From fe2b3ba4ad21679f8e4d57cc8b82004a22fb723f Mon Sep 17 00:00:00 2001 From: plaguss Date: Thu, 7 Nov 2024 08:31:00 +0000 Subject: [PATCH] Deployed 1997ef23 to pr-1053 with MkDocs 1.6.1 and mike 2.1.3 --- pr-1053/api/models/llm/llm_gallery/index.html | 2017 +++++++++-------- pr-1053/search/search_index.json | 2 +- 2 files changed, 1092 insertions(+), 927 deletions(-) diff --git a/pr-1053/api/models/llm/llm_gallery/index.html b/pr-1053/api/models/llm/llm_gallery/index.html index 0d52455e95..32e1f5672c 100644 --- a/pr-1053/api/models/llm/llm_gallery/index.html +++ b/pr-1053/api/models/llm/llm_gallery/index.html @@ -21946,8 +21946,7 @@

684 685 686 -687 -688
class OpenAILLM(AsyncLLM):
+687
class OpenAILLM(AsyncLLM):
     """OpenAI LLM implementation running the async API client.
 
     Attributes:
@@ -22148,453 +22147,452 @@ 

response_format: Optional[Dict[str, str]] = None, ) -> GenerateOutput: """Generates `num_generations` responses for the given input using the OpenAI async - client. + client. - Args: - input: a single input in chat format to generate responses for. - num_generations: the number of generations to create per input. Defaults to - `1`. - max_new_tokens: the maximum number of new tokens that the model will generate. - Defaults to `128`. - frequency_penalty: the repetition penalty to use for the generation. Defaults - to `0.0`. - presence_penalty: the presence penalty to use for the generation. Defaults to - `0.0`. - temperature: the temperature to use for the generation. Defaults to `0.1`. - top_p: the top-p value to use for the generation. Defaults to `1.0`. - stop: a string or a list of strings to use as a stop sequence for the generation. - Defaults to `None`. - response_format: the format of the response to return. Must be one of - "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode) - for more information on how to use the JSON model from OpenAI. Defaults to None - which returns text. To return JSON, use {"type": "json_object"}. - ) - - Note: - If response_format - - Returns: - A list of lists of strings containing the generated responses for each input. - """ - - structured_output = None - if isinstance(input, tuple): - input, structured_output = input - result = self._prepare_structured_output( - structured_output=structured_output, # type: ignore - client=self._aclient, - framework="openai", - ) - self._aclient = result.get("client") # type: ignore - - if structured_output is None and self.structured_output is not None: - structured_output = self.structured_output - - kwargs = { - "messages": input, # type: ignore - "model": self.model, - "max_tokens": max_new_tokens, - "n": num_generations, - "frequency_penalty": frequency_penalty, - "presence_penalty": presence_penalty, - "temperature": temperature, - "top_p": top_p, - "stop": stop, - } - - if response_format is not None: - kwargs["response_format"] = response_format - - if structured_output: - kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore - - completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore - - if structured_output: - return self._generations_from_structured_output(completion) - - return self._generations_from_openai_completion(completion) - - def _generations_from_structured_output( - self, completion: "BaseModel" - ) -> "GenerateOutput": - """Get the generations from the structured output object. - - Args: - completion: an instance of `pydantic.BaseModel` with the content of the structuted - output. - - Returns: - A list with the content of the structured output. - """ - return [completion.model_dump_json()] - - def _generations_from_openai_completion( - self, completion: "OpenAIChatCompletion" - ) -> "GenerateOutput": - """Get the generations from the OpenAI Chat Completion object. - - Args: - completion: the completion object to get the generations from. - - Returns: - A list of strings containing the generated responses for the input. - """ - generations = [] - for choice in completion.choices: - if (content := choice.message.content) is None: - self._logger.warning( # type: ignore - f"Received no response using OpenAI client (model: '{self.model}')." - f" Finish reason was: {choice.finish_reason}" - ) - generations.append(content) - return generations - - def offline_batch_generate( - self, - inputs: Union[List["FormattedInput"], None] = None, - num_generations: int = 1, - max_new_tokens: int = 128, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - temperature: float = 1.0, - top_p: float = 1.0, - stop: Optional[Union[str, List[str]]] = None, - response_format: Optional[str] = None, - **kwargs: Any, - ) -> List["GenerateOutput"]: - """Uses the OpenAI batch API to generate `num_generations` responses for the given - inputs. - - Args: - inputs: a list of inputs in chat format to generate responses for. - num_generations: the number of generations to create per input. Defaults to - `1`. - max_new_tokens: the maximum number of new tokens that the model will generate. - Defaults to `128`. - frequency_penalty: the repetition penalty to use for the generation. Defaults - to `0.0`. - presence_penalty: the presence penalty to use for the generation. Defaults to - `0.0`. - temperature: the temperature to use for the generation. Defaults to `0.1`. - top_p: the top-p value to use for the generation. Defaults to `1.0`. - stop: a string or a list of strings to use as a stop sequence for the generation. - Defaults to `None`. - response_format: the format of the response to return. Must be one of - "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode) - for more information on how to use the JSON model from OpenAI. Defaults to `text`. - - Returns: - A list of lists of strings containing the generated responses for each input - in `inputs`. - - Raises: - DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation - is not finished yet. - ValueError: if no job IDs were found to retrieve the results from. - """ - if self.jobs_ids: - return self._check_and_get_batch_results() - - if inputs: - self.jobs_ids = self._create_jobs( - inputs=inputs, - **{ - "model": self.model, - "max_tokens": max_new_tokens, - "n": num_generations, - "frequency_penalty": frequency_penalty, - "presence_penalty": presence_penalty, - "temperature": temperature, - "top_p": top_p, - "stop": stop, - "response_format": response_format, - }, - ) - raise DistilabelOfflineBatchGenerationNotFinishedException( - jobs_ids=self.jobs_ids - ) - - raise ValueError("No `inputs` were provided and no `jobs_ids` were found.") - - def _check_and_get_batch_results(self) -> List["GenerateOutput"]: - """Checks the status of the batch jobs and retrieves the results from the OpenAI - Batch API. - - Returns: - A list of lists of strings containing the generated responses for each input. - - Raises: - ValueError: if no job IDs were found to retrieve the results from. - DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation - is not finished yet. - RuntimeError: if the only batch job found failed. - """ - if not self.jobs_ids: - raise ValueError("No job IDs were found to retrieve the results from.") - - outputs = [] - for batch_id in self.jobs_ids: - batch = self._get_openai_batch(batch_id) - - if batch.status in ("validating", "in_progress", "finalizing"): - raise DistilabelOfflineBatchGenerationNotFinishedException( - jobs_ids=self.jobs_ids - ) - - if batch.status in ("failed", "expired", "cancelled", "cancelling"): - self._logger.error( # type: ignore - f"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'." - ) - if len(self.jobs_ids) == 1: - self.jobs_ids = None - raise RuntimeError( - f"The only OpenAI API Batch that was created with ID '{batch_id}'" - f" failed with status '{batch.status}'." - ) - - continue - - outputs.extend(self._retrieve_batch_results(batch)) - - # sort by `custom_id` to return the results in the same order as the inputs - outputs = sorted(outputs, key=lambda x: int(x["custom_id"])) - return [self._parse_output(output) for output in outputs] - - def _parse_output(self, output: Dict[str, Any]) -> "GenerateOutput": - """Parses the output from the OpenAI Batch API into a list of strings. - - Args: - output: the output to parse. - - Returns: - A list of strings containing the generated responses for the input. - """ - from openai.types.chat import ChatCompletion as OpenAIChatCompletion - - if "response" not in output: - return [] - - if output["response"]["status_code"] != 200: - return [] - - return self._generations_from_openai_completion( - OpenAIChatCompletion(**output["response"]["body"]) - ) - - def _get_openai_batch(self, batch_id: str) -> "OpenAIBatch": - """Gets a batch from the OpenAI Batch API. - - Args: - batch_id: the ID of the batch to retrieve. - - Returns: - The batch retrieved from the OpenAI Batch API. - - Raises: - openai.OpenAIError: if there was an error while retrieving the batch from the - OpenAI Batch API. - """ - import openai - - try: - return self._client.batches.retrieve(batch_id) - except openai.OpenAIError as e: - self._logger.error( # type: ignore - f"Error while retrieving batch '{batch_id}' from OpenAI: {e}" - ) - raise e - - def _retrieve_batch_results(self, batch: "OpenAIBatch") -> List[Dict[str, Any]]: - """Retrieves the results of a batch from its output file, parsing the JSONL content - into a list of dictionaries. - - Args: - batch: the batch to retrieve the results from. - - Returns: - A list of dictionaries containing the results of the batch. - - Raises: - AssertionError: if no output file ID was found in the batch. - """ - import openai - - assert batch.output_file_id, "No output file ID was found in the batch." - - try: - file_response = self._client.files.content(batch.output_file_id) - return [orjson.loads(line) for line in file_response.text.splitlines()] - except openai.OpenAIError as e: - self._logger.error( # type: ignore - f"Error while retrieving batch results from file '{batch.output_file_id}': {e}" - ) - return [] - - def _create_jobs( - self, inputs: List["FormattedInput"], **kwargs: Any - ) -> Tuple[str, ...]: - """Creates jobs in the OpenAI Batch API to generate responses for the given inputs. - - Args: - inputs: a list of inputs in chat format to generate responses for. - kwargs: the keyword arguments to use for the generation. - - Returns: - A list of job IDs created in the OpenAI Batch API. - """ - batch_input_files = self._create_batch_files(inputs=inputs, **kwargs) - jobs = [] - for batch_input_file in batch_input_files: - if batch := self._create_batch_api_job(batch_input_file): - jobs.append(batch.id) - return tuple(jobs) - - def _create_batch_api_job( - self, batch_input_file: "OpenAIFileObject" - ) -> Union["OpenAIBatch", None]: - """Creates a job in the OpenAI Batch API to generate responses for the given input - file. - - Args: - batch_input_file: the input file to generate responses for. - - Returns: - The batch job created in the OpenAI Batch API. - """ - import openai - - metadata = {"description": "distilabel"} - - if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME: - metadata["distilabel_pipeline_name"] = distilabel_pipeline_name - - if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID: - metadata["distilabel_pipeline_cache_id"] = distilabel_pipeline_cache_id - - batch = None - try: - batch = self._client.batches.create( - completion_window="24h", - endpoint="/v1/chat/completions", - input_file_id=batch_input_file.id, - metadata=metadata, - ) - except openai.OpenAIError as e: - self._logger.error( # type: ignore - f"Error while creating OpenAI Batch API job for file with ID" - f" '{batch_input_file.id}': {e}." - ) - raise e - return batch - - def _create_batch_files( - self, inputs: List["FormattedInput"], **kwargs: Any - ) -> List["OpenAIFileObject"]: - """Creates the necessary input files for the batch API to generate responses. The - maximum size of each file so the OpenAI Batch API can process it is 100MB, so we - need to split the inputs into multiple files if necessary. - - More information: https://platform.openai.com/docs/api-reference/files/create - - Args: - inputs: a list of inputs in chat format to generate responses for, optionally - including structured output. - kwargs: the keyword arguments to use for the generation. - - Returns: - The list of file objects created for the OpenAI Batch API. - - Raises: - openai.OpenAIError: if there was an error while creating the batch input file - in the OpenAI Batch API. - """ - import openai - - files = [] - for file_no, buffer in enumerate( - self._create_jsonl_buffers(inputs=inputs, **kwargs) - ): - try: - # TODO: add distilabel pipeline name and id - batch_input_file = self._client.files.create( - file=(self._name_for_openai_files(file_no), buffer), - purpose="batch", - ) - files.append(batch_input_file) - except openai.OpenAIError as e: - self._logger.error( # type: ignore - f"Error while creating OpenAI batch input file: {e}" - ) - raise e - return files - - def _create_jsonl_buffers( - self, inputs: List["FormattedInput"], **kwargs: Any - ) -> Generator[io.BytesIO, None, None]: - """Creates a generator of buffers containing the JSONL formatted inputs to be - used by the OpenAI Batch API. The buffers created are of size 100MB or less. - - Args: - inputs: a list of inputs in chat format to generate responses for, optionally - including structured output. - kwargs: the keyword arguments to use for the generation. - - Yields: - A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch - API. - """ - buffer = io.BytesIO() - buffer_current_size = 0 - for i, input in enumerate(inputs): - # We create the smallest `custom_id` so we don't increase the size of the file - # to much, but we can still sort the results with the order of the inputs. - row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs) - row_size = len(row) - if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE: - buffer.seek(0) - yield buffer - buffer = io.BytesIO() - buffer_current_size = 0 - buffer.write(row) - buffer_current_size += row_size - - if buffer_current_size > 0: - buffer.seek(0) - yield buffer - - def _create_jsonl_row( - self, input: "FormattedInput", custom_id: str, **kwargs: Any - ) -> bytes: - """Creates a JSONL formatted row to be used by the OpenAI Batch API. - - Args: - input: a list of inputs in chat format to generate responses for, optionally - including structured output. - custom_id: a custom ID to use for the row. - kwargs: the keyword arguments to use for the generation. - - Returns: - A JSONL formatted row to be used by the OpenAI Batch API. - """ - # TODO: depending on the format of the input, add `response_format` to the kwargs - row = { - "custom_id": custom_id, - "method": "POST", - "url": "/v1/chat/completions", - "body": {"messages": input, **kwargs}, - } - json_row = orjson.dumps(row) - return json_row + b"\n" - - def _name_for_openai_files(self, file_no: int) -> str: - if ( - envs.DISTILABEL_PIPELINE_NAME is None - or envs.DISTILABEL_PIPELINE_CACHE_ID is None - ): - return f"distilabel-pipeline-fileno-{file_no}.jsonl" - - return f"distilabel-pipeline-{envs.DISTILABEL_PIPELINE_NAME}-{envs.DISTILABEL_PIPELINE_CACHE_ID}-fileno-{file_no}.jsonl" + Args: + input: a single input in chat format to generate responses for. + num_generations: the number of generations to create per input. Defaults to + `1`. + max_new_tokens: the maximum number of new tokens that the model will generate. + Defaults to `128`. + frequency_penalty: the repetition penalty to use for the generation. Defaults + to `0.0`. + presence_penalty: the presence penalty to use for the generation. Defaults to + `0.0`. + temperature: the temperature to use for the generation. Defaults to `0.1`. + top_p: the top-p value to use for the generation. Defaults to `1.0`. + stop: a string or a list of strings to use as a stop sequence for the generation. + Defaults to `None`. + response_format: the format of the response to return. Must be one of + "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode) + for more information on how to use the JSON model from OpenAI. Defaults to None + which returns text. To return JSON, use {"type": "json_object"}. + + Note: + If response_format + + Returns: + A list of lists of strings containing the generated responses for each input. + """ + + structured_output = None + if isinstance(input, tuple): + input, structured_output = input + result = self._prepare_structured_output( + structured_output=structured_output, # type: ignore + client=self._aclient, + framework="openai", + ) + self._aclient = result.get("client") # type: ignore + + if structured_output is None and self.structured_output is not None: + structured_output = self.structured_output + + kwargs = { + "messages": input, # type: ignore + "model": self.model, + "max_tokens": max_new_tokens, + "n": num_generations, + "frequency_penalty": frequency_penalty, + "presence_penalty": presence_penalty, + "temperature": temperature, + "top_p": top_p, + "stop": stop, + } + + if response_format is not None: + kwargs["response_format"] = response_format + + if structured_output: + kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore + + completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore + + if structured_output: + return self._generations_from_structured_output(completion) + + return self._generations_from_openai_completion(completion) + + def _generations_from_structured_output( + self, completion: "BaseModel" + ) -> "GenerateOutput": + """Get the generations from the structured output object. + + Args: + completion: an instance of `pydantic.BaseModel` with the content of the structuted + output. + + Returns: + A list with the content of the structured output. + """ + return [completion.model_dump_json()] + + def _generations_from_openai_completion( + self, completion: "OpenAIChatCompletion" + ) -> "GenerateOutput": + """Get the generations from the OpenAI Chat Completion object. + + Args: + completion: the completion object to get the generations from. + + Returns: + A list of strings containing the generated responses for the input. + """ + generations = [] + for choice in completion.choices: + if (content := choice.message.content) is None: + self._logger.warning( # type: ignore + f"Received no response using OpenAI client (model: '{self.model}')." + f" Finish reason was: {choice.finish_reason}" + ) + generations.append(content) + return generations + + def offline_batch_generate( + self, + inputs: Union[List["FormattedInput"], None] = None, + num_generations: int = 1, + max_new_tokens: int = 128, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + temperature: float = 1.0, + top_p: float = 1.0, + stop: Optional[Union[str, List[str]]] = None, + response_format: Optional[str] = None, + **kwargs: Any, + ) -> List["GenerateOutput"]: + """Uses the OpenAI batch API to generate `num_generations` responses for the given + inputs. + + Args: + inputs: a list of inputs in chat format to generate responses for. + num_generations: the number of generations to create per input. Defaults to + `1`. + max_new_tokens: the maximum number of new tokens that the model will generate. + Defaults to `128`. + frequency_penalty: the repetition penalty to use for the generation. Defaults + to `0.0`. + presence_penalty: the presence penalty to use for the generation. Defaults to + `0.0`. + temperature: the temperature to use for the generation. Defaults to `0.1`. + top_p: the top-p value to use for the generation. Defaults to `1.0`. + stop: a string or a list of strings to use as a stop sequence for the generation. + Defaults to `None`. + response_format: the format of the response to return. Must be one of + "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode) + for more information on how to use the JSON model from OpenAI. Defaults to `text`. + + Returns: + A list of lists of strings containing the generated responses for each input + in `inputs`. + + Raises: + DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation + is not finished yet. + ValueError: if no job IDs were found to retrieve the results from. + """ + if self.jobs_ids: + return self._check_and_get_batch_results() + + if inputs: + self.jobs_ids = self._create_jobs( + inputs=inputs, + **{ + "model": self.model, + "max_tokens": max_new_tokens, + "n": num_generations, + "frequency_penalty": frequency_penalty, + "presence_penalty": presence_penalty, + "temperature": temperature, + "top_p": top_p, + "stop": stop, + "response_format": response_format, + }, + ) + raise DistilabelOfflineBatchGenerationNotFinishedException( + jobs_ids=self.jobs_ids + ) + + raise ValueError("No `inputs` were provided and no `jobs_ids` were found.") + + def _check_and_get_batch_results(self) -> List["GenerateOutput"]: + """Checks the status of the batch jobs and retrieves the results from the OpenAI + Batch API. + + Returns: + A list of lists of strings containing the generated responses for each input. + + Raises: + ValueError: if no job IDs were found to retrieve the results from. + DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation + is not finished yet. + RuntimeError: if the only batch job found failed. + """ + if not self.jobs_ids: + raise ValueError("No job IDs were found to retrieve the results from.") + + outputs = [] + for batch_id in self.jobs_ids: + batch = self._get_openai_batch(batch_id) + + if batch.status in ("validating", "in_progress", "finalizing"): + raise DistilabelOfflineBatchGenerationNotFinishedException( + jobs_ids=self.jobs_ids + ) + + if batch.status in ("failed", "expired", "cancelled", "cancelling"): + self._logger.error( # type: ignore + f"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'." + ) + if len(self.jobs_ids) == 1: + self.jobs_ids = None + raise RuntimeError( + f"The only OpenAI API Batch that was created with ID '{batch_id}'" + f" failed with status '{batch.status}'." + ) + + continue + + outputs.extend(self._retrieve_batch_results(batch)) + + # sort by `custom_id` to return the results in the same order as the inputs + outputs = sorted(outputs, key=lambda x: int(x["custom_id"])) + return [self._parse_output(output) for output in outputs] + + def _parse_output(self, output: Dict[str, Any]) -> "GenerateOutput": + """Parses the output from the OpenAI Batch API into a list of strings. + + Args: + output: the output to parse. + + Returns: + A list of strings containing the generated responses for the input. + """ + from openai.types.chat import ChatCompletion as OpenAIChatCompletion + + if "response" not in output: + return [] + + if output["response"]["status_code"] != 200: + return [] + + return self._generations_from_openai_completion( + OpenAIChatCompletion(**output["response"]["body"]) + ) + + def _get_openai_batch(self, batch_id: str) -> "OpenAIBatch": + """Gets a batch from the OpenAI Batch API. + + Args: + batch_id: the ID of the batch to retrieve. + + Returns: + The batch retrieved from the OpenAI Batch API. + + Raises: + openai.OpenAIError: if there was an error while retrieving the batch from the + OpenAI Batch API. + """ + import openai + + try: + return self._client.batches.retrieve(batch_id) + except openai.OpenAIError as e: + self._logger.error( # type: ignore + f"Error while retrieving batch '{batch_id}' from OpenAI: {e}" + ) + raise e + + def _retrieve_batch_results(self, batch: "OpenAIBatch") -> List[Dict[str, Any]]: + """Retrieves the results of a batch from its output file, parsing the JSONL content + into a list of dictionaries. + + Args: + batch: the batch to retrieve the results from. + + Returns: + A list of dictionaries containing the results of the batch. + + Raises: + AssertionError: if no output file ID was found in the batch. + """ + import openai + + assert batch.output_file_id, "No output file ID was found in the batch." + + try: + file_response = self._client.files.content(batch.output_file_id) + return [orjson.loads(line) for line in file_response.text.splitlines()] + except openai.OpenAIError as e: + self._logger.error( # type: ignore + f"Error while retrieving batch results from file '{batch.output_file_id}': {e}" + ) + return [] + + def _create_jobs( + self, inputs: List["FormattedInput"], **kwargs: Any + ) -> Tuple[str, ...]: + """Creates jobs in the OpenAI Batch API to generate responses for the given inputs. + + Args: + inputs: a list of inputs in chat format to generate responses for. + kwargs: the keyword arguments to use for the generation. + + Returns: + A list of job IDs created in the OpenAI Batch API. + """ + batch_input_files = self._create_batch_files(inputs=inputs, **kwargs) + jobs = [] + for batch_input_file in batch_input_files: + if batch := self._create_batch_api_job(batch_input_file): + jobs.append(batch.id) + return tuple(jobs) + + def _create_batch_api_job( + self, batch_input_file: "OpenAIFileObject" + ) -> Union["OpenAIBatch", None]: + """Creates a job in the OpenAI Batch API to generate responses for the given input + file. + + Args: + batch_input_file: the input file to generate responses for. + + Returns: + The batch job created in the OpenAI Batch API. + """ + import openai + + metadata = {"description": "distilabel"} + + if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME: + metadata["distilabel_pipeline_name"] = distilabel_pipeline_name + + if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID: + metadata["distilabel_pipeline_cache_id"] = distilabel_pipeline_cache_id + + batch = None + try: + batch = self._client.batches.create( + completion_window="24h", + endpoint="/v1/chat/completions", + input_file_id=batch_input_file.id, + metadata=metadata, + ) + except openai.OpenAIError as e: + self._logger.error( # type: ignore + f"Error while creating OpenAI Batch API job for file with ID" + f" '{batch_input_file.id}': {e}." + ) + raise e + return batch + + def _create_batch_files( + self, inputs: List["FormattedInput"], **kwargs: Any + ) -> List["OpenAIFileObject"]: + """Creates the necessary input files for the batch API to generate responses. The + maximum size of each file so the OpenAI Batch API can process it is 100MB, so we + need to split the inputs into multiple files if necessary. + + More information: https://platform.openai.com/docs/api-reference/files/create + + Args: + inputs: a list of inputs in chat format to generate responses for, optionally + including structured output. + kwargs: the keyword arguments to use for the generation. + + Returns: + The list of file objects created for the OpenAI Batch API. + + Raises: + openai.OpenAIError: if there was an error while creating the batch input file + in the OpenAI Batch API. + """ + import openai + + files = [] + for file_no, buffer in enumerate( + self._create_jsonl_buffers(inputs=inputs, **kwargs) + ): + try: + # TODO: add distilabel pipeline name and id + batch_input_file = self._client.files.create( + file=(self._name_for_openai_files(file_no), buffer), + purpose="batch", + ) + files.append(batch_input_file) + except openai.OpenAIError as e: + self._logger.error( # type: ignore + f"Error while creating OpenAI batch input file: {e}" + ) + raise e + return files + + def _create_jsonl_buffers( + self, inputs: List["FormattedInput"], **kwargs: Any + ) -> Generator[io.BytesIO, None, None]: + """Creates a generator of buffers containing the JSONL formatted inputs to be + used by the OpenAI Batch API. The buffers created are of size 100MB or less. + + Args: + inputs: a list of inputs in chat format to generate responses for, optionally + including structured output. + kwargs: the keyword arguments to use for the generation. + + Yields: + A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch + API. + """ + buffer = io.BytesIO() + buffer_current_size = 0 + for i, input in enumerate(inputs): + # We create the smallest `custom_id` so we don't increase the size of the file + # to much, but we can still sort the results with the order of the inputs. + row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs) + row_size = len(row) + if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE: + buffer.seek(0) + yield buffer + buffer = io.BytesIO() + buffer_current_size = 0 + buffer.write(row) + buffer_current_size += row_size + + if buffer_current_size > 0: + buffer.seek(0) + yield buffer + + def _create_jsonl_row( + self, input: "FormattedInput", custom_id: str, **kwargs: Any + ) -> bytes: + """Creates a JSONL formatted row to be used by the OpenAI Batch API. + + Args: + input: a list of inputs in chat format to generate responses for, optionally + including structured output. + custom_id: a custom ID to use for the row. + kwargs: the keyword arguments to use for the generation. + + Returns: + A JSONL formatted row to be used by the OpenAI Batch API. + """ + # TODO: depending on the format of the input, add `response_format` to the kwargs + row = { + "custom_id": custom_id, + "method": "POST", + "url": "/v1/chat/completions", + "body": {"messages": input, **kwargs}, + } + json_row = orjson.dumps(row) + return json_row + b"\n" + + def _name_for_openai_files(self, file_no: int) -> str: + if ( + envs.DISTILABEL_PIPELINE_NAME is None + or envs.DISTILABEL_PIPELINE_CACHE_ID is None + ): + return f"distilabel-pipeline-fileno-{file_no}.jsonl" + + return f"distilabel-pipeline-{envs.DISTILABEL_PIPELINE_NAME}-{envs.DISTILABEL_PIPELINE_CACHE_ID}-fileno-{file_no}.jsonl"

@@ -22786,33 +22784,202 @@

Generates num_generations responses for the given input using the OpenAI async - client.

-
    Args:
-        input: a single input in chat format to generate responses for.
-        num_generations: the number of generations to create per input. Defaults to
-            `1`.
-        max_new_tokens: the maximum number of new tokens that the model will generate.
-            Defaults to `128`.
-        frequency_penalty: the repetition penalty to use for the generation. Defaults
-            to `0.0`.
-        presence_penalty: the presence penalty to use for the generation. Defaults to
-            `0.0`.
-        temperature: the temperature to use for the generation. Defaults to `0.1`.
-        top_p: the top-p value to use for the generation. Defaults to `1.0`.
-        stop: a string or a list of strings to use as a stop sequence for the generation.
-            Defaults to `None`.
-        response_format: the format of the response to return. Must be one of
-            "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)
-            for more information on how to use the JSON model from OpenAI. Defaults to None
-            which returns text. To return JSON, use {"type": "json_object"}.
-
-

)

-
    Note:
-        If response_format
-
-    Returns:
-        A list of lists of strings containing the generated responses for each input.
-
+client.

+ + +

Parameters:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
+ input + + FormattedInput + +
+

a single input in chat format to generate responses for.

+
+
+ required +
+ num_generations + + int + +
+

the number of generations to create per input. Defaults to +1.

+
+
+ 1 +
+ max_new_tokens + + int + +
+

the maximum number of new tokens that the model will generate. +Defaults to 128.

+
+
+ 128 +
+ frequency_penalty + + float + +
+

the repetition penalty to use for the generation. Defaults +to 0.0.

+
+
+ 0.0 +
+ presence_penalty + + float + +
+

the presence penalty to use for the generation. Defaults to +0.0.

+
+
+ 0.0 +
+ temperature + + float + +
+

the temperature to use for the generation. Defaults to 0.1.

+
+
+ 1.0 +
+ top_p + + float + +
+

the top-p value to use for the generation. Defaults to 1.0.

+
+
+ 1.0 +
+ stop + + Optional[Union[str, List[str]]] + +
+

a string or a list of strings to use as a stop sequence for the generation. +Defaults to None.

+
+
+ None +
+ response_format + + Optional[Dict[str, str]] + +
+

the format of the response to return. Must be one of +"text" or "json". Read the documentation here +for more information on how to use the JSON model from OpenAI. Defaults to None +which returns text. To return JSON, use {"type": "json_object"}.

+
+
+ None +
+ + +
+ Note +

If response_format

+
+ +

Returns:

+ + + + + + + + + + + + + +
TypeDescription
+ GenerateOutput + +
+

A list of lists of strings containing the generated responses for each input.

+
+
Source code in src/distilabel/models/llms/openai.py @@ -22894,8 +23061,7 @@
303 304 305 -306 -307
@validate_call
+306
@validate_call
 async def agenerate(  # type: ignore
     self,
     input: FormattedInput,
@@ -22909,72 +23075,71 @@ 
response_format: Optional[Dict[str, str]] = None, ) -> GenerateOutput: """Generates `num_generations` responses for the given input using the OpenAI async - client. + client. - Args: - input: a single input in chat format to generate responses for. - num_generations: the number of generations to create per input. Defaults to - `1`. - max_new_tokens: the maximum number of new tokens that the model will generate. - Defaults to `128`. - frequency_penalty: the repetition penalty to use for the generation. Defaults - to `0.0`. - presence_penalty: the presence penalty to use for the generation. Defaults to - `0.0`. - temperature: the temperature to use for the generation. Defaults to `0.1`. - top_p: the top-p value to use for the generation. Defaults to `1.0`. - stop: a string or a list of strings to use as a stop sequence for the generation. - Defaults to `None`. - response_format: the format of the response to return. Must be one of - "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode) - for more information on how to use the JSON model from OpenAI. Defaults to None - which returns text. To return JSON, use {"type": "json_object"}. - ) - - Note: - If response_format - - Returns: - A list of lists of strings containing the generated responses for each input. - """ - - structured_output = None - if isinstance(input, tuple): - input, structured_output = input - result = self._prepare_structured_output( - structured_output=structured_output, # type: ignore - client=self._aclient, - framework="openai", - ) - self._aclient = result.get("client") # type: ignore - - if structured_output is None and self.structured_output is not None: - structured_output = self.structured_output - - kwargs = { - "messages": input, # type: ignore - "model": self.model, - "max_tokens": max_new_tokens, - "n": num_generations, - "frequency_penalty": frequency_penalty, - "presence_penalty": presence_penalty, - "temperature": temperature, - "top_p": top_p, - "stop": stop, - } - - if response_format is not None: - kwargs["response_format"] = response_format - - if structured_output: - kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore - - completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore - - if structured_output: - return self._generations_from_structured_output(completion) - - return self._generations_from_openai_completion(completion) + Args: + input: a single input in chat format to generate responses for. + num_generations: the number of generations to create per input. Defaults to + `1`. + max_new_tokens: the maximum number of new tokens that the model will generate. + Defaults to `128`. + frequency_penalty: the repetition penalty to use for the generation. Defaults + to `0.0`. + presence_penalty: the presence penalty to use for the generation. Defaults to + `0.0`. + temperature: the temperature to use for the generation. Defaults to `0.1`. + top_p: the top-p value to use for the generation. Defaults to `1.0`. + stop: a string or a list of strings to use as a stop sequence for the generation. + Defaults to `None`. + response_format: the format of the response to return. Must be one of + "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode) + for more information on how to use the JSON model from OpenAI. Defaults to None + which returns text. To return JSON, use {"type": "json_object"}. + + Note: + If response_format + + Returns: + A list of lists of strings containing the generated responses for each input. + """ + + structured_output = None + if isinstance(input, tuple): + input, structured_output = input + result = self._prepare_structured_output( + structured_output=structured_output, # type: ignore + client=self._aclient, + framework="openai", + ) + self._aclient = result.get("client") # type: ignore + + if structured_output is None and self.structured_output is not None: + structured_output = self.structured_output + + kwargs = { + "messages": input, # type: ignore + "model": self.model, + "max_tokens": max_new_tokens, + "n": num_generations, + "frequency_penalty": frequency_penalty, + "presence_penalty": presence_penalty, + "temperature": temperature, + "top_p": top_p, + "stop": stop, + } + + if response_format is not None: + kwargs["response_format"] = response_format + + if structured_output: + kwargs = self._prepare_kwargs(kwargs, structured_output) # type: ignore + + completion = await self._aclient.chat.completions.create(**kwargs) # type: ignore + + if structured_output: + return self._generations_from_structured_output(completion) + + return self._generations_from_openai_completion(completion)
@@ -23051,7 +23216,8 @@
Source code in src/distilabel/models/llms/openai.py -
309
+              
308
+309
 310
 311
 312
@@ -23062,20 +23228,19 @@ 
317 318 319 -320 -321
def _generations_from_structured_output(
-    self, completion: "BaseModel"
-) -> "GenerateOutput":
-    """Get the generations from the structured output object.
-
-    Args:
-        completion: an instance of `pydantic.BaseModel` with the content of the structuted
-            output.
-
-    Returns:
-        A list with the content of the structured output.
-    """
-    return [completion.model_dump_json()]
+320
def _generations_from_structured_output(
+    self, completion: "BaseModel"
+) -> "GenerateOutput":
+    """Get the generations from the structured output object.
+
+    Args:
+        completion: an instance of `pydantic.BaseModel` with the content of the structuted
+            output.
+
+    Returns:
+        A list with the content of the structured output.
+    """
+    return [completion.model_dump_json()]
 
@@ -23151,7 +23316,8 @@
Source code in src/distilabel/models/llms/openai.py -
323
+              
322
+323
 324
 325
 326
@@ -23169,27 +23335,26 @@ 
338 339 340 -341 -342
def _generations_from_openai_completion(
-    self, completion: "OpenAIChatCompletion"
-) -> "GenerateOutput":
-    """Get the generations from the OpenAI Chat Completion object.
-
-    Args:
-        completion: the completion object to get the generations from.
-
-    Returns:
-        A list of strings containing the generated responses for the input.
-    """
-    generations = []
-    for choice in completion.choices:
-        if (content := choice.message.content) is None:
-            self._logger.warning(  # type: ignore
-                f"Received no response using OpenAI client (model: '{self.model}')."
-                f" Finish reason was: {choice.finish_reason}"
-            )
-        generations.append(content)
-    return generations
+341
def _generations_from_openai_completion(
+    self, completion: "OpenAIChatCompletion"
+) -> "GenerateOutput":
+    """Get the generations from the OpenAI Chat Completion object.
+
+    Args:
+        completion: the completion object to get the generations from.
+
+    Returns:
+        A list of strings containing the generated responses for the input.
+    """
+    generations = []
+    for choice in completion.choices:
+        if (content := choice.message.content) is None:
+            self._logger.warning(  # type: ignore
+                f"Received no response using OpenAI client (model: '{self.model}')."
+                f" Finish reason was: {choice.finish_reason}"
+            )
+        generations.append(content)
+    return generations
 
@@ -23445,7 +23610,8 @@
Source code in src/distilabel/models/llms/openai.py -
344
+              
343
+344
 345
 346
 347
@@ -23509,73 +23675,72 @@ 
405 406 407 -408 -409
def offline_batch_generate(
-    self,
-    inputs: Union[List["FormattedInput"], None] = None,
-    num_generations: int = 1,
-    max_new_tokens: int = 128,
-    frequency_penalty: float = 0.0,
-    presence_penalty: float = 0.0,
-    temperature: float = 1.0,
-    top_p: float = 1.0,
-    stop: Optional[Union[str, List[str]]] = None,
-    response_format: Optional[str] = None,
-    **kwargs: Any,
-) -> List["GenerateOutput"]:
-    """Uses the OpenAI batch API to generate `num_generations` responses for the given
-    inputs.
-
-    Args:
-        inputs: a list of inputs in chat format to generate responses for.
-        num_generations: the number of generations to create per input. Defaults to
-            `1`.
-        max_new_tokens: the maximum number of new tokens that the model will generate.
-            Defaults to `128`.
-        frequency_penalty: the repetition penalty to use for the generation. Defaults
-            to `0.0`.
-        presence_penalty: the presence penalty to use for the generation. Defaults to
-            `0.0`.
-        temperature: the temperature to use for the generation. Defaults to `0.1`.
-        top_p: the top-p value to use for the generation. Defaults to `1.0`.
-        stop: a string or a list of strings to use as a stop sequence for the generation.
-            Defaults to `None`.
-        response_format: the format of the response to return. Must be one of
-            "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)
-            for more information on how to use the JSON model from OpenAI. Defaults to `text`.
-
-    Returns:
-        A list of lists of strings containing the generated responses for each input
-        in `inputs`.
-
-    Raises:
-        DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation
-            is not finished yet.
-        ValueError: if no job IDs were found to retrieve the results from.
-    """
-    if self.jobs_ids:
-        return self._check_and_get_batch_results()
-
-    if inputs:
-        self.jobs_ids = self._create_jobs(
-            inputs=inputs,
-            **{
-                "model": self.model,
-                "max_tokens": max_new_tokens,
-                "n": num_generations,
-                "frequency_penalty": frequency_penalty,
-                "presence_penalty": presence_penalty,
-                "temperature": temperature,
-                "top_p": top_p,
-                "stop": stop,
-                "response_format": response_format,
-            },
-        )
-        raise DistilabelOfflineBatchGenerationNotFinishedException(
-            jobs_ids=self.jobs_ids
-        )
-
-    raise ValueError("No `inputs` were provided and no `jobs_ids` were found.")
+408
def offline_batch_generate(
+    self,
+    inputs: Union[List["FormattedInput"], None] = None,
+    num_generations: int = 1,
+    max_new_tokens: int = 128,
+    frequency_penalty: float = 0.0,
+    presence_penalty: float = 0.0,
+    temperature: float = 1.0,
+    top_p: float = 1.0,
+    stop: Optional[Union[str, List[str]]] = None,
+    response_format: Optional[str] = None,
+    **kwargs: Any,
+) -> List["GenerateOutput"]:
+    """Uses the OpenAI batch API to generate `num_generations` responses for the given
+    inputs.
+
+    Args:
+        inputs: a list of inputs in chat format to generate responses for.
+        num_generations: the number of generations to create per input. Defaults to
+            `1`.
+        max_new_tokens: the maximum number of new tokens that the model will generate.
+            Defaults to `128`.
+        frequency_penalty: the repetition penalty to use for the generation. Defaults
+            to `0.0`.
+        presence_penalty: the presence penalty to use for the generation. Defaults to
+            `0.0`.
+        temperature: the temperature to use for the generation. Defaults to `0.1`.
+        top_p: the top-p value to use for the generation. Defaults to `1.0`.
+        stop: a string or a list of strings to use as a stop sequence for the generation.
+            Defaults to `None`.
+        response_format: the format of the response to return. Must be one of
+            "text" or "json". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)
+            for more information on how to use the JSON model from OpenAI. Defaults to `text`.
+
+    Returns:
+        A list of lists of strings containing the generated responses for each input
+        in `inputs`.
+
+    Raises:
+        DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation
+            is not finished yet.
+        ValueError: if no job IDs were found to retrieve the results from.
+    """
+    if self.jobs_ids:
+        return self._check_and_get_batch_results()
+
+    if inputs:
+        self.jobs_ids = self._create_jobs(
+            inputs=inputs,
+            **{
+                "model": self.model,
+                "max_tokens": max_new_tokens,
+                "n": num_generations,
+                "frequency_penalty": frequency_penalty,
+                "presence_penalty": presence_penalty,
+                "temperature": temperature,
+                "top_p": top_p,
+                "stop": stop,
+                "response_format": response_format,
+            },
+        )
+        raise DistilabelOfflineBatchGenerationNotFinishedException(
+            jobs_ids=self.jobs_ids
+        )
+
+    raise ValueError("No `inputs` were provided and no `jobs_ids` were found.")
 
@@ -23665,7 +23830,8 @@
Source code in src/distilabel/models/llms/openai.py -
411
+              
410
+411
 412
 413
 414
@@ -23706,50 +23872,49 @@ 
449 450 451 -452 -453
def _check_and_get_batch_results(self) -> List["GenerateOutput"]:
-    """Checks the status of the batch jobs and retrieves the results from the OpenAI
-    Batch API.
-
-    Returns:
-        A list of lists of strings containing the generated responses for each input.
-
-    Raises:
-        ValueError: if no job IDs were found to retrieve the results from.
-        DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation
-            is not finished yet.
-        RuntimeError: if the only batch job found failed.
-    """
-    if not self.jobs_ids:
-        raise ValueError("No job IDs were found to retrieve the results from.")
-
-    outputs = []
-    for batch_id in self.jobs_ids:
-        batch = self._get_openai_batch(batch_id)
-
-        if batch.status in ("validating", "in_progress", "finalizing"):
-            raise DistilabelOfflineBatchGenerationNotFinishedException(
-                jobs_ids=self.jobs_ids
-            )
-
-        if batch.status in ("failed", "expired", "cancelled", "cancelling"):
-            self._logger.error(  # type: ignore
-                f"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'."
-            )
-            if len(self.jobs_ids) == 1:
-                self.jobs_ids = None
-                raise RuntimeError(
-                    f"The only OpenAI API Batch that was created with ID '{batch_id}'"
-                    f" failed with status '{batch.status}'."
-                )
-
-            continue
-
-        outputs.extend(self._retrieve_batch_results(batch))
-
-    # sort by `custom_id` to return the results in the same order as the inputs
-    outputs = sorted(outputs, key=lambda x: int(x["custom_id"]))
-    return [self._parse_output(output) for output in outputs]
+452
def _check_and_get_batch_results(self) -> List["GenerateOutput"]:
+    """Checks the status of the batch jobs and retrieves the results from the OpenAI
+    Batch API.
+
+    Returns:
+        A list of lists of strings containing the generated responses for each input.
+
+    Raises:
+        ValueError: if no job IDs were found to retrieve the results from.
+        DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation
+            is not finished yet.
+        RuntimeError: if the only batch job found failed.
+    """
+    if not self.jobs_ids:
+        raise ValueError("No job IDs were found to retrieve the results from.")
+
+    outputs = []
+    for batch_id in self.jobs_ids:
+        batch = self._get_openai_batch(batch_id)
+
+        if batch.status in ("validating", "in_progress", "finalizing"):
+            raise DistilabelOfflineBatchGenerationNotFinishedException(
+                jobs_ids=self.jobs_ids
+            )
+
+        if batch.status in ("failed", "expired", "cancelled", "cancelling"):
+            self._logger.error(  # type: ignore
+                f"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'."
+            )
+            if len(self.jobs_ids) == 1:
+                self.jobs_ids = None
+                raise RuntimeError(
+                    f"The only OpenAI API Batch that was created with ID '{batch_id}'"
+                    f" failed with status '{batch.status}'."
+                )
+
+            continue
+
+        outputs.extend(self._retrieve_batch_results(batch))
+
+    # sort by `custom_id` to return the results in the same order as the inputs
+    outputs = sorted(outputs, key=lambda x: int(x["custom_id"]))
+    return [self._parse_output(output) for output in outputs]
 
@@ -23825,7 +23990,8 @@
Source code in src/distilabel/models/llms/openai.py -
455
+              
454
+455
 456
 457
 458
@@ -23843,27 +24009,26 @@ 
470 471 472 -473 -474
def _parse_output(self, output: Dict[str, Any]) -> "GenerateOutput":
-    """Parses the output from the OpenAI Batch API into a list of strings.
-
-    Args:
-        output: the output to parse.
-
-    Returns:
-        A list of strings containing the generated responses for the input.
-    """
-    from openai.types.chat import ChatCompletion as OpenAIChatCompletion
-
-    if "response" not in output:
-        return []
-
-    if output["response"]["status_code"] != 200:
-        return []
-
-    return self._generations_from_openai_completion(
-        OpenAIChatCompletion(**output["response"]["body"])
-    )
+473
def _parse_output(self, output: Dict[str, Any]) -> "GenerateOutput":
+    """Parses the output from the OpenAI Batch API into a list of strings.
+
+    Args:
+        output: the output to parse.
+
+    Returns:
+        A list of strings containing the generated responses for the input.
+    """
+    from openai.types.chat import ChatCompletion as OpenAIChatCompletion
+
+    if "response" not in output:
+        return []
+
+    if output["response"]["status_code"] != 200:
+        return []
+
+    return self._generations_from_openai_completion(
+        OpenAIChatCompletion(**output["response"]["body"])
+    )
 
@@ -23963,7 +24128,8 @@
Source code in src/distilabel/models/llms/openai.py -
476
+              
475
+476
 477
 478
 479
@@ -23983,29 +24149,28 @@ 
493 494 495 -496 -497
def _get_openai_batch(self, batch_id: str) -> "OpenAIBatch":
-    """Gets a batch from the OpenAI Batch API.
-
-    Args:
-        batch_id: the ID of the batch to retrieve.
-
-    Returns:
-        The batch retrieved from the OpenAI Batch API.
-
-    Raises:
-        openai.OpenAIError: if there was an error while retrieving the batch from the
-            OpenAI Batch API.
-    """
-    import openai
-
-    try:
-        return self._client.batches.retrieve(batch_id)
-    except openai.OpenAIError as e:
-        self._logger.error(  # type: ignore
-            f"Error while retrieving batch '{batch_id}' from OpenAI: {e}"
-        )
-        raise e
+496
def _get_openai_batch(self, batch_id: str) -> "OpenAIBatch":
+    """Gets a batch from the OpenAI Batch API.
+
+    Args:
+        batch_id: the ID of the batch to retrieve.
+
+    Returns:
+        The batch retrieved from the OpenAI Batch API.
+
+    Raises:
+        openai.OpenAIError: if there was an error while retrieving the batch from the
+            OpenAI Batch API.
+    """
+    import openai
+
+    try:
+        return self._client.batches.retrieve(batch_id)
+    except openai.OpenAIError as e:
+        self._logger.error(  # type: ignore
+            f"Error while retrieving batch '{batch_id}' from OpenAI: {e}"
+        )
+        raise e
 
@@ -24105,7 +24270,8 @@
Source code in src/distilabel/models/llms/openai.py -
499
+              
498
+499
 500
 501
 502
@@ -24128,32 +24294,31 @@ 
519 520 521 -522 -523
def _retrieve_batch_results(self, batch: "OpenAIBatch") -> List[Dict[str, Any]]:
-    """Retrieves the results of a batch from its output file, parsing the JSONL content
-    into a list of dictionaries.
-
-    Args:
-        batch: the batch to retrieve the results from.
-
-    Returns:
-        A list of dictionaries containing the results of the batch.
-
-    Raises:
-        AssertionError: if no output file ID was found in the batch.
-    """
-    import openai
-
-    assert batch.output_file_id, "No output file ID was found in the batch."
-
-    try:
-        file_response = self._client.files.content(batch.output_file_id)
-        return [orjson.loads(line) for line in file_response.text.splitlines()]
-    except openai.OpenAIError as e:
-        self._logger.error(  # type: ignore
-            f"Error while retrieving batch results from file '{batch.output_file_id}': {e}"
-        )
-        return []
+522
def _retrieve_batch_results(self, batch: "OpenAIBatch") -> List[Dict[str, Any]]:
+    """Retrieves the results of a batch from its output file, parsing the JSONL content
+    into a list of dictionaries.
+
+    Args:
+        batch: the batch to retrieve the results from.
+
+    Returns:
+        A list of dictionaries containing the results of the batch.
+
+    Raises:
+        AssertionError: if no output file ID was found in the batch.
+    """
+    import openai
+
+    assert batch.output_file_id, "No output file ID was found in the batch."
+
+    try:
+        file_response = self._client.files.content(batch.output_file_id)
+        return [orjson.loads(line) for line in file_response.text.splitlines()]
+    except openai.OpenAIError as e:
+        self._logger.error(  # type: ignore
+            f"Error while retrieving batch results from file '{batch.output_file_id}': {e}"
+        )
+        return []
 
@@ -24245,7 +24410,8 @@
Source code in src/distilabel/models/llms/openai.py -
525
+              
524
+525
 526
 527
 528
@@ -24261,25 +24427,24 @@ 
538 539 540 -541 -542
def _create_jobs(
-    self, inputs: List["FormattedInput"], **kwargs: Any
-) -> Tuple[str, ...]:
-    """Creates jobs in the OpenAI Batch API to generate responses for the given inputs.
-
-    Args:
-        inputs: a list of inputs in chat format to generate responses for.
-        kwargs: the keyword arguments to use for the generation.
-
-    Returns:
-        A list of job IDs created in the OpenAI Batch API.
-    """
-    batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)
-    jobs = []
-    for batch_input_file in batch_input_files:
-        if batch := self._create_batch_api_job(batch_input_file):
-            jobs.append(batch.id)
-    return tuple(jobs)
+541
def _create_jobs(
+    self, inputs: List["FormattedInput"], **kwargs: Any
+) -> Tuple[str, ...]:
+    """Creates jobs in the OpenAI Batch API to generate responses for the given inputs.
+
+    Args:
+        inputs: a list of inputs in chat format to generate responses for.
+        kwargs: the keyword arguments to use for the generation.
+
+    Returns:
+        A list of job IDs created in the OpenAI Batch API.
+    """
+    batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)
+    jobs = []
+    for batch_input_file in batch_input_files:
+        if batch := self._create_batch_api_job(batch_input_file):
+            jobs.append(batch.id)
+    return tuple(jobs)
 
@@ -24356,7 +24521,8 @@
Source code in src/distilabel/models/llms/openai.py -
544
+              
543
+544
 545
 546
 547
@@ -24391,44 +24557,43 @@ 
576 577 578 -579 -580
def _create_batch_api_job(
-    self, batch_input_file: "OpenAIFileObject"
-) -> Union["OpenAIBatch", None]:
-    """Creates a job in the OpenAI Batch API to generate responses for the given input
-    file.
-
-    Args:
-        batch_input_file: the input file to generate responses for.
-
-    Returns:
-        The batch job created in the OpenAI Batch API.
-    """
-    import openai
-
-    metadata = {"description": "distilabel"}
-
-    if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:
-        metadata["distilabel_pipeline_name"] = distilabel_pipeline_name
-
-    if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:
-        metadata["distilabel_pipeline_cache_id"] = distilabel_pipeline_cache_id
-
-    batch = None
-    try:
-        batch = self._client.batches.create(
-            completion_window="24h",
-            endpoint="/v1/chat/completions",
-            input_file_id=batch_input_file.id,
-            metadata=metadata,
-        )
-    except openai.OpenAIError as e:
-        self._logger.error(  # type: ignore
-            f"Error while creating OpenAI Batch API job for file with ID"
-            f" '{batch_input_file.id}': {e}."
-        )
-        raise e
-    return batch
+579
def _create_batch_api_job(
+    self, batch_input_file: "OpenAIFileObject"
+) -> Union["OpenAIBatch", None]:
+    """Creates a job in the OpenAI Batch API to generate responses for the given input
+    file.
+
+    Args:
+        batch_input_file: the input file to generate responses for.
+
+    Returns:
+        The batch job created in the OpenAI Batch API.
+    """
+    import openai
+
+    metadata = {"description": "distilabel"}
+
+    if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:
+        metadata["distilabel_pipeline_name"] = distilabel_pipeline_name
+
+    if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:
+        metadata["distilabel_pipeline_cache_id"] = distilabel_pipeline_cache_id
+
+    batch = None
+    try:
+        batch = self._client.batches.create(
+            completion_window="24h",
+            endpoint="/v1/chat/completions",
+            input_file_id=batch_input_file.id,
+            metadata=metadata,
+        )
+    except openai.OpenAIError as e:
+        self._logger.error(  # type: ignore
+            f"Error while creating OpenAI Batch API job for file with ID"
+            f" '{batch_input_file.id}': {e}."
+        )
+        raise e
+    return batch
 
@@ -24548,7 +24713,8 @@
Source code in src/distilabel/models/llms/openai.py -
582
+              
581
+582
 583
 584
 585
@@ -24586,47 +24752,46 @@ 
617 618 619 -620 -621
def _create_batch_files(
-    self, inputs: List["FormattedInput"], **kwargs: Any
-) -> List["OpenAIFileObject"]:
-    """Creates the necessary input files for the batch API to generate responses. The
-    maximum size of each file so the OpenAI Batch API can process it is 100MB, so we
-    need to split the inputs into multiple files if necessary.
-
-    More information: https://platform.openai.com/docs/api-reference/files/create
-
-    Args:
-        inputs: a list of inputs in chat format to generate responses for, optionally
-            including structured output.
-        kwargs: the keyword arguments to use for the generation.
-
-    Returns:
-        The list of file objects created for the OpenAI Batch API.
-
-    Raises:
-        openai.OpenAIError: if there was an error while creating the batch input file
-            in the OpenAI Batch API.
-    """
-    import openai
-
-    files = []
-    for file_no, buffer in enumerate(
-        self._create_jsonl_buffers(inputs=inputs, **kwargs)
-    ):
-        try:
-            # TODO: add distilabel pipeline name and id
-            batch_input_file = self._client.files.create(
-                file=(self._name_for_openai_files(file_no), buffer),
-                purpose="batch",
-            )
-            files.append(batch_input_file)
-        except openai.OpenAIError as e:
-            self._logger.error(  # type: ignore
-                f"Error while creating OpenAI batch input file: {e}"
-            )
-            raise e
-    return files
+620
def _create_batch_files(
+    self, inputs: List["FormattedInput"], **kwargs: Any
+) -> List["OpenAIFileObject"]:
+    """Creates the necessary input files for the batch API to generate responses. The
+    maximum size of each file so the OpenAI Batch API can process it is 100MB, so we
+    need to split the inputs into multiple files if necessary.
+
+    More information: https://platform.openai.com/docs/api-reference/files/create
+
+    Args:
+        inputs: a list of inputs in chat format to generate responses for, optionally
+            including structured output.
+        kwargs: the keyword arguments to use for the generation.
+
+    Returns:
+        The list of file objects created for the OpenAI Batch API.
+
+    Raises:
+        openai.OpenAIError: if there was an error while creating the batch input file
+            in the OpenAI Batch API.
+    """
+    import openai
+
+    files = []
+    for file_no, buffer in enumerate(
+        self._create_jsonl_buffers(inputs=inputs, **kwargs)
+    ):
+        try:
+            # TODO: add distilabel pipeline name and id
+            batch_input_file = self._client.files.create(
+                file=(self._name_for_openai_files(file_no), buffer),
+                purpose="batch",
+            )
+            files.append(batch_input_file)
+        except openai.OpenAIError as e:
+            self._logger.error(  # type: ignore
+                f"Error while creating OpenAI batch input file: {e}"
+            )
+            raise e
+    return files
 
@@ -24730,7 +24895,8 @@
Source code in src/distilabel/models/llms/openai.py -
623
+              
622
+623
 624
 625
 626
@@ -24761,40 +24927,39 @@ 
651 652 653 -654 -655
def _create_jsonl_buffers(
-    self, inputs: List["FormattedInput"], **kwargs: Any
-) -> Generator[io.BytesIO, None, None]:
-    """Creates a generator of buffers containing the JSONL formatted inputs to be
-    used by the OpenAI Batch API. The buffers created are of size 100MB or less.
-
-    Args:
-        inputs: a list of inputs in chat format to generate responses for, optionally
-            including structured output.
-        kwargs: the keyword arguments to use for the generation.
-
-    Yields:
-        A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch
-        API.
-    """
-    buffer = io.BytesIO()
-    buffer_current_size = 0
-    for i, input in enumerate(inputs):
-        # We create the smallest `custom_id` so we don't  increase the size of the file
-        # to much, but we can still sort the results with the order of the inputs.
-        row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)
-        row_size = len(row)
-        if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:
-            buffer.seek(0)
-            yield buffer
-            buffer = io.BytesIO()
-            buffer_current_size = 0
-        buffer.write(row)
-        buffer_current_size += row_size
-
-    if buffer_current_size > 0:
-        buffer.seek(0)
-        yield buffer
+654
def _create_jsonl_buffers(
+    self, inputs: List["FormattedInput"], **kwargs: Any
+) -> Generator[io.BytesIO, None, None]:
+    """Creates a generator of buffers containing the JSONL formatted inputs to be
+    used by the OpenAI Batch API. The buffers created are of size 100MB or less.
+
+    Args:
+        inputs: a list of inputs in chat format to generate responses for, optionally
+            including structured output.
+        kwargs: the keyword arguments to use for the generation.
+
+    Yields:
+        A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch
+        API.
+    """
+    buffer = io.BytesIO()
+    buffer_current_size = 0
+    for i, input in enumerate(inputs):
+        # We create the smallest `custom_id` so we don't  increase the size of the file
+        # to much, but we can still sort the results with the order of the inputs.
+        row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)
+        row_size = len(row)
+        if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:
+            buffer.seek(0)
+            yield buffer
+            buffer = io.BytesIO()
+            buffer_current_size = 0
+        buffer.write(row)
+        buffer_current_size += row_size
+
+    if buffer_current_size > 0:
+        buffer.seek(0)
+        yield buffer
 
@@ -24903,7 +25068,8 @@
Source code in src/distilabel/models/llms/openai.py -
657
+              
656
+657
 658
 659
 660
@@ -24924,30 +25090,29 @@ 
675 676 677 -678 -679
def _create_jsonl_row(
-    self, input: "FormattedInput", custom_id: str, **kwargs: Any
-) -> bytes:
-    """Creates a JSONL formatted row to be used by the OpenAI Batch API.
-
-    Args:
-        input: a list of inputs in chat format to generate responses for, optionally
-            including structured output.
-        custom_id: a custom ID to use for the row.
-        kwargs: the keyword arguments to use for the generation.
-
-    Returns:
-        A JSONL formatted row to be used by the OpenAI Batch API.
-    """
-    # TODO: depending on the format of the input, add `response_format` to the kwargs
-    row = {
-        "custom_id": custom_id,
-        "method": "POST",
-        "url": "/v1/chat/completions",
-        "body": {"messages": input, **kwargs},
-    }
-    json_row = orjson.dumps(row)
-    return json_row + b"\n"
+678
def _create_jsonl_row(
+    self, input: "FormattedInput", custom_id: str, **kwargs: Any
+) -> bytes:
+    """Creates a JSONL formatted row to be used by the OpenAI Batch API.
+
+    Args:
+        input: a list of inputs in chat format to generate responses for, optionally
+            including structured output.
+        custom_id: a custom ID to use for the row.
+        kwargs: the keyword arguments to use for the generation.
+
+    Returns:
+        A JSONL formatted row to be used by the OpenAI Batch API.
+    """
+    # TODO: depending on the format of the input, add `response_format` to the kwargs
+    row = {
+        "custom_id": custom_id,
+        "method": "POST",
+        "url": "/v1/chat/completions",
+        "body": {"messages": input, **kwargs},
+    }
+    json_row = orjson.dumps(row)
+    return json_row + b"\n"
 
diff --git a/pr-1053/search/search_index.json b/pr-1053/search/search_index.json index b52492ed3e..88bad038d3 100644 --- a/pr-1053/search/search_index.json +++ b/pr-1053/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Distilabel","text":"Synthesize data for AI and add feedback on the fly!

Distilabel is the framework for synthetic data and AI feedback for engineers who need fast, reliable and scalable pipelines based on verified research papers.

  • Get started in 5 minutes!

    Install distilabel with pip and run your first Pipeline to generate and evaluate synthetic data.

    Quickstart

  • How-to guides

    Get familiar with the basics of distilabel. Learn how to define steps, tasks and llms and run your Pipeline.

    Learn more

"},{"location":"#why-use-distilabel","title":"Why use distilabel?","text":"

Distilabel can be used for generating synthetic data and AI feedback for a wide variety of projects including traditional predictive NLP (classification, extraction, etc.), or generative and large language model scenarios (instruction following, dialogue generation, judging etc.). Distilabel's programmatic approach allows you to build scalable pipelines for data generation and AI feedback. The goal of distilabel is to accelerate your AI development by quickly generating high-quality, diverse datasets based on verified research methodologies for generating and judging with AI feedback.

Improve your AI output quality through data quality

Compute is expensive and output quality is important. We help you focus on data quality, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time achieving and keeping high-quality standards for your synthetic data.

Take control of your data and models

Ownership of data for fine-tuning your own LLMs is not easy but distilabel can help you to get started. We integrate AI feedback from any LLM provider out there using one unified API.

Improve efficiency by quickly iterating on the right data and models

Synthesize and judge data with latest research papers while ensuring flexibility, scalability and fault tolerance. So you can focus on improving your data and training your models.

"},{"location":"#what-do-people-build-with-distilabel","title":"What do people build with distilabel?","text":"

The Argilla community uses distilabel to create amazing datasets and models.

  • The 1M OpenHermesPreference is a dataset of ~1 million AI preferences derived from teknium/OpenHermes-2.5. It shows how we can use Distilabel to synthesize data on an immense scale.
  • Our distilabeled Intel Orca DPO dataset and the improved OpenHermes model, show how we improve model performance by filtering out 50% of the original dataset through AI feedback.
  • The haiku DPO data outlines how anyone can create a dataset for a specific task and the latest research papers to improve the quality of the dataset.
"},{"location":"api/cli/","title":"Command Line Interface (CLI)","text":"

This section contains the API reference for the CLI. For more information on how to use the CLI, see Tutorial - CLI.

"},{"location":"api/cli/#utility-functions-for-the-distilabel-pipeline-sub-commands","title":"Utility functions for the distilabel pipeline sub-commands","text":"

Here are some utility functions to help working with the pipelines in the console.

"},{"location":"api/cli/#distilabel.cli.pipeline.utils","title":"utils","text":""},{"location":"api/cli/#distilabel.cli.pipeline.utils.parse_runtime_parameters","title":"parse_runtime_parameters(params)","text":"

Parses the runtime parameters from the CLI format to the format expected by the Pipeline.run method. The CLI format is a list of tuples, where the first element is a list of keys and the second element is the value.

Parameters:

Name Type Description Default params List[Tuple[List[str], str]]

A list of tuples, where the first element is a list of keys and the second element is the value.

required

Returns:

Type Description Dict[str, Dict[str, Any]]

A dictionary with the runtime parameters in the format expected by the

Dict[str, Dict[str, Any]]

Pipeline.run method.

Source code in src/distilabel/cli/pipeline/utils.py
def parse_runtime_parameters(\n    params: List[Tuple[List[str], str]],\n) -> Dict[str, Dict[str, Any]]:\n    \"\"\"Parses the runtime parameters from the CLI format to the format expected by the\n    `Pipeline.run` method. The CLI format is a list of tuples, where the first element is\n    a list of keys and the second element is the value.\n\n    Args:\n        params: A list of tuples, where the first element is a list of keys and the\n            second element is the value.\n\n    Returns:\n        A dictionary with the runtime parameters in the format expected by the\n        `Pipeline.run` method.\n    \"\"\"\n    runtime_params = {}\n    for keys, value in params:\n        current = runtime_params\n        for i, key in enumerate(keys):\n            if i == len(keys) - 1:\n                current[key] = value\n            else:\n                current = current.setdefault(key, {})\n    return runtime_params\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.valid_http_url","title":"valid_http_url(url)","text":"

Check if the URL is a valid HTTP URL.

Parameters:

Name Type Description Default url str

The URL to check.

required

Returns:

Type Description bool

True, if the URL is a valid HTTP URL. False, otherwise.

Source code in src/distilabel/cli/pipeline/utils.py
def valid_http_url(url: str) -> bool:\n    \"\"\"Check if the URL is a valid HTTP URL.\n\n    Args:\n        url: The URL to check.\n\n    Returns:\n        `True`, if the URL is a valid HTTP URL. `False`, otherwise.\n    \"\"\"\n    try:\n        TypeAdapter(HttpUrl).validate_python(url)  # type: ignore\n    except ValidationError:\n        return False\n\n    return True\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_config_from_url","title":"get_config_from_url(url)","text":"

Loads the pipeline configuration from a URL pointing to a JSON or YAML file.

Parameters:

Name Type Description Default url str

The URL pointing to the pipeline configuration file.

required

Returns:

Type Description Dict[str, Any]

The pipeline configuration as a dictionary.

Raises:

Type Description ValueError

If the file format is not supported.

Source code in src/distilabel/cli/pipeline/utils.py
def get_config_from_url(url: str) -> Dict[str, Any]:\n    \"\"\"Loads the pipeline configuration from a URL pointing to a JSON or YAML file.\n\n    Args:\n        url: The URL pointing to the pipeline configuration file.\n\n    Returns:\n        The pipeline configuration as a dictionary.\n\n    Raises:\n        ValueError: If the file format is not supported.\n    \"\"\"\n    if not url.endswith((\".json\", \".yaml\", \".yml\")):\n        raise DistilabelUserError(\n            f\"Unsupported file format for '{url}'. Only JSON and YAML are supported\",\n            page=\"sections/how_to_guides/basic/pipeline/?h=seriali#serializing-the-pipeline\",\n        )\n    response = _download_remote_file(url)\n\n    if url.endswith((\".yaml\", \".yml\")):\n        content = response.content.decode(\"utf-8\")\n        return yaml.safe_load(content)\n\n    return response.json()\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_pipeline_from_url","title":"get_pipeline_from_url(url, pipeline_name='pipeline')","text":"

Downloads the file to the current working directory and loads the pipeline object from a python script.

Parameters:

Name Type Description Default url str

The URL pointing to the python script with the pipeline definition.

required pipeline_name str

The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:....

'pipeline'

Returns:

Type Description BasePipeline

The pipeline instantiated.

Raises:

Type Description ValueError

If the file format is not supported.

Source code in src/distilabel/cli/pipeline/utils.py
def get_pipeline_from_url(url: str, pipeline_name: str = \"pipeline\") -> \"BasePipeline\":\n    \"\"\"Downloads the file to the current working directory and loads the pipeline object\n    from a python script.\n\n    Args:\n        url: The URL pointing to the python script with the pipeline definition.\n        pipeline_name: The name of the pipeline in the script.\n            I.e: `with Pipeline(...) as pipeline:...`.\n\n    Returns:\n        The pipeline instantiated.\n\n    Raises:\n        ValueError: If the file format is not supported.\n    \"\"\"\n    if not url.endswith(\".py\"):\n        raise DistilabelUserError(\n            f\"Unsupported file format for '{url}'. It must be a python file.\",\n            page=\"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run\",\n        )\n    response = _download_remote_file(url)\n\n    content = response.content.decode(\"utf-8\")\n    script_local = Path.cwd() / Path(url).name\n    script_local.write_text(content)\n\n    # Add the current working directory to sys.path\n    sys.path.insert(0, os.getcwd())\n    module = importlib.import_module(str(Path(url).stem))\n    pipeline = getattr(module, pipeline_name, None)\n    if not pipeline:\n        raise ImportError(\n            f\"The script must contain an object with the pipeline named: '{pipeline_name}' that can be imported\"\n        )\n\n    return pipeline\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_pipeline","title":"get_pipeline(config_or_script, pipeline_name='pipeline')","text":"

Get a pipeline from a configuration file or a remote python script.

Parameters:

Name Type Description Default config_or_script str

The path or URL to the pipeline configuration file or URL to a python script.

required pipeline_name str

The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:....

'pipeline'

Returns:

Type Description BasePipeline

The pipeline.

Raises:

Type Description ValueError

If the file format is not supported.

FileNotFoundError

If the configuration file does not exist.

Source code in src/distilabel/cli/pipeline/utils.py
def get_pipeline(\n    config_or_script: str, pipeline_name: str = \"pipeline\"\n) -> \"BasePipeline\":\n    \"\"\"Get a pipeline from a configuration file or a remote python script.\n\n    Args:\n        config_or_script: The path or URL to the pipeline configuration file\n            or URL to a python script.\n        pipeline_name: The name of the pipeline in the script.\n            I.e: `with Pipeline(...) as pipeline:...`.\n\n    Returns:\n        The pipeline.\n\n    Raises:\n        ValueError: If the file format is not supported.\n        FileNotFoundError: If the configuration file does not exist.\n    \"\"\"\n    config = script = None\n    if config_or_script.endswith((\".json\", \".yaml\", \".yml\")):\n        config = config_or_script\n    elif config_or_script.endswith(\".py\"):\n        script = config_or_script\n    else:\n        raise DistilabelUserError(\n            \"The file must be a valid config file or python script with a pipeline.\",\n            page=\"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run\",\n        )\n\n    if valid_http_url(config_or_script):\n        if config:\n            data = get_config_from_url(config)\n            return Pipeline.from_dict(data)\n        return get_pipeline_from_url(script, pipeline_name=pipeline_name)\n\n    if not config:\n        raise ValueError(\n            f\"To run a pipeline from a python script, run it as `python {script}`\"\n        )\n\n    if Path(config).is_file():\n        return Pipeline.from_file(config)\n\n    raise FileNotFoundError(f\"File '{config_or_script}' does not exist.\")\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.display_pipeline_information","title":"display_pipeline_information(pipeline)","text":"

Displays the pipeline information to the console.

Parameters:

Name Type Description Default pipeline BasePipeline

The pipeline.

required Source code in src/distilabel/cli/pipeline/utils.py
def display_pipeline_information(pipeline: \"BasePipeline\") -> None:\n    \"\"\"Displays the pipeline information to the console.\n\n    Args:\n        pipeline: The pipeline.\n    \"\"\"\n    from rich.console import Console\n\n    Console().print(_build_pipeline_panel(pipeline))\n
"},{"location":"api/distiset/","title":"Distiset","text":"

This section contains the API reference for the Distiset. For more information on how to use the CLI, see Tutorial - CLI.

"},{"location":"api/distiset/#distilabel.distiset.Distiset","title":"Distiset","text":"

Bases: dict

Convenient wrapper around datasets.Dataset to push to the Hugging Face Hub.

It's a dictionary where the keys correspond to the different leaf_steps from the internal DAG and the values are datasets.Dataset.

Attributes:

Name Type Description _pipeline_path Optional[Path]

Optional path to the pipeline.yaml file that generated the dataset. Defaults to None.

_artifacts_path Optional[Path]

Optional path to the directory containing the generated artifacts by the pipeline steps. Defaults to None.

_log_filename_path Optional[Path]

Optional path to the pipeline.log file that generated was written by the pipeline. Defaults to None.

_citations Optional[List[str]]

Optional list containing citations that will be included in the dataset card. Defaults to None.

Source code in src/distilabel/distiset.py
class Distiset(dict):\n    \"\"\"Convenient wrapper around `datasets.Dataset` to push to the Hugging Face Hub.\n\n    It's a dictionary where the keys correspond to the different leaf_steps from the internal\n    `DAG` and the values are `datasets.Dataset`.\n\n    Attributes:\n        _pipeline_path: Optional path to the `pipeline.yaml` file that generated the dataset.\n            Defaults to `None`.\n        _artifacts_path: Optional path to the directory containing the generated artifacts\n            by the pipeline steps. Defaults to `None`.\n        _log_filename_path: Optional path to the `pipeline.log` file that generated was written\n            by the pipeline. Defaults to `None`.\n        _citations: Optional list containing citations that will be included in the dataset\n            card. Defaults to `None`.\n    \"\"\"\n\n    _pipeline_path: Optional[Path] = None\n    _artifacts_path: Optional[Path] = None\n    _log_filename_path: Optional[Path] = None\n    _citations: Optional[List[str]] = None\n\n    def push_to_hub(\n        self,\n        repo_id: str,\n        private: bool = False,\n        token: Optional[str] = None,\n        generate_card: bool = True,\n        include_script: bool = False,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration\n        corresponding to the leaf step that generated it.\n\n        Args:\n            repo_id:\n                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n                of the logged-in user.\n            private:\n                Whether the dataset repository should be set to private or not. Only affects repository creation:\n                a repository that already exists will not be affected by that parameter.\n            token:\n                An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n                to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n                if no token is passed and the user is not logged-in.\n            generate_card:\n                Whether to generate a dataset card or not. Defaults to True.\n            include_script:\n                Whether you want to push the pipeline script to the hugging face hub to share it.\n                If set to True, the name of the script that was run to create the distiset will be\n                automatically determined, and that will be the name of the file uploaded to your\n                repository. Take into account, this operation only makes sense for a distiset obtained\n                from calling `Pipeline.run()` method. Defaults to False.\n            **kwargs:\n                Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.\n\n        Raises:\n            ValueError: If no token is provided and couldn't be retrieved automatically.\n        \"\"\"\n        script_filename = sys.argv[0]\n        filename_py = (\n            script_filename.split(\"/\")[-1]\n            if \"/\" in script_filename\n            else script_filename\n        )\n        script_path = Path.cwd() / script_filename\n\n        if token is None:\n            token = get_hf_token(self.__class__.__name__, \"token\")\n\n        for name, dataset in self.items():\n            dataset.push_to_hub(\n                repo_id=repo_id,\n                config_name=name,\n                private=private,\n                token=token,\n                **kwargs,\n            )\n\n        if self.artifacts_path:\n            upload_folder(\n                repo_id=repo_id,\n                folder_path=self.artifacts_path,\n                path_in_repo=\"artifacts\",\n                token=token,\n                repo_type=\"dataset\",\n                commit_message=\"Include pipeline artifacts\",\n            )\n\n        if include_script and script_path.exists():\n            upload_file(\n                path_or_fileobj=script_path,\n                path_in_repo=filename_py,\n                repo_id=repo_id,\n                repo_type=\"dataset\",\n                token=token,\n                commit_message=\"Include pipeline script\",\n            )\n\n        if generate_card:\n            self._generate_card(\n                repo_id, token, include_script=include_script, filename_py=filename_py\n            )\n\n    def _get_card(\n        self,\n        repo_id: str,\n        token: Optional[str] = None,\n        include_script: bool = False,\n        filename_py: Optional[str] = None,\n    ) -> DistilabelDatasetCard:\n        \"\"\"Generates the dataset card for the `Distiset`.\n\n        Note:\n            If `repo_id` and `token` are provided, it will extract the metadata from the README.md file\n            on the hub.\n\n        Args:\n            repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.\n            token: The token to authenticate with the Hugging Face Hub.\n                We assume that if it's provided, the dataset will be in the Hugging Face Hub,\n                so the README metadata will be extracted from there.\n            include_script: Whether to upload the script to the hugging face repository.\n            filename_py: The name of the script. If `include_script` is True, the script will\n                be uploaded to the repository using this name, otherwise it won't be used.\n\n        Returns:\n            The dataset card for the `Distiset`.\n        \"\"\"\n        sample_records = {}\n        for name, dataset in self.items():\n            sample_records[name] = (\n                dataset[0] if not isinstance(dataset, dict) else dataset[\"train\"][0]\n            )\n\n        readme_metadata = {}\n        if repo_id and token:\n            readme_metadata = self._extract_readme_metadata(repo_id, token)\n\n        metadata = {\n            **readme_metadata,\n            \"size_categories\": size_categories_parser(\n                max(len(dataset) for dataset in self.values())\n            ),\n            \"tags\": [\"synthetic\", \"distilabel\", \"rlaif\"],\n        }\n\n        card = DistilabelDatasetCard.from_template(\n            card_data=DatasetCardData(**metadata),\n            repo_id=repo_id,\n            sample_records=sample_records,\n            include_script=include_script,\n            filename_py=filename_py,\n            artifacts=self._get_artifacts_metadata(),\n            references=self.citations,\n        )\n\n        return card\n\n    def _get_artifacts_metadata(self) -> Dict[str, List[Dict[str, Any]]]:\n        \"\"\"Gets a dictionary with the metadata of the artifacts generated by the pipeline steps.\n\n        Returns:\n            A dictionary in which the key is the name of the step and the value is a list\n            of dictionaries, each of them containing the name and metadata of the step artifact.\n        \"\"\"\n        if not self.artifacts_path:\n            return {}\n\n        def iterdir_ignore_hidden(path: Path) -> Generator[Path, None, None]:\n            return (f for f in Path(path).iterdir() if not f.name.startswith(\".\"))\n\n        artifacts_metadata = defaultdict(list)\n        for step_artifacts_dir in iterdir_ignore_hidden(self.artifacts_path):\n            step_name = step_artifacts_dir.stem\n            for artifact_dir in iterdir_ignore_hidden(step_artifacts_dir):\n                artifact_name = artifact_dir.stem\n                metadata_path = artifact_dir / \"metadata.json\"\n                metadata = json.loads(metadata_path.read_text())\n                artifacts_metadata[step_name].append(\n                    {\"name\": artifact_name, \"metadata\": metadata}\n                )\n\n        return dict(artifacts_metadata)\n\n    def _extract_readme_metadata(\n        self, repo_id: str, token: Optional[str]\n    ) -> Dict[str, Any]:\n        \"\"\"Extracts the metadata from the README.md file of the dataset repository.\n\n        We have to download the previous README.md file in the repo, extract the metadata from it,\n        and generate a dict again to be passed thorough the `DatasetCardData` object.\n\n        Args:\n            repo_id: The ID of the repository to push to, from the `push_to_hub` method.\n            token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.\n\n        Returns:\n            The metadata extracted from the README.md file of the dataset repository as a dict.\n        \"\"\"\n        readme_path = Path(\n            hf_hub_download(repo_id, \"README.md\", repo_type=\"dataset\", token=token)\n        )\n        # Remove the '---' from the metadata\n        metadata = re.findall(r\"---\\n(.*?)\\n---\", readme_path.read_text(), re.DOTALL)[0]\n        metadata = yaml.safe_load(metadata)\n        return metadata\n\n    def _generate_card(\n        self,\n        repo_id: str,\n        token: str,\n        include_script: bool = False,\n        filename_py: Optional[str] = None,\n    ) -> None:\n        \"\"\"Generates a dataset card and pushes it to the Hugging Face Hub, and\n        if the `pipeline.yaml` path is available in the `Distiset`, uploads that\n        to the same repository.\n\n        Args:\n            repo_id: The ID of the repository to push to, from the `push_to_hub` method.\n            token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.\n            include_script: Whether to upload the script to the hugging face repository.\n            filename_py: The name of the script. If `include_script` is True, the script will\n                be uploaded to the repository using this name, otherwise it won't be used.\n        \"\"\"\n        card = self._get_card(\n            repo_id=repo_id,\n            token=token,\n            include_script=include_script,\n            filename_py=filename_py,\n        )\n\n        card.push_to_hub(\n            repo_id,\n            repo_type=\"dataset\",\n            token=token,\n        )\n\n        if self.pipeline_path:\n            # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.\n            HfApi().upload_file(\n                path_or_fileobj=self.pipeline_path,\n                path_in_repo=PIPELINE_CONFIG_FILENAME,\n                repo_id=repo_id,\n                repo_type=\"dataset\",\n                token=token,\n            )\n\n        if self.log_filename_path:\n            # The same we had with \"pipeline.yaml\" but with the log file.\n            HfApi().upload_file(\n                path_or_fileobj=self.log_filename_path,\n                path_in_repo=PIPELINE_LOG_FILENAME,\n                repo_id=repo_id,\n                repo_type=\"dataset\",\n                token=token,\n            )\n\n    def train_test_split(\n        self,\n        train_size: float,\n        shuffle: bool = True,\n        seed: Optional[int] = None,\n    ) -> Self:\n        \"\"\"Return a `Distiset` whose values will be a `datasets.DatasetDict` with two random train and test subsets.\n        Splits are created from the dataset according to `train_size` and `shuffle`.\n\n        Args:\n            train_size:\n                Float between `0.0` and `1.0` representing the proportion of the dataset to include in the test split.\n                It will be applied to all the datasets in the `Distiset`.\n            shuffle: Whether or not to shuffle the data before splitting\n            seed:\n                A seed to initialize the default BitGenerator, passed to the underlying method.\n\n        Returns:\n            The `Distiset` with the train-test split applied to all the datasets.\n        \"\"\"\n        assert 0 < train_size < 1, \"train_size must be a float between 0 and 1\"\n        for name, dataset in self.items():\n            self[name] = dataset.train_test_split(\n                train_size=train_size,\n                shuffle=shuffle,\n                seed=seed,\n            )\n        return self\n\n    def save_to_disk(\n        self,\n        distiset_path: PathLike,\n        max_shard_size: Optional[Union[str, int]] = None,\n        num_shards: Optional[int] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        save_card: bool = True,\n        save_pipeline_config: bool = True,\n        save_pipeline_log: bool = True,\n    ) -> None:\n        r\"\"\"\n        Saves a `Distiset` to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n        In case you want to save the `Distiset` in a remote filesystem, you can pass the `storage_options` parameter\n        as you would do with `datasets`'s `Dataset.save_to_disk` method: [see example](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets)\n\n        Args:\n            distiset_path: Path where you want to save the `Distiset`. It can be a local path\n                (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n            max_shard_size: The maximum size of the dataset shards to be uploaded to the hub.\n                If expressed as a string, needs to be digits followed by a unit (like `\"50MB\"`).\n                Defaults to `None`.\n            num_shards: Number of shards to write. By default the number of shards depends on\n                `max_shard_size` and `num_proc`. Defaults to `None`.\n            num_proc: Number of processes when downloading and generating the dataset locally.\n                Multiprocessing is disabled by default. Defaults to `None`.\n            storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n                Defaults to `None`.\n            save_card: Whether to save the dataset card. Defaults to `True`.\n            save_pipeline_config: Whether to save the pipeline configuration file (aka the `pipeline.yaml` file).\n                Defaults to `True`.\n            save_pipeline_log: Whether to save the pipeline log file (aka the `pipeline.log` file).\n                Defaults to `True`.\n\n        Examples:\n            ```python\n            # Save your distiset in a local folder:\n            distiset.save_to_disk(distiset_path=\"my-distiset\")\n            # Save your distiset in a remote storage:\n            storage_options = {\n                \"key\": os.environ[\"S3_ACCESS_KEY\"],\n                \"secret\": os.environ[\"S3_SECRET_KEY\"],\n                \"client_kwargs\": {\n                    \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n                    \"region_name\": os.environ[\"S3_REGION\"],\n                },\n            }\n            distiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n            ```\n        \"\"\"\n        distiset_path = str(distiset_path)\n        for name, dataset in self.items():\n            dataset.save_to_disk(\n                f\"{distiset_path}/{name}\",\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                num_proc=num_proc,\n                storage_options=storage_options,\n            )\n\n        distiset_config_folder = posixpath.join(distiset_path, DISTISET_CONFIG_FOLDER)\n\n        fs: fsspec.AbstractFileSystem\n        fs, _, _ = fsspec.get_fs_token_paths(\n            distiset_config_folder, storage_options=storage_options\n        )\n        fs.makedirs(distiset_config_folder, exist_ok=True)\n\n        if self.artifacts_path:\n            distiset_artifacts_folder = posixpath.join(\n                distiset_path, DISTISET_ARTIFACTS_FOLDER\n            )\n            fs.copy(str(self.artifacts_path), distiset_artifacts_folder, recursive=True)\n\n        if save_card:\n            # NOTE:\u00a0Currently the card is not the same if we write to disk or push to the HF hub,\n            # as we aren't generating the README copying/updating the data from the dataset repo.\n            card = self._get_card(repo_id=Path(distiset_path).stem, token=None)\n            new_filename = posixpath.join(distiset_config_folder, \"README.md\")\n            if storage_options:\n                # Write the card the same way as DatasetCard.save does:\n                with fs.open(new_filename, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n                    f.write(str(card))\n            else:\n                card.save(new_filename)\n\n        # Write our internal files to the distiset folder by copying them to the distiset folder.\n        if save_pipeline_config and self.pipeline_path:\n            new_filename = posixpath.join(\n                distiset_config_folder, PIPELINE_CONFIG_FILENAME\n            )\n            if self.pipeline_path.exists() and (not fs.isfile(new_filename)):\n                data = yaml.safe_load(self.pipeline_path.read_text())\n                with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n                    yaml.dump(data, f, default_flow_style=False)\n\n        if save_pipeline_log and self.log_filename_path:\n            new_filename = posixpath.join(distiset_config_folder, PIPELINE_LOG_FILENAME)\n            if self.log_filename_path.exists() and (not fs.isfile(new_filename)):\n                data = self.log_filename_path.read_text()\n                with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n                    f.write(data)\n\n    @classmethod\n    def load_from_disk(\n        cls,\n        distiset_path: PathLike,\n        keep_in_memory: Optional[bool] = None,\n        storage_options: Optional[Dict[str, Any]] = None,\n        download_dir: Optional[PathLike] = None,\n    ) -> Self:\n        \"\"\"Loads a dataset that was previously saved using `Distiset.save_to_disk` from a dataset\n        directory, or from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n        Args:\n            distiset_path: Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").\n            keep_in_memory: Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk``\n                for more information. Defaults to `None`.\n            storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n                Defaults to `None`.\n            download_dir: Optional directory to download the dataset to. Defaults to None,\n                in which case it will create a temporary directory.\n\n        Returns:\n            A `Distiset` loaded from disk, it should be a `Distiset` object created using `Distiset.save_to_disk`.\n        \"\"\"\n        original_distiset_path = str(distiset_path)\n\n        fs: fsspec.AbstractFileSystem\n        fs, _, [distiset_path] = fsspec.get_fs_token_paths(  # type: ignore\n            original_distiset_path, storage_options=storage_options\n        )\n        dest_distiset_path = distiset_path\n\n        assert fs.isdir(\n            original_distiset_path\n        ), \"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem.\"\n\n        has_config = False\n        has_artifacts = False\n        distiset = cls()\n\n        if is_remote_filesystem(fs):\n            src_dataset_path = distiset_path\n            if download_dir:\n                dest_distiset_path = download_dir\n            else:\n                dest_distiset_path = Dataset._build_local_temp_path(src_dataset_path)  # type: ignore\n            fs.download(src_dataset_path, dest_distiset_path.as_posix(), recursive=True)  # type: ignore\n\n        # Now we should have the distiset locally, so we can read those files\n        for folder in Path(dest_distiset_path).iterdir():\n            if folder.stem == DISTISET_CONFIG_FOLDER:\n                has_config = True\n                continue\n            elif folder.stem == DISTISET_ARTIFACTS_FOLDER:\n                has_artifacts = True\n                continue\n            distiset[folder.stem] = load_from_disk(\n                str(folder),\n                keep_in_memory=keep_in_memory,\n            )\n\n        # From the config folder we just need to point to the files. Once downloaded we set the path to point to point to the files. Once downloaded we set the path\n        # to wherever they are.\n        if has_config:\n            distiset_config_folder = posixpath.join(\n                dest_distiset_path, DISTISET_CONFIG_FOLDER\n            )\n\n            pipeline_path = posixpath.join(\n                distiset_config_folder, PIPELINE_CONFIG_FILENAME\n            )\n            if Path(pipeline_path).exists():\n                distiset.pipeline_path = Path(pipeline_path)\n\n            log_filename_path = posixpath.join(\n                distiset_config_folder, PIPELINE_LOG_FILENAME\n            )\n            if Path(log_filename_path).exists():\n                distiset.log_filename_path = Path(log_filename_path)\n\n        if has_artifacts:\n            distiset.artifacts_path = Path(\n                posixpath.join(dest_distiset_path, DISTISET_ARTIFACTS_FOLDER)\n            )\n\n        return distiset\n\n    @property\n    def pipeline_path(self) -> Union[Path, None]:\n        \"\"\"Returns the path to the `pipeline.yaml` file that generated the `Pipeline`.\"\"\"\n        return self._pipeline_path\n\n    @pipeline_path.setter\n    def pipeline_path(self, path: PathLike) -> None:\n        self._pipeline_path = Path(path)\n\n    @property\n    def artifacts_path(self) -> Union[Path, None]:\n        \"\"\"Returns the path to the directory containing the artifacts generated by the steps\n        of the pipeline.\"\"\"\n        return self._artifacts_path\n\n    @artifacts_path.setter\n    def artifacts_path(self, path: PathLike) -> None:\n        self._artifacts_path = Path(path)\n\n    @property\n    def log_filename_path(self) -> Union[Path, None]:\n        \"\"\"Returns the path to the `pipeline.log` file that generated the `Pipeline`.\"\"\"\n        return self._log_filename_path\n\n    @log_filename_path.setter\n    def log_filename_path(self, path: PathLike) -> None:\n        self._log_filename_path = Path(path)\n\n    @property\n    def citations(self) -> Union[List[str], None]:\n        \"\"\"Bibtex references to be included in the README.\"\"\"\n        return self._citations\n\n    @citations.setter\n    def citations(self, citations_: List[str]) -> None:\n        self._citations = sorted(set(citations_))\n\n    def __repr__(self):\n        # Copy from `datasets.DatasetDict.__repr__`.\n        repr = \"\\n\".join([f\"{k}: {v}\" for k, v in self.items()])\n        repr = re.sub(r\"^\", \" \" * 4, repr, count=0, flags=re.M)\n        return f\"Distiset({{\\n{repr}\\n}})\"\n
"},{"location":"api/distiset/#distilabel.distiset.Distiset.pipeline_path","title":"pipeline_path: Union[Path, None] property writable","text":"

Returns the path to the pipeline.yaml file that generated the Pipeline.

"},{"location":"api/distiset/#distilabel.distiset.Distiset.artifacts_path","title":"artifacts_path: Union[Path, None] property writable","text":"

Returns the path to the directory containing the artifacts generated by the steps of the pipeline.

"},{"location":"api/distiset/#distilabel.distiset.Distiset.log_filename_path","title":"log_filename_path: Union[Path, None] property writable","text":"

Returns the path to the pipeline.log file that generated the Pipeline.

"},{"location":"api/distiset/#distilabel.distiset.Distiset.citations","title":"citations: Union[List[str], None] property writable","text":"

Bibtex references to be included in the README.

"},{"location":"api/distiset/#distilabel.distiset.Distiset.push_to_hub","title":"push_to_hub(repo_id, private=False, token=None, generate_card=True, include_script=False, **kwargs)","text":"

Pushes the Distiset to the Hugging Face Hub, each dataset will be pushed as a different configuration corresponding to the leaf step that generated it.

Parameters:

Name Type Description Default repo_id str

The ID of the repository to push to in the following format: <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.

required private bool

Whether the dataset repository should be set to private or not. Only affects repository creation: a repository that already exists will not be affected by that parameter.

False token Optional[str]

An optional authentication token for the Hugging Face Hub. If no token is passed, will default to the token saved locally when logging in with huggingface-cli login. Will raise an error if no token is passed and the user is not logged-in.

None generate_card bool

Whether to generate a dataset card or not. Defaults to True.

True include_script bool

Whether you want to push the pipeline script to the hugging face hub to share it. If set to True, the name of the script that was run to create the distiset will be automatically determined, and that will be the name of the file uploaded to your repository. Take into account, this operation only makes sense for a distiset obtained from calling Pipeline.run() method. Defaults to False.

False **kwargs Any

Additional keyword arguments to pass to the push_to_hub method of the datasets.Dataset object.

{}

Raises:

Type Description ValueError

If no token is provided and couldn't be retrieved automatically.

Source code in src/distilabel/distiset.py
def push_to_hub(\n    self,\n    repo_id: str,\n    private: bool = False,\n    token: Optional[str] = None,\n    generate_card: bool = True,\n    include_script: bool = False,\n    **kwargs: Any,\n) -> None:\n    \"\"\"Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration\n    corresponding to the leaf step that generated it.\n\n    Args:\n        repo_id:\n            The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n            `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n            of the logged-in user.\n        private:\n            Whether the dataset repository should be set to private or not. Only affects repository creation:\n            a repository that already exists will not be affected by that parameter.\n        token:\n            An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n            to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n            if no token is passed and the user is not logged-in.\n        generate_card:\n            Whether to generate a dataset card or not. Defaults to True.\n        include_script:\n            Whether you want to push the pipeline script to the hugging face hub to share it.\n            If set to True, the name of the script that was run to create the distiset will be\n            automatically determined, and that will be the name of the file uploaded to your\n            repository. Take into account, this operation only makes sense for a distiset obtained\n            from calling `Pipeline.run()` method. Defaults to False.\n        **kwargs:\n            Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.\n\n    Raises:\n        ValueError: If no token is provided and couldn't be retrieved automatically.\n    \"\"\"\n    script_filename = sys.argv[0]\n    filename_py = (\n        script_filename.split(\"/\")[-1]\n        if \"/\" in script_filename\n        else script_filename\n    )\n    script_path = Path.cwd() / script_filename\n\n    if token is None:\n        token = get_hf_token(self.__class__.__name__, \"token\")\n\n    for name, dataset in self.items():\n        dataset.push_to_hub(\n            repo_id=repo_id,\n            config_name=name,\n            private=private,\n            token=token,\n            **kwargs,\n        )\n\n    if self.artifacts_path:\n        upload_folder(\n            repo_id=repo_id,\n            folder_path=self.artifacts_path,\n            path_in_repo=\"artifacts\",\n            token=token,\n            repo_type=\"dataset\",\n            commit_message=\"Include pipeline artifacts\",\n        )\n\n    if include_script and script_path.exists():\n        upload_file(\n            path_or_fileobj=script_path,\n            path_in_repo=filename_py,\n            repo_id=repo_id,\n            repo_type=\"dataset\",\n            token=token,\n            commit_message=\"Include pipeline script\",\n        )\n\n    if generate_card:\n        self._generate_card(\n            repo_id, token, include_script=include_script, filename_py=filename_py\n        )\n
"},{"location":"api/distiset/#distilabel.distiset.Distiset.train_test_split","title":"train_test_split(train_size, shuffle=True, seed=None)","text":"

Return a Distiset whose values will be a datasets.DatasetDict with two random train and test subsets. Splits are created from the dataset according to train_size and shuffle.

Parameters:

Name Type Description Default train_size float

Float between 0.0 and 1.0 representing the proportion of the dataset to include in the test split. It will be applied to all the datasets in the Distiset.

required shuffle bool

Whether or not to shuffle the data before splitting

True seed Optional[int]

A seed to initialize the default BitGenerator, passed to the underlying method.

None

Returns:

Type Description Self

The Distiset with the train-test split applied to all the datasets.

Source code in src/distilabel/distiset.py
def train_test_split(\n    self,\n    train_size: float,\n    shuffle: bool = True,\n    seed: Optional[int] = None,\n) -> Self:\n    \"\"\"Return a `Distiset` whose values will be a `datasets.DatasetDict` with two random train and test subsets.\n    Splits are created from the dataset according to `train_size` and `shuffle`.\n\n    Args:\n        train_size:\n            Float between `0.0` and `1.0` representing the proportion of the dataset to include in the test split.\n            It will be applied to all the datasets in the `Distiset`.\n        shuffle: Whether or not to shuffle the data before splitting\n        seed:\n            A seed to initialize the default BitGenerator, passed to the underlying method.\n\n    Returns:\n        The `Distiset` with the train-test split applied to all the datasets.\n    \"\"\"\n    assert 0 < train_size < 1, \"train_size must be a float between 0 and 1\"\n    for name, dataset in self.items():\n        self[name] = dataset.train_test_split(\n            train_size=train_size,\n            shuffle=shuffle,\n            seed=seed,\n        )\n    return self\n
"},{"location":"api/distiset/#distilabel.distiset.Distiset.save_to_disk","title":"save_to_disk(distiset_path, max_shard_size=None, num_shards=None, num_proc=None, storage_options=None, save_card=True, save_pipeline_config=True, save_pipeline_log=True)","text":"

Saves a Distiset to a dataset directory, or in a filesystem using any implementation of fsspec.spec.AbstractFileSystem.

In case you want to save the Distiset in a remote filesystem, you can pass the storage_options parameter as you would do with datasets's Dataset.save_to_disk method: see example

Parameters:

Name Type Description Default distiset_path PathLike

Path where you want to save the Distiset. It can be a local path (e.g. dataset/train) or remote URI (e.g. s3://my-bucket/dataset/train)

required max_shard_size Optional[Union[str, int]]

The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit (like \"50MB\"). Defaults to None.

None num_shards Optional[int]

Number of shards to write. By default the number of shards depends on max_shard_size and num_proc. Defaults to None.

None num_proc Optional[int]

Number of processes when downloading and generating the dataset locally. Multiprocessing is disabled by default. Defaults to None.

None storage_options Optional[dict]

Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

None save_card bool

Whether to save the dataset card. Defaults to True.

True save_pipeline_config bool

Whether to save the pipeline configuration file (aka the pipeline.yaml file). Defaults to True.

True save_pipeline_log bool

Whether to save the pipeline log file (aka the pipeline.log file). Defaults to True.

True

Examples:

# Save your distiset in a local folder:\ndistiset.save_to_disk(distiset_path=\"my-distiset\")\n# Save your distiset in a remote storage:\nstorage_options = {\n    \"key\": os.environ[\"S3_ACCESS_KEY\"],\n    \"secret\": os.environ[\"S3_SECRET_KEY\"],\n    \"client_kwargs\": {\n        \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n        \"region_name\": os.environ[\"S3_REGION\"],\n    },\n}\ndistiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n
Source code in src/distilabel/distiset.py
def save_to_disk(\n    self,\n    distiset_path: PathLike,\n    max_shard_size: Optional[Union[str, int]] = None,\n    num_shards: Optional[int] = None,\n    num_proc: Optional[int] = None,\n    storage_options: Optional[dict] = None,\n    save_card: bool = True,\n    save_pipeline_config: bool = True,\n    save_pipeline_log: bool = True,\n) -> None:\n    r\"\"\"\n    Saves a `Distiset` to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n    In case you want to save the `Distiset` in a remote filesystem, you can pass the `storage_options` parameter\n    as you would do with `datasets`'s `Dataset.save_to_disk` method: [see example](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets)\n\n    Args:\n        distiset_path: Path where you want to save the `Distiset`. It can be a local path\n            (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n        max_shard_size: The maximum size of the dataset shards to be uploaded to the hub.\n            If expressed as a string, needs to be digits followed by a unit (like `\"50MB\"`).\n            Defaults to `None`.\n        num_shards: Number of shards to write. By default the number of shards depends on\n            `max_shard_size` and `num_proc`. Defaults to `None`.\n        num_proc: Number of processes when downloading and generating the dataset locally.\n            Multiprocessing is disabled by default. Defaults to `None`.\n        storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n        save_card: Whether to save the dataset card. Defaults to `True`.\n        save_pipeline_config: Whether to save the pipeline configuration file (aka the `pipeline.yaml` file).\n            Defaults to `True`.\n        save_pipeline_log: Whether to save the pipeline log file (aka the `pipeline.log` file).\n            Defaults to `True`.\n\n    Examples:\n        ```python\n        # Save your distiset in a local folder:\n        distiset.save_to_disk(distiset_path=\"my-distiset\")\n        # Save your distiset in a remote storage:\n        storage_options = {\n            \"key\": os.environ[\"S3_ACCESS_KEY\"],\n            \"secret\": os.environ[\"S3_SECRET_KEY\"],\n            \"client_kwargs\": {\n                \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n                \"region_name\": os.environ[\"S3_REGION\"],\n            },\n        }\n        distiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n        ```\n    \"\"\"\n    distiset_path = str(distiset_path)\n    for name, dataset in self.items():\n        dataset.save_to_disk(\n            f\"{distiset_path}/{name}\",\n            max_shard_size=max_shard_size,\n            num_shards=num_shards,\n            num_proc=num_proc,\n            storage_options=storage_options,\n        )\n\n    distiset_config_folder = posixpath.join(distiset_path, DISTISET_CONFIG_FOLDER)\n\n    fs: fsspec.AbstractFileSystem\n    fs, _, _ = fsspec.get_fs_token_paths(\n        distiset_config_folder, storage_options=storage_options\n    )\n    fs.makedirs(distiset_config_folder, exist_ok=True)\n\n    if self.artifacts_path:\n        distiset_artifacts_folder = posixpath.join(\n            distiset_path, DISTISET_ARTIFACTS_FOLDER\n        )\n        fs.copy(str(self.artifacts_path), distiset_artifacts_folder, recursive=True)\n\n    if save_card:\n        # NOTE:\u00a0Currently the card is not the same if we write to disk or push to the HF hub,\n        # as we aren't generating the README copying/updating the data from the dataset repo.\n        card = self._get_card(repo_id=Path(distiset_path).stem, token=None)\n        new_filename = posixpath.join(distiset_config_folder, \"README.md\")\n        if storage_options:\n            # Write the card the same way as DatasetCard.save does:\n            with fs.open(new_filename, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n                f.write(str(card))\n        else:\n            card.save(new_filename)\n\n    # Write our internal files to the distiset folder by copying them to the distiset folder.\n    if save_pipeline_config and self.pipeline_path:\n        new_filename = posixpath.join(\n            distiset_config_folder, PIPELINE_CONFIG_FILENAME\n        )\n        if self.pipeline_path.exists() and (not fs.isfile(new_filename)):\n            data = yaml.safe_load(self.pipeline_path.read_text())\n            with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n                yaml.dump(data, f, default_flow_style=False)\n\n    if save_pipeline_log and self.log_filename_path:\n        new_filename = posixpath.join(distiset_config_folder, PIPELINE_LOG_FILENAME)\n        if self.log_filename_path.exists() and (not fs.isfile(new_filename)):\n            data = self.log_filename_path.read_text()\n            with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n                f.write(data)\n
"},{"location":"api/distiset/#distilabel.distiset.Distiset.load_from_disk","title":"load_from_disk(distiset_path, keep_in_memory=None, storage_options=None, download_dir=None) classmethod","text":"

Loads a dataset that was previously saved using Distiset.save_to_disk from a dataset directory, or from a filesystem using any implementation of fsspec.spec.AbstractFileSystem.

Parameters:

Name Type Description Default distiset_path PathLike

Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").

required keep_in_memory Optional[bool]

Whether to copy the dataset in-memory, see datasets.Dataset.load_from_disk`` for more information. Defaults toNone`.

None storage_options Optional[Dict[str, Any]]

Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

None download_dir Optional[PathLike]

Optional directory to download the dataset to. Defaults to None, in which case it will create a temporary directory.

None

Returns:

Type Description Self

A Distiset loaded from disk, it should be a Distiset object created using Distiset.save_to_disk.

Source code in src/distilabel/distiset.py
@classmethod\ndef load_from_disk(\n    cls,\n    distiset_path: PathLike,\n    keep_in_memory: Optional[bool] = None,\n    storage_options: Optional[Dict[str, Any]] = None,\n    download_dir: Optional[PathLike] = None,\n) -> Self:\n    \"\"\"Loads a dataset that was previously saved using `Distiset.save_to_disk` from a dataset\n    directory, or from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n    Args:\n        distiset_path: Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").\n        keep_in_memory: Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk``\n            for more information. Defaults to `None`.\n        storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n        download_dir: Optional directory to download the dataset to. Defaults to None,\n            in which case it will create a temporary directory.\n\n    Returns:\n        A `Distiset` loaded from disk, it should be a `Distiset` object created using `Distiset.save_to_disk`.\n    \"\"\"\n    original_distiset_path = str(distiset_path)\n\n    fs: fsspec.AbstractFileSystem\n    fs, _, [distiset_path] = fsspec.get_fs_token_paths(  # type: ignore\n        original_distiset_path, storage_options=storage_options\n    )\n    dest_distiset_path = distiset_path\n\n    assert fs.isdir(\n        original_distiset_path\n    ), \"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem.\"\n\n    has_config = False\n    has_artifacts = False\n    distiset = cls()\n\n    if is_remote_filesystem(fs):\n        src_dataset_path = distiset_path\n        if download_dir:\n            dest_distiset_path = download_dir\n        else:\n            dest_distiset_path = Dataset._build_local_temp_path(src_dataset_path)  # type: ignore\n        fs.download(src_dataset_path, dest_distiset_path.as_posix(), recursive=True)  # type: ignore\n\n    # Now we should have the distiset locally, so we can read those files\n    for folder in Path(dest_distiset_path).iterdir():\n        if folder.stem == DISTISET_CONFIG_FOLDER:\n            has_config = True\n            continue\n        elif folder.stem == DISTISET_ARTIFACTS_FOLDER:\n            has_artifacts = True\n            continue\n        distiset[folder.stem] = load_from_disk(\n            str(folder),\n            keep_in_memory=keep_in_memory,\n        )\n\n    # From the config folder we just need to point to the files. Once downloaded we set the path to point to point to the files. Once downloaded we set the path\n    # to wherever they are.\n    if has_config:\n        distiset_config_folder = posixpath.join(\n            dest_distiset_path, DISTISET_CONFIG_FOLDER\n        )\n\n        pipeline_path = posixpath.join(\n            distiset_config_folder, PIPELINE_CONFIG_FILENAME\n        )\n        if Path(pipeline_path).exists():\n            distiset.pipeline_path = Path(pipeline_path)\n\n        log_filename_path = posixpath.join(\n            distiset_config_folder, PIPELINE_LOG_FILENAME\n        )\n        if Path(log_filename_path).exists():\n            distiset.log_filename_path = Path(log_filename_path)\n\n    if has_artifacts:\n        distiset.artifacts_path = Path(\n            posixpath.join(dest_distiset_path, DISTISET_ARTIFACTS_FOLDER)\n        )\n\n    return distiset\n
"},{"location":"api/distiset/#distilabel.distiset.create_distiset","title":"create_distiset(data_dir, pipeline_path=None, log_filename_path=None, enable_metadata=False, dag=None)","text":"

Creates a Distiset from the buffer folder.

This function is intended to be used as a helper to create a Distiset from from the folder where the cached data was written by the _WriteBuffer.

Parameters:

Name Type Description Default data_dir Path

Folder where the data buffers were written by the _WriteBuffer. It should correspond to CacheLocation.data.

required pipeline_path Optional[Path]

Optional path to the pipeline.yaml file that generated the dataset. Internally this will be passed to the Distiset object on creation to allow uploading the pipeline.yaml file to the repo upon Distiset.push_to_hub.

None log_filename_path Optional[Path]

Optional path to the pipeline.log file that was generated during the pipeline run. Internally this will be passed to the Distiset object on creation to allow uploading the pipeline.log file to the repo upon Distiset.push_to_hub.

None enable_metadata bool

Whether to include the distilabel metadata column in the dataset or not. Defaults to False.

False dag Optional[DAG]

DAG contained in a Pipeline. If informed, will be used to extract the references/ citations from it.

None

Returns:

Type Description Distiset

The dataset created from the buffer folder, where the different leaf steps will

Distiset

correspond to different configurations of the dataset.

Examples:

from pathlib import Path\ndistiset = create_distiset(Path.home() / \".cache/distilabel/pipelines/path-to-pipe-hashname\")\n
Source code in src/distilabel/distiset.py
def create_distiset(  # noqa: C901\n    data_dir: Path,\n    pipeline_path: Optional[Path] = None,\n    log_filename_path: Optional[Path] = None,\n    enable_metadata: bool = False,\n    dag: Optional[\"DAG\"] = None,\n) -> Distiset:\n    \"\"\"Creates a `Distiset` from the buffer folder.\n\n    This function is intended to be used as a helper to create a `Distiset` from from the folder\n    where the cached data was written by the `_WriteBuffer`.\n\n    Args:\n        data_dir: Folder where the data buffers were written by the `_WriteBuffer`.\n            It should correspond to `CacheLocation.data`.\n        pipeline_path: Optional path to the pipeline.yaml file that generated the dataset.\n            Internally this will be passed to the `Distiset` object on creation to allow\n            uploading the `pipeline.yaml` file to the repo upon `Distiset.push_to_hub`.\n        log_filename_path: Optional path to the pipeline.log file that was generated during the pipeline run.\n            Internally this will be passed to the `Distiset` object on creation to allow\n            uploading the `pipeline.log` file to the repo upon `Distiset.push_to_hub`.\n        enable_metadata: Whether to include the distilabel metadata column in the dataset or not.\n            Defaults to `False`.\n        dag: DAG contained in a `Pipeline`. If informed, will be used to extract the references/\n            citations from it.\n\n    Returns:\n        The dataset created from the buffer folder, where the different leaf steps will\n        correspond to different configurations of the dataset.\n\n    Examples:\n        ```python\n        from pathlib import Path\n        distiset = create_distiset(Path.home() / \".cache/distilabel/pipelines/path-to-pipe-hashname\")\n        ```\n    \"\"\"\n    from distilabel.constants import DISTILABEL_METADATA_KEY\n\n    logger = logging.getLogger(\"distilabel.distiset\")\n\n    steps_outputs_dir = data_dir / STEPS_OUTPUTS_PATH\n\n    distiset = Distiset()\n    for file in steps_outputs_dir.iterdir():\n        if file.is_file():\n            continue\n\n        files = [str(file) for file in list_files_in_dir(file)]\n        if files:\n            try:\n                ds = load_dataset(\n                    \"parquet\", name=file.stem, data_files={\"train\": files}\n                )\n                if not enable_metadata and DISTILABEL_METADATA_KEY in ds.column_names:\n                    ds = ds.remove_columns(DISTILABEL_METADATA_KEY)\n                distiset[file.stem] = ds\n            except ArrowInvalid:\n                logger.warning(f\"\u274c Failed to load the subset from '{file}' directory.\")\n                continue\n        else:\n            logger.warning(\n                f\"No output files for step '{file.stem}', can't create a dataset.\"\n                \" Did the step produce any data?\"\n            )\n\n    # If there's only one dataset i.e. one config, then set the config name to `default`\n    if len(distiset.keys()) == 1:\n        distiset[\"default\"] = distiset.pop(list(distiset.keys())[0])\n\n    # If there's any artifact set the `artifacts_path` so they can be uploaded\n    steps_artifacts_dir = data_dir / STEPS_ARTIFACTS_PATH\n    if any(steps_artifacts_dir.rglob(\"*\")):\n        distiset.artifacts_path = steps_artifacts_dir\n\n    # Include `pipeline.yaml` if exists\n    if pipeline_path:\n        distiset.pipeline_path = pipeline_path\n    else:\n        # If the pipeline path is not provided, try to find it in the parent directory\n        # and assume that's the wanted file.\n        pipeline_path = steps_outputs_dir.parent / \"pipeline.yaml\"\n        if pipeline_path.exists():\n            distiset.pipeline_path = pipeline_path\n\n    # Include `pipeline.log` if exists\n    if log_filename_path:\n        distiset.log_filename_path = log_filename_path\n    else:\n        log_filename_path = steps_outputs_dir.parent / \"pipeline.log\"\n        if log_filename_path.exists():\n            distiset.log_filename_path = log_filename_path\n\n    if dag:\n        distiset._citations = _grab_citations(dag)\n\n    return distiset\n
"},{"location":"api/errors/","title":"Errors","text":"

This section contains the distilabel custom errors. Unlike exceptions, errors in distilabel are used to handle unexpected situations that can't be anticipated and that can't be handled in a controlled way.

"},{"location":"api/errors/#distilabel.errors.DistilabelError","title":"DistilabelError","text":"

A mixin class for common functionality shared by all Distilabel-specific errors.

Attributes:

Name Type Description message

A message describing the error.

page

An optional error code from PydanticErrorCodes enum.

Examples:

raise DistilabelUserError(\"This is an error message.\")\nThis is an error message.\n\nraise DistilabelUserError(\"This is an error message.\", page=\"sections/getting_started/faq/\")\nThis is an error message.\nFor further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'\n
Source code in src/distilabel/errors.py
class DistilabelError:\n    \"\"\"A mixin class for common functionality shared by all Distilabel-specific errors.\n\n    Attributes:\n        message: A message describing the error.\n        page: An optional error code from PydanticErrorCodes enum.\n\n    Examples:\n        ```python\n        raise DistilabelUserError(\"This is an error message.\")\n        This is an error message.\n\n        raise DistilabelUserError(\"This is an error message.\", page=\"sections/getting_started/faq/\")\n        This is an error message.\n        For further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'\n        ```\n    \"\"\"\n\n    def __init__(self, message: str, *, page: Optional[str] = None) -> None:\n        self.message = message\n        self.page = page\n\n    def __str__(self) -> str:\n        if self.page is None:\n            return self.message\n        else:\n            return f\"{self.message}\\n\\nFor further information visit '{DISTILABEL_DOCS_URL}{self.page}'\"\n
"},{"location":"api/errors/#distilabel.errors.DistilabelUserError","title":"DistilabelUserError","text":"

Bases: DistilabelError, ValueError

ValueError that we can redirect to a given page in the documentation.

Source code in src/distilabel/errors.py
class DistilabelUserError(DistilabelError, ValueError):\n    \"\"\"ValueError that we can redirect to a given page in the documentation.\"\"\"\n\n    pass\n
"},{"location":"api/errors/#distilabel.errors.DistilabelTypeError","title":"DistilabelTypeError","text":"

Bases: DistilabelError, TypeError

TypeError that we can redirect to a given page in the documentation.

Source code in src/distilabel/errors.py
class DistilabelTypeError(DistilabelError, TypeError):\n    \"\"\"TypeError that we can redirect to a given page in the documentation.\"\"\"\n\n    pass\n
"},{"location":"api/errors/#distilabel.errors.DistilabelNotImplementedError","title":"DistilabelNotImplementedError","text":"

Bases: DistilabelError, NotImplementedError

NotImplementedError that we can redirect to a given page in the documentation.

Source code in src/distilabel/errors.py
class DistilabelNotImplementedError(DistilabelError, NotImplementedError):\n    \"\"\"NotImplementedError that we can redirect to a given page in the documentation.\"\"\"\n\n    pass\n
"},{"location":"api/exceptions/","title":"Exceptions","text":"

This section contains the distilabel custom exceptions. Unlike errors, exceptions in distilabel are used to handle specific situations that can be anticipated and that can be handled in a controlled way internally by the library.

"},{"location":"api/exceptions/#distilabel.exceptions.DistilabelException","title":"DistilabelException","text":"

Bases: Exception

Base exception (can be gracefully handled) for distilabel framework.

Source code in src/distilabel/exceptions.py
class DistilabelException(Exception):\n    \"\"\"Base exception (can be gracefully handled) for `distilabel` framework.\"\"\"\n\n    pass\n
"},{"location":"api/exceptions/#distilabel.exceptions.DistilabelGenerationException","title":"DistilabelGenerationException","text":"

Bases: DistilabelException

Base exception for LLM generation errors.

Source code in src/distilabel/exceptions.py
class DistilabelGenerationException(DistilabelException):\n    \"\"\"Base exception for `LLM` generation errors.\"\"\"\n\n    pass\n
"},{"location":"api/exceptions/#distilabel.exceptions.DistilabelOfflineBatchGenerationNotFinishedException","title":"DistilabelOfflineBatchGenerationNotFinishedException","text":"

Bases: DistilabelGenerationException

Exception raised when a batch generation is not finished.

Source code in src/distilabel/exceptions.py
class DistilabelOfflineBatchGenerationNotFinishedException(\n    DistilabelGenerationException\n):\n    \"\"\"Exception raised when a batch generation is not finished.\"\"\"\n\n    jobs_ids: Tuple[str, ...]\n\n    def __init__(self, jobs_ids: Tuple[str, ...]) -> None:\n        self.jobs_ids = jobs_ids\n        super().__init__(f\"Batch generation with jobs_ids={jobs_ids} is not finished\")\n
"},{"location":"api/mixins/requirements/","title":"RequirementsMixin","text":""},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin","title":"RequirementsMixin","text":"

Mixin for classes that have requirements attribute.

Used to add requirements to a Step and a Pipeline.

Source code in src/distilabel/mixins/requirements.py
class RequirementsMixin:\n    \"\"\"Mixin for classes that have `requirements` attribute.\n\n    Used to add requirements to a `Step` and a `Pipeline`.\n    \"\"\"\n\n    _requirements: Union[List[Requirement], None] = []\n\n    def _gather_requirements(self) -> List[str]:\n        \"\"\"This method will be overwritten in the `BasePipeline` class to gather the requirements\n        from each step.\n        \"\"\"\n        return []\n\n    @property\n    def requirements(self) -> List[str]:\n        \"\"\"Return a list of requirements that must be installed to run the `Pipeline`.\n\n        The requirements in a Pipeline will include the requirements from all the steps (if any).\n\n        Returns:\n            List of requirements that must be installed to run the `Pipeline`, sorted alphabetically.\n        \"\"\"\n        self.requirements = self._gather_requirements()\n\n        return [str(r) for r in self._requirements]\n\n    @requirements.setter\n    def requirements(self, _requirements: List[str]) -> None:\n        requirements = []\n        if not isinstance(_requirements, list):\n            _requirements = [_requirements]\n\n        for r in _requirements:\n            try:\n                requirements.append(Requirement(r))\n            except InvalidRequirement:\n                self._logger.warning(f\"Invalid requirement: `{r}`\")\n\n        self._requirements = sorted(\n            set(self._requirements).union(set(requirements)), key=lambda x: str(x)\n        )\n\n    def requirements_to_install(self) -> List[str]:\n        \"\"\"Check if the requirements are installed in the current environment, and returns the ones that aren't.\n\n        Returns:\n            List of requirements required to run the pipeline that are not installed in the current environment.\n        \"\"\"\n\n        to_install = []\n        for req in self.requirements:\n            requirement = Requirement(req)\n            if importlib.util.find_spec(requirement.name):\n                if (str(requirement.specifier) != \"\") and (\n                    version(requirement.name) != str(requirement.specifier)\n                ):\n                    to_install.append(req)\n            else:\n                to_install.append(req)\n        return to_install\n
"},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin.requirements","title":"requirements: List[str] property writable","text":"

Return a list of requirements that must be installed to run the Pipeline.

The requirements in a Pipeline will include the requirements from all the steps (if any).

Returns:

Type Description List[str]

List of requirements that must be installed to run the Pipeline, sorted alphabetically.

"},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin.requirements_to_install","title":"requirements_to_install()","text":"

Check if the requirements are installed in the current environment, and returns the ones that aren't.

Returns:

Type Description List[str]

List of requirements required to run the pipeline that are not installed in the current environment.

Source code in src/distilabel/mixins/requirements.py
def requirements_to_install(self) -> List[str]:\n    \"\"\"Check if the requirements are installed in the current environment, and returns the ones that aren't.\n\n    Returns:\n        List of requirements required to run the pipeline that are not installed in the current environment.\n    \"\"\"\n\n    to_install = []\n    for req in self.requirements:\n        requirement = Requirement(req)\n        if importlib.util.find_spec(requirement.name):\n            if (str(requirement.specifier) != \"\") and (\n                version(requirement.name) != str(requirement.specifier)\n            ):\n                to_install.append(req)\n        else:\n            to_install.append(req)\n    return to_install\n
"},{"location":"api/mixins/runtime_parameters/","title":"RuntimeParametersMixin","text":""},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin","title":"RuntimeParametersMixin","text":"

Bases: BaseModel

Mixin for classes that have RuntimeParameters attributes.

Attributes:

Name Type Description _runtime_parameters Dict[str, Any]

A dictionary containing the values of the runtime parameters of the class. This attribute is meant to be used internally and should not be accessed directly.

Source code in src/distilabel/mixins/runtime_parameters.py
class RuntimeParametersMixin(BaseModel):\n    \"\"\"Mixin for classes that have `RuntimeParameter`s attributes.\n\n    Attributes:\n        _runtime_parameters: A dictionary containing the values of the runtime parameters\n            of the class. This attribute is meant to be used internally and should not be\n            accessed directly.\n    \"\"\"\n\n    _runtime_parameters: Dict[str, Any] = PrivateAttr(default_factory=dict)\n\n    @property\n    def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n        \"\"\"Returns a dictionary containing the name of the runtime parameters of the class\n        as keys and whether the parameter is required or not as values.\n\n        Returns:\n            A dictionary containing the name of the runtime parameters of the class as keys\n            and whether the parameter is required or not as values.\n        \"\"\"\n\n        runtime_parameters = {}\n\n        for name, field_info in self.model_fields.items():  # type: ignore\n            # `field: RuntimeParameter[Any]` or `field: Optional[RuntimeParameter[Any]]`\n            is_runtime_param, is_optional = _is_runtime_parameter(field_info)\n            if is_runtime_param:\n                runtime_parameters[name] = is_optional\n                continue\n\n            attr = getattr(self, name)\n\n            # `field: RuntimeParametersMixin`\n            if isinstance(attr, RuntimeParametersMixin):\n                runtime_parameters[name] = attr.runtime_parameters_names\n\n            # `field: List[RuntimeParametersMixin]`\n            if (\n                isinstance(attr, list)\n                and attr\n                and isinstance(attr[0], RuntimeParametersMixin)\n            ):\n                runtime_parameters[name] = {\n                    str(i): item.runtime_parameters_names for i, item in enumerate(attr)\n                }\n\n        return runtime_parameters\n\n    def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n        \"\"\"Gets the information of the runtime parameters of the class such as the name and\n        the description. This function is meant to include the information of the runtime\n        parameters in the serialized data of the class.\n\n        Returns:\n            A list containing the information for each runtime parameter of the class.\n        \"\"\"\n        runtime_parameters_info = []\n        for name, field_info in self.model_fields.items():  # type: ignore\n            if name not in self.runtime_parameters_names:\n                continue\n\n            attr = getattr(self, name)\n\n            # Get runtime parameters info for `RuntimeParametersMixin` field\n            if isinstance(attr, RuntimeParametersMixin):\n                runtime_parameters_info.append(\n                    {\n                        \"name\": name,\n                        \"runtime_parameters_info\": attr.get_runtime_parameters_info(),\n                    }\n                )\n                continue\n\n            # Get runtime parameters info for `List[RuntimeParametersMixin]` field\n            if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n                runtime_parameters_info.append(\n                    {\n                        \"name\": name,\n                        \"runtime_parameters_info\": {\n                            str(i): item.get_runtime_parameters_info()\n                            for i, item in enumerate(attr)\n                        },\n                    }\n                )\n                continue\n\n            info = {\"name\": name, \"optional\": self.runtime_parameters_names[name]}\n            if field_info.description is not None:\n                info[\"description\"] = field_info.description\n            runtime_parameters_info.append(info)\n        return runtime_parameters_info\n\n    def set_runtime_parameters(self, runtime_parameters: Dict[str, Any]) -> None:\n        \"\"\"Sets the runtime parameters of the class using the provided values. If the attr\n        to be set is a `RuntimeParametersMixin`, it will call `set_runtime_parameters` on\n        the attr.\n\n        Args:\n            runtime_parameters: A dictionary containing the values of the runtime parameters\n                to set.\n        \"\"\"\n        runtime_parameters_names = list(self.runtime_parameters_names.keys())\n        for name, value in runtime_parameters.items():\n            if name not in self.runtime_parameters_names:\n                # Check done just to ensure the unit tests for the mixin run\n                if getattr(self, \"pipeline\", None):\n                    closest = difflib.get_close_matches(\n                        name, runtime_parameters_names, cutoff=0.5\n                    )\n                    msg = (\n                        f\"\u26a0\ufe0f  Runtime parameter '{name}' unknown in step '{self.name}'.\"  # type: ignore\n                    )\n                    if closest:\n                        msg += f\" Did you mean any of: {closest}\"\n                    else:\n                        msg += f\" Available runtime parameters for the step: {runtime_parameters_names}.\"\n                    self.pipeline._logger.warning(msg)  # type: ignore\n                continue\n\n            attr = getattr(self, name)\n\n            # Set runtime parameters for `RuntimeParametersMixin` field\n            if isinstance(attr, RuntimeParametersMixin):\n                attr.set_runtime_parameters(value)\n                self._runtime_parameters[name] = value\n                continue\n\n            # Set runtime parameters for `List[RuntimeParametersMixin]` field\n            if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n                for i, item in enumerate(attr):\n                    item_value = value.get(str(i), {})\n                    item.set_runtime_parameters(item_value)\n                self._runtime_parameters[name] = value\n                continue\n\n            # Handle settings values for `_SecretField`\n            field_info = self.model_fields[name]\n            inner_type = extract_annotation_inner_type(field_info.annotation)\n            if is_type_pydantic_secret_field(inner_type):\n                value = inner_type(value)\n\n            # Set the value of the runtime parameter\n            setattr(self, name, value)\n            self._runtime_parameters[name] = value\n
"},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property","text":"

Returns a dictionary containing the name of the runtime parameters of the class as keys and whether the parameter is required or not as values.

Returns:

Type Description RuntimeParametersNames

A dictionary containing the name of the runtime parameters of the class as keys

RuntimeParametersNames

and whether the parameter is required or not as values.

"},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.get_runtime_parameters_info","title":"get_runtime_parameters_info()","text":"

Gets the information of the runtime parameters of the class such as the name and the description. This function is meant to include the information of the runtime parameters in the serialized data of the class.

Returns:

Type Description List[RuntimeParameterInfo]

A list containing the information for each runtime parameter of the class.

Source code in src/distilabel/mixins/runtime_parameters.py
def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n    \"\"\"Gets the information of the runtime parameters of the class such as the name and\n    the description. This function is meant to include the information of the runtime\n    parameters in the serialized data of the class.\n\n    Returns:\n        A list containing the information for each runtime parameter of the class.\n    \"\"\"\n    runtime_parameters_info = []\n    for name, field_info in self.model_fields.items():  # type: ignore\n        if name not in self.runtime_parameters_names:\n            continue\n\n        attr = getattr(self, name)\n\n        # Get runtime parameters info for `RuntimeParametersMixin` field\n        if isinstance(attr, RuntimeParametersMixin):\n            runtime_parameters_info.append(\n                {\n                    \"name\": name,\n                    \"runtime_parameters_info\": attr.get_runtime_parameters_info(),\n                }\n            )\n            continue\n\n        # Get runtime parameters info for `List[RuntimeParametersMixin]` field\n        if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n            runtime_parameters_info.append(\n                {\n                    \"name\": name,\n                    \"runtime_parameters_info\": {\n                        str(i): item.get_runtime_parameters_info()\n                        for i, item in enumerate(attr)\n                    },\n                }\n            )\n            continue\n\n        info = {\"name\": name, \"optional\": self.runtime_parameters_names[name]}\n        if field_info.description is not None:\n            info[\"description\"] = field_info.description\n        runtime_parameters_info.append(info)\n    return runtime_parameters_info\n
"},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.set_runtime_parameters","title":"set_runtime_parameters(runtime_parameters)","text":"

Sets the runtime parameters of the class using the provided values. If the attr to be set is a RuntimeParametersMixin, it will call set_runtime_parameters on the attr.

Parameters:

Name Type Description Default runtime_parameters Dict[str, Any]

A dictionary containing the values of the runtime parameters to set.

required Source code in src/distilabel/mixins/runtime_parameters.py
def set_runtime_parameters(self, runtime_parameters: Dict[str, Any]) -> None:\n    \"\"\"Sets the runtime parameters of the class using the provided values. If the attr\n    to be set is a `RuntimeParametersMixin`, it will call `set_runtime_parameters` on\n    the attr.\n\n    Args:\n        runtime_parameters: A dictionary containing the values of the runtime parameters\n            to set.\n    \"\"\"\n    runtime_parameters_names = list(self.runtime_parameters_names.keys())\n    for name, value in runtime_parameters.items():\n        if name not in self.runtime_parameters_names:\n            # Check done just to ensure the unit tests for the mixin run\n            if getattr(self, \"pipeline\", None):\n                closest = difflib.get_close_matches(\n                    name, runtime_parameters_names, cutoff=0.5\n                )\n                msg = (\n                    f\"\u26a0\ufe0f  Runtime parameter '{name}' unknown in step '{self.name}'.\"  # type: ignore\n                )\n                if closest:\n                    msg += f\" Did you mean any of: {closest}\"\n                else:\n                    msg += f\" Available runtime parameters for the step: {runtime_parameters_names}.\"\n                self.pipeline._logger.warning(msg)  # type: ignore\n            continue\n\n        attr = getattr(self, name)\n\n        # Set runtime parameters for `RuntimeParametersMixin` field\n        if isinstance(attr, RuntimeParametersMixin):\n            attr.set_runtime_parameters(value)\n            self._runtime_parameters[name] = value\n            continue\n\n        # Set runtime parameters for `List[RuntimeParametersMixin]` field\n        if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n            for i, item in enumerate(attr):\n                item_value = value.get(str(i), {})\n                item.set_runtime_parameters(item_value)\n            self._runtime_parameters[name] = value\n            continue\n\n        # Handle settings values for `_SecretField`\n        field_info = self.model_fields[name]\n        inner_type = extract_annotation_inner_type(field_info.annotation)\n        if is_type_pydantic_secret_field(inner_type):\n            value = inner_type(value)\n\n        # Set the value of the runtime parameter\n        setattr(self, name, value)\n        self._runtime_parameters[name] = value\n
"},{"location":"api/models/embedding/","title":"Embedding","text":"

This section contains the API reference for the distilabel embeddings.

For more information on how the Embeddings works and see some examples.

"},{"location":"api/models/embedding/#distilabel.models.embeddings.base","title":"base","text":""},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings","title":"Embeddings","text":"

Bases: RuntimeParametersMixin, BaseModel, _Serializable, ABC

Base class for Embeddings models.

To implement an Embeddings subclass, you need to subclass this class and implement: - load method to load the Embeddings model. Don't forget to call super().load(), so the _logger attribute is initialized. - model_name property to return the model name used for the Embeddings. - encode method to generate the sentence embeddings.

Attributes:

Name Type Description _logger Logger

the logger to be used for the Embeddings model. It will be initialized when the load method is called.

Source code in src/distilabel/models/embeddings/base.py
class Embeddings(RuntimeParametersMixin, BaseModel, _Serializable, ABC):\n    \"\"\"Base class for `Embeddings` models.\n\n    To implement an `Embeddings` subclass, you need to subclass this class and implement:\n        - `load` method to load the `Embeddings` model. Don't forget to call `super().load()`,\n            so the `_logger` attribute is initialized.\n        - `model_name` property to return the model name used for the `Embeddings`.\n        - `encode` method to generate the sentence embeddings.\n\n    Attributes:\n        _logger: the logger to be used for the `Embeddings` model. It will be initialized\n            when the `load` method is called.\n    \"\"\"\n\n    model_config = ConfigDict(\n        arbitrary_types_allowed=True,\n        protected_namespaces=(),\n        validate_default=True,\n        validate_assignment=True,\n        extra=\"forbid\",\n    )\n    _logger: \"Logger\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Method to be called to initialize the `Embeddings`\"\"\"\n        self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n\n    def unload(self) -> None:\n        \"\"\"Method to be called to unload the `Embeddings` and release any resources.\"\"\"\n        pass\n\n    @property\n    @abstractmethod\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the `Embeddings`.\"\"\"\n        pass\n\n    @abstractmethod\n    def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n        \"\"\"Generates embeddings for the provided inputs.\n\n        Args:\n            inputs: a list of texts for which an embedding has to be generated.\n\n        Returns:\n            The generated embeddings.\n        \"\"\"\n        pass\n
"},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.model_name","title":"model_name: str abstractmethod property","text":"

Returns the model name used for the Embeddings.

"},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.load","title":"load()","text":"

Method to be called to initialize the Embeddings

Source code in src/distilabel/models/embeddings/base.py
def load(self) -> None:\n    \"\"\"Method to be called to initialize the `Embeddings`\"\"\"\n    self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n
"},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.unload","title":"unload()","text":"

Method to be called to unload the Embeddings and release any resources.

Source code in src/distilabel/models/embeddings/base.py
def unload(self) -> None:\n    \"\"\"Method to be called to unload the `Embeddings` and release any resources.\"\"\"\n    pass\n
"},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.encode","title":"encode(inputs) abstractmethod","text":"

Generates embeddings for the provided inputs.

Parameters:

Name Type Description Default inputs List[str]

a list of texts for which an embedding has to be generated.

required

Returns:

Type Description List[List[Union[int, float]]]

The generated embeddings.

Source code in src/distilabel/models/embeddings/base.py
@abstractmethod\ndef encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n    \"\"\"Generates embeddings for the provided inputs.\n\n    Args:\n        inputs: a list of texts for which an embedding has to be generated.\n\n    Returns:\n        The generated embeddings.\n    \"\"\"\n    pass\n
"},{"location":"api/models/embedding/embedding_gallery/","title":"Embedding Gallery","text":"

This section contains the existing Embeddings subclasses implemented in distilabel.

"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings","title":"embeddings","text":""},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings","title":"SentenceTransformerEmbeddings","text":"

Bases: Embeddings, CudaDevicePlacementMixin

sentence-transformers library implementation for embedding generation.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

device Optional[RuntimeParameter[str]]

the name of the device used to load the model e.g. \"cuda\", \"mps\", etc. Defaults to None.

prompts Optional[Dict[str, str]]

a dictionary containing prompts to be used with the model. Defaults to None.

default_prompt_name Optional[str]

the default prompt (in prompts) that will be applied to the inputs. If not provided, then no prompt will be used. Defaults to None.

trust_remote_code bool

whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

revision Optional[str]

if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

token Optional[str]

the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

truncate_dim Optional[int]

the dimension to truncate the sentence embeddings. Defaults to None.

model_kwargs Optional[Dict[str, Any]]

extra kwargs that will be passed to the Hugging Face transformers model class. Defaults to None.

tokenizer_kwargs Optional[Dict[str, Any]]

extra kwargs that will be passed to the Hugging Face transformers tokenizer class. Defaults to None.

config_kwargs Optional[Dict[str, Any]]

extra kwargs that will be passed to the Hugging Face transformers configuration class. Defaults to None.

precision Optional[Literal['float32', 'int8', 'uint8', 'binary', 'ubinary']]

the dtype that will have the resulting embeddings. Defaults to \"float32\".

normalize_embeddings RuntimeParameter[bool]

whether to normalize the embeddings so they have a length of 1. Defaults to None.

Examples:

Generating sentence embeddings:

from distilabel.models import SentenceTransformerEmbeddings\n\nembeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n#   [-0.05447685346007347, -0.01623094454407692, ...],\n#   [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n
Source code in src/distilabel/models/embeddings/sentence_transformers.py
class SentenceTransformerEmbeddings(Embeddings, CudaDevicePlacementMixin):\n    \"\"\"`sentence-transformers` library implementation for embedding generation.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        device: the name of the device used to load the model e.g. \"cuda\", \"mps\", etc.\n            Defaults to `None`.\n        prompts: a dictionary containing prompts to be used with the model. Defaults to\n            `None`.\n        default_prompt_name: the default prompt (in `prompts`) that will be applied to the\n            inputs. If not provided, then no prompt will be used. Defaults to `None`.\n        trust_remote_code: whether to allow fetching and executing remote code fetched\n            from the repository in the Hub. Defaults to `False`.\n        revision: if `model` refers to a Hugging Face Hub repository, then the revision\n            (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n        token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n            Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n            local configuration will be used. Defaults to `None`.\n        truncate_dim: the dimension to truncate the sentence embeddings. Defaults to `None`.\n        model_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n            model class. Defaults to `None`.\n        tokenizer_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n            tokenizer class. Defaults to `None`.\n        config_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n            configuration class. Defaults to `None`.\n        precision: the dtype that will have the resulting embeddings. Defaults to `\"float32\"`.\n        normalize_embeddings: whether to normalize the embeddings so they have a length\n            of 1. Defaults to `None`.\n\n    Examples:\n        Generating sentence embeddings:\n\n        ```python\n        from distilabel.models import SentenceTransformerEmbeddings\n\n        embeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\n        embeddings.load()\n\n        results = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n        # [\n        #   [-0.05447685346007347, -0.01623094454407692, ...],\n        #   [4.4889533455716446e-05, 0.044016145169734955, ...],\n        # ]\n        ```\n    \"\"\"\n\n    model: str\n    device: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The device to be used to load the model. If `None`, then it\"\n        \" will check if a GPU can be used.\",\n    )\n    prompts: Optional[Dict[str, str]] = None\n    default_prompt_name: Optional[str] = None\n    trust_remote_code: bool = False\n    revision: Optional[str] = None\n    token: Optional[str] = None\n    truncate_dim: Optional[int] = None\n    model_kwargs: Optional[Dict[str, Any]] = None\n    tokenizer_kwargs: Optional[Dict[str, Any]] = None\n    config_kwargs: Optional[Dict[str, Any]] = None\n    precision: Optional[Literal[\"float32\", \"int8\", \"uint8\", \"binary\", \"ubinary\"]] = (\n        \"float32\"\n    )\n    normalize_embeddings: RuntimeParameter[bool] = Field(\n        default=True,\n        description=\"Whether to normalize the embeddings so the generated vectors\"\n        \" have a length of 1 or not.\",\n    )\n\n    _model: Union[\"SentenceTransformer\", None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the Sentence Transformer model\"\"\"\n        super().load()\n\n        if self.device == \"cuda\":\n            CudaDevicePlacementMixin.load(self)\n\n        try:\n            from sentence_transformers import SentenceTransformer\n        except ImportError as e:\n            raise ImportError(\n                \"`sentence-transformers` package is not installed. Please install it using\"\n                \" `pip install sentence-transformers`.\"\n            ) from e\n\n        self._model = SentenceTransformer(\n            model_name_or_path=self.model,\n            device=self.device,\n            prompts=self.prompts,\n            default_prompt_name=self.default_prompt_name,\n            trust_remote_code=self.trust_remote_code,\n            revision=self.revision,\n            token=self.token,\n            truncate_dim=self.truncate_dim,\n            model_kwargs=self.model_kwargs,\n            tokenizer_kwargs=self.tokenizer_kwargs,\n            config_kwargs=self.config_kwargs,\n        )\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the name of the model.\"\"\"\n        return self.model\n\n    def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n        \"\"\"Generates embeddings for the provided inputs.\n\n        Args:\n            inputs: a list of texts for which an embedding has to be generated.\n\n        Returns:\n            The generated embeddings.\n        \"\"\"\n        return self._model.encode(  # type: ignore\n            sentences=inputs,\n            batch_size=len(inputs),\n            convert_to_numpy=True,\n            precision=self.precision,  # type: ignore\n            normalize_embeddings=self.normalize_embeddings,  # type: ignore\n        ).tolist()  # type: ignore\n\n    def unload(self) -> None:\n        del self._model\n        if self.device == \"cuda\":\n            CudaDevicePlacementMixin.unload(self)\n        super().unload()\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.model_name","title":"model_name: str property","text":"

Returns the name of the model.

"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.load","title":"load()","text":"

Loads the Sentence Transformer model

Source code in src/distilabel/models/embeddings/sentence_transformers.py
def load(self) -> None:\n    \"\"\"Loads the Sentence Transformer model\"\"\"\n    super().load()\n\n    if self.device == \"cuda\":\n        CudaDevicePlacementMixin.load(self)\n\n    try:\n        from sentence_transformers import SentenceTransformer\n    except ImportError as e:\n        raise ImportError(\n            \"`sentence-transformers` package is not installed. Please install it using\"\n            \" `pip install sentence-transformers`.\"\n        ) from e\n\n    self._model = SentenceTransformer(\n        model_name_or_path=self.model,\n        device=self.device,\n        prompts=self.prompts,\n        default_prompt_name=self.default_prompt_name,\n        trust_remote_code=self.trust_remote_code,\n        revision=self.revision,\n        token=self.token,\n        truncate_dim=self.truncate_dim,\n        model_kwargs=self.model_kwargs,\n        tokenizer_kwargs=self.tokenizer_kwargs,\n        config_kwargs=self.config_kwargs,\n    )\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.encode","title":"encode(inputs)","text":"

Generates embeddings for the provided inputs.

Parameters:

Name Type Description Default inputs List[str]

a list of texts for which an embedding has to be generated.

required

Returns:

Type Description List[List[Union[int, float]]]

The generated embeddings.

Source code in src/distilabel/models/embeddings/sentence_transformers.py
def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n    \"\"\"Generates embeddings for the provided inputs.\n\n    Args:\n        inputs: a list of texts for which an embedding has to be generated.\n\n    Returns:\n        The generated embeddings.\n    \"\"\"\n    return self._model.encode(  # type: ignore\n        sentences=inputs,\n        batch_size=len(inputs),\n        convert_to_numpy=True,\n        precision=self.precision,  # type: ignore\n        normalize_embeddings=self.normalize_embeddings,  # type: ignore\n    ).tolist()  # type: ignore\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings","title":"vLLMEmbeddings","text":"

Bases: Embeddings, CudaDevicePlacementMixin

vllm library implementation for embedding generation.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

dtype str

the data type to use for the model. Defaults to auto.

trust_remote_code bool

whether to trust the remote code when loading the model. Defaults to False.

quantization Optional[str]

the quantization mode to use for the model. Defaults to None.

revision Optional[str]

the revision of the model to load. Defaults to None.

enforce_eager bool

whether to enforce eager execution. Defaults to True.

seed int

the seed to use for the random number generator. Defaults to 0.

extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]]

additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {}.

_model LLM

the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

References
  • Offline inference embeddings

Examples:

Generating sentence embeddings:

from distilabel.models import vLLMEmbeddings\n\nembeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n#   [-0.05447685346007347, -0.01623094454407692, ...],\n#   [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n
Source code in src/distilabel/models/embeddings/vllm.py
class vLLMEmbeddings(Embeddings, CudaDevicePlacementMixin):\n    \"\"\"`vllm` library implementation for embedding generation.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        dtype: the data type to use for the model. Defaults to `auto`.\n        trust_remote_code: whether to trust the remote code when loading the model. Defaults\n            to `False`.\n        quantization: the quantization mode to use for the model. Defaults to `None`.\n        revision: the revision of the model to load. Defaults to `None`.\n        enforce_eager: whether to enforce eager execution. Defaults to `True`.\n        seed: the seed to use for the random number generator. Defaults to `0`.\n        extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n            `LLM` class of `vllm` library. Defaults to `{}`.\n        _model: the `vLLM` model instance. This attribute is meant to be used internally\n            and should not be accessed directly. It will be set in the `load` method.\n\n    References:\n        - [Offline inference embeddings](https://docs.vllm.ai/en/latest/getting_started/examples/offline_inference_embedding.html)\n\n    Examples:\n        Generating sentence embeddings:\n\n        ```python\n        from distilabel.models import vLLMEmbeddings\n\n        embeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\n        embeddings.load()\n\n        results = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n        # [\n        #   [-0.05447685346007347, -0.01623094454407692, ...],\n        #   [4.4889533455716446e-05, 0.044016145169734955, ...],\n        # ]\n        ```\n    \"\"\"\n\n    model: str\n    dtype: str = \"auto\"\n    trust_remote_code: bool = False\n    quantization: Optional[str] = None\n    revision: Optional[str] = None\n\n    enforce_eager: bool = True\n\n    seed: int = 0\n\n    extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n        default_factory=dict,\n        description=\"Additional dictionary of keyword arguments that will be passed to the\"\n        \" `vLLM` class of `vllm` library. See all the supported arguments at: \"\n        \"https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\",\n    )\n\n    _model: \"_vLLM\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\"\"\"\n        super().load()\n\n        CudaDevicePlacementMixin.load(self)\n\n        try:\n            from vllm import LLM as _vLLM\n\n        except ImportError as ie:\n            raise ImportError(\n                \"vLLM is not installed. Please install it using `pip install vllm`.\"\n            ) from ie\n\n        self._model = _vLLM(\n            self.model,\n            dtype=self.dtype,\n            trust_remote_code=self.trust_remote_code,\n            quantization=self.quantization,\n            revision=self.revision,\n            enforce_eager=self.enforce_eager,\n            seed=self.seed,\n            **self.extra_kwargs,  # type: ignore\n        )\n\n    def unload(self) -> None:\n        \"\"\"Unloads the `vLLM` model.\"\"\"\n        CudaDevicePlacementMixin.unload(self)\n        super().unload()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the name of the model.\"\"\"\n        return self.model\n\n    def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n        \"\"\"Generates embeddings for the provided inputs.\n\n        Args:\n            inputs: a list of texts for which an embedding has to be generated.\n\n        Returns:\n            The generated embeddings.\n        \"\"\"\n        return [output.outputs.embedding for output in self._model.encode(inputs)]\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.model_name","title":"model_name: str property","text":"

Returns the name of the model.

"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.load","title":"load()","text":"

Loads the vLLM model using either the path or the Hugging Face Hub repository id.

Source code in src/distilabel/models/embeddings/vllm.py
def load(self) -> None:\n    \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\"\"\"\n    super().load()\n\n    CudaDevicePlacementMixin.load(self)\n\n    try:\n        from vllm import LLM as _vLLM\n\n    except ImportError as ie:\n        raise ImportError(\n            \"vLLM is not installed. Please install it using `pip install vllm`.\"\n        ) from ie\n\n    self._model = _vLLM(\n        self.model,\n        dtype=self.dtype,\n        trust_remote_code=self.trust_remote_code,\n        quantization=self.quantization,\n        revision=self.revision,\n        enforce_eager=self.enforce_eager,\n        seed=self.seed,\n        **self.extra_kwargs,  # type: ignore\n    )\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.unload","title":"unload()","text":"

Unloads the vLLM model.

Source code in src/distilabel/models/embeddings/vllm.py
def unload(self) -> None:\n    \"\"\"Unloads the `vLLM` model.\"\"\"\n    CudaDevicePlacementMixin.unload(self)\n    super().unload()\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.encode","title":"encode(inputs)","text":"

Generates embeddings for the provided inputs.

Parameters:

Name Type Description Default inputs List[str]

a list of texts for which an embedding has to be generated.

required

Returns:

Type Description List[List[Union[int, float]]]

The generated embeddings.

Source code in src/distilabel/models/embeddings/vllm.py
def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n    \"\"\"Generates embeddings for the provided inputs.\n\n    Args:\n        inputs: a list of texts for which an embedding has to be generated.\n\n    Returns:\n        The generated embeddings.\n    \"\"\"\n    return [output.outputs.embedding for output in self._model.encode(inputs)]\n
"},{"location":"api/models/llm/","title":"LLM","text":"

This section contains the API reference for the distilabel LLMs, both for the LLM synchronous implementation, and for the AsyncLLM asynchronous one.

For more information and examples on how to use existing LLMs or create custom ones, please refer to Tutorial - LLM.

"},{"location":"api/models/llm/#distilabel.models.llms.base","title":"base","text":""},{"location":"api/models/llm/#distilabel.models.llms.base.LLM","title":"LLM","text":"

Bases: RuntimeParametersMixin, BaseModel, _Serializable, ABC

Base class for LLMs to be used in distilabel framework.

To implement an LLM subclass, you need to subclass this class and implement: - load method to load the LLM if needed. Don't forget to call super().load(), so the _logger attribute is initialized. - model_name property to return the model name used for the LLM. - generate method to generate num_generations per input in inputs.

Attributes:

Name Type Description generation_kwargs Optional[RuntimeParameter[Dict[str, Any]]]

the kwargs to be propagated to either generate or agenerate methods within each LLM.

use_offline_batch_generation Optional[RuntimeParameter[bool]]

whether to use the offline_batch_generate method to generate the responses.

offline_batch_generation_block_until_done Optional[RuntimeParameter[int]]

if provided, then polling will be done until the ofline_batch_generate method is able to retrieve the results. The value indicate the time to wait between each polling.

jobs_ids Union[Tuple[str, ...], None]

the job ids generated by the offline_batch_generate method. This attribute is used to store the job ids generated by the offline_batch_generate method so later they can be used to retrieve the results. It is not meant to be set by the user.

_logger Logger

the logger to be used for the LLM. It will be initialized when the load method is called.

Source code in src/distilabel/models/llms/base.py
class LLM(RuntimeParametersMixin, BaseModel, _Serializable, ABC):\n    \"\"\"Base class for `LLM`s to be used in `distilabel` framework.\n\n    To implement an `LLM` subclass, you need to subclass this class and implement:\n        - `load` method to load the `LLM` if needed. Don't forget to call `super().load()`,\n            so the `_logger` attribute is initialized.\n        - `model_name` property to return the model name used for the LLM.\n        - `generate` method to generate `num_generations` per input in `inputs`.\n\n    Attributes:\n        generation_kwargs: the kwargs to be propagated to either `generate` or `agenerate`\n            methods within each `LLM`.\n        use_offline_batch_generation: whether to use the `offline_batch_generate` method to\n            generate the responses.\n        offline_batch_generation_block_until_done: if provided, then polling will be done until\n            the `ofline_batch_generate` method is able to retrieve the results. The value indicate\n            the time to wait between each polling.\n        jobs_ids: the job ids generated by the `offline_batch_generate` method. This attribute\n            is used to store the job ids generated by the `offline_batch_generate` method\n            so later they can be used to retrieve the results. It is not meant to be set by\n            the user.\n        _logger: the logger to be used for the `LLM`. It will be initialized when the `load`\n            method is called.\n    \"\"\"\n\n    model_config = ConfigDict(\n        arbitrary_types_allowed=True,\n        protected_namespaces=(),\n        validate_default=True,\n        validate_assignment=True,\n        extra=\"forbid\",\n    )\n\n    generation_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n        default_factory=dict,\n        description=\"The kwargs to be propagated to either `generate` or `agenerate`\"\n        \" methods within each `LLM`.\",\n    )\n    use_offline_batch_generation: Optional[RuntimeParameter[bool]] = Field(\n        default=False,\n        description=\"Whether to use the `offline_batch_generate` method to generate\"\n        \" the responses.\",\n    )\n    offline_batch_generation_block_until_done: Optional[RuntimeParameter[int]] = Field(\n        default=None,\n        description=\"If provided, then polling will be done until the `ofline_batch_generate`\"\n        \" method is able to retrieve the results. The value indicate the time to wait between\"\n        \" each polling.\",\n    )\n\n    jobs_ids: Union[Tuple[str, ...], None] = Field(default=None)\n    _logger: \"Logger\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Method to be called to initialize the `LLM`, its logger and optionally the\n        structured output generator.\"\"\"\n        self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n\n    def unload(self) -> None:\n        \"\"\"Method to be called to unload the `LLM` and release any resources.\"\"\"\n        pass\n\n    @property\n    @abstractmethod\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        pass\n\n    def get_generation_kwargs(self) -> Dict[str, Any]:\n        \"\"\"Returns the generation kwargs to be used for the generation. This method can\n        be overridden to provide a more complex logic for the generation kwargs.\n\n        Returns:\n            The kwargs to be used for the generation.\n        \"\"\"\n        return self.generation_kwargs  # type: ignore\n\n    @abstractmethod\n    def generate(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Abstract method to be implemented by each LLM to generate `num_generations`\n        per input in `inputs`.\n\n        Args:\n            inputs: the list of inputs to generate responses for which follows OpenAI's\n                API format:\n\n                ```python\n                [\n                    {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n                    {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n                    {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n                    {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n                ]\n                ```\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n        \"\"\"\n        pass\n\n    def generate_outputs(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Generates outputs for the given inputs using either `generate` method or the\n        `offine_batch_generate` method if `use_offline_\n        \"\"\"\n        if self.use_offline_batch_generation:\n            if self.offline_batch_generation_block_until_done is not None:\n                return self._offline_batch_generate_polling(\n                    inputs=inputs,\n                    num_generations=num_generations,\n                    **kwargs,\n                )\n\n            # This will raise `DistilabelOfflineBatchGenerationNotFinishedException` right away\n            # if the batch generation is not finished.\n            return self.offline_batch_generate(\n                inputs=inputs,\n                num_generations=num_generations,\n                **kwargs,\n            )\n\n        return self.generate(inputs=inputs, num_generations=num_generations, **kwargs)\n\n    def _offline_batch_generate_polling(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Method to poll the `offline_batch_generate` method until the batch generation\n        is finished.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        while True:\n            try:\n                return self.offline_batch_generate(\n                    inputs=inputs,\n                    num_generations=num_generations,\n                    **kwargs,\n                )\n            except DistilabelOfflineBatchGenerationNotFinishedException as e:\n                self._logger.info(\n                    f\"Waiting for the offline batch generation to finish: {e}. Sleeping\"\n                    f\" for {self.offline_batch_generation_block_until_done} seconds before\"\n                    \" trying to get the results again.\"\n                )\n                # When running a `Step` in a child process, SIGINT is overriden so the child\n                # process doesn't stop when the parent process receives a SIGINT signal.\n                # The new handler sets an environment variable that is checked here to stop\n                # the polling.\n                if os.getenv(SIGINT_HANDLER_CALLED_ENV_NAME) is not None:\n                    self._logger.info(\n                        \"Received a KeyboardInterrupt. Stopping polling for checking if the\"\n                        \" offline batch generation is finished...\"\n                    )\n                    raise e\n                time.sleep(self.offline_batch_generation_block_until_done)  # type: ignore\n            except KeyboardInterrupt as e:\n                # This is for the case the `LLM` is being executed outside a pipeline\n                self._logger.info(\n                    \"Received a KeyboardInterrupt. Stopping polling for checking if the\"\n                    \" offline batch generation is finished...\"\n                )\n                raise DistilabelOfflineBatchGenerationNotFinishedException(\n                    jobs_ids=self.jobs_ids  # type: ignore\n                ) from e\n\n    @property\n    def generate_parameters(self) -> List[\"inspect.Parameter\"]:\n        \"\"\"Returns the parameters of the `generate` method.\n\n        Returns:\n            A list containing the parameters of the `generate` method.\n        \"\"\"\n        return list(inspect.signature(self.generate).parameters.values())\n\n    @property\n    def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n        \"\"\"Returns the runtime parameters of the `LLM`, which are combination of the\n        attributes of the `LLM` type hinted with `RuntimeParameter` and the parameters\n        of the `generate` method that are not `input` and `num_generations`.\n\n        Returns:\n            A dictionary with the name of the runtime parameters as keys and a boolean\n            indicating if the parameter is optional or not.\n        \"\"\"\n        runtime_parameters = super().runtime_parameters_names\n        runtime_parameters[\"generation_kwargs\"] = {}\n\n        # runtime parameters from the `generate` method\n        for param in self.generate_parameters:\n            if param.name in [\"input\", \"inputs\", \"num_generations\"]:\n                continue\n            is_optional = param.default != inspect.Parameter.empty\n            runtime_parameters[\"generation_kwargs\"][param.name] = is_optional\n\n        return runtime_parameters\n\n    def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n        \"\"\"Gets the information of the runtime parameters of the `LLM` such as the name\n        and the description. This function is meant to include the information of the runtime\n        parameters in the serialized data of the `LLM`.\n\n        Returns:\n            A list containing the information for each runtime parameter of the `LLM`.\n        \"\"\"\n        runtime_parameters_info = super().get_runtime_parameters_info()\n\n        generation_kwargs_info = next(\n            (\n                runtime_parameter_info\n                for runtime_parameter_info in runtime_parameters_info\n                if runtime_parameter_info[\"name\"] == \"generation_kwargs\"\n            ),\n            None,\n        )\n\n        # If `generation_kwargs` attribute is present, we need to include the `generate`\n        # method arguments as the information for this attribute.\n        if generation_kwargs_info:\n            generate_docstring_args = self.generate_parsed_docstring[\"args\"]\n\n            generation_kwargs_info[\"keys\"] = []\n            for key, value in generation_kwargs_info[\"optional\"].items():\n                info = {\"name\": key, \"optional\": value}\n                if description := generate_docstring_args.get(key):\n                    info[\"description\"] = description\n                generation_kwargs_info[\"keys\"].append(info)\n\n            generation_kwargs_info.pop(\"optional\")\n\n        return runtime_parameters_info\n\n    @cached_property\n    def generate_parsed_docstring(self) -> \"Docstring\":\n        \"\"\"Returns the parsed docstring of the `generate` method.\n\n        Returns:\n            The parsed docstring of the `generate` method.\n        \"\"\"\n        return parse_google_docstring(self.generate)\n\n    def get_last_hidden_states(\n        self, inputs: List[\"StandardInput\"]\n    ) -> List[\"HiddenState\"]:\n        \"\"\"Method to get the last hidden states of the model for a list of inputs.\n\n        Args:\n            inputs: the list of inputs to get the last hidden states from.\n\n        Returns:\n            A list containing the last hidden state for each sequence using a NumPy array\n                with shape [num_tokens, hidden_size].\n        \"\"\"\n        # TODO: update to use `DistilabelNotImplementedError`\n        raise NotImplementedError(\n            f\"Method `get_last_hidden_states` is not implemented for `{self.__class__.__name__}`\"\n        )\n\n    def _prepare_structured_output(\n        self, structured_output: Optional[\"StructuredOutputType\"] = None\n    ) -> Union[Any, None]:\n        \"\"\"Method in charge of preparing the structured output generator.\n\n        By default will raise a `NotImplementedError`, subclasses that allow it must override this\n        method with the implementation.\n\n        Args:\n            structured_output: the config to prepare the guided generation.\n\n        Returns:\n            The structure to be used for the guided generation.\n        \"\"\"\n        # TODO: update to use `DistilabelNotImplementedError`\n        raise NotImplementedError(\n            f\"Guided generation is not implemented for `{type(self).__name__}`\"\n        )\n\n    def offline_batch_generate(\n        self,\n        inputs: Union[List[\"FormattedInput\"], None] = None,\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Method to generate a list of outputs for the given inputs using an offline batch\n        generation method to be implemented by each `LLM`.\n\n        This method should create jobs the first time is called and store the job ids, so\n        the second and subsequent calls can retrieve the results of the batch generation.\n        If subsequent calls are made before the batch generation is finished, then the method\n        should raise a `DistilabelOfflineBatchGenerationNotFinishedException`. This exception\n        will be handled automatically by the `Pipeline` which will store all the required\n        information for recovering the pipeline execution when the batch generation is finished.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        raise DistilabelNotImplementedError(\n            f\"`offline_batch_generate` is not implemented for `{self.__class__.__name__}`\",\n            page=\"sections/how_to_guides/advanced/offline-batch-generation/\",\n        )\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.model_name","title":"model_name: str abstractmethod property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_parameters","title":"generate_parameters: List[inspect.Parameter] property","text":"

Returns the parameters of the generate method.

Returns:

Type Description List[Parameter]

A list containing the parameters of the generate method.

"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property","text":"

Returns the runtime parameters of the LLM, which are combination of the attributes of the LLM type hinted with RuntimeParameter and the parameters of the generate method that are not input and num_generations.

Returns:

Type Description RuntimeParametersNames

A dictionary with the name of the runtime parameters as keys and a boolean

RuntimeParametersNames

indicating if the parameter is optional or not.

"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_parsed_docstring","title":"generate_parsed_docstring: Docstring cached property","text":"

Returns the parsed docstring of the generate method.

Returns:

Type Description Docstring

The parsed docstring of the generate method.

"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.load","title":"load()","text":"

Method to be called to initialize the LLM, its logger and optionally the structured output generator.

Source code in src/distilabel/models/llms/base.py
def load(self) -> None:\n    \"\"\"Method to be called to initialize the `LLM`, its logger and optionally the\n    structured output generator.\"\"\"\n    self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.unload","title":"unload()","text":"

Method to be called to unload the LLM and release any resources.

Source code in src/distilabel/models/llms/base.py
def unload(self) -> None:\n    \"\"\"Method to be called to unload the `LLM` and release any resources.\"\"\"\n    pass\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_generation_kwargs","title":"get_generation_kwargs()","text":"

Returns the generation kwargs to be used for the generation. This method can be overridden to provide a more complex logic for the generation kwargs.

Returns:

Type Description Dict[str, Any]

The kwargs to be used for the generation.

Source code in src/distilabel/models/llms/base.py
def get_generation_kwargs(self) -> Dict[str, Any]:\n    \"\"\"Returns the generation kwargs to be used for the generation. This method can\n    be overridden to provide a more complex logic for the generation kwargs.\n\n    Returns:\n        The kwargs to be used for the generation.\n    \"\"\"\n    return self.generation_kwargs  # type: ignore\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate","title":"generate(inputs, num_generations=1, **kwargs) abstractmethod","text":"

Abstract method to be implemented by each LLM to generate num_generations per input in inputs.

Parameters:

Name Type Description Default inputs List[FormattedInput]

the list of inputs to generate responses for which follows OpenAI's API format:

[\n    {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n    {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n    {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n    {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n]\n
required num_generations int

the number of generations to generate per input.

1 **kwargs Any

the additional kwargs to be used for the generation.

{} Source code in src/distilabel/models/llms/base.py
@abstractmethod\ndef generate(\n    self,\n    inputs: List[\"FormattedInput\"],\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Abstract method to be implemented by each LLM to generate `num_generations`\n    per input in `inputs`.\n\n    Args:\n        inputs: the list of inputs to generate responses for which follows OpenAI's\n            API format:\n\n            ```python\n            [\n                {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n                {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n                {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n                {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n            ]\n            ```\n        num_generations: the number of generations to generate per input.\n        **kwargs: the additional kwargs to be used for the generation.\n    \"\"\"\n    pass\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_outputs","title":"generate_outputs(inputs, num_generations=1, **kwargs)","text":"

Generates outputs for the given inputs using either generate method or the offine_batch_generate method if `use_offline_

Source code in src/distilabel/models/llms/base.py
def generate_outputs(\n    self,\n    inputs: List[\"FormattedInput\"],\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Generates outputs for the given inputs using either `generate` method or the\n    `offine_batch_generate` method if `use_offline_\n    \"\"\"\n    if self.use_offline_batch_generation:\n        if self.offline_batch_generation_block_until_done is not None:\n            return self._offline_batch_generate_polling(\n                inputs=inputs,\n                num_generations=num_generations,\n                **kwargs,\n            )\n\n        # This will raise `DistilabelOfflineBatchGenerationNotFinishedException` right away\n        # if the batch generation is not finished.\n        return self.offline_batch_generate(\n            inputs=inputs,\n            num_generations=num_generations,\n            **kwargs,\n        )\n\n    return self.generate(inputs=inputs, num_generations=num_generations, **kwargs)\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_runtime_parameters_info","title":"get_runtime_parameters_info()","text":"

Gets the information of the runtime parameters of the LLM such as the name and the description. This function is meant to include the information of the runtime parameters in the serialized data of the LLM.

Returns:

Type Description List[RuntimeParameterInfo]

A list containing the information for each runtime parameter of the LLM.

Source code in src/distilabel/models/llms/base.py
def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n    \"\"\"Gets the information of the runtime parameters of the `LLM` such as the name\n    and the description. This function is meant to include the information of the runtime\n    parameters in the serialized data of the `LLM`.\n\n    Returns:\n        A list containing the information for each runtime parameter of the `LLM`.\n    \"\"\"\n    runtime_parameters_info = super().get_runtime_parameters_info()\n\n    generation_kwargs_info = next(\n        (\n            runtime_parameter_info\n            for runtime_parameter_info in runtime_parameters_info\n            if runtime_parameter_info[\"name\"] == \"generation_kwargs\"\n        ),\n        None,\n    )\n\n    # If `generation_kwargs` attribute is present, we need to include the `generate`\n    # method arguments as the information for this attribute.\n    if generation_kwargs_info:\n        generate_docstring_args = self.generate_parsed_docstring[\"args\"]\n\n        generation_kwargs_info[\"keys\"] = []\n        for key, value in generation_kwargs_info[\"optional\"].items():\n            info = {\"name\": key, \"optional\": value}\n            if description := generate_docstring_args.get(key):\n                info[\"description\"] = description\n            generation_kwargs_info[\"keys\"].append(info)\n\n        generation_kwargs_info.pop(\"optional\")\n\n    return runtime_parameters_info\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_last_hidden_states","title":"get_last_hidden_states(inputs)","text":"

Method to get the last hidden states of the model for a list of inputs.

Parameters:

Name Type Description Default inputs List[StandardInput]

the list of inputs to get the last hidden states from.

required

Returns:

Type Description List[HiddenState]

A list containing the last hidden state for each sequence using a NumPy array with shape [num_tokens, hidden_size].

Source code in src/distilabel/models/llms/base.py
def get_last_hidden_states(\n    self, inputs: List[\"StandardInput\"]\n) -> List[\"HiddenState\"]:\n    \"\"\"Method to get the last hidden states of the model for a list of inputs.\n\n    Args:\n        inputs: the list of inputs to get the last hidden states from.\n\n    Returns:\n        A list containing the last hidden state for each sequence using a NumPy array\n            with shape [num_tokens, hidden_size].\n    \"\"\"\n    # TODO: update to use `DistilabelNotImplementedError`\n    raise NotImplementedError(\n        f\"Method `get_last_hidden_states` is not implemented for `{self.__class__.__name__}`\"\n    )\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.offline_batch_generate","title":"offline_batch_generate(inputs=None, num_generations=1, **kwargs)","text":"

Method to generate a list of outputs for the given inputs using an offline batch generation method to be implemented by each LLM.

This method should create jobs the first time is called and store the job ids, so the second and subsequent calls can retrieve the results of the batch generation. If subsequent calls are made before the batch generation is finished, then the method should raise a DistilabelOfflineBatchGenerationNotFinishedException. This exception will be handled automatically by the Pipeline which will store all the required information for recovering the pipeline execution when the batch generation is finished.

Parameters:

Name Type Description Default inputs Union[List[FormattedInput], None]

the list of inputs to generate responses for.

None num_generations int

the number of generations to generate per input.

1 **kwargs Any

the additional kwargs to be used for the generation.

{}

Returns:

Type Description List[GenerateOutput]

A list containing the generations for each input.

Source code in src/distilabel/models/llms/base.py
def offline_batch_generate(\n    self,\n    inputs: Union[List[\"FormattedInput\"], None] = None,\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Method to generate a list of outputs for the given inputs using an offline batch\n    generation method to be implemented by each `LLM`.\n\n    This method should create jobs the first time is called and store the job ids, so\n    the second and subsequent calls can retrieve the results of the batch generation.\n    If subsequent calls are made before the batch generation is finished, then the method\n    should raise a `DistilabelOfflineBatchGenerationNotFinishedException`. This exception\n    will be handled automatically by the `Pipeline` which will store all the required\n    information for recovering the pipeline execution when the batch generation is finished.\n\n    Args:\n        inputs: the list of inputs to generate responses for.\n        num_generations: the number of generations to generate per input.\n        **kwargs: the additional kwargs to be used for the generation.\n\n    Returns:\n        A list containing the generations for each input.\n    \"\"\"\n    raise DistilabelNotImplementedError(\n        f\"`offline_batch_generate` is not implemented for `{self.__class__.__name__}`\",\n        page=\"sections/how_to_guides/advanced/offline-batch-generation/\",\n    )\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM","title":"AsyncLLM","text":"

Bases: LLM

Abstract class for asynchronous LLMs, so as to benefit from the async capabilities of each LLM implementation. This class is meant to be subclassed by each LLM, and the method agenerate needs to be implemented to provide the asynchronous generation of responses.

Attributes:

Name Type Description _event_loop AbstractEventLoop

the event loop to be used for the asynchronous generation of responses.

Source code in src/distilabel/models/llms/base.py
class AsyncLLM(LLM):\n    \"\"\"Abstract class for asynchronous LLMs, so as to benefit from the async capabilities\n    of each LLM implementation. This class is meant to be subclassed by each LLM, and the\n    method `agenerate` needs to be implemented to provide the asynchronous generation of\n    responses.\n\n    Attributes:\n        _event_loop: the event loop to be used for the asynchronous generation of responses.\n    \"\"\"\n\n    _num_generations_param_supported = True\n    _event_loop: \"asyncio.AbstractEventLoop\" = PrivateAttr(default=None)\n    _new_event_loop: bool = PrivateAttr(default=False)\n\n    @property\n    def generate_parameters(self) -> List[inspect.Parameter]:\n        \"\"\"Returns the parameters of the `agenerate` method.\n\n        Returns:\n            A list containing the parameters of the `agenerate` method.\n        \"\"\"\n        return list(inspect.signature(self.agenerate).parameters.values())\n\n    @cached_property\n    def generate_parsed_docstring(self) -> \"Docstring\":\n        \"\"\"Returns the parsed docstring of the `agenerate` method.\n\n        Returns:\n            The parsed docstring of the `agenerate` method.\n        \"\"\"\n        return parse_google_docstring(self.agenerate)\n\n    @property\n    def event_loop(self) -> \"asyncio.AbstractEventLoop\":\n        if self._event_loop is None:\n            try:\n                self._event_loop = asyncio.get_running_loop()\n                if self._event_loop.is_closed():\n                    self._event_loop = asyncio.new_event_loop()  # type: ignore\n                    self._new_event_loop = True\n            except RuntimeError:\n                self._event_loop = asyncio.new_event_loop()\n                self._new_event_loop = True\n        asyncio.set_event_loop(self._event_loop)\n        return self._event_loop\n\n    @abstractmethod\n    async def agenerate(\n        self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n    ) -> List[Union[str, None]]:\n        \"\"\"Method to generate a `num_generations` responses for a given input asynchronously,\n        and executed concurrently in `generate` method.\n        \"\"\"\n        pass\n\n    async def _agenerate(\n        self, inputs: List[\"FormattedInput\"], num_generations: int = 1, **kwargs: Any\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        if self._num_generations_param_supported:\n            tasks = [\n                asyncio.create_task(\n                    self.agenerate(\n                        input=input, num_generations=num_generations, **kwargs\n                    )\n                )\n                for input in inputs\n            ]\n            return await asyncio.gather(*tasks)\n\n        tasks = [\n            asyncio.create_task(self.agenerate(input=input, **kwargs))\n            for input in inputs\n            for _ in range(num_generations)\n        ]\n        outputs = [outputs[0] for outputs in await asyncio.gather(*tasks)]\n        return [\n            list(group)\n            for group in grouper(outputs, n=num_generations, incomplete=\"ignore\")\n        ]\n\n    def generate(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Method to generate a list of responses asynchronously, returning the output\n        synchronously awaiting for the response of each input sent to `agenerate`.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        return self.event_loop.run_until_complete(\n            self._agenerate(inputs=inputs, num_generations=num_generations, **kwargs)\n        )\n\n    def __del__(self) -> None:\n        \"\"\"Closes the event loop when the object is deleted.\"\"\"\n        if sys.meta_path is None:\n            return\n\n        if self._new_event_loop:\n            if self._event_loop.is_running():\n                self._event_loop.stop()\n            self._event_loop.close()\n\n    @staticmethod\n    def _prepare_structured_output(  # type: ignore\n        structured_output: \"InstructorStructuredOutputType\",\n        client: Any = None,\n        framework: Optional[str] = None,\n    ) -> Dict[str, Union[str, Any]]:\n        \"\"\"Wraps the client and updates the schema to work store it internally as a json schema.\n\n        Args:\n            structured_output: The configuration dict to prepare the structured output.\n            client: The client to wrap to generate structured output. Implemented to work\n                with `instructor`.\n            framework: The name of the framework.\n\n        Returns:\n            A dictionary containing the wrapped client and the schema to update the structured_output\n            variable in case it is a pydantic model.\n        \"\"\"\n        from distilabel.steps.tasks.structured_outputs.instructor import (\n            prepare_instructor,\n        )\n\n        result = {}\n        client = prepare_instructor(\n            client,\n            mode=structured_output.get(\"mode\"),\n            framework=framework,  # type: ignore\n        )\n        result[\"client\"] = client\n\n        schema = structured_output.get(\"schema\")\n        if not schema:\n            raise DistilabelUserError(\n                f\"The `structured_output` argument must contain a schema: {structured_output}\",\n                page=\"sections/how_to_guides/advanced/structured_generation/#instructor\",\n            )\n        if inspect.isclass(schema) and issubclass(schema, BaseModel):\n            # We want a json schema for the serialization, but instructor wants a pydantic BaseModel.\n            structured_output[\"schema\"] = schema.model_json_schema()  # type: ignore\n            result[\"structured_output\"] = structured_output\n\n        return result\n\n    @staticmethod\n    def _prepare_kwargs(\n        arguments: Dict[str, Any], structured_output: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Helper method to update the kwargs with the structured output configuration,\n        used in case they are defined.\n\n        Args:\n            arguments: The arguments that would be passed to the LLM as **kwargs.\n                to update with the structured output configuration.\n            structured_outputs: The structured output configuration to update the arguments.\n\n        Returns:\n            kwargs updated with the special arguments used by `instructor`.\n        \"\"\"\n        # We can deal with json schema or BaseModel, but we need to convert it to a BaseModel\n        # for the Instructor client.\n        schema = structured_output.get(\"schema\", {})\n\n        # If there's already a pydantic model, we don't need to do anything,\n        # otherwise, try to obtain one.\n        if not (inspect.isclass(schema) and issubclass(schema, BaseModel)):\n            from distilabel.steps.tasks.structured_outputs.utils import (\n                json_schema_to_model,\n            )\n\n            if isinstance(schema, str):\n                # In case it was saved in the dataset as a string.\n                schema = json.loads(schema)\n\n            try:\n                schema = json_schema_to_model(schema)\n            except Exception as e:\n                raise ValueError(\n                    f\"Failed to convert the schema to a pydantic model, the model is too complex currently: {e}\"\n                ) from e\n\n        arguments.update(\n            **{\n                \"response_model\": schema,\n                \"max_retries\": structured_output.get(\"max_retries\", 1),\n            },\n        )\n        return arguments\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate_parameters","title":"generate_parameters: List[inspect.Parameter] property","text":"

Returns the parameters of the agenerate method.

Returns:

Type Description List[Parameter]

A list containing the parameters of the agenerate method.

"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate_parsed_docstring","title":"generate_parsed_docstring: Docstring cached property","text":"

Returns the parsed docstring of the agenerate method.

Returns:

Type Description Docstring

The parsed docstring of the agenerate method.

"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.agenerate","title":"agenerate(input, num_generations=1, **kwargs) abstractmethod async","text":"

Method to generate a num_generations responses for a given input asynchronously, and executed concurrently in generate method.

Source code in src/distilabel/models/llms/base.py
@abstractmethod\nasync def agenerate(\n    self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n) -> List[Union[str, None]]:\n    \"\"\"Method to generate a `num_generations` responses for a given input asynchronously,\n    and executed concurrently in `generate` method.\n    \"\"\"\n    pass\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate","title":"generate(inputs, num_generations=1, **kwargs)","text":"

Method to generate a list of responses asynchronously, returning the output synchronously awaiting for the response of each input sent to agenerate.

Parameters:

Name Type Description Default inputs List[FormattedInput]

the list of inputs to generate responses for.

required num_generations int

the number of generations to generate per input.

1 **kwargs Any

the additional kwargs to be used for the generation.

{}

Returns:

Type Description List[GenerateOutput]

A list containing the generations for each input.

Source code in src/distilabel/models/llms/base.py
def generate(\n    self,\n    inputs: List[\"FormattedInput\"],\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Method to generate a list of responses asynchronously, returning the output\n    synchronously awaiting for the response of each input sent to `agenerate`.\n\n    Args:\n        inputs: the list of inputs to generate responses for.\n        num_generations: the number of generations to generate per input.\n        **kwargs: the additional kwargs to be used for the generation.\n\n    Returns:\n        A list containing the generations for each input.\n    \"\"\"\n    return self.event_loop.run_until_complete(\n        self._agenerate(inputs=inputs, num_generations=num_generations, **kwargs)\n    )\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.__del__","title":"__del__()","text":"

Closes the event loop when the object is deleted.

Source code in src/distilabel/models/llms/base.py
def __del__(self) -> None:\n    \"\"\"Closes the event loop when the object is deleted.\"\"\"\n    if sys.meta_path is None:\n        return\n\n    if self._new_event_loop:\n        if self._event_loop.is_running():\n            self._event_loop.stop()\n        self._event_loop.close()\n
"},{"location":"api/models/llm/llm_gallery/","title":"LLM Gallery","text":"

This section contains the existing LLM subclasses implemented in distilabel.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms","title":"llms","text":""},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM","title":"AnthropicLLM","text":"

Bases: AsyncLLM

Anthropic LLM implementation running the Async API client.

Attributes:

Name Type Description model str

the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\", \"claude-3-sonnet-20240229\", etc. Available models can be checked here: Anthropic: Models overview.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Anthropic API. Defaults to None which means that https://api.anthropic.com will be used internally.

timeout RuntimeParameter[float]

the maximum time in seconds to wait for a response. Defaults to 600.0.

max_retries RuntimeParameter[int]

The maximum number of times to retry the request before failing. Defaults to 6.

http_client Optional[AsyncClient]

if provided, an alternative HTTP client to use for calling Anthropic API. Defaults to None.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

_api_key_env_var str

the name of the environment variable to use for the API key. It is meant to be used internally.

_aclient Optional[AsyncAnthropic]

the AsyncAnthropic client to use for the Anthropic API. It is meant to be used internally. Set in the load method.

Runtime parameters
  • api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable.
  • base_url: the base URL to use for the Anthropic API. Defaults to \"https://api.anthropic.com\".
  • timeout: the maximum time in seconds to wait for a response. Defaults to 600.0.
  • max_retries: the maximum number of times to retry the request before failing. Defaults to 6.

Examples:

Generate text:

from distilabel.models.llms import AnthropicLLM\n\nllm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pydantic import BaseModel\nfrom distilabel.models.llms import AnthropicLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = AnthropicLLM(\n    model=\"claude-3-opus-20240229\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/anthropic.py
class AnthropicLLM(AsyncLLM):\n    \"\"\"Anthropic LLM implementation running the Async API client.\n\n    Attributes:\n        model: the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\",\n            \"claude-3-sonnet-20240229\", etc. Available models can be checked here:\n            [Anthropic: Models overview](https://docs.anthropic.com/claude/docs/models-overview).\n        api_key: the API key to authenticate the requests to the Anthropic API. If not provided,\n            it will be read from `ANTHROPIC_API_KEY` environment variable.\n        base_url: the base URL to use for the Anthropic API. Defaults to `None` which means\n            that `https://api.anthropic.com` will be used internally.\n        timeout: the maximum time in seconds to wait for a response. Defaults to `600.0`.\n        max_retries: The maximum number of times to retry the request before failing. Defaults\n            to `6`.\n        http_client: if provided, an alternative HTTP client to use for calling Anthropic\n            API. Defaults to `None`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n        _api_key_env_var: the name of the environment variable to use for the API key. It\n            is meant to be used internally.\n        _aclient: the `AsyncAnthropic` client to use for the Anthropic API. It is meant\n            to be used internally. Set in the `load` method.\n\n    Runtime parameters:\n        - `api_key`: the API key to authenticate the requests to the Anthropic API. If not\n            provided, it will be read from `ANTHROPIC_API_KEY` environment variable.\n        - `base_url`: the base URL to use for the Anthropic API. Defaults to `\"https://api.anthropic.com\"`.\n        - `timeout`: the maximum time in seconds to wait for a response. Defaults to `600.0`.\n        - `max_retries`: the maximum number of times to retry the request before failing.\n            Defaults to `6`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import AnthropicLLM\n\n        llm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import AnthropicLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = AnthropicLLM(\n            model=\"claude-3-opus-20240229\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"ANTHROPIC_BASE_URL\", \"https://api.anthropic.com\"\n        ),\n        description=\"The base URL to use for the Anthropic API.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_ANTHROPIC_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Anthropic API.\",\n    )\n    timeout: RuntimeParameter[float] = Field(\n        default=600.0,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    max_retries: RuntimeParameter[int] = Field(\n        default=6,\n        description=\"The maximum number of times to retry the request to the API before\"\n        \" failing.\",\n    )\n    http_client: Optional[AsyncClient] = Field(default=None, exclude=True)\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _api_key_env_var: str = PrivateAttr(default=_ANTHROPIC_API_KEY_ENV_VAR_NAME)\n    _aclient: Optional[\"AsyncAnthropic\"] = PrivateAttr(...)\n\n    def _check_model_exists(self) -> None:\n        \"\"\"Checks if the specified model exists in the available models.\"\"\"\n        from anthropic import AsyncAnthropic\n\n        annotation = get_type_hints(AsyncAnthropic().messages.create).get(\"model\", None)\n        models = [\n            value\n            for type_ in get_args(annotation)\n            if get_origin(type_) is Literal\n            for value in get_args(type_)\n        ]\n\n        if self.model not in models:\n            raise ValueError(\n                f\"Model {self.model} does not exist among available models. \"\n                f\"The available models are {', '.join(models)}\"\n            )\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncAnthropic` client to use the Anthropic async API.\"\"\"\n        super().load()\n\n        try:\n            from anthropic import AsyncAnthropic\n        except ImportError as ie:\n            raise ImportError(\n                \"Anthropic Python client is not installed. Please install it using\"\n                \" `pip install anthropic`.\"\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        self._check_model_exists()\n\n        self._aclient = AsyncAnthropic(\n            api_key=self.api_key.get_secret_value(),\n            base_url=self.base_url,\n            timeout=self.timeout,\n            http_client=self.http_client,\n            max_retries=self.max_retries,\n        )\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"anthropic\",\n            )\n            self._aclient = result.get(\"client\")\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        max_tokens: int = 128,\n        stop_sequences: Union[List[str], None] = None,\n        temperature: float = 1.0,\n        top_p: Union[float, None] = None,\n        top_k: Union[int, None] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates a response asynchronously, using the [Anthropic Async API definition](https://github.com/anthropics/anthropic-sdk-python).\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            max_tokens: the maximum number of new tokens that the model will generate. Defaults to `128`.\n            stop_sequences: custom text sequences that will cause the model to stop generating. Defaults to `NOT_GIVEN`.\n            temperature: the temperature to use for the generation. Set only if top_p is None. Defaults to `1.0`.\n            top_p: the top-p value to use for the generation. Defaults to `NOT_GIVEN`.\n            top_k: the top-k value to use for the generation. Defaults to `NOT_GIVEN`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        from anthropic._types import NOT_GIVEN\n\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,\n                client=self._aclient,\n                framework=\"anthropic\",\n            )\n            self._aclient = result.get(\"client\")\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"messages\": input,  # type: ignore\n            \"model\": self.model,\n            \"system\": (\n                input.pop(0)[\"content\"]\n                if input and input[0][\"role\"] == \"system\"\n                else NOT_GIVEN\n            ),\n            \"max_tokens\": max_tokens,\n            \"stream\": False,\n            \"stop_sequences\": NOT_GIVEN if stop_sequences is None else stop_sequences,\n            \"temperature\": temperature,\n            \"top_p\": NOT_GIVEN if top_p is None else top_p,\n            \"top_k\": NOT_GIVEN if top_k is None else top_k,\n        }\n\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n        generations = []\n\n        completion = await self._aclient.messages.create(**kwargs)  # type: ignore\n        if structured_output:\n            generations.append(completion.model_dump_json())\n            return generations\n\n        if (content := completion.content[0].text) is None:\n            self._logger.warning(\n                f\"Received no response using Anthropic client (model: '{self.model}').\"\n                f\" Finish reason was: {completion.stop_reason}\"\n            )\n        generations.append(content)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM._check_model_exists","title":"_check_model_exists()","text":"

Checks if the specified model exists in the available models.

Source code in src/distilabel/models/llms/anthropic.py
def _check_model_exists(self) -> None:\n    \"\"\"Checks if the specified model exists in the available models.\"\"\"\n    from anthropic import AsyncAnthropic\n\n    annotation = get_type_hints(AsyncAnthropic().messages.create).get(\"model\", None)\n    models = [\n        value\n        for type_ in get_args(annotation)\n        if get_origin(type_) is Literal\n        for value in get_args(type_)\n    ]\n\n    if self.model not in models:\n        raise ValueError(\n            f\"Model {self.model} does not exist among available models. \"\n            f\"The available models are {', '.join(models)}\"\n        )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.load","title":"load()","text":"

Loads the AsyncAnthropic client to use the Anthropic async API.

Source code in src/distilabel/models/llms/anthropic.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncAnthropic` client to use the Anthropic async API.\"\"\"\n    super().load()\n\n    try:\n        from anthropic import AsyncAnthropic\n    except ImportError as ie:\n        raise ImportError(\n            \"Anthropic Python client is not installed. Please install it using\"\n            \" `pip install anthropic`.\"\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    self._check_model_exists()\n\n    self._aclient = AsyncAnthropic(\n        api_key=self.api_key.get_secret_value(),\n        base_url=self.base_url,\n        timeout=self.timeout,\n        http_client=self.http_client,\n        max_retries=self.max_retries,\n    )\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"anthropic\",\n        )\n        self._aclient = result.get(\"client\")\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.agenerate","title":"agenerate(input, max_tokens=128, stop_sequences=None, temperature=1.0, top_p=None, top_k=None) async","text":"

Generates a response asynchronously, using the Anthropic Async API definition.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required max_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 stop_sequences Union[List[str], None]

custom text sequences that will cause the model to stop generating. Defaults to NOT_GIVEN.

None temperature float

the temperature to use for the generation. Set only if top_p is None. Defaults to 1.0.

1.0 top_p Union[float, None]

the top-p value to use for the generation. Defaults to NOT_GIVEN.

None top_k Union[int, None]

the top-k value to use for the generation. Defaults to NOT_GIVEN.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/anthropic.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    max_tokens: int = 128,\n    stop_sequences: Union[List[str], None] = None,\n    temperature: float = 1.0,\n    top_p: Union[float, None] = None,\n    top_k: Union[int, None] = None,\n) -> GenerateOutput:\n    \"\"\"Generates a response asynchronously, using the [Anthropic Async API definition](https://github.com/anthropics/anthropic-sdk-python).\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        max_tokens: the maximum number of new tokens that the model will generate. Defaults to `128`.\n        stop_sequences: custom text sequences that will cause the model to stop generating. Defaults to `NOT_GIVEN`.\n        temperature: the temperature to use for the generation. Set only if top_p is None. Defaults to `1.0`.\n        top_p: the top-p value to use for the generation. Defaults to `NOT_GIVEN`.\n        top_k: the top-k value to use for the generation. Defaults to `NOT_GIVEN`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    from anthropic._types import NOT_GIVEN\n\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,\n            client=self._aclient,\n            framework=\"anthropic\",\n        )\n        self._aclient = result.get(\"client\")\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"messages\": input,  # type: ignore\n        \"model\": self.model,\n        \"system\": (\n            input.pop(0)[\"content\"]\n            if input and input[0][\"role\"] == \"system\"\n            else NOT_GIVEN\n        ),\n        \"max_tokens\": max_tokens,\n        \"stream\": False,\n        \"stop_sequences\": NOT_GIVEN if stop_sequences is None else stop_sequences,\n        \"temperature\": temperature,\n        \"top_p\": NOT_GIVEN if top_p is None else top_p,\n        \"top_k\": NOT_GIVEN if top_k is None else top_k,\n    }\n\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n    generations = []\n\n    completion = await self._aclient.messages.create(**kwargs)  # type: ignore\n    if structured_output:\n        generations.append(completion.model_dump_json())\n        return generations\n\n    if (content := completion.content[0].text) is None:\n        self._logger.warning(\n            f\"Received no response using Anthropic client (model: '{self.model}').\"\n            f\" Finish reason was: {completion.stop_reason}\"\n        )\n    generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnyscaleLLM","title":"AnyscaleLLM","text":"

Bases: OpenAILLM

Anyscale LLM implementation running the async API client of OpenAI.

Attributes:

Name Type Description model

the model name to use for the LLM, e.g., google/gemma-7b-it. See the supported models under the \"Text Generation -> Supported Models\" section here.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Anyscale API requests. Defaults to None, which means that the value set for the environment variable ANYSCALE_BASE_URL will be used, or \"https://api.endpoints.anyscale.com/v1\" if not set.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Anyscale API. Defaults to None which means that the value set for the environment variable ANYSCALE_API_KEY will be used, or None if not set.

_api_key_env_var str

the name of the environment variable to use for the API key. It is meant to be used internally.

Examples:

Generate text:

from distilabel.models.llms import AnyscaleLLM\n\nllm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/anyscale.py
class AnyscaleLLM(OpenAILLM):\n    \"\"\"Anyscale LLM implementation running the async API client of OpenAI.\n\n    Attributes:\n        model: the model name to use for the LLM, e.g., `google/gemma-7b-it`. See the\n            supported models under the \"Text Generation -> Supported Models\" section\n            [here](https://docs.endpoints.anyscale.com/).\n        base_url: the base URL to use for the Anyscale API requests. Defaults to `None`, which\n            means that the value set for the environment variable `ANYSCALE_BASE_URL` will be used, or\n            \"https://api.endpoints.anyscale.com/v1\" if not set.\n        api_key: the API key to authenticate the requests to the Anyscale API. Defaults to `None` which\n            means that the value set for the environment variable `ANYSCALE_API_KEY` will be used, or\n            `None` if not set.\n        _api_key_env_var: the name of the environment variable to use for the API key.\n            It is meant to be used internally.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import AnyscaleLLM\n\n        llm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"ANYSCALE_BASE_URL\", \"https://api.endpoints.anyscale.com/v1\"\n        ),\n        description=\"The base URL to use for the Anyscale API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_ANYSCALE_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Anyscale API.\",\n    )\n\n    _api_key_env_var: str = PrivateAttr(_ANYSCALE_API_KEY_ENV_VAR_NAME)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AzureOpenAILLM","title":"AzureOpenAILLM","text":"

Bases: OpenAILLM

Azure OpenAI LLM implementation running the async API client.

Attributes:

Name Type Description model

the model name to use for the LLM i.e. the name of the Azure deployment.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Azure OpenAI API can be set with AZURE_OPENAI_ENDPOINT. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_ENDPOINT will be used, or None if not set.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Azure OpenAI API. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_API_KEY will be used, or None if not set.

api_version Optional[RuntimeParameter[str]]

the API version to use for the Azure OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_VERSION will be used, or None if not set.

Icon

:material-microsoft-azure:

Examples:

Generate text:

from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate text from a custom endpoint following the OpenAI API:

from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pydantic import BaseModel\nfrom distilabel.models.llms import AzureOpenAILLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = AzureOpenAILLM(\n    model=\"gpt-4-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/azure.py
class AzureOpenAILLM(OpenAILLM):\n    \"\"\"Azure OpenAI LLM implementation running the async API client.\n\n    Attributes:\n        model: the model name to use for the LLM i.e. the name of the Azure deployment.\n        base_url: the base URL to use for the Azure OpenAI API can be set with `AZURE_OPENAI_ENDPOINT`.\n            Defaults to `None` which means that the value set for the environment variable\n            `AZURE_OPENAI_ENDPOINT` will be used, or `None` if not set.\n        api_key: the API key to authenticate the requests to the Azure OpenAI API. Defaults to `None`\n            which means that the value set for the environment variable `AZURE_OPENAI_API_KEY` will be\n            used, or `None` if not set.\n        api_version: the API version to use for the Azure OpenAI API. Defaults to `None` which means\n            that the value set for the environment variable `OPENAI_API_VERSION` will be used, or\n            `None` if not set.\n\n    Icon:\n        `:material-microsoft-azure:`\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import AzureOpenAILLM\n\n        llm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate text from a custom endpoint following the OpenAI API:\n\n        ```python\n        from distilabel.models.llms import AzureOpenAILLM\n\n        llm = AzureOpenAILLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\",\n            base_url=r\"http://localhost:8080/v1\"\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import AzureOpenAILLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = AzureOpenAILLM(\n            model=\"gpt-4-turbo\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(_AZURE_OPENAI_ENDPOINT_ENV_VAR_NAME),\n        description=\"The base URL to use for the Azure OpenAI API requests i.e. the Azure OpenAI endpoint.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_AZURE_OPENAI_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Azure OpenAI API.\",\n    )\n\n    api_version: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\"OPENAI_API_VERSION\"),\n        description=\"The API version to use for the Azure OpenAI API.\",\n    )\n\n    _base_url_env_var: str = PrivateAttr(_AZURE_OPENAI_ENDPOINT_ENV_VAR_NAME)\n    _api_key_env_var: str = PrivateAttr(_AZURE_OPENAI_API_KEY_ENV_VAR_NAME)\n    _aclient: Optional[\"AsyncAzureOpenAI\"] = PrivateAttr(...)  # type: ignore\n\n    @override\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncAzureOpenAI` client to benefit from async requests.\"\"\"\n        # This is a workaround to avoid the `OpenAILLM` calling the _prepare_structured_output\n        # in the load method before we have the proper client.\n        with patch(\n            \"distilabel.models.openai.OpenAILLM._prepare_structured_output\", lambda x: x\n        ):\n            super().load()\n\n        try:\n            from openai import AsyncAzureOpenAI\n        except ImportError as ie:\n            raise ImportError(\n                \"OpenAI Python client is not installed. Please install it using\"\n                \" `pip install openai`.\"\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        # TODO: May be worth adding the AD auth too? Also the `organization`?\n        self._aclient = AsyncAzureOpenAI(  # type: ignore\n            azure_endpoint=self.base_url,  # type: ignore\n            azure_deployment=self.model,\n            api_version=self.api_version,\n            api_key=self.api_key.get_secret_value(),\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        if self.structured_output:\n            self._prepare_structured_output(self.structured_output)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AzureOpenAILLM.load","title":"load()","text":"

Loads the AsyncAzureOpenAI client to benefit from async requests.

Source code in src/distilabel/models/llms/azure.py
@override\ndef load(self) -> None:\n    \"\"\"Loads the `AsyncAzureOpenAI` client to benefit from async requests.\"\"\"\n    # This is a workaround to avoid the `OpenAILLM` calling the _prepare_structured_output\n    # in the load method before we have the proper client.\n    with patch(\n        \"distilabel.models.openai.OpenAILLM._prepare_structured_output\", lambda x: x\n    ):\n        super().load()\n\n    try:\n        from openai import AsyncAzureOpenAI\n    except ImportError as ie:\n        raise ImportError(\n            \"OpenAI Python client is not installed. Please install it using\"\n            \" `pip install openai`.\"\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    # TODO: May be worth adding the AD auth too? Also the `organization`?\n    self._aclient = AsyncAzureOpenAI(  # type: ignore\n        azure_endpoint=self.base_url,  # type: ignore\n        azure_deployment=self.model,\n        api_version=self.api_version,\n        api_key=self.api_key.get_secret_value(),\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    if self.structured_output:\n        self._prepare_structured_output(self.structured_output)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM","title":"CohereLLM","text":"

Bases: AsyncLLM

Cohere API implementation using the async client for concurrent text generation.

Attributes:

Name Type Description model str

the name of the model from the Cohere API to use for the generation.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\".

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable.

timeout RuntimeParameter[int]

the maximum time in seconds to wait for a response from the API. Defaults to 120.

client_name RuntimeParameter[str]

the name of the client to use for the API requests. Defaults to \"distilabel\".

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

_ChatMessage Type[ChatMessage]

the ChatMessage class from the cohere package.

_aclient AsyncClient

the AsyncClient client from the cohere package.

Runtime parameters
  • base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\".
  • api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable.
  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.
  • client_name: the name of the client to use for the API requests. Defaults to \"distilabel\".

Examples:

Generate text:

from distilabel.models.llms import CohereLLM\n\nllm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import CohereLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = CohereLLM(\n    model=\"CohereForAI/c4ai-command-r-plus\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/cohere.py
class CohereLLM(AsyncLLM):\n    \"\"\"Cohere API implementation using the async client for concurrent text generation.\n\n    Attributes:\n        model: the name of the model from the Cohere API to use for the generation.\n        base_url: the base URL to use for the Cohere API requests. Defaults to\n            `\"https://api.cohere.ai/v1\"`.\n        api_key: the API key to authenticate the requests to the Cohere API. Defaults to\n            the value of the `COHERE_API_KEY` environment variable.\n        timeout: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        client_name: the name of the client to use for the API requests. Defaults to\n            `\"distilabel\"`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n        _ChatMessage: the `ChatMessage` class from the `cohere` package.\n        _aclient: the `AsyncClient` client from the `cohere` package.\n\n    Runtime parameters:\n        - `base_url`: the base URL to use for the Cohere API requests. Defaults to\n            `\"https://api.cohere.ai/v1\"`.\n        - `api_key`: the API key to authenticate the requests to the Cohere API. Defaults\n            to the value of the `COHERE_API_KEY` environment variable.\n        - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        - `client_name`: the name of the client to use for the API requests. Defaults to\n            `\"distilabel\"`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import CohereLLM\n\n        llm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import CohereLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = CohereLLM(\n            model=\"CohereForAI/c4ai-command-r-plus\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"COHERE_BASE_URL\", \"https://api.cohere.ai/v1\"\n        ),\n        description=\"The base URL to use for the Cohere API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_COHERE_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Cohere API.\",\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    client_name: RuntimeParameter[str] = Field(\n        default=\"distilabel\",\n        description=\"The name of the client to use for the API requests.\",\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _ChatMessage: Type[\"ChatMessage\"] = PrivateAttr(...)\n    _aclient: \"AsyncClient\" = PrivateAttr(...)\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncClient` client from the `cohere` package.\"\"\"\n\n        super().load()\n\n        try:\n            from cohere import AsyncClient, ChatMessage\n        except ImportError as ie:\n            raise ImportError(\n                \"The `cohere` package is required to use the `CohereLLM` class.\"\n            ) from ie\n\n        self._ChatMessage = ChatMessage\n\n        self._aclient = AsyncClient(\n            api_key=self.api_key.get_secret_value(),  # type: ignore\n            client_name=self.client_name,\n            base_url=self.base_url,\n            timeout=self.timeout,\n        )\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"cohere\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    def _format_chat_to_cohere(\n        self, input: \"FormattedInput\"\n    ) -> Tuple[Union[str, None], List[\"ChatMessage\"], str]:\n        \"\"\"Formats the chat input to the Cohere Chat API conversational format.\n\n        Args:\n            input: The chat input to format.\n\n        Returns:\n            A tuple containing the system, chat history, and message.\n        \"\"\"\n        system = None\n        message = None\n        chat_history = []\n        for item in input:\n            role = item[\"role\"]\n            content = item[\"content\"]\n            if role == \"system\":\n                system = content\n            elif role == \"user\":\n                message = content\n            elif role == \"assistant\":\n                if message is None:\n                    raise ValueError(\n                        \"An assistant message but be preceded by a user message.\"\n                    )\n                chat_history.append(self._ChatMessage(role=\"USER\", message=message))  # type: ignore\n                chat_history.append(self._ChatMessage(role=\"CHATBOT\", message=content))  # type: ignore\n                message = None\n\n        if message is None:\n            raise ValueError(\"The chat input must end with a user message.\")\n\n        return system, chat_history, message\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        temperature: Optional[float] = None,\n        max_tokens: Optional[int] = None,\n        k: Optional[int] = None,\n        p: Optional[float] = None,\n        seed: Optional[float] = None,\n        stop_sequences: Optional[Sequence[str]] = None,\n        frequency_penalty: Optional[float] = None,\n        presence_penalty: Optional[float] = None,\n        raw_prompting: Optional[bool] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates a response from the LLM given an input.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            temperature: the temperature to use for the generation. Defaults to `None`.\n            max_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `None`.\n            k: the number of highest probability vocabulary tokens to keep for the generation.\n                Defaults to `None`.\n            p: the nucleus sampling probability to use for the generation. Defaults to\n                `None`.\n            seed: the seed to use for the generation. Defaults to `None`.\n            stop_sequences: a list of sequences to use as stopping criteria for the generation.\n                Defaults to `None`.\n            frequency_penalty: the frequency penalty to use for the generation. Defaults\n                to `None`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `None`.\n            raw_prompting: a flag to use raw prompting for the generation. Defaults to\n                `None`.\n\n        Returns:\n            The generated response from the Cohere API model.\n        \"\"\"\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,  # type: ignore\n                client=self._aclient,\n                framework=\"cohere\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        system, chat_history, message = self._format_chat_to_cohere(input)\n\n        kwargs = {\n            \"message\": message,\n            \"model\": self.model,\n            \"preamble\": system,\n            \"chat_history\": chat_history,\n            \"temperature\": temperature,\n            \"max_tokens\": max_tokens,\n            \"k\": k,\n            \"p\": p,\n            \"seed\": seed,\n            \"stop_sequences\": stop_sequences,\n            \"frequency_penalty\": frequency_penalty,\n            \"presence_penalty\": presence_penalty,\n            \"raw_prompting\": raw_prompting,\n        }\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)  # type: ignore\n\n        response = await self._aclient.chat(**kwargs)  # type: ignore\n\n        if structured_output:\n            return [response.model_dump_json()]\n\n        if (text := response.text) == \"\":\n            self._logger.warning(  # type: ignore\n                f\"Received no response using Cohere client (model: '{self.model}').\"\n                f\" Finish reason was: {response.finish_reason}\"\n            )\n            return [None]\n\n        return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.load","title":"load()","text":"

Loads the AsyncClient client from the cohere package.

Source code in src/distilabel/models/llms/cohere.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncClient` client from the `cohere` package.\"\"\"\n\n    super().load()\n\n    try:\n        from cohere import AsyncClient, ChatMessage\n    except ImportError as ie:\n        raise ImportError(\n            \"The `cohere` package is required to use the `CohereLLM` class.\"\n        ) from ie\n\n    self._ChatMessage = ChatMessage\n\n    self._aclient = AsyncClient(\n        api_key=self.api_key.get_secret_value(),  # type: ignore\n        client_name=self.client_name,\n        base_url=self.base_url,\n        timeout=self.timeout,\n    )\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"cohere\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM._format_chat_to_cohere","title":"_format_chat_to_cohere(input)","text":"

Formats the chat input to the Cohere Chat API conversational format.

Parameters:

Name Type Description Default input FormattedInput

The chat input to format.

required

Returns:

Type Description Tuple[Union[str, None], List[ChatMessage], str]

A tuple containing the system, chat history, and message.

Source code in src/distilabel/models/llms/cohere.py
def _format_chat_to_cohere(\n    self, input: \"FormattedInput\"\n) -> Tuple[Union[str, None], List[\"ChatMessage\"], str]:\n    \"\"\"Formats the chat input to the Cohere Chat API conversational format.\n\n    Args:\n        input: The chat input to format.\n\n    Returns:\n        A tuple containing the system, chat history, and message.\n    \"\"\"\n    system = None\n    message = None\n    chat_history = []\n    for item in input:\n        role = item[\"role\"]\n        content = item[\"content\"]\n        if role == \"system\":\n            system = content\n        elif role == \"user\":\n            message = content\n        elif role == \"assistant\":\n            if message is None:\n                raise ValueError(\n                    \"An assistant message but be preceded by a user message.\"\n                )\n            chat_history.append(self._ChatMessage(role=\"USER\", message=message))  # type: ignore\n            chat_history.append(self._ChatMessage(role=\"CHATBOT\", message=content))  # type: ignore\n            message = None\n\n    if message is None:\n        raise ValueError(\"The chat input must end with a user message.\")\n\n    return system, chat_history, message\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.agenerate","title":"agenerate(input, temperature=None, max_tokens=None, k=None, p=None, seed=None, stop_sequences=None, frequency_penalty=None, presence_penalty=None, raw_prompting=None) async","text":"

Generates a response from the LLM given an input.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required temperature Optional[float]

the temperature to use for the generation. Defaults to None.

None max_tokens Optional[int]

the maximum number of new tokens that the model will generate. Defaults to None.

None k Optional[int]

the number of highest probability vocabulary tokens to keep for the generation. Defaults to None.

None p Optional[float]

the nucleus sampling probability to use for the generation. Defaults to None.

None seed Optional[float]

the seed to use for the generation. Defaults to None.

None stop_sequences Optional[Sequence[str]]

a list of sequences to use as stopping criteria for the generation. Defaults to None.

None frequency_penalty Optional[float]

the frequency penalty to use for the generation. Defaults to None.

None presence_penalty Optional[float]

the presence penalty to use for the generation. Defaults to None.

None raw_prompting Optional[bool]

a flag to use raw prompting for the generation. Defaults to None.

None

Returns:

Type Description GenerateOutput

The generated response from the Cohere API model.

Source code in src/distilabel/models/llms/cohere.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    temperature: Optional[float] = None,\n    max_tokens: Optional[int] = None,\n    k: Optional[int] = None,\n    p: Optional[float] = None,\n    seed: Optional[float] = None,\n    stop_sequences: Optional[Sequence[str]] = None,\n    frequency_penalty: Optional[float] = None,\n    presence_penalty: Optional[float] = None,\n    raw_prompting: Optional[bool] = None,\n) -> GenerateOutput:\n    \"\"\"Generates a response from the LLM given an input.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        temperature: the temperature to use for the generation. Defaults to `None`.\n        max_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `None`.\n        k: the number of highest probability vocabulary tokens to keep for the generation.\n            Defaults to `None`.\n        p: the nucleus sampling probability to use for the generation. Defaults to\n            `None`.\n        seed: the seed to use for the generation. Defaults to `None`.\n        stop_sequences: a list of sequences to use as stopping criteria for the generation.\n            Defaults to `None`.\n        frequency_penalty: the frequency penalty to use for the generation. Defaults\n            to `None`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `None`.\n        raw_prompting: a flag to use raw prompting for the generation. Defaults to\n            `None`.\n\n    Returns:\n        The generated response from the Cohere API model.\n    \"\"\"\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,  # type: ignore\n            client=self._aclient,\n            framework=\"cohere\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    system, chat_history, message = self._format_chat_to_cohere(input)\n\n    kwargs = {\n        \"message\": message,\n        \"model\": self.model,\n        \"preamble\": system,\n        \"chat_history\": chat_history,\n        \"temperature\": temperature,\n        \"max_tokens\": max_tokens,\n        \"k\": k,\n        \"p\": p,\n        \"seed\": seed,\n        \"stop_sequences\": stop_sequences,\n        \"frequency_penalty\": frequency_penalty,\n        \"presence_penalty\": presence_penalty,\n        \"raw_prompting\": raw_prompting,\n    }\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)  # type: ignore\n\n    response = await self._aclient.chat(**kwargs)  # type: ignore\n\n    if structured_output:\n        return [response.model_dump_json()]\n\n    if (text := response.text) == \"\":\n        self._logger.warning(  # type: ignore\n            f\"Received no response using Cohere client (model: '{self.model}').\"\n            f\" Finish reason was: {response.finish_reason}\"\n        )\n        return [None]\n\n    return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM","title":"GroqLLM","text":"

Bases: AsyncLLM

Groq API implementation using the async client for concurrent text generation.

Attributes:

Name Type Description model str

the name of the model from the Groq API to use for the generation.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\".

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable.

max_retries RuntimeParameter[int]

the maximum number of times to retry the request to the API before failing. Defaults to 2.

timeout RuntimeParameter[int]

the maximum time in seconds to wait for a response from the API. Defaults to 120.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

_api_key_env_var str

the name of the environment variable to use for the API key.

_aclient Optional[AsyncGroq]

the AsyncGroq client from the groq package.

Runtime parameters
  • base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\".
  • api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable.
  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2.
  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

Examples:

Generate text:

from distilabel.models.llms import GroqLLM\n\nllm = GroqLLM(model=\"llama3-70b-8192\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import GroqLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = GroqLLM(\n    model=\"llama3-70b-8192\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/groq.py
class GroqLLM(AsyncLLM):\n    \"\"\"Groq API implementation using the async client for concurrent text generation.\n\n    Attributes:\n        model: the name of the model from the Groq API to use for the generation.\n        base_url: the base URL to use for the Groq API requests. Defaults to\n            `\"https://api.groq.com\"`.\n        api_key: the API key to authenticate the requests to the Groq API. Defaults to\n            the value of the `GROQ_API_KEY` environment variable.\n        max_retries: the maximum number of times to retry the request to the API before\n            failing. Defaults to `2`.\n        timeout: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n        _api_key_env_var: the name of the environment variable to use for the API key.\n        _aclient: the `AsyncGroq` client from the `groq` package.\n\n    Runtime parameters:\n        - `base_url`: the base URL to use for the Groq API requests. Defaults to\n            `\"https://api.groq.com\"`.\n        - `api_key`: the API key to authenticate the requests to the Groq API. Defaults to\n            the value of the `GROQ_API_KEY` environment variable.\n        - `max_retries`: the maximum number of times to retry the request to the API before\n            failing. Defaults to `2`.\n        - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import GroqLLM\n\n        llm = GroqLLM(model=\"llama3-70b-8192\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import GroqLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = GroqLLM(\n            model=\"llama3-70b-8192\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            _GROQ_API_BASE_URL_ENV_VAR_NAME, \"https://api.groq.com\"\n        ),\n        description=\"The base URL to use for the Groq API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_GROQ_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Groq API.\",\n    )\n    max_retries: RuntimeParameter[int] = Field(\n        default=2,\n        description=\"The maximum number of times to retry the request to the API before\"\n        \" failing.\",\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _api_key_env_var: str = PrivateAttr(_GROQ_API_KEY_ENV_VAR_NAME)\n    _aclient: Optional[\"AsyncGroq\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncGroq` client to benefit from async requests.\"\"\"\n        super().load()\n\n        try:\n            from groq import AsyncGroq\n        except ImportError as ie:\n            raise ImportError(\n                \"Groq Python client is not installed. Please install it using\"\n                ' `pip install groq` or from the extras as `pip install \"distilabel[groq]\"`.'\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        self._aclient = AsyncGroq(\n            base_url=self.base_url,\n            api_key=self.api_key.get_secret_value(),\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"groq\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        seed: Optional[int] = None,\n        max_new_tokens: int = 128,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        stop: Optional[str] = None,\n    ) -> \"GenerateOutput\":\n        \"\"\"Generates `num_generations` responses for the given input using the Groq async\n        client.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            seed: the seed to use for the generation. Defaults to `None`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            stop: the stop sequence to use for the generation. Defaults to `None`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n\n        References:\n            - https://console.groq.com/docs/text-chat\n        \"\"\"\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,\n                client=self._aclient,\n                framework=\"groq\",\n            )\n            self._aclient = result.get(\"client\")\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"messages\": input,  # type: ignore\n            \"model\": self.model,\n            \"seed\": seed,\n            \"temperature\": temperature,\n            \"max_tokens\": max_new_tokens,\n            \"top_p\": top_p,\n            \"stream\": False,\n            \"stop\": stop,\n        }\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n        generations = []\n        completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n        if structured_output:\n            generations.append(completion.model_dump_json())\n            return generations\n\n        for choice in completion.choices:\n            if (content := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"Received no response using the Groq client (model: '{self.model}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(content)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.load","title":"load()","text":"

Loads the AsyncGroq client to benefit from async requests.

Source code in src/distilabel/models/llms/groq.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncGroq` client to benefit from async requests.\"\"\"\n    super().load()\n\n    try:\n        from groq import AsyncGroq\n    except ImportError as ie:\n        raise ImportError(\n            \"Groq Python client is not installed. Please install it using\"\n            ' `pip install groq` or from the extras as `pip install \"distilabel[groq]\"`.'\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    self._aclient = AsyncGroq(\n        base_url=self.base_url,\n        api_key=self.api_key.get_secret_value(),\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"groq\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.agenerate","title":"agenerate(input, seed=None, max_new_tokens=128, temperature=1.0, top_p=1.0, stop=None) async","text":"

Generates num_generations responses for the given input using the Groq async client.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required seed Optional[int]

the seed to use for the generation. Defaults to None.

None max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 stop Optional[str]

the stop sequence to use for the generation. Defaults to None.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

References
  • https://console.groq.com/docs/text-chat
Source code in src/distilabel/models/llms/groq.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    seed: Optional[int] = None,\n    max_new_tokens: int = 128,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    stop: Optional[str] = None,\n) -> \"GenerateOutput\":\n    \"\"\"Generates `num_generations` responses for the given input using the Groq async\n    client.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        seed: the seed to use for the generation. Defaults to `None`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        stop: the stop sequence to use for the generation. Defaults to `None`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n\n    References:\n        - https://console.groq.com/docs/text-chat\n    \"\"\"\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,\n            client=self._aclient,\n            framework=\"groq\",\n        )\n        self._aclient = result.get(\"client\")\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"messages\": input,  # type: ignore\n        \"model\": self.model,\n        \"seed\": seed,\n        \"temperature\": temperature,\n        \"max_tokens\": max_new_tokens,\n        \"top_p\": top_p,\n        \"stream\": False,\n        \"stop\": stop,\n    }\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n    generations = []\n    completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n    if structured_output:\n        generations.append(completion.model_dump_json())\n        return generations\n\n    for choice in completion.choices:\n        if (content := choice.message.content) is None:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using the Groq client (model: '{self.model}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM","title":"InferenceEndpointsLLM","text":"

Bases: AsyncLLM, MagpieChatTemplateMixin

InferenceEndpoints LLM implementation running the async API client.

This LLM will internally use huggingface_hub.AsyncInferenceClient.

Attributes:

Name Type Description model_id Optional[str]

the model ID to use for the LLM as available in the Hugging Face Hub, which will be used to resolve the base URL for the serverless Inference Endpoints API requests. Defaults to None.

endpoint_name Optional[RuntimeParameter[str]]

the name of the Inference Endpoint to use for the LLM. Defaults to None.

endpoint_namespace Optional[RuntimeParameter[str]]

the namespace of the Inference Endpoint to use for the LLM. Defaults to None.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Inference Endpoints API requests.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Inference Endpoints API.

tokenizer_id Optional[str]

the tokenizer ID to use for the LLM as available in the Hugging Face Hub. Defaults to None, but defining one is recommended to properly format the prompt.

model_display_name Optional[str]

the model display name to use for the LLM. Defaults to None.

use_magpie_template Optional[str]

a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

magpie_pre_query_template Optional[str]

the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

structured_output Optional[RuntimeParameter[StructuredOutputType]]

a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

Icon

:hugging:

Examples:

Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded:

from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Dedicated Inference Endpoints:

from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    endpoint_name=\"<ENDPOINT_NAME>\",\n    api_key=\"<HF_API_KEY>\",\n    endpoint_namespace=\"<USER|ORG>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Dedicated Inference Endpoints or TGI:

from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    api_key=\"<HF_API_KEY>\",\n    base_url=\"<BASE_URL>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pydantic import BaseModel\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    api_key=\"api.key\",\n    structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n
Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
class InferenceEndpointsLLM(AsyncLLM, MagpieChatTemplateMixin):\n    \"\"\"InferenceEndpoints LLM implementation running the async API client.\n\n    This LLM will internally use `huggingface_hub.AsyncInferenceClient`.\n\n    Attributes:\n        model_id: the model ID to use for the LLM as available in the Hugging Face Hub, which\n            will be used to resolve the base URL for the serverless Inference Endpoints API requests.\n            Defaults to `None`.\n        endpoint_name: the name of the Inference Endpoint to use for the LLM. Defaults to `None`.\n        endpoint_namespace: the namespace of the Inference Endpoint to use for the LLM. Defaults to `None`.\n        base_url: the base URL to use for the Inference Endpoints API requests.\n        api_key: the API key to authenticate the requests to the Inference Endpoints API.\n        tokenizer_id: the tokenizer ID to use for the LLM as available in the Hugging Face Hub.\n            Defaults to `None`, but defining one is recommended to properly format the prompt.\n        model_display_name: the model display name to use for the LLM. Defaults to `None`.\n        use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n            template. Defaults to `False`.\n        magpie_pre_query_template: the pre-query template to be applied to the prompt or\n            sent to the LLM to generate an instruction or a follow up user message. Valid\n            values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n            to `None`.\n        structured_output: a dictionary containing the structured output configuration or\n            if more fine-grained control is needed, an instance of `OutlinesStructuredOutput`.\n            Defaults to None.\n\n    Icon:\n        `:hugging:`\n\n    Examples:\n        Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded:\n\n        ```python\n        from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n        llm = InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Dedicated Inference Endpoints:\n\n        ```python\n        from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n        llm = InferenceEndpointsLLM(\n            endpoint_name=\"<ENDPOINT_NAME>\",\n            api_key=\"<HF_API_KEY>\",\n            endpoint_namespace=\"<USER|ORG>\",\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Dedicated Inference Endpoints or TGI:\n\n        ```python\n        from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n        llm = InferenceEndpointsLLM(\n            api_key=\"<HF_API_KEY>\",\n            base_url=\"<BASE_URL>\",\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import InferenceEndpointsLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            api_key=\"api.key\",\n            structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n        ```\n    \"\"\"\n\n    model_id: Optional[str] = None\n\n    endpoint_name: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The name of the Inference Endpoint to use for the LLM.\",\n    )\n    endpoint_namespace: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The namespace of the Inference Endpoint to use for the LLM.\",\n    )\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The base URL to use for the Inference Endpoints API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR),\n        description=\"The API key to authenticate the requests to the Inference Endpoints API.\",\n    )\n\n    tokenizer_id: Optional[str] = None\n    model_display_name: Optional[str] = None\n\n    structured_output: Optional[RuntimeParameter[StructuredOutputType]] = Field(\n        default=None,\n        description=\"The structured output format to use across all the generations.\",\n    )\n\n    _num_generations_param_supported = False\n\n    _model_name: Optional[str] = PrivateAttr(default=None)\n    _tokenizer: Optional[\"PreTrainedTokenizer\"] = PrivateAttr(default=None)\n    _api_key_env_var: str = PrivateAttr(HF_TOKEN_ENV_VAR)\n    _aclient: Optional[\"AsyncInferenceClient\"] = PrivateAttr(...)\n\n    @model_validator(mode=\"after\")  # type: ignore\n    def only_one_of_model_id_endpoint_name_or_base_url_provided(\n        self,\n    ) -> \"InferenceEndpointsLLM\":\n        \"\"\"Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also\n        provided, a warning will be shown informing the user that the provided `base_url` will be ignored in\n        favour of the dynamically calculated one..\"\"\"\n\n        if self.base_url and (self.model_id or self.endpoint_name):\n            self._logger.warning(  # type: ignore\n                f\"Since the `base_url={self.base_url}` is available and either one of `model_id`\"\n                \" or `endpoint_name` is also provided, the `base_url` will either be ignored\"\n                \" or overwritten with the one generated from either of those args, for serverless\"\n                \" or dedicated inference endpoints, respectively.\"\n            )\n\n        if self.use_magpie_template and self.tokenizer_id is None:\n            raise ValueError(\n                \"`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,\"\n                \" set a `tokenizer_id` and try again.\"\n            )\n\n        if (\n            self.model_id\n            and self.tokenizer_id is None\n            and self.structured_output is not None\n        ):\n            self.tokenizer_id = self.model_id\n\n        if self.base_url and not (self.model_id or self.endpoint_name):\n            return self\n\n        if self.model_id and not self.endpoint_name:\n            return self\n\n        if self.endpoint_name and not self.model_id:\n            return self\n\n        raise ValidationError(\n            f\"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is\"\n            f\" provided too, it will be overwritten instead. Found `model_id`={self.model_id},\"\n            f\" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}.\"\n        )\n\n    def load(self) -> None:  # noqa: C901\n        \"\"\"Loads the `AsyncInferenceClient` client to connect to the Hugging Face Inference\n        Endpoint.\n\n        Raises:\n            ImportError: if the `huggingface-hub` Python client is not installed.\n            ValueError: if the model is not currently deployed or is not running the TGI framework.\n            ImportError: if the `transformers` Python client is not installed.\n        \"\"\"\n        super().load()\n\n        try:\n            from huggingface_hub import (\n                AsyncInferenceClient,\n                InferenceClient,\n                get_inference_endpoint,\n            )\n        except ImportError as ie:\n            raise ImportError(\n                \"Hugging Face Hub Python client is not installed. Please install it using\"\n                \" `pip install huggingface-hub`.\"\n            ) from ie\n\n        if self.api_key is None:\n            self.api_key = SecretStr(get_hf_token(self.__class__.__name__, \"api_key\"))\n\n        if self.model_id is not None:\n            client = InferenceClient(\n                model=self.model_id, token=self.api_key.get_secret_value()\n            )\n            status = client.get_model_status()\n\n            if (\n                status.state not in {\"Loadable\", \"Loaded\"}\n                and status.framework != \"text-generation-inference\"\n            ):\n                raise ValueError(\n                    f\"Model {self.model_id} is not currently deployed or is not running the TGI framework\"\n                )\n\n            self.base_url = client._resolve_url(\n                model=self.model_id, task=\"text-generation\"\n            )\n\n        if self.endpoint_name is not None:\n            client = get_inference_endpoint(\n                name=self.endpoint_name,\n                namespace=self.endpoint_namespace,\n                token=self.api_key.get_secret_value(),\n            )\n            if client.status in [\"paused\", \"scaledToZero\"]:\n                client.resume().wait(timeout=300)\n            elif client.status == \"initializing\":\n                client.wait(timeout=300)\n\n            self.base_url = client.url\n            self._model_name = client.repository\n\n        self._aclient = AsyncInferenceClient(\n            base_url=self.base_url,\n            token=self.api_key.get_secret_value(),\n        )\n\n        if self.tokenizer_id:\n            try:\n                from transformers import AutoTokenizer\n            except ImportError as ie:\n                raise ImportError(\n                    \"Transformers Python client is not installed. Please install it using\"\n                    \" `pip install transformers`.\"\n                ) from ie\n\n            self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id)\n\n    @property\n    @override\n    def model_name(self) -> Union[str, None]:  # type: ignore\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return (\n            self.model_display_name\n            or self._model_name\n            or self.model_id\n            or self.endpoint_name\n            or self.base_url\n        )\n\n    def prepare_input(self, input: \"StandardInput\") -> str:\n        \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n        input.\n\n        Args:\n            input: the input list containing chat items.\n\n        Returns:\n            The prompt to send to the LLM.\n        \"\"\"\n        prompt: str = (\n            self._tokenizer.apply_chat_template(  # type: ignore\n                conversation=input,  # type: ignore\n                tokenize=False,\n                add_generation_prompt=True,\n            )\n            if input\n            else \"\"\n        )\n        return super().apply_magpie_pre_query_template(prompt, input)\n\n    def _get_structured_output(\n        self, input: FormattedInput\n    ) -> Union[Dict[str, Any], None]:\n        \"\"\"Gets the structured output (if any) for the given input.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n\n        Returns:\n            The structured output that will be passed as `grammer` to the inference endpoint\n            or `None` if not required.\n        \"\"\"\n        structured_output = None\n\n        # Specific structured output per input\n        if isinstance(input, tuple):\n            input, structured_output = input\n            structured_output = {\n                \"type\": structured_output[\"format\"],  # type: ignore\n                \"value\": structured_output[\"schema\"],  # type: ignore\n            }\n\n        # Same structured output for all the inputs\n        if structured_output is None and self.structured_output is not None:\n            try:\n                structured_output = {\n                    \"type\": self.structured_output[\"format\"],  # type: ignore\n                    \"value\": self.structured_output[\"schema\"],  # type: ignore\n                }\n            except KeyError as e:\n                raise ValueError(\n                    \"To use the structured output you have to inform the `format` and `schema` in \"\n                    \"the `structured_output` attribute.\"\n                ) from e\n\n        if structured_output:\n            if isinstance(structured_output[\"value\"], ModelMetaclass):\n                structured_output[\"value\"] = structured_output[\n                    \"value\"\n                ].model_json_schema()\n\n        return structured_output\n\n    async def _generate_with_text_generation(\n        self,\n        input: FormattedInput,\n        max_new_tokens: int = 128,\n        repetition_penalty: Optional[float] = None,\n        frequency_penalty: Optional[float] = None,\n        temperature: float = 1.0,\n        do_sample: bool = False,\n        top_k: Optional[int] = None,\n        top_p: Optional[float] = None,\n        typical_p: Optional[float] = None,\n        stop_sequences: Union[List[str], None] = None,\n        return_full_text: bool = False,\n        seed: Optional[int] = None,\n        watermark: bool = False,\n    ) -> Union[str, None]:\n        structured_output = self._get_structured_output(input)\n\n        completion = None\n        try:\n            completion = await self._aclient.text_generation(  # type: ignore\n                prompt=self.prepare_input(input),  # type: ignore\n                max_new_tokens=max_new_tokens,\n                do_sample=do_sample,\n                typical_p=typical_p,\n                repetition_penalty=repetition_penalty,\n                frequency_penalty=frequency_penalty,\n                temperature=temperature,\n                top_p=top_p,\n                top_k=top_k,\n                stop_sequences=stop_sequences,\n                return_full_text=return_full_text,\n                # NOTE: here to ensure that the cache is not used and a different response is\n                # generated every time\n                seed=seed or random.randint(0, sys.maxsize),\n                watermark=watermark,\n                grammar=structured_output,  # type: ignore\n            )\n        except Exception as e:\n            self._logger.warning(  # type: ignore\n                f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n                f\" Finish reason was: {e}\"\n            )\n        return completion\n\n    async def _generate_with_chat_completion(\n        self,\n        input: \"StandardInput\",\n        max_new_tokens: int = 128,\n        frequency_penalty: Optional[float] = None,\n        logit_bias: Optional[List[float]] = None,\n        presence_penalty: Optional[float] = None,\n        seed: Optional[int] = None,\n        stop_sequences: Optional[List[str]] = None,\n        temperature: float = 1.0,\n        tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n        tool_prompt: Optional[str] = None,\n        tools: Optional[List[Dict[str, Any]]] = None,\n        top_p: Optional[float] = None,\n    ) -> Union[str, None]:\n        message = None\n        try:\n            completion = await self._aclient.chat_completion(  # type: ignore\n                messages=input,  # type: ignore\n                max_tokens=max_new_tokens,\n                frequency_penalty=frequency_penalty,\n                logit_bias=logit_bias,\n                presence_penalty=presence_penalty,\n                # NOTE: here to ensure that the cache is not used and a different response is\n                # generated every time\n                seed=seed or random.randint(0, sys.maxsize),\n                stop=stop_sequences,\n                temperature=temperature,\n                tool_choice=tool_choice,  # type: ignore\n                tool_prompt=tool_prompt,\n                tools=tools,  # type: ignore\n                top_p=top_p,\n            )\n            choice = completion.choices[0]\n            if (message := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n        except Exception as e:\n            self._logger.warning(  # type: ignore\n                f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n                f\" Finish reason was: {e}\"\n            )\n        return message\n\n    def _check_stop_sequences(\n        self,\n        stop_sequences: Optional[Union[str, List[str]]] = None,\n    ) -> Union[List[str], None]:\n        \"\"\"Checks that no more than 4 stop sequences are provided.\n\n        Args:\n            stop_sequences: the stop sequences to be checked.\n\n        Returns:\n            The stop sequences.\n        \"\"\"\n        if stop_sequences is not None:\n            if isinstance(stop_sequences, str):\n                stop_sequences = [stop_sequences]\n            if len(stop_sequences) > 4:\n                warnings.warn(\n                    \"Only up to 4 stop sequences are allowed, so keeping the first 4 items only.\",\n                    UserWarning,\n                    stacklevel=2,\n                )\n                stop_sequences = stop_sequences[:4]\n        return stop_sequences\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        max_new_tokens: int = 128,\n        frequency_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n        logit_bias: Optional[List[float]] = None,\n        presence_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n        seed: Optional[int] = None,\n        stop_sequences: Optional[List[str]] = None,\n        temperature: float = 1.0,\n        tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n        tool_prompt: Optional[str] = None,\n        tools: Optional[List[Dict[str, Any]]] = None,\n        top_p: Optional[float] = None,\n        do_sample: bool = False,\n        repetition_penalty: Optional[float] = None,\n        return_full_text: bool = False,\n        top_k: Optional[int] = None,\n        typical_p: Optional[float] = None,\n        watermark: bool = False,\n    ) -> GenerateOutput:\n        \"\"\"Generates completions for the given input using the async client. This method\n        uses two methods of the `huggingface_hub.AsyncClient`: `chat_completion` and `text_generation`.\n        `chat_completion` method will be used only if no `tokenizer_id` has been specified.\n        Some arguments of this function are specific to the `text_generation` method, while\n        some others are specific to the `chat_completion` method.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n                new tokens based on their existing frequency in the text so far, decreasing\n                model's likelihood to repeat the same line verbatim. Defauls to `None`.\n            logit_bias: modify the likelihood of specified tokens appearing in the completion.\n                This argument is exclusive to the `chat_completion` method and will be used\n                only if `tokenizer_id` is `None`.\n                Defaults to `None`.\n            presence_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n                new tokens based on whether they appear in the text so far, increasing the\n                model likelihood to talk about new topics. This argument is exclusive to\n                the `chat_completion` method and will be used only if `tokenizer_id` is\n                `None`. Defauls to `None`.\n            seed: the seed to use for the generation. Defaults to `None`.\n            stop_sequences: either a single string or a list of strings containing the sequences\n                to stop the generation at. Defaults to `None`, but will be set to the\n                `tokenizer.eos_token` if available.\n            temperature: the temperature to use for the generation. Defaults to `1.0`.\n            tool_choice: the name of the tool the model should call. It can be a dictionary\n                like `{\"function_name\": \"my_tool\"}` or \"auto\". If not provided, then the\n                model won't use any tool. This argument is exclusive to the `chat_completion`\n                method and will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n            tool_prompt: A prompt to be appended before the tools. This argument is exclusive\n                to the `chat_completion` method and will be used only if `tokenizer_id`\n                is `None`. Defauls to `None`.\n            tools: a list of tools definitions that the LLM can use.\n                This argument is exclusive to the `chat_completion` method and will be used\n                only if `tokenizer_id` is `None`. Defaults to `None`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            do_sample: whether to use sampling for the generation. This argument is exclusive\n                of the `text_generation` method and will be only used if `tokenizer_id` is not\n                `None`. Defaults to `False`.\n            repetition_penalty: the repetition penalty to use for the generation. This argument\n                is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n                is not `None`. Defaults to `None`.\n            return_full_text: whether to return the full text of the completion or just\n                the generated text. Defaults to `False`, meaning that only the generated\n                text will be returned. This argument is exclusive of the `text_generation`\n                method and will be only used if `tokenizer_id` is not `None`.\n            top_k: the top-k value to use for the generation. This argument is exclusive\n                of the `text_generation` method and will be only used if `tokenizer_id`\n                is not `None`. Defaults to `0.8`, since neither `0.0` nor `1.0` are valid\n                values in TGI.\n            typical_p: the typical-p value to use for the generation. This argument is exclusive\n                of the `text_generation` method and will be only used if `tokenizer_id`\n                is not `None`. Defaults to `None`.\n            watermark: whether to add the watermark to the generated text. This argument\n                is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n                is not `None`. Defaults to `None`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        stop_sequences = self._check_stop_sequences(stop_sequences)\n\n        if self.tokenizer_id is None:\n            return [\n                await self._generate_with_chat_completion(\n                    input=input,  # type: ignore\n                    max_new_tokens=max_new_tokens,\n                    frequency_penalty=frequency_penalty,\n                    logit_bias=logit_bias,\n                    presence_penalty=presence_penalty,\n                    seed=seed,\n                    stop_sequences=stop_sequences,\n                    temperature=temperature,\n                    tool_choice=tool_choice,\n                    tool_prompt=tool_prompt,\n                    tools=tools,\n                    top_p=top_p,\n                )\n            ]\n\n        return [\n            await self._generate_with_text_generation(\n                input=input,\n                max_new_tokens=max_new_tokens,\n                do_sample=do_sample,\n                typical_p=typical_p,\n                repetition_penalty=repetition_penalty,\n                frequency_penalty=frequency_penalty,\n                temperature=temperature,\n                top_p=top_p,\n                top_k=top_k,\n                stop_sequences=stop_sequences,\n                return_full_text=return_full_text,\n                seed=seed,\n                watermark=watermark,\n            )\n        ]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.model_name","title":"model_name: Union[str, None] property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.only_one_of_model_id_endpoint_name_or_base_url_provided","title":"only_one_of_model_id_endpoint_name_or_base_url_provided()","text":"

Validates that only one of model_id or endpoint_name is provided; and if base_url is also provided, a warning will be shown informing the user that the provided base_url will be ignored in favour of the dynamically calculated one..

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
@model_validator(mode=\"after\")  # type: ignore\ndef only_one_of_model_id_endpoint_name_or_base_url_provided(\n    self,\n) -> \"InferenceEndpointsLLM\":\n    \"\"\"Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also\n    provided, a warning will be shown informing the user that the provided `base_url` will be ignored in\n    favour of the dynamically calculated one..\"\"\"\n\n    if self.base_url and (self.model_id or self.endpoint_name):\n        self._logger.warning(  # type: ignore\n            f\"Since the `base_url={self.base_url}` is available and either one of `model_id`\"\n            \" or `endpoint_name` is also provided, the `base_url` will either be ignored\"\n            \" or overwritten with the one generated from either of those args, for serverless\"\n            \" or dedicated inference endpoints, respectively.\"\n        )\n\n    if self.use_magpie_template and self.tokenizer_id is None:\n        raise ValueError(\n            \"`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,\"\n            \" set a `tokenizer_id` and try again.\"\n        )\n\n    if (\n        self.model_id\n        and self.tokenizer_id is None\n        and self.structured_output is not None\n    ):\n        self.tokenizer_id = self.model_id\n\n    if self.base_url and not (self.model_id or self.endpoint_name):\n        return self\n\n    if self.model_id and not self.endpoint_name:\n        return self\n\n    if self.endpoint_name and not self.model_id:\n        return self\n\n    raise ValidationError(\n        f\"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is\"\n        f\" provided too, it will be overwritten instead. Found `model_id`={self.model_id},\"\n        f\" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}.\"\n    )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.load","title":"load()","text":"

Loads the AsyncInferenceClient client to connect to the Hugging Face Inference Endpoint.

Raises:

Type Description ImportError

if the huggingface-hub Python client is not installed.

ValueError

if the model is not currently deployed or is not running the TGI framework.

ImportError

if the transformers Python client is not installed.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
def load(self) -> None:  # noqa: C901\n    \"\"\"Loads the `AsyncInferenceClient` client to connect to the Hugging Face Inference\n    Endpoint.\n\n    Raises:\n        ImportError: if the `huggingface-hub` Python client is not installed.\n        ValueError: if the model is not currently deployed or is not running the TGI framework.\n        ImportError: if the `transformers` Python client is not installed.\n    \"\"\"\n    super().load()\n\n    try:\n        from huggingface_hub import (\n            AsyncInferenceClient,\n            InferenceClient,\n            get_inference_endpoint,\n        )\n    except ImportError as ie:\n        raise ImportError(\n            \"Hugging Face Hub Python client is not installed. Please install it using\"\n            \" `pip install huggingface-hub`.\"\n        ) from ie\n\n    if self.api_key is None:\n        self.api_key = SecretStr(get_hf_token(self.__class__.__name__, \"api_key\"))\n\n    if self.model_id is not None:\n        client = InferenceClient(\n            model=self.model_id, token=self.api_key.get_secret_value()\n        )\n        status = client.get_model_status()\n\n        if (\n            status.state not in {\"Loadable\", \"Loaded\"}\n            and status.framework != \"text-generation-inference\"\n        ):\n            raise ValueError(\n                f\"Model {self.model_id} is not currently deployed or is not running the TGI framework\"\n            )\n\n        self.base_url = client._resolve_url(\n            model=self.model_id, task=\"text-generation\"\n        )\n\n    if self.endpoint_name is not None:\n        client = get_inference_endpoint(\n            name=self.endpoint_name,\n            namespace=self.endpoint_namespace,\n            token=self.api_key.get_secret_value(),\n        )\n        if client.status in [\"paused\", \"scaledToZero\"]:\n            client.resume().wait(timeout=300)\n        elif client.status == \"initializing\":\n            client.wait(timeout=300)\n\n        self.base_url = client.url\n        self._model_name = client.repository\n\n    self._aclient = AsyncInferenceClient(\n        base_url=self.base_url,\n        token=self.api_key.get_secret_value(),\n    )\n\n    if self.tokenizer_id:\n        try:\n            from transformers import AutoTokenizer\n        except ImportError as ie:\n            raise ImportError(\n                \"Transformers Python client is not installed. Please install it using\"\n                \" `pip install transformers`.\"\n            ) from ie\n\n        self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.prepare_input","title":"prepare_input(input)","text":"

Prepares the input (applying the chat template and tokenization) for the provided input.

Parameters:

Name Type Description Default input StandardInput

the input list containing chat items.

required

Returns:

Type Description str

The prompt to send to the LLM.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
def prepare_input(self, input: \"StandardInput\") -> str:\n    \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n    input.\n\n    Args:\n        input: the input list containing chat items.\n\n    Returns:\n        The prompt to send to the LLM.\n    \"\"\"\n    prompt: str = (\n        self._tokenizer.apply_chat_template(  # type: ignore\n            conversation=input,  # type: ignore\n            tokenize=False,\n            add_generation_prompt=True,\n        )\n        if input\n        else \"\"\n    )\n    return super().apply_magpie_pre_query_template(prompt, input)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM._get_structured_output","title":"_get_structured_output(input)","text":"

Gets the structured output (if any) for the given input.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required

Returns:

Type Description Union[Dict[str, Any], None]

The structured output that will be passed as grammer to the inference endpoint

Union[Dict[str, Any], None]

or None if not required.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
def _get_structured_output(\n    self, input: FormattedInput\n) -> Union[Dict[str, Any], None]:\n    \"\"\"Gets the structured output (if any) for the given input.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n\n    Returns:\n        The structured output that will be passed as `grammer` to the inference endpoint\n        or `None` if not required.\n    \"\"\"\n    structured_output = None\n\n    # Specific structured output per input\n    if isinstance(input, tuple):\n        input, structured_output = input\n        structured_output = {\n            \"type\": structured_output[\"format\"],  # type: ignore\n            \"value\": structured_output[\"schema\"],  # type: ignore\n        }\n\n    # Same structured output for all the inputs\n    if structured_output is None and self.structured_output is not None:\n        try:\n            structured_output = {\n                \"type\": self.structured_output[\"format\"],  # type: ignore\n                \"value\": self.structured_output[\"schema\"],  # type: ignore\n            }\n        except KeyError as e:\n            raise ValueError(\n                \"To use the structured output you have to inform the `format` and `schema` in \"\n                \"the `structured_output` attribute.\"\n            ) from e\n\n    if structured_output:\n        if isinstance(structured_output[\"value\"], ModelMetaclass):\n            structured_output[\"value\"] = structured_output[\n                \"value\"\n            ].model_json_schema()\n\n    return structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM._check_stop_sequences","title":"_check_stop_sequences(stop_sequences=None)","text":"

Checks that no more than 4 stop sequences are provided.

Parameters:

Name Type Description Default stop_sequences Optional[Union[str, List[str]]]

the stop sequences to be checked.

None

Returns:

Type Description Union[List[str], None]

The stop sequences.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
def _check_stop_sequences(\n    self,\n    stop_sequences: Optional[Union[str, List[str]]] = None,\n) -> Union[List[str], None]:\n    \"\"\"Checks that no more than 4 stop sequences are provided.\n\n    Args:\n        stop_sequences: the stop sequences to be checked.\n\n    Returns:\n        The stop sequences.\n    \"\"\"\n    if stop_sequences is not None:\n        if isinstance(stop_sequences, str):\n            stop_sequences = [stop_sequences]\n        if len(stop_sequences) > 4:\n            warnings.warn(\n                \"Only up to 4 stop sequences are allowed, so keeping the first 4 items only.\",\n                UserWarning,\n                stacklevel=2,\n            )\n            stop_sequences = stop_sequences[:4]\n    return stop_sequences\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.agenerate","title":"agenerate(input, max_new_tokens=128, frequency_penalty=None, logit_bias=None, presence_penalty=None, seed=None, stop_sequences=None, temperature=1.0, tool_choice=None, tool_prompt=None, tools=None, top_p=None, do_sample=False, repetition_penalty=None, return_full_text=False, top_k=None, typical_p=None, watermark=False) async","text":"

Generates completions for the given input using the async client. This method uses two methods of the huggingface_hub.AsyncClient: chat_completion and text_generation. chat_completion method will be used only if no tokenizer_id has been specified. Some arguments of this function are specific to the text_generation method, while some others are specific to the chat_completion method.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty Optional[Annotated[float, Field(ge=-2.0, le=2.0)]]

a value between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing model's likelihood to repeat the same line verbatim. Defauls to None.

None logit_bias Optional[List[float]]

modify the likelihood of specified tokens appearing in the completion. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defaults to None.

None presence_penalty Optional[Annotated[float, Field(ge=-2.0, le=2.0)]]

a value between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model likelihood to talk about new topics. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defauls to None.

None seed Optional[int]

the seed to use for the generation. Defaults to None.

None stop_sequences Optional[List[str]]

either a single string or a list of strings containing the sequences to stop the generation at. Defaults to None, but will be set to the tokenizer.eos_token if available.

None temperature float

the temperature to use for the generation. Defaults to 1.0.

1.0 tool_choice Optional[Union[Dict[str, str], Literal['auto']]]

the name of the tool the model should call. It can be a dictionary like {\"function_name\": \"my_tool\"} or \"auto\". If not provided, then the model won't use any tool. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defaults to None.

None tool_prompt Optional[str]

A prompt to be appended before the tools. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defauls to None.

None tools Optional[List[Dict[str, Any]]]

a list of tools definitions that the LLM can use. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defaults to None.

None top_p Optional[float]

the top-p value to use for the generation. Defaults to 1.0.

None do_sample bool

whether to use sampling for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to False.

False repetition_penalty Optional[float]

the repetition penalty to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to None.

None return_full_text bool

whether to return the full text of the completion or just the generated text. Defaults to False, meaning that only the generated text will be returned. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None.

False top_k Optional[int]

the top-k value to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to 0.8, since neither 0.0 nor 1.0 are valid values in TGI.

None typical_p Optional[float]

the typical-p value to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to None.

None watermark bool

whether to add the watermark to the generated text. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to None.

False

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    max_new_tokens: int = 128,\n    frequency_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n    logit_bias: Optional[List[float]] = None,\n    presence_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n    seed: Optional[int] = None,\n    stop_sequences: Optional[List[str]] = None,\n    temperature: float = 1.0,\n    tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n    tool_prompt: Optional[str] = None,\n    tools: Optional[List[Dict[str, Any]]] = None,\n    top_p: Optional[float] = None,\n    do_sample: bool = False,\n    repetition_penalty: Optional[float] = None,\n    return_full_text: bool = False,\n    top_k: Optional[int] = None,\n    typical_p: Optional[float] = None,\n    watermark: bool = False,\n) -> GenerateOutput:\n    \"\"\"Generates completions for the given input using the async client. This method\n    uses two methods of the `huggingface_hub.AsyncClient`: `chat_completion` and `text_generation`.\n    `chat_completion` method will be used only if no `tokenizer_id` has been specified.\n    Some arguments of this function are specific to the `text_generation` method, while\n    some others are specific to the `chat_completion` method.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n            new tokens based on their existing frequency in the text so far, decreasing\n            model's likelihood to repeat the same line verbatim. Defauls to `None`.\n        logit_bias: modify the likelihood of specified tokens appearing in the completion.\n            This argument is exclusive to the `chat_completion` method and will be used\n            only if `tokenizer_id` is `None`.\n            Defaults to `None`.\n        presence_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n            new tokens based on whether they appear in the text so far, increasing the\n            model likelihood to talk about new topics. This argument is exclusive to\n            the `chat_completion` method and will be used only if `tokenizer_id` is\n            `None`. Defauls to `None`.\n        seed: the seed to use for the generation. Defaults to `None`.\n        stop_sequences: either a single string or a list of strings containing the sequences\n            to stop the generation at. Defaults to `None`, but will be set to the\n            `tokenizer.eos_token` if available.\n        temperature: the temperature to use for the generation. Defaults to `1.0`.\n        tool_choice: the name of the tool the model should call. It can be a dictionary\n            like `{\"function_name\": \"my_tool\"}` or \"auto\". If not provided, then the\n            model won't use any tool. This argument is exclusive to the `chat_completion`\n            method and will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n        tool_prompt: A prompt to be appended before the tools. This argument is exclusive\n            to the `chat_completion` method and will be used only if `tokenizer_id`\n            is `None`. Defauls to `None`.\n        tools: a list of tools definitions that the LLM can use.\n            This argument is exclusive to the `chat_completion` method and will be used\n            only if `tokenizer_id` is `None`. Defaults to `None`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        do_sample: whether to use sampling for the generation. This argument is exclusive\n            of the `text_generation` method and will be only used if `tokenizer_id` is not\n            `None`. Defaults to `False`.\n        repetition_penalty: the repetition penalty to use for the generation. This argument\n            is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n            is not `None`. Defaults to `None`.\n        return_full_text: whether to return the full text of the completion or just\n            the generated text. Defaults to `False`, meaning that only the generated\n            text will be returned. This argument is exclusive of the `text_generation`\n            method and will be only used if `tokenizer_id` is not `None`.\n        top_k: the top-k value to use for the generation. This argument is exclusive\n            of the `text_generation` method and will be only used if `tokenizer_id`\n            is not `None`. Defaults to `0.8`, since neither `0.0` nor `1.0` are valid\n            values in TGI.\n        typical_p: the typical-p value to use for the generation. This argument is exclusive\n            of the `text_generation` method and will be only used if `tokenizer_id`\n            is not `None`. Defaults to `None`.\n        watermark: whether to add the watermark to the generated text. This argument\n            is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n            is not `None`. Defaults to `None`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    stop_sequences = self._check_stop_sequences(stop_sequences)\n\n    if self.tokenizer_id is None:\n        return [\n            await self._generate_with_chat_completion(\n                input=input,  # type: ignore\n                max_new_tokens=max_new_tokens,\n                frequency_penalty=frequency_penalty,\n                logit_bias=logit_bias,\n                presence_penalty=presence_penalty,\n                seed=seed,\n                stop_sequences=stop_sequences,\n                temperature=temperature,\n                tool_choice=tool_choice,\n                tool_prompt=tool_prompt,\n                tools=tools,\n                top_p=top_p,\n            )\n        ]\n\n    return [\n        await self._generate_with_text_generation(\n            input=input,\n            max_new_tokens=max_new_tokens,\n            do_sample=do_sample,\n            typical_p=typical_p,\n            repetition_penalty=repetition_penalty,\n            frequency_penalty=frequency_penalty,\n            temperature=temperature,\n            top_p=top_p,\n            top_k=top_k,\n            stop_sequences=stop_sequences,\n            return_full_text=return_full_text,\n            seed=seed,\n            watermark=watermark,\n        )\n    ]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM","title":"TransformersLLM","text":"

Bases: LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin

Hugging Face transformers library LLM implementation using the text generation pipeline.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

revision str

if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

torch_dtype str

the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\".

trust_remote_code bool

whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

model_kwargs Optional[Dict[str, Any]]

additional dictionary of keyword arguments that will be passed to the from_pretrained method of the model.

tokenizer Optional[str]

the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer config files. If not provided, the one associated to the model will be used. Defaults to None.

use_fast bool

whether to use a fast tokenizer or not. Defaults to True.

chat_template Optional[str]

a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None.

device Optional[Union[str, int]]

the name or index of the device where the model will be loaded. Defaults to None.

device_map Optional[Union[str, Dict[str, Any]]]

a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\". Defaults to None.

token Optional[SecretStr]

the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]]

a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

use_magpie_template Optional[RuntimeParameter[OutlinesStructuredOutputType]]

a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

magpie_pre_query_template Optional[RuntimeParameter[OutlinesStructuredOutputType]]

the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

Icon

:hugging:

Examples:

Generate text:

from distilabel.models.llms import TransformersLLM\n\nllm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/huggingface/transformers.py
class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):\n    \"\"\"Hugging Face `transformers` library LLM implementation using the text generation\n    pipeline.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        revision: if `model` refers to a Hugging Face Hub repository, then the revision\n            (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n        torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc.\n            Defaults to `\"auto\"`.\n        trust_remote_code: whether to allow fetching and executing remote code fetched\n            from the repository in the Hub. Defaults to `False`.\n        model_kwargs: additional dictionary of keyword arguments that will be passed to\n            the `from_pretrained` method of the model.\n        tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing\n            the tokenizer config files. If not provided, the one associated to the `model`\n            will be used. Defaults to `None`.\n        use_fast: whether to use a fast tokenizer or not. Defaults to `True`.\n        chat_template: a chat template that will be used to build the prompts before\n            sending them to the model. If not provided, the chat template defined in the\n            tokenizer config will be used. If not provided and the tokenizer doesn't have\n            a chat template, then ChatML template will be used. Defaults to `None`.\n        device: the name or index of the device where the model will be loaded. Defaults\n            to `None`.\n        device_map: a dictionary mapping each layer of the model to a device, or a mode\n            like `\"sequential\"` or `\"auto\"`. Defaults to `None`.\n        token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n            Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n            local configuration will be used. Defaults to `None`.\n        structured_output: a dictionary containing the structured output configuration or if more\n            fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n        use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n            template. Defaults to `False`.\n        magpie_pre_query_template: the pre-query template to be applied to the prompt or\n            sent to the LLM to generate an instruction or a follow up user message. Valid\n            values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n            to `None`.\n\n    Icon:\n        `:hugging:`\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import TransformersLLM\n\n        llm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    revision: str = \"main\"\n    torch_dtype: str = \"auto\"\n    trust_remote_code: bool = False\n    model_kwargs: Optional[Dict[str, Any]] = None\n    tokenizer: Optional[str] = None\n    use_fast: bool = True\n    chat_template: Optional[str] = None\n    device: Optional[Union[str, int]] = None\n    device_map: Optional[Union[str, Dict[str, Any]]] = None\n    token: Optional[SecretStr] = Field(\n        default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR)\n    )\n    structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n        default=None,\n        description=\"The structured output format to use across all the generations.\",\n    )\n\n    _pipeline: Optional[\"Pipeline\"] = PrivateAttr(...)\n    _prefix_allowed_tokens_fn: Union[Callable, None] = PrivateAttr(default=None)\n\n    def load(self) -> None:\n        \"\"\"Loads the model and tokenizer and creates the text generation pipeline. In addition,\n        it will configure the tokenizer chat template.\"\"\"\n        if self.device == \"cuda\":\n            CudaDevicePlacementMixin.load(self)\n\n        try:\n            from transformers import pipeline\n        except ImportError as ie:\n            raise ImportError(\n                \"Transformers is not installed. Please install it using `pip install transformers`.\"\n            ) from ie\n\n        token = self.token.get_secret_value() if self.token is not None else self.token\n\n        self._pipeline = pipeline(\n            \"text-generation\",\n            model=self.model,\n            revision=self.revision,\n            torch_dtype=self.torch_dtype,\n            trust_remote_code=self.trust_remote_code,\n            model_kwargs=self.model_kwargs or {},\n            tokenizer=self.tokenizer or self.model,\n            use_fast=self.use_fast,\n            device=self.device,\n            device_map=self.device_map,\n            token=token,\n            return_full_text=False,\n        )\n\n        if self.chat_template is not None:\n            self._pipeline.tokenizer.chat_template = self.chat_template  # type: ignore\n\n        if self._pipeline.tokenizer.pad_token is None:  # type: ignore\n            self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token  # type: ignore\n\n        if self.structured_output:\n            self._prefix_allowed_tokens_fn = self._prepare_structured_output(\n                self.structured_output\n            )\n\n        super().load()\n\n    def unload(self) -> None:\n        \"\"\"Unloads the `vLLM` model.\"\"\"\n        CudaDevicePlacementMixin.unload(self)\n        super().unload()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    def prepare_input(self, input: \"StandardInput\") -> str:\n        \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n        input.\n\n        Args:\n            input: the input list containing chat items.\n\n        Returns:\n            The prompt to send to the LLM.\n        \"\"\"\n        if self._pipeline.tokenizer.chat_template:  # type: ignore\n            return input[0][\"content\"]\n\n        prompt: str = (\n            self._pipeline.tokenizer.apply_chat_template(  # type: ignore\n                input,  # type: ignore\n                tokenize=False,\n                add_generation_prompt=True,\n            )\n            if input\n            else \"\"\n        )\n        return super().apply_magpie_pre_query_template(prompt, input)\n\n    @validate_call\n    def generate(  # type: ignore\n        self,\n        inputs: List[StandardInput],\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        temperature: float = 0.1,\n        repetition_penalty: float = 1.1,\n        top_p: float = 1.0,\n        top_k: int = 0,\n        do_sample: bool = True,\n    ) -> List[GenerateOutput]:\n        \"\"\"Generates `num_generations` responses for each input using the text generation\n        pipeline.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            repetition_penalty: the repetition penalty to use for the generation. Defaults\n                to `1.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            top_k: the top-k value to use for the generation. Defaults to `0`.\n            do_sample: whether to use sampling or not. Defaults to `True`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        prepared_inputs = [self.prepare_input(input=input) for input in inputs]\n\n        outputs: List[List[Dict[str, str]]] = self._pipeline(  # type: ignore\n            prepared_inputs,\n            max_new_tokens=max_new_tokens,\n            temperature=temperature,\n            repetition_penalty=repetition_penalty,\n            top_p=top_p,\n            top_k=top_k,\n            do_sample=do_sample,\n            num_return_sequences=num_generations,\n            prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,\n            pad_token_id=self._pipeline.tokenizer.eos_token_id,  # type: ignore\n        )\n        return [\n            [generation[\"generated_text\"] for generation in output]\n            for output in outputs\n        ]\n\n    def get_last_hidden_states(\n        self, inputs: List[\"StandardInput\"]\n    ) -> List[\"HiddenState\"]:\n        \"\"\"Gets the last `hidden_states` of the model for the given inputs. It doesn't\n        execute the task head.\n\n        Args:\n            inputs: a list of inputs in chat format to generate the embeddings for.\n\n        Returns:\n            A list containing the last hidden state for each sequence using a NumPy array\n            with shape [num_tokens, hidden_size].\n        \"\"\"\n        model: \"PreTrainedModel\" = (\n            self._pipeline.model.model  # type: ignore\n            if hasattr(self._pipeline.model, \"model\")  # type: ignore\n            else next(self._pipeline.model.children())  # type: ignore\n        )\n        tokenizer: \"PreTrainedTokenizer\" = self._pipeline.tokenizer  # type: ignore\n        input_ids = tokenizer(\n            [self.prepare_input(input) for input in inputs],  # type: ignore\n            return_tensors=\"pt\",\n            padding=True,\n        ).to(model.device)\n        last_hidden_states = model(**input_ids)[\"last_hidden_state\"]\n\n        return [\n            seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()\n            for seq_last_hidden_state, attention_mask in zip(\n                last_hidden_states,\n                input_ids[\"attention_mask\"],  # type: ignore\n            )\n        ]\n\n    def _prepare_structured_output(\n        self, structured_output: Optional[OutlinesStructuredOutputType] = None\n    ) -> Union[Callable, None]:\n        \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n        Args:\n            structured_output: the configuration dict to prepare the structured output.\n\n        Returns:\n            The callable that will be used to guide the generation of the model.\n        \"\"\"\n        from distilabel.steps.tasks.structured_outputs.outlines import (\n            prepare_guided_output,\n        )\n\n        result = prepare_guided_output(\n            structured_output, \"transformers\", self._pipeline\n        )\n        if schema := result.get(\"schema\"):\n            self.structured_output[\"schema\"] = schema\n        return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.load","title":"load()","text":"

Loads the model and tokenizer and creates the text generation pipeline. In addition, it will configure the tokenizer chat template.

Source code in src/distilabel/models/llms/huggingface/transformers.py
def load(self) -> None:\n    \"\"\"Loads the model and tokenizer and creates the text generation pipeline. In addition,\n    it will configure the tokenizer chat template.\"\"\"\n    if self.device == \"cuda\":\n        CudaDevicePlacementMixin.load(self)\n\n    try:\n        from transformers import pipeline\n    except ImportError as ie:\n        raise ImportError(\n            \"Transformers is not installed. Please install it using `pip install transformers`.\"\n        ) from ie\n\n    token = self.token.get_secret_value() if self.token is not None else self.token\n\n    self._pipeline = pipeline(\n        \"text-generation\",\n        model=self.model,\n        revision=self.revision,\n        torch_dtype=self.torch_dtype,\n        trust_remote_code=self.trust_remote_code,\n        model_kwargs=self.model_kwargs or {},\n        tokenizer=self.tokenizer or self.model,\n        use_fast=self.use_fast,\n        device=self.device,\n        device_map=self.device_map,\n        token=token,\n        return_full_text=False,\n    )\n\n    if self.chat_template is not None:\n        self._pipeline.tokenizer.chat_template = self.chat_template  # type: ignore\n\n    if self._pipeline.tokenizer.pad_token is None:  # type: ignore\n        self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token  # type: ignore\n\n    if self.structured_output:\n        self._prefix_allowed_tokens_fn = self._prepare_structured_output(\n            self.structured_output\n        )\n\n    super().load()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.unload","title":"unload()","text":"

Unloads the vLLM model.

Source code in src/distilabel/models/llms/huggingface/transformers.py
def unload(self) -> None:\n    \"\"\"Unloads the `vLLM` model.\"\"\"\n    CudaDevicePlacementMixin.unload(self)\n    super().unload()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.prepare_input","title":"prepare_input(input)","text":"

Prepares the input (applying the chat template and tokenization) for the provided input.

Parameters:

Name Type Description Default input StandardInput

the input list containing chat items.

required

Returns:

Type Description str

The prompt to send to the LLM.

Source code in src/distilabel/models/llms/huggingface/transformers.py
def prepare_input(self, input: \"StandardInput\") -> str:\n    \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n    input.\n\n    Args:\n        input: the input list containing chat items.\n\n    Returns:\n        The prompt to send to the LLM.\n    \"\"\"\n    if self._pipeline.tokenizer.chat_template:  # type: ignore\n        return input[0][\"content\"]\n\n    prompt: str = (\n        self._pipeline.tokenizer.apply_chat_template(  # type: ignore\n            input,  # type: ignore\n            tokenize=False,\n            add_generation_prompt=True,\n        )\n        if input\n        else \"\"\n    )\n    return super().apply_magpie_pre_query_template(prompt, input)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, temperature=0.1, repetition_penalty=1.1, top_p=1.0, top_k=0, do_sample=True)","text":"

Generates num_generations responses for each input using the text generation pipeline.

Parameters:

Name Type Description Default inputs List[StandardInput]

a list of inputs in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 temperature float

the temperature to use for the generation. Defaults to 0.1.

0.1 repetition_penalty float

the repetition penalty to use for the generation. Defaults to 1.1.

1.1 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 top_k int

the top-k value to use for the generation. Defaults to 0.

0 do_sample bool

whether to use sampling or not. Defaults to True.

True

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/huggingface/transformers.py
@validate_call\ndef generate(  # type: ignore\n    self,\n    inputs: List[StandardInput],\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    temperature: float = 0.1,\n    repetition_penalty: float = 1.1,\n    top_p: float = 1.0,\n    top_k: int = 0,\n    do_sample: bool = True,\n) -> List[GenerateOutput]:\n    \"\"\"Generates `num_generations` responses for each input using the text generation\n    pipeline.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        repetition_penalty: the repetition penalty to use for the generation. Defaults\n            to `1.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        top_k: the top-k value to use for the generation. Defaults to `0`.\n        do_sample: whether to use sampling or not. Defaults to `True`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    prepared_inputs = [self.prepare_input(input=input) for input in inputs]\n\n    outputs: List[List[Dict[str, str]]] = self._pipeline(  # type: ignore\n        prepared_inputs,\n        max_new_tokens=max_new_tokens,\n        temperature=temperature,\n        repetition_penalty=repetition_penalty,\n        top_p=top_p,\n        top_k=top_k,\n        do_sample=do_sample,\n        num_return_sequences=num_generations,\n        prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,\n        pad_token_id=self._pipeline.tokenizer.eos_token_id,  # type: ignore\n    )\n    return [\n        [generation[\"generated_text\"] for generation in output]\n        for output in outputs\n    ]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.get_last_hidden_states","title":"get_last_hidden_states(inputs)","text":"

Gets the last hidden_states of the model for the given inputs. It doesn't execute the task head.

Parameters:

Name Type Description Default inputs List[StandardInput]

a list of inputs in chat format to generate the embeddings for.

required

Returns:

Type Description List[HiddenState]

A list containing the last hidden state for each sequence using a NumPy array

List[HiddenState]

with shape [num_tokens, hidden_size].

Source code in src/distilabel/models/llms/huggingface/transformers.py
def get_last_hidden_states(\n    self, inputs: List[\"StandardInput\"]\n) -> List[\"HiddenState\"]:\n    \"\"\"Gets the last `hidden_states` of the model for the given inputs. It doesn't\n    execute the task head.\n\n    Args:\n        inputs: a list of inputs in chat format to generate the embeddings for.\n\n    Returns:\n        A list containing the last hidden state for each sequence using a NumPy array\n        with shape [num_tokens, hidden_size].\n    \"\"\"\n    model: \"PreTrainedModel\" = (\n        self._pipeline.model.model  # type: ignore\n        if hasattr(self._pipeline.model, \"model\")  # type: ignore\n        else next(self._pipeline.model.children())  # type: ignore\n    )\n    tokenizer: \"PreTrainedTokenizer\" = self._pipeline.tokenizer  # type: ignore\n    input_ids = tokenizer(\n        [self.prepare_input(input) for input in inputs],  # type: ignore\n        return_tensors=\"pt\",\n        padding=True,\n    ).to(model.device)\n    last_hidden_states = model(**input_ids)[\"last_hidden_state\"]\n\n    return [\n        seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()\n        for seq_last_hidden_state, attention_mask in zip(\n            last_hidden_states,\n            input_ids[\"attention_mask\"],  # type: ignore\n        )\n    ]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None)","text":"

Creates the appropriate function to filter tokens to generate structured outputs.

Parameters:

Name Type Description Default structured_output Optional[OutlinesStructuredOutputType]

the configuration dict to prepare the structured output.

None

Returns:

Type Description Union[Callable, None]

The callable that will be used to guide the generation of the model.

Source code in src/distilabel/models/llms/huggingface/transformers.py
def _prepare_structured_output(\n    self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[Callable, None]:\n    \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n    Args:\n        structured_output: the configuration dict to prepare the structured output.\n\n    Returns:\n        The callable that will be used to guide the generation of the model.\n    \"\"\"\n    from distilabel.steps.tasks.structured_outputs.outlines import (\n        prepare_guided_output,\n    )\n\n    result = prepare_guided_output(\n        structured_output, \"transformers\", self._pipeline\n    )\n    if schema := result.get(\"schema\"):\n        self.structured_output[\"schema\"] = schema\n    return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM","title":"LiteLLM","text":"

Bases: AsyncLLM

LiteLLM implementation running the async API client.

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\", etc.

verbose RuntimeParameter[bool]

whether to log the LiteLLM client's logs. Defaults to False.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

Runtime parameters
  • verbose: whether to log the LiteLLM client's logs. Defaults to False.

Examples:

Generate text:

from distilabel.models.llms import LiteLLM\n\nllm = LiteLLM(model=\"gpt-3.5-turbo\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import LiteLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = LiteLLM(\n    model=\"gpt-3.5-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/litellm.py
class LiteLLM(AsyncLLM):\n    \"\"\"LiteLLM implementation running the async API client.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\",\n            etc.\n        verbose: whether to log the LiteLLM client's logs. Defaults to `False`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n\n    Runtime parameters:\n        - `verbose`: whether to log the LiteLLM client's logs. Defaults to `False`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import LiteLLM\n\n        llm = LiteLLM(model=\"gpt-3.5-turbo\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import LiteLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = LiteLLM(\n            model=\"gpt-3.5-turbo\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    verbose: RuntimeParameter[bool] = Field(\n        default=False, description=\"Whether to log the LiteLLM client's logs.\"\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _aclient: Optional[Callable] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"\n        Loads the `acompletion` LiteLLM client to benefit from async requests.\n        \"\"\"\n        super().load()\n\n        try:\n            import litellm\n\n            litellm.telemetry = False\n        except ImportError as e:\n            raise ImportError(\n                \"LiteLLM Python client is not installed. Please install it using\"\n                \" `pip install litellm`.\"\n            ) from e\n        self._aclient = litellm.acompletion\n\n        if not self.verbose:\n            litellm.suppress_debug_info = True\n            for key in logging.Logger.manager.loggerDict.keys():\n                if \"litellm\" not in key.lower():\n                    continue\n                logging.getLogger(key).setLevel(logging.CRITICAL)\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"litellm\",\n            )\n            self._aclient = result.get(\"client\")\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore # noqa: C901\n        self,\n        input: FormattedInput,\n        num_generations: int = 1,\n        functions: Optional[List] = None,\n        function_call: Optional[str] = None,\n        temperature: Optional[float] = 1.0,\n        top_p: Optional[float] = 1.0,\n        stop: Optional[Union[str, list]] = None,\n        max_tokens: Optional[int] = None,\n        presence_penalty: Optional[float] = None,\n        frequency_penalty: Optional[float] = None,\n        logit_bias: Optional[dict] = None,\n        user: Optional[str] = None,\n        metadata: Optional[dict] = None,\n        api_base: Optional[str] = None,\n        api_version: Optional[str] = None,\n        api_key: Optional[str] = None,\n        model_list: Optional[list] = None,\n        mock_response: Optional[str] = None,\n        force_timeout: Optional[int] = 600,\n        custom_llm_provider: Optional[str] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for the given input using the [LiteLLM async client](https://github.com/BerriAI/litellm).\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            functions: a list of functions to apply to the conversation messages. Defaults to\n                `None`.\n            function_call: the name of the function to call within the conversation. Defaults\n                to `None`.\n            temperature: the temperature to use for the generation. Defaults to `1.0`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            stop: Up to 4 sequences where the LLM API will stop generating further tokens.\n                Defaults to `None`.\n            max_tokens: The maximum number of tokens in the generated completion. Defaults to\n                `None`.\n            presence_penalty: It is used to penalize new tokens based on their existence in the\n                text so far. Defaults to `None`.\n            frequency_penalty: It is used to penalize new tokens based on their frequency in the\n                text so far. Defaults to `None`.\n            logit_bias: Used to modify the probability of specific tokens appearing in the\n                completion. Defaults to `None`.\n            user: A unique identifier representing your end-user. This can help the LLM provider\n                to monitor and detect abuse. Defaults to `None`.\n            metadata: Pass in additional metadata to tag your completion calls - eg. prompt\n                version, details, etc. Defaults to `None`.\n            api_base: Base URL for the API. Defaults to `None`.\n            api_version: API version. Defaults to `None`.\n            api_key: API key. Defaults to `None`.\n            model_list: List of api base, version, keys. Defaults to `None`.\n            mock_response: If provided, return a mock completion response for testing or debugging\n                purposes. Defaults to `None`.\n            force_timeout: The maximum execution time in seconds for the completion request.\n                Defaults to `600`.\n            custom_llm_provider: Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable)\n                model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to\n                `None`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        import litellm\n\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,\n                client=self._aclient,\n                framework=\"litellm\",\n            )\n            self._aclient = result.get(\"client\")\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"model\": self.model,\n            \"messages\": input,\n            \"n\": num_generations,\n            \"functions\": functions,\n            \"function_call\": function_call,\n            \"temperature\": temperature,\n            \"top_p\": top_p,\n            \"stream\": False,\n            \"stop\": stop,\n            \"max_tokens\": max_tokens,\n            \"presence_penalty\": presence_penalty,\n            \"frequency_penalty\": frequency_penalty,\n            \"logit_bias\": logit_bias,\n            \"user\": user,\n            \"metadata\": metadata,\n            \"api_base\": api_base,\n            \"api_version\": api_version,\n            \"api_key\": api_key,\n            \"model_list\": model_list,\n            \"mock_response\": mock_response,\n            \"force_timeout\": force_timeout,\n            \"custom_llm_provider\": custom_llm_provider,\n        }\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n        async def _call_aclient_until_n_choices() -> List[\"Choices\"]:\n            choices = []\n            while len(choices) < num_generations:\n                completion = await self._aclient(**kwargs)  # type: ignore\n                if not self.structured_output:\n                    completion = completion.choices\n                choices.extend(completion)\n            return choices\n\n        # litellm.drop_params is used to en/disable sending **kwargs parameters to the API if they cannot be used\n        try:\n            litellm.drop_params = False\n            choices = await _call_aclient_until_n_choices()\n        except litellm.exceptions.APIError as e:\n            if \"does not support parameters\" in str(e):\n                litellm.drop_params = True\n                choices = await _call_aclient_until_n_choices()\n            else:\n                raise e\n\n        generations = []\n\n        if self.structured_output:\n            generations.append([choice.model_dump_json() for choice in choices])\n            return generations\n\n        for choice in choices:\n            if (content := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"Received no response using LiteLLM client (model: '{self.model}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(content)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.load","title":"load()","text":"

Loads the acompletion LiteLLM client to benefit from async requests.

Source code in src/distilabel/models/llms/litellm.py
def load(self) -> None:\n    \"\"\"\n    Loads the `acompletion` LiteLLM client to benefit from async requests.\n    \"\"\"\n    super().load()\n\n    try:\n        import litellm\n\n        litellm.telemetry = False\n    except ImportError as e:\n        raise ImportError(\n            \"LiteLLM Python client is not installed. Please install it using\"\n            \" `pip install litellm`.\"\n        ) from e\n    self._aclient = litellm.acompletion\n\n    if not self.verbose:\n        litellm.suppress_debug_info = True\n        for key in logging.Logger.manager.loggerDict.keys():\n            if \"litellm\" not in key.lower():\n                continue\n            logging.getLogger(key).setLevel(logging.CRITICAL)\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"litellm\",\n        )\n        self._aclient = result.get(\"client\")\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.agenerate","title":"agenerate(input, num_generations=1, functions=None, function_call=None, temperature=1.0, top_p=1.0, stop=None, max_tokens=None, presence_penalty=None, frequency_penalty=None, logit_bias=None, user=None, metadata=None, api_base=None, api_version=None, api_key=None, model_list=None, mock_response=None, force_timeout=600, custom_llm_provider=None) async","text":"

Generates num_generations responses for the given input using the LiteLLM async client.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 functions Optional[List]

a list of functions to apply to the conversation messages. Defaults to None.

None function_call Optional[str]

the name of the function to call within the conversation. Defaults to None.

None temperature Optional[float]

the temperature to use for the generation. Defaults to 1.0.

1.0 top_p Optional[float]

the top-p value to use for the generation. Defaults to 1.0.

1.0 stop Optional[Union[str, list]]

Up to 4 sequences where the LLM API will stop generating further tokens. Defaults to None.

None max_tokens Optional[int]

The maximum number of tokens in the generated completion. Defaults to None.

None presence_penalty Optional[float]

It is used to penalize new tokens based on their existence in the text so far. Defaults to None.

None frequency_penalty Optional[float]

It is used to penalize new tokens based on their frequency in the text so far. Defaults to None.

None logit_bias Optional[dict]

Used to modify the probability of specific tokens appearing in the completion. Defaults to None.

None user Optional[str]

A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse. Defaults to None.

None metadata Optional[dict]

Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc. Defaults to None.

None api_base Optional[str]

Base URL for the API. Defaults to None.

None api_version Optional[str]

API version. Defaults to None.

None api_key Optional[str]

API key. Defaults to None.

None model_list Optional[list]

List of api base, version, keys. Defaults to None.

None mock_response Optional[str]

If provided, return a mock completion response for testing or debugging purposes. Defaults to None.

None force_timeout Optional[int]

The maximum execution time in seconds for the completion request. Defaults to 600.

600 custom_llm_provider Optional[str]

Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable) model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to None.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/litellm.py
@validate_call\nasync def agenerate(  # type: ignore # noqa: C901\n    self,\n    input: FormattedInput,\n    num_generations: int = 1,\n    functions: Optional[List] = None,\n    function_call: Optional[str] = None,\n    temperature: Optional[float] = 1.0,\n    top_p: Optional[float] = 1.0,\n    stop: Optional[Union[str, list]] = None,\n    max_tokens: Optional[int] = None,\n    presence_penalty: Optional[float] = None,\n    frequency_penalty: Optional[float] = None,\n    logit_bias: Optional[dict] = None,\n    user: Optional[str] = None,\n    metadata: Optional[dict] = None,\n    api_base: Optional[str] = None,\n    api_version: Optional[str] = None,\n    api_key: Optional[str] = None,\n    model_list: Optional[list] = None,\n    mock_response: Optional[str] = None,\n    force_timeout: Optional[int] = 600,\n    custom_llm_provider: Optional[str] = None,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for the given input using the [LiteLLM async client](https://github.com/BerriAI/litellm).\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        functions: a list of functions to apply to the conversation messages. Defaults to\n            `None`.\n        function_call: the name of the function to call within the conversation. Defaults\n            to `None`.\n        temperature: the temperature to use for the generation. Defaults to `1.0`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        stop: Up to 4 sequences where the LLM API will stop generating further tokens.\n            Defaults to `None`.\n        max_tokens: The maximum number of tokens in the generated completion. Defaults to\n            `None`.\n        presence_penalty: It is used to penalize new tokens based on their existence in the\n            text so far. Defaults to `None`.\n        frequency_penalty: It is used to penalize new tokens based on their frequency in the\n            text so far. Defaults to `None`.\n        logit_bias: Used to modify the probability of specific tokens appearing in the\n            completion. Defaults to `None`.\n        user: A unique identifier representing your end-user. This can help the LLM provider\n            to monitor and detect abuse. Defaults to `None`.\n        metadata: Pass in additional metadata to tag your completion calls - eg. prompt\n            version, details, etc. Defaults to `None`.\n        api_base: Base URL for the API. Defaults to `None`.\n        api_version: API version. Defaults to `None`.\n        api_key: API key. Defaults to `None`.\n        model_list: List of api base, version, keys. Defaults to `None`.\n        mock_response: If provided, return a mock completion response for testing or debugging\n            purposes. Defaults to `None`.\n        force_timeout: The maximum execution time in seconds for the completion request.\n            Defaults to `600`.\n        custom_llm_provider: Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable)\n            model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to\n            `None`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    import litellm\n\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,\n            client=self._aclient,\n            framework=\"litellm\",\n        )\n        self._aclient = result.get(\"client\")\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"model\": self.model,\n        \"messages\": input,\n        \"n\": num_generations,\n        \"functions\": functions,\n        \"function_call\": function_call,\n        \"temperature\": temperature,\n        \"top_p\": top_p,\n        \"stream\": False,\n        \"stop\": stop,\n        \"max_tokens\": max_tokens,\n        \"presence_penalty\": presence_penalty,\n        \"frequency_penalty\": frequency_penalty,\n        \"logit_bias\": logit_bias,\n        \"user\": user,\n        \"metadata\": metadata,\n        \"api_base\": api_base,\n        \"api_version\": api_version,\n        \"api_key\": api_key,\n        \"model_list\": model_list,\n        \"mock_response\": mock_response,\n        \"force_timeout\": force_timeout,\n        \"custom_llm_provider\": custom_llm_provider,\n    }\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n    async def _call_aclient_until_n_choices() -> List[\"Choices\"]:\n        choices = []\n        while len(choices) < num_generations:\n            completion = await self._aclient(**kwargs)  # type: ignore\n            if not self.structured_output:\n                completion = completion.choices\n            choices.extend(completion)\n        return choices\n\n    # litellm.drop_params is used to en/disable sending **kwargs parameters to the API if they cannot be used\n    try:\n        litellm.drop_params = False\n        choices = await _call_aclient_until_n_choices()\n    except litellm.exceptions.APIError as e:\n        if \"does not support parameters\" in str(e):\n            litellm.drop_params = True\n            choices = await _call_aclient_until_n_choices()\n        else:\n            raise e\n\n    generations = []\n\n    if self.structured_output:\n        generations.append([choice.model_dump_json() for choice in choices])\n        return generations\n\n    for choice in choices:\n        if (content := choice.message.content) is None:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using LiteLLM client (model: '{self.model}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM","title":"LlamaCppLLM","text":"

Bases: LLM

llama.cpp LLM implementation running the Python bindings for the C++ code.

Attributes:

Name Type Description model_path RuntimeParameter[FilePath]

contains the path to the GGUF quantized model, compatible with the installed version of the llama.cpp Python bindings.

n_gpu_layers RuntimeParameter[int]

the number of layers to use for the GPU. Defaults to -1, meaning that the available GPU device will be used.

chat_format Optional[RuntimeParameter[str]]

the chat format to use for the model. Defaults to None, which means the Llama format will be used.

n_ctx int

the context size to use for the model. Defaults to 512.

n_batch int

the prompt processing maximum batch size to use for the model. Defaults to 512.

seed int

random seed to use for the generation. Defaults to 4294967295.

verbose RuntimeParameter[bool]

whether to print verbose output. Defaults to False.

structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]]

a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]]

additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {}.

_model Optional[Llama]

the Llama model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

Runtime parameters
  • model_path: the path to the GGUF quantized model.
  • n_gpu_layers: the number of layers to use for the GPU. Defaults to -1.
  • chat_format: the chat format to use for the model. Defaults to None.
  • verbose: whether to print verbose output. Defaults to False.
  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {}.
References
  • llama.cpp
  • llama-cpp-python

Examples:

Generate text:

from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\n# You can follow along this example downloading the following model running the following\n# command in the terminal, that will download the model to the `Downloads` folder:\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nllm = LlamaCppLLM(\n    model_path=str(Path.home() / model_path),\n    n_gpu_layers=-1,  # To use the GPU if available\n    n_ctx=1024,       # Set the context size\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = LlamaCppLLM(\n    model_path=str(Path.home() / model_path),  # type: ignore\n    n_gpu_layers=-1,\n    n_ctx=1024,\n    structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/llamacpp.py
class LlamaCppLLM(LLM):\n    \"\"\"llama.cpp LLM implementation running the Python bindings for the C++ code.\n\n    Attributes:\n        model_path: contains the path to the GGUF quantized model, compatible with the\n            installed version of the `llama.cpp` Python bindings.\n        n_gpu_layers: the number of layers to use for the GPU. Defaults to `-1`, meaning that\n            the available GPU device will be used.\n        chat_format: the chat format to use for the model. Defaults to `None`, which means the\n            Llama format will be used.\n        n_ctx: the context size to use for the model. Defaults to `512`.\n        n_batch: the prompt processing maximum batch size to use for the model. Defaults to `512`.\n        seed: random seed to use for the generation. Defaults to `4294967295`.\n        verbose: whether to print verbose output. Defaults to `False`.\n        structured_output: a dictionary containing the structured output configuration or if more\n            fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n        extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n            `Llama` class of `llama_cpp` library. Defaults to `{}`.\n        _model: the Llama model instance. This attribute is meant to be used internally and\n            should not be accessed directly. It will be set in the `load` method.\n\n    Runtime parameters:\n        - `model_path`: the path to the GGUF quantized model.\n        - `n_gpu_layers`: the number of layers to use for the GPU. Defaults to `-1`.\n        - `chat_format`: the chat format to use for the model. Defaults to `None`.\n        - `verbose`: whether to print verbose output. Defaults to `False`.\n        - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to the\n            `Llama` class of `llama_cpp` library. Defaults to `{}`.\n\n    References:\n        - [`llama.cpp`](https://github.com/ggerganov/llama.cpp)\n        - [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python)\n\n    Examples:\n        Generate text:\n\n        ```python\n        from pathlib import Path\n        from distilabel.models.llms import LlamaCppLLM\n\n        # You can follow along this example downloading the following model running the following\n        # command in the terminal, that will download the model to the `Downloads` folder:\n        # curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\n        model_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\n        llm = LlamaCppLLM(\n            model_path=str(Path.home() / model_path),\n            n_gpu_layers=-1,  # To use the GPU if available\n            n_ctx=1024,       # Set the context size\n        )\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pathlib import Path\n        from distilabel.models.llms import LlamaCppLLM\n\n        model_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = LlamaCppLLM(\n            model_path=str(Path.home() / model_path),  # type: ignore\n            n_gpu_layers=-1,\n            n_ctx=1024,\n            structured_output={\"format\": \"json\", \"schema\": Character},\n        )\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model_path: RuntimeParameter[FilePath] = Field(\n        default=None, description=\"The path to the GGUF quantized model.\", exclude=True\n    )\n    n_gpu_layers: RuntimeParameter[int] = Field(\n        default=-1,\n        description=\"The number of layers that will be loaded in the GPU.\",\n    )\n    chat_format: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The chat format to use for the model. Defaults to `None`, which means the Llama format will be used.\",\n    )\n\n    n_ctx: int = 512\n    n_batch: int = 512\n    seed: int = 4294967295\n    verbose: RuntimeParameter[bool] = Field(\n        default=False,\n        description=\"Whether to print verbose output from llama.cpp library.\",\n    )\n    extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n        default_factory=dict,\n        description=\"Additional dictionary of keyword arguments that will be passed to the\"\n        \" `Llama` class of `llama_cpp` library. See all the supported arguments at: \"\n        \"https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__\",\n    )\n    structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n        default=None,\n        description=\"The structured output format to use across all the generations.\",\n    )\n\n    _logits_processor: Optional[\"LogitsProcessorList\"] = PrivateAttr(default=None)\n    _model: Optional[\"Llama\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `Llama` model from the `model_path`.\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError as ie:\n            raise ImportError(\n                \"The `llama_cpp` package is required to use the `LlamaCppLLM` class.\"\n            ) from ie\n\n        self._model = Llama(\n            model_path=self.model_path.as_posix(),  # type: ignore\n            seed=self.seed,\n            n_ctx=self.n_ctx,\n            n_batch=self.n_batch,\n            chat_format=self.chat_format,\n            n_gpu_layers=self.n_gpu_layers,\n            verbose=self.verbose,\n            **self.extra_kwargs,\n        )\n\n        if self.structured_output:\n            self._logits_processor = self._prepare_structured_output(\n                self.structured_output\n            )\n\n        # NOTE: Here because of the custom `logging` interface used, since it will create the logging name\n        # out of the model name, which won't be available until the `Llama` instance is created.\n        super().load()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self._model.model_path  # type: ignore\n\n    @validate_call\n    def generate(  # type: ignore\n        self,\n        inputs: List[FormattedInput],\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        frequency_penalty: float = 0.0,\n        presence_penalty: float = 0.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        extra_generation_kwargs: Optional[Dict[str, Any]] = None,\n    ) -> List[GenerateOutput]:\n        \"\"\"Generates `num_generations` responses for the given input using the Llama model.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            extra_generation_kwargs: dictionary with additional arguments to be passed to\n                the `create_chat_completion` method. Reference at\n                https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        structured_output = None\n        batch_outputs = []\n        for input in inputs:\n            if isinstance(input, tuple):\n                input, structured_output = input\n            elif self.structured_output:\n                structured_output = self.structured_output\n\n            outputs = []\n            for _ in range(num_generations):\n                # NOTE(plaguss): There seems to be a bug in how the logits processor\n                # is used. Basically it consumes the FSM internally, and it isn't reinitialized\n                # after each generation, so subsequent calls yield nothing. This is a workaround\n                # until is fixed in the `llama_cpp` or `outlines` libraries.\n                if structured_output:\n                    self._logits_processor = self._prepare_structured_output(\n                        structured_output\n                    )\n                chat_completions: \"CreateChatCompletionResponse\" = (\n                    self._model.create_chat_completion(  # type: ignore\n                        messages=input,  # type: ignore\n                        max_tokens=max_new_tokens,\n                        frequency_penalty=frequency_penalty,\n                        presence_penalty=presence_penalty,\n                        temperature=temperature,\n                        top_p=top_p,\n                        logits_processor=self._logits_processor,\n                        **(extra_generation_kwargs or {}),\n                    )\n                )\n                outputs.append(chat_completions[\"choices\"][0][\"message\"][\"content\"])\n            batch_outputs.append(outputs)\n        return batch_outputs\n\n    def _prepare_structured_output(\n        self, structured_output: Optional[OutlinesStructuredOutputType] = None\n    ) -> Union[\"LogitsProcessorList\", None]:\n        \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n        Args:\n            structured_output: the configuration dict to prepare the structured output.\n\n        Returns:\n            The callable that will be used to guide the generation of the model.\n        \"\"\"\n        from distilabel.steps.tasks.structured_outputs.outlines import (\n            prepare_guided_output,\n        )\n\n        result = prepare_guided_output(structured_output, \"llamacpp\", self._model)\n        if (schema := result.get(\"schema\")) and self.structured_output:\n            self.structured_output[\"schema\"] = schema\n        return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.load","title":"load()","text":"

Loads the Llama model from the model_path.

Source code in src/distilabel/models/llms/llamacpp.py
def load(self) -> None:\n    \"\"\"Loads the `Llama` model from the `model_path`.\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError as ie:\n        raise ImportError(\n            \"The `llama_cpp` package is required to use the `LlamaCppLLM` class.\"\n        ) from ie\n\n    self._model = Llama(\n        model_path=self.model_path.as_posix(),  # type: ignore\n        seed=self.seed,\n        n_ctx=self.n_ctx,\n        n_batch=self.n_batch,\n        chat_format=self.chat_format,\n        n_gpu_layers=self.n_gpu_layers,\n        verbose=self.verbose,\n        **self.extra_kwargs,\n    )\n\n    if self.structured_output:\n        self._logits_processor = self._prepare_structured_output(\n            self.structured_output\n        )\n\n    # NOTE: Here because of the custom `logging` interface used, since it will create the logging name\n    # out of the model name, which won't be available until the `Llama` instance is created.\n    super().load()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, extra_generation_kwargs=None)","text":"

Generates num_generations responses for the given input using the Llama model.

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 extra_generation_kwargs Optional[Dict[str, Any]]

dictionary with additional arguments to be passed to the create_chat_completion method. Reference at https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion

None

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/llamacpp.py
@validate_call\ndef generate(  # type: ignore\n    self,\n    inputs: List[FormattedInput],\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    frequency_penalty: float = 0.0,\n    presence_penalty: float = 0.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    extra_generation_kwargs: Optional[Dict[str, Any]] = None,\n) -> List[GenerateOutput]:\n    \"\"\"Generates `num_generations` responses for the given input using the Llama model.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        extra_generation_kwargs: dictionary with additional arguments to be passed to\n            the `create_chat_completion` method. Reference at\n            https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    structured_output = None\n    batch_outputs = []\n    for input in inputs:\n        if isinstance(input, tuple):\n            input, structured_output = input\n        elif self.structured_output:\n            structured_output = self.structured_output\n\n        outputs = []\n        for _ in range(num_generations):\n            # NOTE(plaguss): There seems to be a bug in how the logits processor\n            # is used. Basically it consumes the FSM internally, and it isn't reinitialized\n            # after each generation, so subsequent calls yield nothing. This is a workaround\n            # until is fixed in the `llama_cpp` or `outlines` libraries.\n            if structured_output:\n                self._logits_processor = self._prepare_structured_output(\n                    structured_output\n                )\n            chat_completions: \"CreateChatCompletionResponse\" = (\n                self._model.create_chat_completion(  # type: ignore\n                    messages=input,  # type: ignore\n                    max_tokens=max_new_tokens,\n                    frequency_penalty=frequency_penalty,\n                    presence_penalty=presence_penalty,\n                    temperature=temperature,\n                    top_p=top_p,\n                    logits_processor=self._logits_processor,\n                    **(extra_generation_kwargs or {}),\n                )\n            )\n            outputs.append(chat_completions[\"choices\"][0][\"message\"][\"content\"])\n        batch_outputs.append(outputs)\n    return batch_outputs\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None)","text":"

Creates the appropriate function to filter tokens to generate structured outputs.

Parameters:

Name Type Description Default structured_output Optional[OutlinesStructuredOutputType]

the configuration dict to prepare the structured output.

None

Returns:

Type Description Union[LogitsProcessorList, None]

The callable that will be used to guide the generation of the model.

Source code in src/distilabel/models/llms/llamacpp.py
def _prepare_structured_output(\n    self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[\"LogitsProcessorList\", None]:\n    \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n    Args:\n        structured_output: the configuration dict to prepare the structured output.\n\n    Returns:\n        The callable that will be used to guide the generation of the model.\n    \"\"\"\n    from distilabel.steps.tasks.structured_outputs.outlines import (\n        prepare_guided_output,\n    )\n\n    result = prepare_guided_output(structured_output, \"llamacpp\", self._model)\n    if (schema := result.get(\"schema\")) and self.structured_output:\n        self.structured_output[\"schema\"] = schema\n    return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM","title":"MistralLLM","text":"

Bases: AsyncLLM

Mistral LLM implementation running the async API client.

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc.

endpoint str

the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\".

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Mistral API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set.

max_retries RuntimeParameter[int]

the maximum number of retries to attempt when a request fails. Defaults to 5.

timeout RuntimeParameter[int]

the maximum time in seconds to wait for a response. Defaults to 120.

max_concurrent_requests RuntimeParameter[int]

the maximum number of concurrent requests to send. Defaults to 64.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

_api_key_env_var str

the name of the environment variable to use for the API key. It is meant to be used internally.

_aclient Optional[Mistral]

the Mistral to use for the Mistral API. It is meant to be used internally. Set in the load method.

Runtime parameters
  • api_key: the API key to authenticate the requests to the Mistral API.
  • max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5.
  • timeout: the maximum time in seconds to wait for a response. Defaults to 120.
  • max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64.

Examples:

Generate text:

from distilabel.models.llms import MistralLLM\n\nllm = MistralLLM(model=\"open-mixtral-8x22b\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import MistralLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = MistralLLM(\n    model=\"open-mixtral-8x22b\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/mistral.py
class MistralLLM(AsyncLLM):\n    \"\"\"Mistral LLM implementation running the async API client.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc.\n        endpoint: the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\".\n        api_key: the API key to authenticate the requests to the Mistral API. Defaults to `None` which\n            means that the value set for the environment variable `OPENAI_API_KEY` will be used, or\n            `None` if not set.\n        max_retries: the maximum number of retries to attempt when a request fails. Defaults to `5`.\n        timeout: the maximum time in seconds to wait for a response. Defaults to `120`.\n        max_concurrent_requests: the maximum number of concurrent requests to send. Defaults\n            to `64`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n        _api_key_env_var: the name of the environment variable to use for the API key. It is meant to\n            be used internally.\n        _aclient: the `Mistral` to use for the Mistral API. It is meant to be used internally.\n            Set in the `load` method.\n\n    Runtime parameters:\n        - `api_key`: the API key to authenticate the requests to the Mistral API.\n        - `max_retries`: the maximum number of retries to attempt when a request fails.\n            Defaults to `5`.\n        - `timeout`: the maximum time in seconds to wait for a response. Defaults to `120`.\n        - `max_concurrent_requests`: the maximum number of concurrent requests to send.\n            Defaults to `64`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import MistralLLM\n\n        llm = MistralLLM(model=\"open-mixtral-8x22b\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import MistralLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = MistralLLM(\n            model=\"open-mixtral-8x22b\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    endpoint: str = \"https://api.mistral.ai\"\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_MISTRALAI_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Mistral API.\",\n    )\n    max_retries: RuntimeParameter[int] = Field(\n        default=6,\n        description=\"The maximum number of times to retry the request to the API before\"\n        \" failing.\",\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    max_concurrent_requests: RuntimeParameter[int] = Field(\n        default=64, description=\"The maximum number of concurrent requests to send.\"\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _api_key_env_var: str = PrivateAttr(_MISTRALAI_API_KEY_ENV_VAR_NAME)\n    _aclient: Optional[\"Mistral\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `Mistral` client to benefit from async requests.\"\"\"\n        super().load()\n\n        try:\n            from mistralai import Mistral\n        except ImportError as ie:\n            raise ImportError(\n                \"MistralAI Python client is not installed. Please install it using\"\n                \" `pip install mistralai`.\"\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        self._aclient = Mistral(\n            api_key=self.api_key.get_secret_value(),\n            endpoint=self.endpoint,\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,  # type: ignore\n            max_concurrent_requests=self.max_concurrent_requests,  # type: ignore\n        )\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"mistral\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    # TODO: add `num_generations` parameter once Mistral client allows `n` parameter\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        max_new_tokens: Optional[int] = None,\n        temperature: Optional[float] = None,\n        top_p: Optional[float] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for the given input using the MistralAI async\n        client.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,\n                client=self._aclient,\n                framework=\"mistral\",\n            )\n            self._aclient = result.get(\"client\")\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"messages\": input,  # type: ignore\n            \"model\": self.model,\n            \"max_tokens\": max_new_tokens,\n            \"temperature\": temperature,\n            \"top_p\": top_p,\n        }\n        generations = []\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)\n            # TODO:\u00a0This should work just with the _aclient.chat method, but it's not working.\n            # We need to check instructor and see if we can create a PR.\n            completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n        else:\n            # completion = await self._aclient.chat(**kwargs)  # type: ignore\n            completion = await self._aclient.chat.complete_async(**kwargs)  # type: ignore\n\n        if structured_output:\n            generations.append(completion.model_dump_json())\n            return generations\n\n        for choice in completion.choices:\n            if (content := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"Received no response using MistralAI client (model: '{self.model}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(content)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.load","title":"load()","text":"

Loads the Mistral client to benefit from async requests.

Source code in src/distilabel/models/llms/mistral.py
def load(self) -> None:\n    \"\"\"Loads the `Mistral` client to benefit from async requests.\"\"\"\n    super().load()\n\n    try:\n        from mistralai import Mistral\n    except ImportError as ie:\n        raise ImportError(\n            \"MistralAI Python client is not installed. Please install it using\"\n            \" `pip install mistralai`.\"\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    self._aclient = Mistral(\n        api_key=self.api_key.get_secret_value(),\n        endpoint=self.endpoint,\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,  # type: ignore\n        max_concurrent_requests=self.max_concurrent_requests,  # type: ignore\n    )\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"mistral\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.agenerate","title":"agenerate(input, max_new_tokens=None, temperature=None, top_p=None) async","text":"

Generates num_generations responses for the given input using the MistralAI async client.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required max_new_tokens Optional[int]

the maximum number of new tokens that the model will generate. Defaults to 128.

None temperature Optional[float]

the temperature to use for the generation. Defaults to 0.1.

None top_p Optional[float]

the top-p value to use for the generation. Defaults to 1.0.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/mistral.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    max_new_tokens: Optional[int] = None,\n    temperature: Optional[float] = None,\n    top_p: Optional[float] = None,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for the given input using the MistralAI async\n    client.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,\n            client=self._aclient,\n            framework=\"mistral\",\n        )\n        self._aclient = result.get(\"client\")\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"messages\": input,  # type: ignore\n        \"model\": self.model,\n        \"max_tokens\": max_new_tokens,\n        \"temperature\": temperature,\n        \"top_p\": top_p,\n    }\n    generations = []\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)\n        # TODO:\u00a0This should work just with the _aclient.chat method, but it's not working.\n        # We need to check instructor and see if we can create a PR.\n        completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n    else:\n        # completion = await self._aclient.chat(**kwargs)  # type: ignore\n        completion = await self._aclient.chat.complete_async(**kwargs)  # type: ignore\n\n    if structured_output:\n        generations.append(completion.model_dump_json())\n        return generations\n\n    for choice in completion.choices:\n        if (content := choice.message.content) is None:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using MistralAI client (model: '{self.model}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM","title":"MixtureOfAgentsLLM","text":"

Bases: AsyncLLM

Mixture-of-Agents implementation.

An LLM class that leverages LLMs collective strenghts to generate a response, as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\" paper. There is a list of LLMs proposing/generating outputs that LLMs from the next round/layer can use as auxiliary information. Finally, there is an LLM that aggregates the outputs to generate the final response.

Attributes:

Name Type Description aggregator_llm LLM

The LLM that aggregates the outputs of the proposer LLMs.

proposers_llms List[AsyncLLM]

The list of LLMs that propose outputs to be aggregated.

rounds int

The number of layers or rounds that the proposers_llms will generate outputs. Defaults to 1.

References
  • Mixture-of-Agents Enhances Large Language Model Capabilities

Examples:

Generate text:

from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\nllm = MixtureOfAgentsLLM(\n    aggregator_llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n    proposers_llms=[\n        InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        ),\n        InferenceEndpointsLLM(\n            model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n            tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n        ),\n        InferenceEndpointsLLM(\n            model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n            tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n        ),\n    ],\n    rounds=2,\n)\n\nllm.load()\n\noutput = llm.generate_outputs(\n    inputs=[\n        [\n            {\n                \"role\": \"user\",\n                \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n            }\n        ]\n    ]\n)\n
Source code in src/distilabel/models/llms/moa.py
class MixtureOfAgentsLLM(AsyncLLM):\n    \"\"\"`Mixture-of-Agents` implementation.\n\n    An `LLM` class that leverages `LLM`s collective strenghts to generate a response,\n    as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\"\n    paper. There is a list of `LLM`s proposing/generating outputs that `LLM`s from the next\n    round/layer can use as auxiliary information. Finally, there is an `LLM` that aggregates\n    the outputs to generate the final response.\n\n    Attributes:\n        aggregator_llm: The `LLM` that aggregates the outputs of the proposer `LLM`s.\n        proposers_llms: The list of `LLM`s that propose outputs to be aggregated.\n        rounds: The number of layers or rounds that the `proposers_llms` will generate\n            outputs. Defaults to `1`.\n\n    References:\n        - [Mixture-of-Agents Enhances Large Language Model Capabilities](https://arxiv.org/abs/2406.04692)\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\n        llm = MixtureOfAgentsLLM(\n            aggregator_llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            ),\n            proposers_llms=[\n                InferenceEndpointsLLM(\n                    model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                    tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                ),\n                InferenceEndpointsLLM(\n                    model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n                    tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n                ),\n                InferenceEndpointsLLM(\n                    model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n                    tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n                ),\n            ],\n            rounds=2,\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(\n            inputs=[\n                [\n                    {\n                        \"role\": \"user\",\n                        \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n                    }\n                ]\n            ]\n        )\n        ```\n    \"\"\"\n\n    aggregator_llm: LLM\n    proposers_llms: List[AsyncLLM] = Field(default_factory=list)\n    rounds: int = 1\n\n    @property\n    def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n        \"\"\"Returns the runtime parameters of the `LLM`, which are a combination of the\n        `RuntimeParameter`s of the `LLM`, the `aggregator_llm` and the `proposers_llms`.\n\n        Returns:\n            The runtime parameters of the `LLM`.\n        \"\"\"\n        runtime_parameters_names = super().runtime_parameters_names\n        del runtime_parameters_names[\"generation_kwargs\"]\n        return runtime_parameters_names\n\n    def load(self) -> None:\n        \"\"\"Loads all the `LLM`s in the `MixtureOfAgents`.\"\"\"\n        super().load()\n\n        for llm in self.proposers_llms:\n            self._logger.debug(f\"Loading proposer LLM in MoA: {llm}\")  # type: ignore\n            llm.load()\n\n        self._logger.debug(f\"Loading aggregator LLM in MoA: {self.aggregator_llm}\")  # type: ignore\n        self.aggregator_llm.load()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the aggregated model name.\"\"\"\n        return f\"moa-{self.aggregator_llm.model_name}-{'-'.join([llm.model_name for llm in self.proposers_llms])}\"\n\n    def get_generation_kwargs(self) -> Dict[str, Any]:\n        \"\"\"Returns the generation kwargs of the `MixtureOfAgents` as a dictionary.\n\n        Returns:\n            The generation kwargs of the `MixtureOfAgents`.\n        \"\"\"\n        return {\n            \"aggregator_llm\": self.aggregator_llm.get_generation_kwargs(),\n            \"proposers_llms\": [\n                llm.get_generation_kwargs() for llm in self.proposers_llms\n            ],\n        }\n\n    # `abstractmethod`, had to be implemented but not used\n    async def agenerate(\n        self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n    ) -> List[Union[str, None]]:\n        raise NotImplementedError(\n            \"`agenerate` method is not implemented for `MixtureOfAgents`\"\n        )\n\n    def _build_moa_system_prompt(self, prev_outputs: List[str]) -> str:\n        \"\"\"Builds the Mixture-of-Agents system prompt.\n\n        Args:\n            prev_outputs: The list of previous outputs to use as references.\n\n        Returns:\n            The Mixture-of-Agents system prompt.\n        \"\"\"\n        moa_system_prompt = MOA_SYSTEM_PROMPT\n        for i, prev_output in enumerate(prev_outputs):\n            if prev_output is not None:\n                moa_system_prompt += f\"\\n{i + 1}. {prev_output}\"\n        return moa_system_prompt\n\n    def _inject_moa_system_prompt(\n        self, input: \"StandardInput\", prev_outputs: List[str]\n    ) -> \"StandardInput\":\n        \"\"\"Injects the Mixture-of-Agents system prompt into the input.\n\n        Args:\n            input: The input to inject the system prompt into.\n            prev_outputs: The list of previous outputs to use as references.\n\n        Returns:\n            The input with the Mixture-of-Agents system prompt injected.\n        \"\"\"\n        if len(prev_outputs) == 0:\n            return input\n\n        moa_system_prompt = self._build_moa_system_prompt(prev_outputs)\n\n        system = next((item for item in input if item[\"role\"] == \"system\"), None)\n        if system:\n            original_system_prompt = system[\"content\"]\n            system[\"content\"] = f\"{moa_system_prompt}\\n\\n{original_system_prompt}\"\n        else:\n            input.insert(0, {\"role\": \"system\", \"content\": moa_system_prompt})\n\n        return input\n\n    async def _agenerate(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        aggregator_llm_kwargs: Dict[str, Any] = kwargs.get(\"aggregator_llm\", {})\n        proposers_llms_kwargs: List[Dict[str, Any]] = kwargs.get(\n            \"proposers_llms\", [{}] * len(self.proposers_llms)\n        )\n\n        prev_outputs = []\n        for round in range(self.rounds):\n            self._logger.debug(f\"Generating round {round + 1}/{self.rounds} in MoA\")  # type: ignore\n\n            # Generate `num_generations` with each proposer LLM for each input\n            tasks = [\n                asyncio.create_task(\n                    llm._agenerate(\n                        inputs=[\n                            self._inject_moa_system_prompt(\n                                cast(\"StandardInput\", input), prev_input_outputs\n                            )\n                            for input, prev_input_outputs in itertools.zip_longest(\n                                inputs, prev_outputs, fillvalue=[]\n                            )\n                        ],\n                        num_generations=1,\n                        **generation_kwargs,\n                    )\n                )\n                for llm, generation_kwargs in zip(\n                    self.proposers_llms, proposers_llms_kwargs\n                )\n            ]\n\n            # Group generations per input\n            outputs: List[List[\"GenerateOutput\"]] = await asyncio.gather(*tasks)\n            prev_outputs = [\n                list(itertools.chain(*input_outputs)) for input_outputs in zip(*outputs)\n            ]\n\n        self._logger.debug(\"Aggregating outputs in MoA\")  # type: ignore\n        if isinstance(self.aggregator_llm, AsyncLLM):\n            return await self.aggregator_llm._agenerate(\n                inputs=[\n                    self._inject_moa_system_prompt(\n                        cast(\"StandardInput\", input), prev_input_outputs\n                    )\n                    for input, prev_input_outputs in zip(inputs, prev_outputs)\n                ],\n                num_generations=num_generations,\n                **aggregator_llm_kwargs,\n            )\n\n        return self.aggregator_llm.generate(\n            inputs=[\n                self._inject_moa_system_prompt(\n                    cast(\"StandardInput\", input), prev_input_outputs\n                )\n                for input, prev_input_outputs in zip(inputs, prev_outputs)\n            ],\n            num_generations=num_generations,\n            **aggregator_llm_kwargs,\n        )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property","text":"

Returns the runtime parameters of the LLM, which are a combination of the RuntimeParameters of the LLM, the aggregator_llm and the proposers_llms.

Returns:

Type Description RuntimeParametersNames

The runtime parameters of the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.model_name","title":"model_name: str property","text":"

Returns the aggregated model name.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.load","title":"load()","text":"

Loads all the LLMs in the MixtureOfAgents.

Source code in src/distilabel/models/llms/moa.py
def load(self) -> None:\n    \"\"\"Loads all the `LLM`s in the `MixtureOfAgents`.\"\"\"\n    super().load()\n\n    for llm in self.proposers_llms:\n        self._logger.debug(f\"Loading proposer LLM in MoA: {llm}\")  # type: ignore\n        llm.load()\n\n    self._logger.debug(f\"Loading aggregator LLM in MoA: {self.aggregator_llm}\")  # type: ignore\n    self.aggregator_llm.load()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.get_generation_kwargs","title":"get_generation_kwargs()","text":"

Returns the generation kwargs of the MixtureOfAgents as a dictionary.

Returns:

Type Description Dict[str, Any]

The generation kwargs of the MixtureOfAgents.

Source code in src/distilabel/models/llms/moa.py
def get_generation_kwargs(self) -> Dict[str, Any]:\n    \"\"\"Returns the generation kwargs of the `MixtureOfAgents` as a dictionary.\n\n    Returns:\n        The generation kwargs of the `MixtureOfAgents`.\n    \"\"\"\n    return {\n        \"aggregator_llm\": self.aggregator_llm.get_generation_kwargs(),\n        \"proposers_llms\": [\n            llm.get_generation_kwargs() for llm in self.proposers_llms\n        ],\n    }\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._build_moa_system_prompt","title":"_build_moa_system_prompt(prev_outputs)","text":"

Builds the Mixture-of-Agents system prompt.

Parameters:

Name Type Description Default prev_outputs List[str]

The list of previous outputs to use as references.

required

Returns:

Type Description str

The Mixture-of-Agents system prompt.

Source code in src/distilabel/models/llms/moa.py
def _build_moa_system_prompt(self, prev_outputs: List[str]) -> str:\n    \"\"\"Builds the Mixture-of-Agents system prompt.\n\n    Args:\n        prev_outputs: The list of previous outputs to use as references.\n\n    Returns:\n        The Mixture-of-Agents system prompt.\n    \"\"\"\n    moa_system_prompt = MOA_SYSTEM_PROMPT\n    for i, prev_output in enumerate(prev_outputs):\n        if prev_output is not None:\n            moa_system_prompt += f\"\\n{i + 1}. {prev_output}\"\n    return moa_system_prompt\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._inject_moa_system_prompt","title":"_inject_moa_system_prompt(input, prev_outputs)","text":"

Injects the Mixture-of-Agents system prompt into the input.

Parameters:

Name Type Description Default input StandardInput

The input to inject the system prompt into.

required prev_outputs List[str]

The list of previous outputs to use as references.

required

Returns:

Type Description StandardInput

The input with the Mixture-of-Agents system prompt injected.

Source code in src/distilabel/models/llms/moa.py
def _inject_moa_system_prompt(\n    self, input: \"StandardInput\", prev_outputs: List[str]\n) -> \"StandardInput\":\n    \"\"\"Injects the Mixture-of-Agents system prompt into the input.\n\n    Args:\n        input: The input to inject the system prompt into.\n        prev_outputs: The list of previous outputs to use as references.\n\n    Returns:\n        The input with the Mixture-of-Agents system prompt injected.\n    \"\"\"\n    if len(prev_outputs) == 0:\n        return input\n\n    moa_system_prompt = self._build_moa_system_prompt(prev_outputs)\n\n    system = next((item for item in input if item[\"role\"] == \"system\"), None)\n    if system:\n        original_system_prompt = system[\"content\"]\n        system[\"content\"] = f\"{moa_system_prompt}\\n\\n{original_system_prompt}\"\n    else:\n        input.insert(0, {\"role\": \"system\", \"content\": moa_system_prompt})\n\n    return input\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._agenerate","title":"_agenerate(inputs, num_generations=1, **kwargs) async","text":"

Internal function to concurrently generate responses for a list of inputs.

Parameters:

Name Type Description Default inputs List[FormattedInput]

the list of inputs to generate responses for.

required num_generations int

the number of generations to generate per input.

1 **kwargs Any

the additional kwargs to be used for the generation.

{}

Returns:

Type Description List[GenerateOutput]

A list containing the generations for each input.

Source code in src/distilabel/models/llms/moa.py
async def _agenerate(\n    self,\n    inputs: List[\"FormattedInput\"],\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n    Args:\n        inputs: the list of inputs to generate responses for.\n        num_generations: the number of generations to generate per input.\n        **kwargs: the additional kwargs to be used for the generation.\n\n    Returns:\n        A list containing the generations for each input.\n    \"\"\"\n    aggregator_llm_kwargs: Dict[str, Any] = kwargs.get(\"aggregator_llm\", {})\n    proposers_llms_kwargs: List[Dict[str, Any]] = kwargs.get(\n        \"proposers_llms\", [{}] * len(self.proposers_llms)\n    )\n\n    prev_outputs = []\n    for round in range(self.rounds):\n        self._logger.debug(f\"Generating round {round + 1}/{self.rounds} in MoA\")  # type: ignore\n\n        # Generate `num_generations` with each proposer LLM for each input\n        tasks = [\n            asyncio.create_task(\n                llm._agenerate(\n                    inputs=[\n                        self._inject_moa_system_prompt(\n                            cast(\"StandardInput\", input), prev_input_outputs\n                        )\n                        for input, prev_input_outputs in itertools.zip_longest(\n                            inputs, prev_outputs, fillvalue=[]\n                        )\n                    ],\n                    num_generations=1,\n                    **generation_kwargs,\n                )\n            )\n            for llm, generation_kwargs in zip(\n                self.proposers_llms, proposers_llms_kwargs\n            )\n        ]\n\n        # Group generations per input\n        outputs: List[List[\"GenerateOutput\"]] = await asyncio.gather(*tasks)\n        prev_outputs = [\n            list(itertools.chain(*input_outputs)) for input_outputs in zip(*outputs)\n        ]\n\n    self._logger.debug(\"Aggregating outputs in MoA\")  # type: ignore\n    if isinstance(self.aggregator_llm, AsyncLLM):\n        return await self.aggregator_llm._agenerate(\n            inputs=[\n                self._inject_moa_system_prompt(\n                    cast(\"StandardInput\", input), prev_input_outputs\n                )\n                for input, prev_input_outputs in zip(inputs, prev_outputs)\n            ],\n            num_generations=num_generations,\n            **aggregator_llm_kwargs,\n        )\n\n    return self.aggregator_llm.generate(\n        inputs=[\n            self._inject_moa_system_prompt(\n                cast(\"StandardInput\", input), prev_input_outputs\n            )\n            for input, prev_input_outputs in zip(inputs, prev_outputs)\n        ],\n        num_generations=num_generations,\n        **aggregator_llm_kwargs,\n    )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM","title":"OllamaLLM","text":"

Bases: AsyncLLM

Ollama LLM implementation running the Async API client.

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"notus\".

host Optional[RuntimeParameter[str]]

the Ollama server host.

timeout RuntimeParameter[int]

the timeout for the LLM. Defaults to 120.

_aclient Optional[AsyncClient]

the AsyncClient to use for the Ollama API. It is meant to be used internally. Set in the load method.

Runtime parameters
  • host: the Ollama server host.
  • timeout: the client timeout for the Ollama API. Defaults to 120.

Examples:

Generate text:

from distilabel.models.llms import OllamaLLM\n\nllm = OllamaLLM(model=\"llama3\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/ollama.py
class OllamaLLM(AsyncLLM):\n    \"\"\"Ollama LLM implementation running the Async API client.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"notus\".\n        host: the Ollama server host.\n        timeout: the timeout for the LLM. Defaults to `120`.\n        _aclient: the `AsyncClient` to use for the Ollama API. It is meant to be used internally.\n            Set in the `load` method.\n\n    Runtime parameters:\n        - `host`: the Ollama server host.\n        - `timeout`: the client timeout for the Ollama API. Defaults to `120`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import OllamaLLM\n\n        llm = OllamaLLM(model=\"llama3\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    host: Optional[RuntimeParameter[str]] = Field(\n        default=None, description=\"The host of the Ollama API.\"\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120, description=\"The timeout for the Ollama API.\"\n    )\n    follow_redirects: bool = True\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _aclient: Optional[\"AsyncClient\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncClient` to use Ollama async API.\"\"\"\n        super().load()\n\n        try:\n            from ollama import AsyncClient\n\n            self._aclient = AsyncClient(\n                host=self.host,\n                timeout=self.timeout,\n                follow_redirects=self.follow_redirects,\n            )\n        except ImportError as e:\n            raise ImportError(\n                \"Ollama Python client is not installed. Please install it using\"\n                \" `pip install ollama`.\"\n            ) from e\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: StandardInput,\n        format: Literal[\"\", \"json\"] = \"\",\n        # TODO: include relevant options from `Options` in `agenerate` method.\n        options: Union[Options, None] = None,\n        keep_alive: Union[bool, None] = None,\n    ) -> GenerateOutput:\n        \"\"\"\n        Generates a response asynchronously, using the [Ollama Async API definition](https://github.com/ollama/ollama-python).\n\n        Args:\n            input: the input to use for the generation.\n            format: the format to use for the generation. Defaults to `\"\"`.\n            options: the options to use for the generation. Defaults to `None`.\n            keep_alive: whether to keep the connection alive. Defaults to `None`.\n\n        Returns:\n            A list of strings as completion for the given input.\n        \"\"\"\n        text = None\n        try:\n            completion: Dict[str, Any] = await self._aclient.chat(  # type: ignore\n                model=self.model,\n                messages=input,  # type: ignore\n                stream=False,\n                format=format,\n                options=options,\n                keep_alive=keep_alive,\n            )\n            text = completion[\"message\"][\"content\"]\n        except Exception as e:\n            self._logger.warning(  # type: ignore\n                f\"\u26a0\ufe0f Received no response using Ollama client (model: '{self.model_name}').\"\n                f\" Finish reason was: {e}\"\n            )\n\n        return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.load","title":"load()","text":"

Loads the AsyncClient to use Ollama async API.

Source code in src/distilabel/models/llms/ollama.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncClient` to use Ollama async API.\"\"\"\n    super().load()\n\n    try:\n        from ollama import AsyncClient\n\n        self._aclient = AsyncClient(\n            host=self.host,\n            timeout=self.timeout,\n            follow_redirects=self.follow_redirects,\n        )\n    except ImportError as e:\n        raise ImportError(\n            \"Ollama Python client is not installed. Please install it using\"\n            \" `pip install ollama`.\"\n        ) from e\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.agenerate","title":"agenerate(input, format='', options=None, keep_alive=None) async","text":"

Generates a response asynchronously, using the Ollama Async API definition.

Parameters:

Name Type Description Default input StandardInput

the input to use for the generation.

required format Literal['', 'json']

the format to use for the generation. Defaults to \"\".

'' options Union[Options, None]

the options to use for the generation. Defaults to None.

None keep_alive Union[bool, None]

whether to keep the connection alive. Defaults to None.

None

Returns:

Type Description GenerateOutput

A list of strings as completion for the given input.

Source code in src/distilabel/models/llms/ollama.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: StandardInput,\n    format: Literal[\"\", \"json\"] = \"\",\n    # TODO: include relevant options from `Options` in `agenerate` method.\n    options: Union[Options, None] = None,\n    keep_alive: Union[bool, None] = None,\n) -> GenerateOutput:\n    \"\"\"\n    Generates a response asynchronously, using the [Ollama Async API definition](https://github.com/ollama/ollama-python).\n\n    Args:\n        input: the input to use for the generation.\n        format: the format to use for the generation. Defaults to `\"\"`.\n        options: the options to use for the generation. Defaults to `None`.\n        keep_alive: whether to keep the connection alive. Defaults to `None`.\n\n    Returns:\n        A list of strings as completion for the given input.\n    \"\"\"\n    text = None\n    try:\n        completion: Dict[str, Any] = await self._aclient.chat(  # type: ignore\n            model=self.model,\n            messages=input,  # type: ignore\n            stream=False,\n            format=format,\n            options=options,\n            keep_alive=keep_alive,\n        )\n        text = completion[\"message\"][\"content\"]\n    except Exception as e:\n        self._logger.warning(  # type: ignore\n            f\"\u26a0\ufe0f Received no response using Ollama client (model: '{self.model_name}').\"\n            f\" Finish reason was: {e}\"\n        )\n\n    return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM","title":"OpenAILLM","text":"

Bases: AsyncLLM

OpenAI LLM implementation running the async API client.

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc. Supported models can be found here.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the OpenAI API requests. Defaults to None, which means that the value set for the environment variable OPENAI_BASE_URL will be used, or \"https://api.openai.com/v1\" if not set.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set.

max_retries RuntimeParameter[int]

the maximum number of times to retry the request to the API before failing. Defaults to 6.

timeout RuntimeParameter[int]

the maximum time in seconds to wait for a response from the API. Defaults to 120.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

Runtime parameters
  • base_url: the base URL to use for the OpenAI API requests. Defaults to None.
  • api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None.
  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.
  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.
Icon

:simple-openai:

Examples:

Generate text:

from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate text from a custom endpoint following the OpenAI API:

from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pydantic import BaseModel\nfrom distilabel.models.llms import OpenAILLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = OpenAILLM(\n    model=\"gpt-4-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n

Generate with Batch API (offline batch generation):

from distilabel.models.llms import OpenAILLM\n\nload = llm = OpenAILLM(\n    model=\"gpt-3.5-turbo\",\n    use_offline_batch_generation=True,\n    offline_batch_generation_block_until_done=5,  # poll for results every 5 seconds\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n# [['Hello! How can I assist you today?']]\n
Source code in src/distilabel/models/llms/openai.py
class OpenAILLM(AsyncLLM):\n    \"\"\"OpenAI LLM implementation running the async API client.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc.\n            Supported models can be found [here](https://platform.openai.com/docs/guides/text-generation).\n        base_url: the base URL to use for the OpenAI API requests. Defaults to `None`, which\n            means that the value set for the environment variable `OPENAI_BASE_URL` will\n            be used, or \"https://api.openai.com/v1\" if not set.\n        api_key: the API key to authenticate the requests to the OpenAI API. Defaults to\n            `None` which means that the value set for the environment variable `OPENAI_API_KEY`\n            will be used, or `None` if not set.\n        max_retries: the maximum number of times to retry the request to the API before\n            failing. Defaults to `6`.\n        timeout: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n\n    Runtime parameters:\n        - `base_url`: the base URL to use for the OpenAI API requests. Defaults to `None`.\n        - `api_key`: the API key to authenticate the requests to the OpenAI API. Defaults\n            to `None`.\n        - `max_retries`: the maximum number of times to retry the request to the API before\n            failing. Defaults to `6`.\n        - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n\n    Icon:\n        `:simple-openai:`\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import OpenAILLM\n\n        llm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate text from a custom endpoint following the OpenAI API:\n\n        ```python\n        from distilabel.models.llms import OpenAILLM\n\n        llm = OpenAILLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\",\n            base_url=r\"http://localhost:8080/v1\"\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import OpenAILLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = OpenAILLM(\n            model=\"gpt-4-turbo\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n\n        Generate with Batch API (offline batch generation):\n\n        ```python\n        from distilabel.models.llms import OpenAILLM\n\n        load = llm = OpenAILLM(\n            model=\"gpt-3.5-turbo\",\n            use_offline_batch_generation=True,\n            offline_batch_generation_block_until_done=5,  # poll for results every 5 seconds\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        # [['Hello! How can I assist you today?']]\n        ```\n    \"\"\"\n\n    model: str\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"OPENAI_BASE_URL\", \"https://api.openai.com/v1\"\n        ),\n        description=\"The base URL to use for the OpenAI API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_OPENAI_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the OpenAI API.\",\n    )\n    max_retries: RuntimeParameter[int] = Field(\n        default=6,\n        description=\"The maximum number of times to retry the request to the API before\"\n        \" failing.\",\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _api_key_env_var: str = PrivateAttr(_OPENAI_API_KEY_ENV_VAR_NAME)\n    _client: \"OpenAI\" = PrivateAttr(None)\n    _aclient: \"AsyncOpenAI\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncOpenAI` client to benefit from async requests.\"\"\"\n        super().load()\n\n        try:\n            from openai import AsyncOpenAI, OpenAI\n        except ImportError as ie:\n            raise ImportError(\n                \"OpenAI Python client is not installed. Please install it using\"\n                \" `pip install openai`.\"\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        self._client = OpenAI(\n            base_url=self.base_url,\n            api_key=self.api_key.get_secret_value(),\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        self._aclient = AsyncOpenAI(\n            base_url=self.base_url,\n            api_key=self.api_key.get_secret_value(),\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"openai\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    def unload(self) -> None:\n        \"\"\"Set clients to `None` as they both contain `thread._RLock` which cannot be pickled\n        in case an exception is raised and has to be handled in the main process\"\"\"\n\n        self._client = None  # type: ignore\n        self._aclient = None  # type: ignore\n        self.structured_output = None\n        super().unload()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        frequency_penalty: float = 0.0,\n        presence_penalty: float = 0.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        stop: Optional[Union[str, List[str]]] = None,\n        response_format: Optional[Dict[str, str]] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for the given input using the OpenAI async\n                client.\n\n                Args:\n                    input: a single input in chat format to generate responses for.\n                    num_generations: the number of generations to create per input. Defaults to\n                        `1`.\n                    max_new_tokens: the maximum number of new tokens that the model will generate.\n                        Defaults to `128`.\n                    frequency_penalty: the repetition penalty to use for the generation. Defaults\n                        to `0.0`.\n                    presence_penalty: the presence penalty to use for the generation. Defaults to\n                        `0.0`.\n                    temperature: the temperature to use for the generation. Defaults to `0.1`.\n                    top_p: the top-p value to use for the generation. Defaults to `1.0`.\n                    stop: a string or a list of strings to use as a stop sequence for the generation.\n                        Defaults to `None`.\n                    response_format: the format of the response to return. Must be one of\n                        \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n                        for more information on how to use the JSON model from OpenAI. Defaults to None\n                        which returns text. To return JSON, use {\"type\": \"json_object\"}.\n        )\n\n                Note:\n                    If response_format\n\n                Returns:\n                    A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,  # type: ignore\n                client=self._aclient,\n                framework=\"openai\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"messages\": input,  # type: ignore\n            \"model\": self.model,\n            \"max_tokens\": max_new_tokens,\n            \"n\": num_generations,\n            \"frequency_penalty\": frequency_penalty,\n            \"presence_penalty\": presence_penalty,\n            \"temperature\": temperature,\n            \"top_p\": top_p,\n            \"stop\": stop,\n        }\n\n        if response_format is not None:\n            kwargs[\"response_format\"] = response_format\n\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)  # type: ignore\n\n        completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n\n        if structured_output:\n            return self._generations_from_structured_output(completion)\n\n        return self._generations_from_openai_completion(completion)\n\n    def _generations_from_structured_output(\n        self, completion: \"BaseModel\"\n    ) -> \"GenerateOutput\":\n        \"\"\"Get the generations from the structured output object.\n\n        Args:\n            completion: an instance of `pydantic.BaseModel` with the content of the structuted\n                output.\n\n        Returns:\n            A list with the content of the structured output.\n        \"\"\"\n        return [completion.model_dump_json()]\n\n    def _generations_from_openai_completion(\n        self, completion: \"OpenAIChatCompletion\"\n    ) -> \"GenerateOutput\":\n        \"\"\"Get the generations from the OpenAI Chat Completion object.\n\n        Args:\n            completion: the completion object to get the generations from.\n\n        Returns:\n            A list of strings containing the generated responses for the input.\n        \"\"\"\n        generations = []\n        for choice in completion.choices:\n            if (content := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"Received no response using OpenAI client (model: '{self.model}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(content)\n        return generations\n\n    def offline_batch_generate(\n        self,\n        inputs: Union[List[\"FormattedInput\"], None] = None,\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        frequency_penalty: float = 0.0,\n        presence_penalty: float = 0.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        stop: Optional[Union[str, List[str]]] = None,\n        response_format: Optional[str] = None,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Uses the OpenAI batch API to generate `num_generations` responses for the given\n        inputs.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            stop: a string or a list of strings to use as a stop sequence for the generation.\n                Defaults to `None`.\n            response_format: the format of the response to return. Must be one of\n                \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n                for more information on how to use the JSON model from OpenAI. Defaults to `text`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input\n            in `inputs`.\n\n        Raises:\n            DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n                is not finished yet.\n            ValueError: if no job IDs were found to retrieve the results from.\n        \"\"\"\n        if self.jobs_ids:\n            return self._check_and_get_batch_results()\n\n        if inputs:\n            self.jobs_ids = self._create_jobs(\n                inputs=inputs,\n                **{\n                    \"model\": self.model,\n                    \"max_tokens\": max_new_tokens,\n                    \"n\": num_generations,\n                    \"frequency_penalty\": frequency_penalty,\n                    \"presence_penalty\": presence_penalty,\n                    \"temperature\": temperature,\n                    \"top_p\": top_p,\n                    \"stop\": stop,\n                    \"response_format\": response_format,\n                },\n            )\n            raise DistilabelOfflineBatchGenerationNotFinishedException(\n                jobs_ids=self.jobs_ids\n            )\n\n        raise ValueError(\"No `inputs` were provided and no `jobs_ids` were found.\")\n\n    def _check_and_get_batch_results(self) -> List[\"GenerateOutput\"]:\n        \"\"\"Checks the status of the batch jobs and retrieves the results from the OpenAI\n        Batch API.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n\n        Raises:\n            ValueError: if no job IDs were found to retrieve the results from.\n            DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n                is not finished yet.\n            RuntimeError: if the only batch job found failed.\n        \"\"\"\n        if not self.jobs_ids:\n            raise ValueError(\"No job IDs were found to retrieve the results from.\")\n\n        outputs = []\n        for batch_id in self.jobs_ids:\n            batch = self._get_openai_batch(batch_id)\n\n            if batch.status in (\"validating\", \"in_progress\", \"finalizing\"):\n                raise DistilabelOfflineBatchGenerationNotFinishedException(\n                    jobs_ids=self.jobs_ids\n                )\n\n            if batch.status in (\"failed\", \"expired\", \"cancelled\", \"cancelling\"):\n                self._logger.error(  # type: ignore\n                    f\"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'.\"\n                )\n                if len(self.jobs_ids) == 1:\n                    self.jobs_ids = None\n                    raise RuntimeError(\n                        f\"The only OpenAI API Batch that was created with ID '{batch_id}'\"\n                        f\" failed with status '{batch.status}'.\"\n                    )\n\n                continue\n\n            outputs.extend(self._retrieve_batch_results(batch))\n\n        # sort by `custom_id` to return the results in the same order as the inputs\n        outputs = sorted(outputs, key=lambda x: int(x[\"custom_id\"]))\n        return [self._parse_output(output) for output in outputs]\n\n    def _parse_output(self, output: Dict[str, Any]) -> \"GenerateOutput\":\n        \"\"\"Parses the output from the OpenAI Batch API into a list of strings.\n\n        Args:\n            output: the output to parse.\n\n        Returns:\n            A list of strings containing the generated responses for the input.\n        \"\"\"\n        from openai.types.chat import ChatCompletion as OpenAIChatCompletion\n\n        if \"response\" not in output:\n            return []\n\n        if output[\"response\"][\"status_code\"] != 200:\n            return []\n\n        return self._generations_from_openai_completion(\n            OpenAIChatCompletion(**output[\"response\"][\"body\"])\n        )\n\n    def _get_openai_batch(self, batch_id: str) -> \"OpenAIBatch\":\n        \"\"\"Gets a batch from the OpenAI Batch API.\n\n        Args:\n            batch_id: the ID of the batch to retrieve.\n\n        Returns:\n            The batch retrieved from the OpenAI Batch API.\n\n        Raises:\n            openai.OpenAIError: if there was an error while retrieving the batch from the\n                OpenAI Batch API.\n        \"\"\"\n        import openai\n\n        try:\n            return self._client.batches.retrieve(batch_id)\n        except openai.OpenAIError as e:\n            self._logger.error(  # type: ignore\n                f\"Error while retrieving batch '{batch_id}' from OpenAI: {e}\"\n            )\n            raise e\n\n    def _retrieve_batch_results(self, batch: \"OpenAIBatch\") -> List[Dict[str, Any]]:\n        \"\"\"Retrieves the results of a batch from its output file, parsing the JSONL content\n        into a list of dictionaries.\n\n        Args:\n            batch: the batch to retrieve the results from.\n\n        Returns:\n            A list of dictionaries containing the results of the batch.\n\n        Raises:\n            AssertionError: if no output file ID was found in the batch.\n        \"\"\"\n        import openai\n\n        assert batch.output_file_id, \"No output file ID was found in the batch.\"\n\n        try:\n            file_response = self._client.files.content(batch.output_file_id)\n            return [orjson.loads(line) for line in file_response.text.splitlines()]\n        except openai.OpenAIError as e:\n            self._logger.error(  # type: ignore\n                f\"Error while retrieving batch results from file '{batch.output_file_id}': {e}\"\n            )\n            return []\n\n    def _create_jobs(\n        self, inputs: List[\"FormattedInput\"], **kwargs: Any\n    ) -> Tuple[str, ...]:\n        \"\"\"Creates jobs in the OpenAI Batch API to generate responses for the given inputs.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            kwargs: the keyword arguments to use for the generation.\n\n        Returns:\n            A list of job IDs created in the OpenAI Batch API.\n        \"\"\"\n        batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)\n        jobs = []\n        for batch_input_file in batch_input_files:\n            if batch := self._create_batch_api_job(batch_input_file):\n                jobs.append(batch.id)\n        return tuple(jobs)\n\n    def _create_batch_api_job(\n        self, batch_input_file: \"OpenAIFileObject\"\n    ) -> Union[\"OpenAIBatch\", None]:\n        \"\"\"Creates a job in the OpenAI Batch API to generate responses for the given input\n        file.\n\n        Args:\n            batch_input_file: the input file to generate responses for.\n\n        Returns:\n            The batch job created in the OpenAI Batch API.\n        \"\"\"\n        import openai\n\n        metadata = {\"description\": \"distilabel\"}\n\n        if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:\n            metadata[\"distilabel_pipeline_name\"] = distilabel_pipeline_name\n\n        if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:\n            metadata[\"distilabel_pipeline_cache_id\"] = distilabel_pipeline_cache_id\n\n        batch = None\n        try:\n            batch = self._client.batches.create(\n                completion_window=\"24h\",\n                endpoint=\"/v1/chat/completions\",\n                input_file_id=batch_input_file.id,\n                metadata=metadata,\n            )\n        except openai.OpenAIError as e:\n            self._logger.error(  # type: ignore\n                f\"Error while creating OpenAI Batch API job for file with ID\"\n                f\" '{batch_input_file.id}': {e}.\"\n            )\n            raise e\n        return batch\n\n    def _create_batch_files(\n        self, inputs: List[\"FormattedInput\"], **kwargs: Any\n    ) -> List[\"OpenAIFileObject\"]:\n        \"\"\"Creates the necessary input files for the batch API to generate responses. The\n        maximum size of each file so the OpenAI Batch API can process it is 100MB, so we\n        need to split the inputs into multiple files if necessary.\n\n        More information: https://platform.openai.com/docs/api-reference/files/create\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for, optionally\n                including structured output.\n            kwargs: the keyword arguments to use for the generation.\n\n        Returns:\n            The list of file objects created for the OpenAI Batch API.\n\n        Raises:\n            openai.OpenAIError: if there was an error while creating the batch input file\n                in the OpenAI Batch API.\n        \"\"\"\n        import openai\n\n        files = []\n        for file_no, buffer in enumerate(\n            self._create_jsonl_buffers(inputs=inputs, **kwargs)\n        ):\n            try:\n                # TODO: add distilabel pipeline name and id\n                batch_input_file = self._client.files.create(\n                    file=(self._name_for_openai_files(file_no), buffer),\n                    purpose=\"batch\",\n                )\n                files.append(batch_input_file)\n            except openai.OpenAIError as e:\n                self._logger.error(  # type: ignore\n                    f\"Error while creating OpenAI batch input file: {e}\"\n                )\n                raise e\n        return files\n\n    def _create_jsonl_buffers(\n        self, inputs: List[\"FormattedInput\"], **kwargs: Any\n    ) -> Generator[io.BytesIO, None, None]:\n        \"\"\"Creates a generator of buffers containing the JSONL formatted inputs to be\n        used by the OpenAI Batch API. The buffers created are of size 100MB or less.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for, optionally\n                including structured output.\n            kwargs: the keyword arguments to use for the generation.\n\n        Yields:\n            A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch\n            API.\n        \"\"\"\n        buffer = io.BytesIO()\n        buffer_current_size = 0\n        for i, input in enumerate(inputs):\n            # We create the smallest `custom_id` so we don't  increase the size of the file\n            # to much, but we can still sort the results with the order of the inputs.\n            row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)\n            row_size = len(row)\n            if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:\n                buffer.seek(0)\n                yield buffer\n                buffer = io.BytesIO()\n                buffer_current_size = 0\n            buffer.write(row)\n            buffer_current_size += row_size\n\n        if buffer_current_size > 0:\n            buffer.seek(0)\n            yield buffer\n\n    def _create_jsonl_row(\n        self, input: \"FormattedInput\", custom_id: str, **kwargs: Any\n    ) -> bytes:\n        \"\"\"Creates a JSONL formatted row to be used by the OpenAI Batch API.\n\n        Args:\n            input: a list of inputs in chat format to generate responses for, optionally\n                including structured output.\n            custom_id: a custom ID to use for the row.\n            kwargs: the keyword arguments to use for the generation.\n\n        Returns:\n            A JSONL formatted row to be used by the OpenAI Batch API.\n        \"\"\"\n        # TODO: depending on the format of the input, add `response_format` to the kwargs\n        row = {\n            \"custom_id\": custom_id,\n            \"method\": \"POST\",\n            \"url\": \"/v1/chat/completions\",\n            \"body\": {\"messages\": input, **kwargs},\n        }\n        json_row = orjson.dumps(row)\n        return json_row + b\"\\n\"\n\n    def _name_for_openai_files(self, file_no: int) -> str:\n        if (\n            envs.DISTILABEL_PIPELINE_NAME is None\n            or envs.DISTILABEL_PIPELINE_CACHE_ID is None\n        ):\n            return f\"distilabel-pipeline-fileno-{file_no}.jsonl\"\n\n        return f\"distilabel-pipeline-{envs.DISTILABEL_PIPELINE_NAME}-{envs.DISTILABEL_PIPELINE_CACHE_ID}-fileno-{file_no}.jsonl\"\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.load","title":"load()","text":"

Loads the AsyncOpenAI client to benefit from async requests.

Source code in src/distilabel/models/llms/openai.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncOpenAI` client to benefit from async requests.\"\"\"\n    super().load()\n\n    try:\n        from openai import AsyncOpenAI, OpenAI\n    except ImportError as ie:\n        raise ImportError(\n            \"OpenAI Python client is not installed. Please install it using\"\n            \" `pip install openai`.\"\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    self._client = OpenAI(\n        base_url=self.base_url,\n        api_key=self.api_key.get_secret_value(),\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    self._aclient = AsyncOpenAI(\n        base_url=self.base_url,\n        api_key=self.api_key.get_secret_value(),\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"openai\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.unload","title":"unload()","text":"

Set clients to None as they both contain thread._RLock which cannot be pickled in case an exception is raised and has to be handled in the main process

Source code in src/distilabel/models/llms/openai.py
def unload(self) -> None:\n    \"\"\"Set clients to `None` as they both contain `thread._RLock` which cannot be pickled\n    in case an exception is raised and has to be handled in the main process\"\"\"\n\n    self._client = None  # type: ignore\n    self._aclient = None  # type: ignore\n    self.structured_output = None\n    super().unload()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.agenerate","title":"agenerate(input, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, stop=None, response_format=None) async","text":"

Generates num_generations responses for the given input using the OpenAI async client.

    Args:\n        input: a single input in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        stop: a string or a list of strings to use as a stop sequence for the generation.\n            Defaults to `None`.\n        response_format: the format of the response to return. Must be one of\n            \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n            for more information on how to use the JSON model from OpenAI. Defaults to None\n            which returns text. To return JSON, use {\"type\": \"json_object\"}.\n

)

    Note:\n        If response_format\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n
Source code in src/distilabel/models/llms/openai.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    frequency_penalty: float = 0.0,\n    presence_penalty: float = 0.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    stop: Optional[Union[str, List[str]]] = None,\n    response_format: Optional[Dict[str, str]] = None,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for the given input using the OpenAI async\n            client.\n\n            Args:\n                input: a single input in chat format to generate responses for.\n                num_generations: the number of generations to create per input. Defaults to\n                    `1`.\n                max_new_tokens: the maximum number of new tokens that the model will generate.\n                    Defaults to `128`.\n                frequency_penalty: the repetition penalty to use for the generation. Defaults\n                    to `0.0`.\n                presence_penalty: the presence penalty to use for the generation. Defaults to\n                    `0.0`.\n                temperature: the temperature to use for the generation. Defaults to `0.1`.\n                top_p: the top-p value to use for the generation. Defaults to `1.0`.\n                stop: a string or a list of strings to use as a stop sequence for the generation.\n                    Defaults to `None`.\n                response_format: the format of the response to return. Must be one of\n                    \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n                    for more information on how to use the JSON model from OpenAI. Defaults to None\n                    which returns text. To return JSON, use {\"type\": \"json_object\"}.\n    )\n\n            Note:\n                If response_format\n\n            Returns:\n                A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,  # type: ignore\n            client=self._aclient,\n            framework=\"openai\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"messages\": input,  # type: ignore\n        \"model\": self.model,\n        \"max_tokens\": max_new_tokens,\n        \"n\": num_generations,\n        \"frequency_penalty\": frequency_penalty,\n        \"presence_penalty\": presence_penalty,\n        \"temperature\": temperature,\n        \"top_p\": top_p,\n        \"stop\": stop,\n    }\n\n    if response_format is not None:\n        kwargs[\"response_format\"] = response_format\n\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)  # type: ignore\n\n    completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n\n    if structured_output:\n        return self._generations_from_structured_output(completion)\n\n    return self._generations_from_openai_completion(completion)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._generations_from_structured_output","title":"_generations_from_structured_output(completion)","text":"

Get the generations from the structured output object.

Parameters:

Name Type Description Default completion BaseModel

an instance of pydantic.BaseModel with the content of the structuted output.

required

Returns:

Type Description GenerateOutput

A list with the content of the structured output.

Source code in src/distilabel/models/llms/openai.py
def _generations_from_structured_output(\n    self, completion: \"BaseModel\"\n) -> \"GenerateOutput\":\n    \"\"\"Get the generations from the structured output object.\n\n    Args:\n        completion: an instance of `pydantic.BaseModel` with the content of the structuted\n            output.\n\n    Returns:\n        A list with the content of the structured output.\n    \"\"\"\n    return [completion.model_dump_json()]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._generations_from_openai_completion","title":"_generations_from_openai_completion(completion)","text":"

Get the generations from the OpenAI Chat Completion object.

Parameters:

Name Type Description Default completion ChatCompletion

the completion object to get the generations from.

required

Returns:

Type Description GenerateOutput

A list of strings containing the generated responses for the input.

Source code in src/distilabel/models/llms/openai.py
def _generations_from_openai_completion(\n    self, completion: \"OpenAIChatCompletion\"\n) -> \"GenerateOutput\":\n    \"\"\"Get the generations from the OpenAI Chat Completion object.\n\n    Args:\n        completion: the completion object to get the generations from.\n\n    Returns:\n        A list of strings containing the generated responses for the input.\n    \"\"\"\n    generations = []\n    for choice in completion.choices:\n        if (content := choice.message.content) is None:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using OpenAI client (model: '{self.model}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.offline_batch_generate","title":"offline_batch_generate(inputs=None, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, stop=None, response_format=None, **kwargs)","text":"

Uses the OpenAI batch API to generate num_generations responses for the given inputs.

Parameters:

Name Type Description Default inputs Union[List[FormattedInput], None]

a list of inputs in chat format to generate responses for.

None num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 stop Optional[Union[str, List[str]]]

a string or a list of strings to use as a stop sequence for the generation. Defaults to None.

None response_format Optional[str]

the format of the response to return. Must be one of \"text\" or \"json\". Read the documentation here for more information on how to use the JSON model from OpenAI. Defaults to text.

None

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input

List[GenerateOutput]

in inputs.

Raises:

Type Description DistilabelOfflineBatchGenerationNotFinishedException

if the batch generation is not finished yet.

ValueError

if no job IDs were found to retrieve the results from.

Source code in src/distilabel/models/llms/openai.py
def offline_batch_generate(\n    self,\n    inputs: Union[List[\"FormattedInput\"], None] = None,\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    frequency_penalty: float = 0.0,\n    presence_penalty: float = 0.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    stop: Optional[Union[str, List[str]]] = None,\n    response_format: Optional[str] = None,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Uses the OpenAI batch API to generate `num_generations` responses for the given\n    inputs.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        stop: a string or a list of strings to use as a stop sequence for the generation.\n            Defaults to `None`.\n        response_format: the format of the response to return. Must be one of\n            \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n            for more information on how to use the JSON model from OpenAI. Defaults to `text`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input\n        in `inputs`.\n\n    Raises:\n        DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n            is not finished yet.\n        ValueError: if no job IDs were found to retrieve the results from.\n    \"\"\"\n    if self.jobs_ids:\n        return self._check_and_get_batch_results()\n\n    if inputs:\n        self.jobs_ids = self._create_jobs(\n            inputs=inputs,\n            **{\n                \"model\": self.model,\n                \"max_tokens\": max_new_tokens,\n                \"n\": num_generations,\n                \"frequency_penalty\": frequency_penalty,\n                \"presence_penalty\": presence_penalty,\n                \"temperature\": temperature,\n                \"top_p\": top_p,\n                \"stop\": stop,\n                \"response_format\": response_format,\n            },\n        )\n        raise DistilabelOfflineBatchGenerationNotFinishedException(\n            jobs_ids=self.jobs_ids\n        )\n\n    raise ValueError(\"No `inputs` were provided and no `jobs_ids` were found.\")\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._check_and_get_batch_results","title":"_check_and_get_batch_results()","text":"

Checks the status of the batch jobs and retrieves the results from the OpenAI Batch API.

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input.

Raises:

Type Description ValueError

if no job IDs were found to retrieve the results from.

DistilabelOfflineBatchGenerationNotFinishedException

if the batch generation is not finished yet.

RuntimeError

if the only batch job found failed.

Source code in src/distilabel/models/llms/openai.py
def _check_and_get_batch_results(self) -> List[\"GenerateOutput\"]:\n    \"\"\"Checks the status of the batch jobs and retrieves the results from the OpenAI\n    Batch API.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n\n    Raises:\n        ValueError: if no job IDs were found to retrieve the results from.\n        DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n            is not finished yet.\n        RuntimeError: if the only batch job found failed.\n    \"\"\"\n    if not self.jobs_ids:\n        raise ValueError(\"No job IDs were found to retrieve the results from.\")\n\n    outputs = []\n    for batch_id in self.jobs_ids:\n        batch = self._get_openai_batch(batch_id)\n\n        if batch.status in (\"validating\", \"in_progress\", \"finalizing\"):\n            raise DistilabelOfflineBatchGenerationNotFinishedException(\n                jobs_ids=self.jobs_ids\n            )\n\n        if batch.status in (\"failed\", \"expired\", \"cancelled\", \"cancelling\"):\n            self._logger.error(  # type: ignore\n                f\"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'.\"\n            )\n            if len(self.jobs_ids) == 1:\n                self.jobs_ids = None\n                raise RuntimeError(\n                    f\"The only OpenAI API Batch that was created with ID '{batch_id}'\"\n                    f\" failed with status '{batch.status}'.\"\n                )\n\n            continue\n\n        outputs.extend(self._retrieve_batch_results(batch))\n\n    # sort by `custom_id` to return the results in the same order as the inputs\n    outputs = sorted(outputs, key=lambda x: int(x[\"custom_id\"]))\n    return [self._parse_output(output) for output in outputs]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._parse_output","title":"_parse_output(output)","text":"

Parses the output from the OpenAI Batch API into a list of strings.

Parameters:

Name Type Description Default output Dict[str, Any]

the output to parse.

required

Returns:

Type Description GenerateOutput

A list of strings containing the generated responses for the input.

Source code in src/distilabel/models/llms/openai.py
def _parse_output(self, output: Dict[str, Any]) -> \"GenerateOutput\":\n    \"\"\"Parses the output from the OpenAI Batch API into a list of strings.\n\n    Args:\n        output: the output to parse.\n\n    Returns:\n        A list of strings containing the generated responses for the input.\n    \"\"\"\n    from openai.types.chat import ChatCompletion as OpenAIChatCompletion\n\n    if \"response\" not in output:\n        return []\n\n    if output[\"response\"][\"status_code\"] != 200:\n        return []\n\n    return self._generations_from_openai_completion(\n        OpenAIChatCompletion(**output[\"response\"][\"body\"])\n    )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._get_openai_batch","title":"_get_openai_batch(batch_id)","text":"

Gets a batch from the OpenAI Batch API.

Parameters:

Name Type Description Default batch_id str

the ID of the batch to retrieve.

required

Returns:

Type Description Batch

The batch retrieved from the OpenAI Batch API.

Raises:

Type Description OpenAIError

if there was an error while retrieving the batch from the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _get_openai_batch(self, batch_id: str) -> \"OpenAIBatch\":\n    \"\"\"Gets a batch from the OpenAI Batch API.\n\n    Args:\n        batch_id: the ID of the batch to retrieve.\n\n    Returns:\n        The batch retrieved from the OpenAI Batch API.\n\n    Raises:\n        openai.OpenAIError: if there was an error while retrieving the batch from the\n            OpenAI Batch API.\n    \"\"\"\n    import openai\n\n    try:\n        return self._client.batches.retrieve(batch_id)\n    except openai.OpenAIError as e:\n        self._logger.error(  # type: ignore\n            f\"Error while retrieving batch '{batch_id}' from OpenAI: {e}\"\n        )\n        raise e\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._retrieve_batch_results","title":"_retrieve_batch_results(batch)","text":"

Retrieves the results of a batch from its output file, parsing the JSONL content into a list of dictionaries.

Parameters:

Name Type Description Default batch Batch

the batch to retrieve the results from.

required

Returns:

Type Description List[Dict[str, Any]]

A list of dictionaries containing the results of the batch.

Raises:

Type Description AssertionError

if no output file ID was found in the batch.

Source code in src/distilabel/models/llms/openai.py
def _retrieve_batch_results(self, batch: \"OpenAIBatch\") -> List[Dict[str, Any]]:\n    \"\"\"Retrieves the results of a batch from its output file, parsing the JSONL content\n    into a list of dictionaries.\n\n    Args:\n        batch: the batch to retrieve the results from.\n\n    Returns:\n        A list of dictionaries containing the results of the batch.\n\n    Raises:\n        AssertionError: if no output file ID was found in the batch.\n    \"\"\"\n    import openai\n\n    assert batch.output_file_id, \"No output file ID was found in the batch.\"\n\n    try:\n        file_response = self._client.files.content(batch.output_file_id)\n        return [orjson.loads(line) for line in file_response.text.splitlines()]\n    except openai.OpenAIError as e:\n        self._logger.error(  # type: ignore\n            f\"Error while retrieving batch results from file '{batch.output_file_id}': {e}\"\n        )\n        return []\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jobs","title":"_create_jobs(inputs, **kwargs)","text":"

Creates jobs in the OpenAI Batch API to generate responses for the given inputs.

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for.

required kwargs Any

the keyword arguments to use for the generation.

{}

Returns:

Type Description Tuple[str, ...]

A list of job IDs created in the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _create_jobs(\n    self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> Tuple[str, ...]:\n    \"\"\"Creates jobs in the OpenAI Batch API to generate responses for the given inputs.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        kwargs: the keyword arguments to use for the generation.\n\n    Returns:\n        A list of job IDs created in the OpenAI Batch API.\n    \"\"\"\n    batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)\n    jobs = []\n    for batch_input_file in batch_input_files:\n        if batch := self._create_batch_api_job(batch_input_file):\n            jobs.append(batch.id)\n    return tuple(jobs)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_batch_api_job","title":"_create_batch_api_job(batch_input_file)","text":"

Creates a job in the OpenAI Batch API to generate responses for the given input file.

Parameters:

Name Type Description Default batch_input_file FileObject

the input file to generate responses for.

required

Returns:

Type Description Union[Batch, None]

The batch job created in the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _create_batch_api_job(\n    self, batch_input_file: \"OpenAIFileObject\"\n) -> Union[\"OpenAIBatch\", None]:\n    \"\"\"Creates a job in the OpenAI Batch API to generate responses for the given input\n    file.\n\n    Args:\n        batch_input_file: the input file to generate responses for.\n\n    Returns:\n        The batch job created in the OpenAI Batch API.\n    \"\"\"\n    import openai\n\n    metadata = {\"description\": \"distilabel\"}\n\n    if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:\n        metadata[\"distilabel_pipeline_name\"] = distilabel_pipeline_name\n\n    if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:\n        metadata[\"distilabel_pipeline_cache_id\"] = distilabel_pipeline_cache_id\n\n    batch = None\n    try:\n        batch = self._client.batches.create(\n            completion_window=\"24h\",\n            endpoint=\"/v1/chat/completions\",\n            input_file_id=batch_input_file.id,\n            metadata=metadata,\n        )\n    except openai.OpenAIError as e:\n        self._logger.error(  # type: ignore\n            f\"Error while creating OpenAI Batch API job for file with ID\"\n            f\" '{batch_input_file.id}': {e}.\"\n        )\n        raise e\n    return batch\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_batch_files","title":"_create_batch_files(inputs, **kwargs)","text":"

Creates the necessary input files for the batch API to generate responses. The maximum size of each file so the OpenAI Batch API can process it is 100MB, so we need to split the inputs into multiple files if necessary.

More information: https://platform.openai.com/docs/api-reference/files/create

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for, optionally including structured output.

required kwargs Any

the keyword arguments to use for the generation.

{}

Returns:

Type Description List[FileObject]

The list of file objects created for the OpenAI Batch API.

Raises:

Type Description OpenAIError

if there was an error while creating the batch input file in the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _create_batch_files(\n    self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> List[\"OpenAIFileObject\"]:\n    \"\"\"Creates the necessary input files for the batch API to generate responses. The\n    maximum size of each file so the OpenAI Batch API can process it is 100MB, so we\n    need to split the inputs into multiple files if necessary.\n\n    More information: https://platform.openai.com/docs/api-reference/files/create\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for, optionally\n            including structured output.\n        kwargs: the keyword arguments to use for the generation.\n\n    Returns:\n        The list of file objects created for the OpenAI Batch API.\n\n    Raises:\n        openai.OpenAIError: if there was an error while creating the batch input file\n            in the OpenAI Batch API.\n    \"\"\"\n    import openai\n\n    files = []\n    for file_no, buffer in enumerate(\n        self._create_jsonl_buffers(inputs=inputs, **kwargs)\n    ):\n        try:\n            # TODO: add distilabel pipeline name and id\n            batch_input_file = self._client.files.create(\n                file=(self._name_for_openai_files(file_no), buffer),\n                purpose=\"batch\",\n            )\n            files.append(batch_input_file)\n        except openai.OpenAIError as e:\n            self._logger.error(  # type: ignore\n                f\"Error while creating OpenAI batch input file: {e}\"\n            )\n            raise e\n    return files\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jsonl_buffers","title":"_create_jsonl_buffers(inputs, **kwargs)","text":"

Creates a generator of buffers containing the JSONL formatted inputs to be used by the OpenAI Batch API. The buffers created are of size 100MB or less.

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for, optionally including structured output.

required kwargs Any

the keyword arguments to use for the generation.

{}

Yields:

Type Description BytesIO

A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch

BytesIO

API.

Source code in src/distilabel/models/llms/openai.py
def _create_jsonl_buffers(\n    self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> Generator[io.BytesIO, None, None]:\n    \"\"\"Creates a generator of buffers containing the JSONL formatted inputs to be\n    used by the OpenAI Batch API. The buffers created are of size 100MB or less.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for, optionally\n            including structured output.\n        kwargs: the keyword arguments to use for the generation.\n\n    Yields:\n        A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch\n        API.\n    \"\"\"\n    buffer = io.BytesIO()\n    buffer_current_size = 0\n    for i, input in enumerate(inputs):\n        # We create the smallest `custom_id` so we don't  increase the size of the file\n        # to much, but we can still sort the results with the order of the inputs.\n        row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)\n        row_size = len(row)\n        if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:\n            buffer.seek(0)\n            yield buffer\n            buffer = io.BytesIO()\n            buffer_current_size = 0\n        buffer.write(row)\n        buffer_current_size += row_size\n\n    if buffer_current_size > 0:\n        buffer.seek(0)\n        yield buffer\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jsonl_row","title":"_create_jsonl_row(input, custom_id, **kwargs)","text":"

Creates a JSONL formatted row to be used by the OpenAI Batch API.

Parameters:

Name Type Description Default input FormattedInput

a list of inputs in chat format to generate responses for, optionally including structured output.

required custom_id str

a custom ID to use for the row.

required kwargs Any

the keyword arguments to use for the generation.

{}

Returns:

Type Description bytes

A JSONL formatted row to be used by the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _create_jsonl_row(\n    self, input: \"FormattedInput\", custom_id: str, **kwargs: Any\n) -> bytes:\n    \"\"\"Creates a JSONL formatted row to be used by the OpenAI Batch API.\n\n    Args:\n        input: a list of inputs in chat format to generate responses for, optionally\n            including structured output.\n        custom_id: a custom ID to use for the row.\n        kwargs: the keyword arguments to use for the generation.\n\n    Returns:\n        A JSONL formatted row to be used by the OpenAI Batch API.\n    \"\"\"\n    # TODO: depending on the format of the input, add `response_format` to the kwargs\n    row = {\n        \"custom_id\": custom_id,\n        \"method\": \"POST\",\n        \"url\": \"/v1/chat/completions\",\n        \"body\": {\"messages\": input, **kwargs},\n    }\n    json_row = orjson.dumps(row)\n    return json_row + b\"\\n\"\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TogetherLLM","title":"TogetherLLM","text":"

Bases: OpenAILLM

TogetherLLM LLM implementation running the async API client of OpenAI.

Attributes:

Name Type Description model

the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\". Supported models can be found here.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Together API can be set with TOGETHER_BASE_URL. Defaults to None which means that the value set for the environment variable TOGETHER_BASE_URL will be used, or \"https://api.together.xyz/v1\" if not set.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Together API. Defaults to None which means that the value set for the environment variable TOGETHER_API_KEY will be used, or None if not set.

_api_key_env_var str

the name of the environment variable to use for the API key. It is meant to be used internally.

Examples:

Generate text:

from distilabel.models.llms import AnyscaleLLM\n\nllm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/together.py
class TogetherLLM(OpenAILLM):\n    \"\"\"TogetherLLM LLM implementation running the async API client of OpenAI.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\".\n            Supported models can be found [here](https://api.together.xyz/models).\n        base_url: the base URL to use for the Together API can be set with `TOGETHER_BASE_URL`.\n            Defaults to `None` which means that the value set for the environment variable\n            `TOGETHER_BASE_URL` will be used, or \"https://api.together.xyz/v1\" if not set.\n        api_key: the API key to authenticate the requests to the Together API. Defaults to `None`\n            which means that the value set for the environment variable `TOGETHER_API_KEY` will be\n            used, or `None` if not set.\n        _api_key_env_var: the name of the environment variable to use for the API key. It\n            is meant to be used internally.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import AnyscaleLLM\n\n        llm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"TOGETHER_BASE_URL\", \"https://api.together.xyz/v1\"\n        ),\n        description=\"The base URL to use for the Together API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_TOGETHER_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Together API.\",\n    )\n\n    _api_key_env_var: str = PrivateAttr(_TOGETHER_API_KEY_ENV_VAR_NAME)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM","title":"VertexAILLM","text":"

Bases: AsyncLLM

VertexAI LLM implementation running the async API clients for Gemini.

  • Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini

To use the VertexAILLM is necessary to have configured the Google Cloud authentication using one of these methods:

  • Setting GOOGLE_CLOUD_CREDENTIALS environment variable
  • Using gcloud auth application-default login command
  • Using vertexai.init function from the google-cloud-aiplatform library

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"gemini-1.0-pro\". Supported models.

_aclient Optional[GenerativeModel]

the GenerativeModel to use for the Vertex AI Gemini API. It is meant to be used internally. Set in the load method.

Icon

:simple-googlecloud:

Examples:

Generate text:

from distilabel.models.llms import VertexAILLM\n\nllm = VertexAILLM(model=\"gemini-1.5-pro\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/vertexai.py
class VertexAILLM(AsyncLLM):\n    \"\"\"VertexAI LLM implementation running the async API clients for Gemini.\n\n    - Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini\n\n    To use the `VertexAILLM` is necessary to have configured the Google Cloud authentication\n    using one of these methods:\n\n    - Setting `GOOGLE_CLOUD_CREDENTIALS` environment variable\n    - Using `gcloud auth application-default login` command\n    - Using `vertexai.init` function from the `google-cloud-aiplatform` library\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"gemini-1.0-pro\". [Supported models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models).\n        _aclient: the `GenerativeModel` to use for the Vertex AI Gemini API. It is meant\n            to be used internally. Set in the `load` method.\n\n    Icon:\n        `:simple-googlecloud:`\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import VertexAILLM\n\n        llm = VertexAILLM(model=\"gemini-1.5-pro\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    model: str\n\n    _num_generations_param_supported = False\n\n    _aclient: Optional[\"GenerativeModel\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `GenerativeModel` class which has access to `generate_content_async` to benefit from async requests.\"\"\"\n        super().load()\n\n        try:\n            from vertexai.generative_models import GenerationConfig, GenerativeModel\n\n            self._generation_config_class = GenerationConfig\n        except ImportError as e:\n            raise ImportError(\n                \"vertexai is not installed. Please install it using\"\n                \" `pip install google-cloud-aiplatform`.\"\n            ) from e\n\n        if _is_gemini_model(self.model):\n            self._aclient = GenerativeModel(model_name=self.model)\n        else:\n            raise NotImplementedError(\n                \"`VertexAILLM` is only implemented for `gemini` models that allow for `ChatType` data.\"\n            )\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    def _chattype_to_content(self, input: \"StandardInput\") -> List[\"Content\"]:\n        \"\"\"Converts a chat type to a list of content items expected by the API.\n\n        Args:\n            input: the chat type to be converted.\n\n        Returns:\n            List[str]: a list of content items expected by the API.\n        \"\"\"\n        from vertexai.generative_models import Content, Part\n\n        contents = []\n        for message in input:\n            if message[\"role\"] not in [\"user\", \"model\"]:\n                raise ValueError(\n                    \"`VertexAILLM only supports the roles 'user' or 'model'.\"\n                )\n            contents.append(\n                Content(\n                    role=message[\"role\"], parts=[Part.from_text(message[\"content\"])]\n                )\n            )\n        return contents\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: StandardInput,\n        temperature: Optional[float] = None,\n        top_p: Optional[float] = None,\n        top_k: Optional[int] = None,\n        max_output_tokens: Optional[int] = None,\n        stop_sequences: Optional[List[str]] = None,\n        safety_settings: Optional[Dict[str, Any]] = None,\n        tools: Optional[List[Dict[str, Any]]] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for the given input using the [VertexAI async client definition](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini).\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            temperature: Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to `None`.\n            top_p: If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to `None`.\n            top_k: If specified, top-k sampling will be used. Defaults to `None`.\n            max_output_tokens: The maximum number of output tokens to generate per message. Defaults to `None`.\n            stop_sequences: A list of stop sequences. Defaults to `None`.\n            safety_settings: Safety configuration for returned content from the API. Defaults to `None`.\n            tools: A potential list of tools that can be used by the API. Defaults to `None`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        from vertexai.generative_models import GenerationConfig\n\n        content: \"GenerationResponse\" = await self._aclient.generate_content_async(  # type: ignore\n            contents=self._chattype_to_content(input),\n            generation_config=GenerationConfig(\n                candidate_count=1,  # only one candidate allowed per call\n                temperature=temperature,\n                top_k=top_k,\n                top_p=top_p,\n                max_output_tokens=max_output_tokens,\n                stop_sequences=stop_sequences,\n            ),\n            safety_settings=safety_settings,  # type: ignore\n            tools=tools,  # type: ignore\n            stream=False,\n        )\n\n        text = None\n        try:\n            text = content.candidates[0].text\n        except ValueError:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using VertexAI client (model: '{self.model}').\"\n                f\" Finish reason was: '{content.candidates[0].finish_reason}'.\"\n            )\n\n        return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.load","title":"load()","text":"

Loads the GenerativeModel class which has access to generate_content_async to benefit from async requests.

Source code in src/distilabel/models/llms/vertexai.py
def load(self) -> None:\n    \"\"\"Loads the `GenerativeModel` class which has access to `generate_content_async` to benefit from async requests.\"\"\"\n    super().load()\n\n    try:\n        from vertexai.generative_models import GenerationConfig, GenerativeModel\n\n        self._generation_config_class = GenerationConfig\n    except ImportError as e:\n        raise ImportError(\n            \"vertexai is not installed. Please install it using\"\n            \" `pip install google-cloud-aiplatform`.\"\n        ) from e\n\n    if _is_gemini_model(self.model):\n        self._aclient = GenerativeModel(model_name=self.model)\n    else:\n        raise NotImplementedError(\n            \"`VertexAILLM` is only implemented for `gemini` models that allow for `ChatType` data.\"\n        )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM._chattype_to_content","title":"_chattype_to_content(input)","text":"

Converts a chat type to a list of content items expected by the API.

Parameters:

Name Type Description Default input StandardInput

the chat type to be converted.

required

Returns:

Type Description List[Content]

List[str]: a list of content items expected by the API.

Source code in src/distilabel/models/llms/vertexai.py
def _chattype_to_content(self, input: \"StandardInput\") -> List[\"Content\"]:\n    \"\"\"Converts a chat type to a list of content items expected by the API.\n\n    Args:\n        input: the chat type to be converted.\n\n    Returns:\n        List[str]: a list of content items expected by the API.\n    \"\"\"\n    from vertexai.generative_models import Content, Part\n\n    contents = []\n    for message in input:\n        if message[\"role\"] not in [\"user\", \"model\"]:\n            raise ValueError(\n                \"`VertexAILLM only supports the roles 'user' or 'model'.\"\n            )\n        contents.append(\n            Content(\n                role=message[\"role\"], parts=[Part.from_text(message[\"content\"])]\n            )\n        )\n    return contents\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.agenerate","title":"agenerate(input, temperature=None, top_p=None, top_k=None, max_output_tokens=None, stop_sequences=None, safety_settings=None, tools=None) async","text":"

Generates num_generations responses for the given input using the VertexAI async client definition.

Parameters:

Name Type Description Default input StandardInput

a single input in chat format to generate responses for.

required temperature Optional[float]

Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to None.

None top_p Optional[float]

If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to None.

None top_k Optional[int]

If specified, top-k sampling will be used. Defaults to None.

None max_output_tokens Optional[int]

The maximum number of output tokens to generate per message. Defaults to None.

None stop_sequences Optional[List[str]]

A list of stop sequences. Defaults to None.

None safety_settings Optional[Dict[str, Any]]

Safety configuration for returned content from the API. Defaults to None.

None tools Optional[List[Dict[str, Any]]]

A potential list of tools that can be used by the API. Defaults to None.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/vertexai.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: StandardInput,\n    temperature: Optional[float] = None,\n    top_p: Optional[float] = None,\n    top_k: Optional[int] = None,\n    max_output_tokens: Optional[int] = None,\n    stop_sequences: Optional[List[str]] = None,\n    safety_settings: Optional[Dict[str, Any]] = None,\n    tools: Optional[List[Dict[str, Any]]] = None,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for the given input using the [VertexAI async client definition](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini).\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        temperature: Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to `None`.\n        top_p: If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to `None`.\n        top_k: If specified, top-k sampling will be used. Defaults to `None`.\n        max_output_tokens: The maximum number of output tokens to generate per message. Defaults to `None`.\n        stop_sequences: A list of stop sequences. Defaults to `None`.\n        safety_settings: Safety configuration for returned content from the API. Defaults to `None`.\n        tools: A potential list of tools that can be used by the API. Defaults to `None`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    from vertexai.generative_models import GenerationConfig\n\n    content: \"GenerationResponse\" = await self._aclient.generate_content_async(  # type: ignore\n        contents=self._chattype_to_content(input),\n        generation_config=GenerationConfig(\n            candidate_count=1,  # only one candidate allowed per call\n            temperature=temperature,\n            top_k=top_k,\n            top_p=top_p,\n            max_output_tokens=max_output_tokens,\n            stop_sequences=stop_sequences,\n        ),\n        safety_settings=safety_settings,  # type: ignore\n        tools=tools,  # type: ignore\n        stream=False,\n    )\n\n    text = None\n    try:\n        text = content.candidates[0].text\n    except ValueError:\n        self._logger.warning(  # type: ignore\n            f\"Received no response using VertexAI client (model: '{self.model}').\"\n            f\" Finish reason was: '{content.candidates[0].finish_reason}'.\"\n        )\n\n    return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM","title":"ClientvLLM","text":"

Bases: OpenAILLM, MagpieChatTemplateMixin

A client for the vLLM server implementing the OpenAI API specification.

Attributes:

Name Type Description base_url

the base URL of the vLLM server. Defaults to \"http://localhost:8000\".

max_retries

the maximum number of times to retry the request to the API before failing. Defaults to 6.

timeout

the maximum time in seconds to wait for a response from the API. Defaults to 120.

httpx_client_kwargs

extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None.

tokenizer Optional[str]

the Hugging Face Hub repo id or path of the tokenizer that will be used to apply the chat template and tokenize the inputs before sending it to the server. Defaults to None.

tokenizer_revision Optional[str]

the revision of the tokenizer to load. Defaults to None.

_aclient Optional[str]

the httpx.AsyncClient used to comunicate with the vLLM server. Defaults to None.

Runtime parameters
  • base_url: the base url of the vLLM server. Defaults to \"http://localhost:8000\".
  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.
  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.
  • httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None.

Examples:

Generate text:

from distilabel.models.llms import ClientvLLM\n\nllm = ClientvLLM(\n    base_url=\"http://localhost:8000/v1\",\n    tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n)\n\nllm.load()\n\nresults = llm.generate_outputs(\n    inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n    temperature=0.7,\n    top_p=1.0,\n    max_new_tokens=256,\n)\n# [\n#     [\n#         \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n#         \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n#         \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n#     ]\n# ]\n
Source code in src/distilabel/models/llms/vllm.py
class ClientvLLM(OpenAILLM, MagpieChatTemplateMixin):\n    \"\"\"A client for the `vLLM` server implementing the OpenAI API specification.\n\n    Attributes:\n        base_url: the base URL of the `vLLM` server. Defaults to `\"http://localhost:8000\"`.\n        max_retries: the maximum number of times to retry the request to the API before\n            failing. Defaults to `6`.\n        timeout: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        httpx_client_kwargs: extra kwargs that will be passed to the `httpx.AsyncClient`\n            created to comunicate with the `vLLM` server. Defaults to `None`.\n        tokenizer: the Hugging Face Hub repo id or path of the tokenizer that will be used\n            to apply the chat template and tokenize the inputs before sending it to the\n            server. Defaults to `None`.\n        tokenizer_revision: the revision of the tokenizer to load. Defaults to `None`.\n        _aclient: the `httpx.AsyncClient` used to comunicate with the `vLLM` server. Defaults\n            to `None`.\n\n    Runtime parameters:\n        - `base_url`: the base url of the `vLLM` server. Defaults to `\"http://localhost:8000\"`.\n        - `max_retries`: the maximum number of times to retry the request to the API before\n            failing. Defaults to `6`.\n        - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        - `httpx_client_kwargs`: extra kwargs that will be passed to the `httpx.AsyncClient`\n            created to comunicate with the `vLLM` server. Defaults to `None`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import ClientvLLM\n\n        llm = ClientvLLM(\n            base_url=\"http://localhost:8000/v1\",\n            tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n        )\n\n        llm.load()\n\n        results = llm.generate_outputs(\n            inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n            temperature=0.7,\n            top_p=1.0,\n            max_new_tokens=256,\n        )\n        # [\n        #     [\n        #         \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n        #         \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n        #         \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n        #     ]\n        # ]\n        ```\n    \"\"\"\n\n    model: str = \"\"  # Default value so it's not needed to `ClientvLLM(model=\"...\")`\n    tokenizer: Optional[str] = None\n    tokenizer_revision: Optional[str] = None\n\n    # We need the sync client to get the list of models\n    _client: \"OpenAI\" = PrivateAttr(None)\n    _tokenizer: \"PreTrainedTokenizer\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Creates an `httpx.AsyncClient` to connect to the vLLM server and a tokenizer\n        optionally.\"\"\"\n\n        self.api_key = SecretStr(\"EMPTY\")\n\n        # We need to first create the sync client to get the model name that will be used\n        # in the `super().load()` when creating the logger.\n        try:\n            from openai import OpenAI\n        except ImportError as ie:\n            raise ImportError(\n                \"OpenAI Python client is not installed. Please install it using\"\n                \" `pip install openai`.\"\n            ) from ie\n\n        self._client = OpenAI(\n            base_url=self.base_url,\n            api_key=self.api_key.get_secret_value(),  # type: ignore\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        super().load()\n\n        try:\n            from transformers import AutoTokenizer\n        except ImportError as ie:\n            raise ImportError(\n                \"To use `ClientvLLM` you need to install `transformers`.\"\n                \"Please install it using `pip install transformers`.\"\n            ) from ie\n\n        self._tokenizer = AutoTokenizer.from_pretrained(\n            self.tokenizer, revision=self.tokenizer_revision\n        )\n\n    @cached_property\n    def model_name(self) -> str:  # type: ignore\n        \"\"\"Returns the name of the model served with vLLM server.\"\"\"\n        models = self._client.models.list()\n        return models.data[0].id\n\n    def _prepare_input(self, input: \"StandardInput\") -> str:\n        \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n        input.\n\n        Args:\n            input: the input list containing chat items.\n\n        Returns:\n            The prompt to send to the LLM.\n        \"\"\"\n        prompt: str = (\n            self._tokenizer.apply_chat_template(  # type: ignore\n                input,  # type: ignore\n                tokenize=False,\n                add_generation_prompt=True,  # type: ignore\n            )\n            if input\n            else \"\"\n        )\n        return super().apply_magpie_pre_query_template(prompt, input)\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        frequency_penalty: float = 0.0,\n        logit_bias: Optional[Dict[str, int]] = None,\n        presence_penalty: float = 0.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for each input.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            logit_bias: modify the likelihood of specified tokens appearing in the completion.\n                Defaults to ``\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: nucleus sampling. The value refers to the top-p tokens that should be\n                considered for sampling. Defaults to `1.0`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n\n        completion = await self._aclient.completions.create(\n            model=self.model_name,\n            prompt=self._prepare_input(input),  # type: ignore\n            n=num_generations,\n            max_tokens=max_new_tokens,\n            frequency_penalty=frequency_penalty,\n            logit_bias=logit_bias,\n            presence_penalty=presence_penalty,\n            temperature=temperature,\n            top_p=top_p,\n        )\n\n        generations = []\n        for choice in completion.choices:\n            if (text := choice.text) == \"\":\n                self._logger.warning(  # type: ignore\n                    f\"Received no response from vLLM server (model: '{self.model_name}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(text)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.model_name","title":"model_name: str cached property","text":"

Returns the name of the model served with vLLM server.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.load","title":"load()","text":"

Creates an httpx.AsyncClient to connect to the vLLM server and a tokenizer optionally.

Source code in src/distilabel/models/llms/vllm.py
def load(self) -> None:\n    \"\"\"Creates an `httpx.AsyncClient` to connect to the vLLM server and a tokenizer\n    optionally.\"\"\"\n\n    self.api_key = SecretStr(\"EMPTY\")\n\n    # We need to first create the sync client to get the model name that will be used\n    # in the `super().load()` when creating the logger.\n    try:\n        from openai import OpenAI\n    except ImportError as ie:\n        raise ImportError(\n            \"OpenAI Python client is not installed. Please install it using\"\n            \" `pip install openai`.\"\n        ) from ie\n\n    self._client = OpenAI(\n        base_url=self.base_url,\n        api_key=self.api_key.get_secret_value(),  # type: ignore\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    super().load()\n\n    try:\n        from transformers import AutoTokenizer\n    except ImportError as ie:\n        raise ImportError(\n            \"To use `ClientvLLM` you need to install `transformers`.\"\n            \"Please install it using `pip install transformers`.\"\n        ) from ie\n\n    self._tokenizer = AutoTokenizer.from_pretrained(\n        self.tokenizer, revision=self.tokenizer_revision\n    )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM._prepare_input","title":"_prepare_input(input)","text":"

Prepares the input (applying the chat template and tokenization) for the provided input.

Parameters:

Name Type Description Default input StandardInput

the input list containing chat items.

required

Returns:

Type Description str

The prompt to send to the LLM.

Source code in src/distilabel/models/llms/vllm.py
def _prepare_input(self, input: \"StandardInput\") -> str:\n    \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n    input.\n\n    Args:\n        input: the input list containing chat items.\n\n    Returns:\n        The prompt to send to the LLM.\n    \"\"\"\n    prompt: str = (\n        self._tokenizer.apply_chat_template(  # type: ignore\n            input,  # type: ignore\n            tokenize=False,\n            add_generation_prompt=True,  # type: ignore\n        )\n        if input\n        else \"\"\n    )\n    return super().apply_magpie_pre_query_template(prompt, input)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.agenerate","title":"agenerate(input, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, logit_bias=None, presence_penalty=0.0, temperature=1.0, top_p=1.0) async","text":"

Generates num_generations responses for each input.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 logit_bias Optional[Dict[str, int]]

modify the likelihood of specified tokens appearing in the completion. Defaults to ``

None presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

nucleus sampling. The value refers to the top-p tokens that should be considered for sampling. Defaults to 1.0.

1.0

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/vllm.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    frequency_penalty: float = 0.0,\n    logit_bias: Optional[Dict[str, int]] = None,\n    presence_penalty: float = 0.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for each input.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        logit_bias: modify the likelihood of specified tokens appearing in the completion.\n            Defaults to ``\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: nucleus sampling. The value refers to the top-p tokens that should be\n            considered for sampling. Defaults to `1.0`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n\n    completion = await self._aclient.completions.create(\n        model=self.model_name,\n        prompt=self._prepare_input(input),  # type: ignore\n        n=num_generations,\n        max_tokens=max_new_tokens,\n        frequency_penalty=frequency_penalty,\n        logit_bias=logit_bias,\n        presence_penalty=presence_penalty,\n        temperature=temperature,\n        top_p=top_p,\n    )\n\n    generations = []\n    for choice in completion.choices:\n        if (text := choice.text) == \"\":\n            self._logger.warning(  # type: ignore\n                f\"Received no response from vLLM server (model: '{self.model_name}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(text)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM","title":"vLLM","text":"

Bases: LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin

vLLM library LLM implementation.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

dtype str

the data type to use for the model. Defaults to auto.

trust_remote_code bool

whether to trust the remote code when loading the model. Defaults to False.

quantization Optional[str]

the quantization mode to use for the model. Defaults to None.

revision Optional[str]

the revision of the model to load. Defaults to None.

tokenizer Optional[str]

the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer files. If not provided, the tokenizer will be loaded from the model directory. Defaults to None.

tokenizer_mode Literal['auto', 'slow']

the mode to use for the tokenizer. Defaults to auto.

tokenizer_revision Optional[str]

the revision of the tokenizer to load. Defaults to None.

skip_tokenizer_init bool

whether to skip the initialization of the tokenizer. Defaults to False.

chat_template Optional[str]

a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None.

structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]]

a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

seed int

the seed to use for the random number generator. Defaults to 0.

extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]]

additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {}.

_model LLM

the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

_tokenizer PreTrainedTokenizer

the tokenizer instance used to format the prompt before passing it to the LLM. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

use_magpie_template PreTrainedTokenizer

a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

magpie_pre_query_template PreTrainedTokenizer

the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

References
  • https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
Runtime parameters
  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library.

Examples:

Generate text:

from distilabel.models.llms import vLLM\n\n# You can pass a custom chat_template to the model\nllm = vLLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pathlib import Path\nfrom distilabel.models.llms import vLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = vLLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\"\n    structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/vllm.py
class vLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):\n    \"\"\"`vLLM` library LLM implementation.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        dtype: the data type to use for the model. Defaults to `auto`.\n        trust_remote_code: whether to trust the remote code when loading the model. Defaults\n            to `False`.\n        quantization: the quantization mode to use for the model. Defaults to `None`.\n        revision: the revision of the model to load. Defaults to `None`.\n        tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing\n            the tokenizer files. If not provided, the tokenizer will be loaded from the\n            model directory. Defaults to `None`.\n        tokenizer_mode: the mode to use for the tokenizer. Defaults to `auto`.\n        tokenizer_revision: the revision of the tokenizer to load. Defaults to `None`.\n        skip_tokenizer_init: whether to skip the initialization of the tokenizer. Defaults\n            to `False`.\n        chat_template: a chat template that will be used to build the prompts before\n            sending them to the model. If not provided, the chat template defined in the\n            tokenizer config will be used. If not provided and the tokenizer doesn't have\n            a chat template, then ChatML template will be used. Defaults to `None`.\n        structured_output: a dictionary containing the structured output configuration or if more\n            fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n        seed: the seed to use for the random number generator. Defaults to `0`.\n        extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n            `LLM` class of `vllm` library. Defaults to `{}`.\n        _model: the `vLLM` model instance. This attribute is meant to be used internally\n            and should not be accessed directly. It will be set in the `load` method.\n        _tokenizer: the tokenizer instance used to format the prompt before passing it to\n            the `LLM`. This attribute is meant to be used internally and should not be\n            accessed directly. It will be set in the `load` method.\n        use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n            template. Defaults to `False`.\n        magpie_pre_query_template: the pre-query template to be applied to the prompt or\n            sent to the LLM to generate an instruction or a follow up user message. Valid\n            values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n            to `None`.\n\n    References:\n        - https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\n\n    Runtime parameters:\n        - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to\n            the `LLM` class of `vllm` library.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import vLLM\n\n        # You can pass a custom chat_template to the model\n        llm = vLLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\",\n            chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n        )\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pathlib import Path\n        from distilabel.models.llms import vLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = vLLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\"\n            structured_output={\"format\": \"json\", \"schema\": Character},\n        )\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    dtype: str = \"auto\"\n    trust_remote_code: bool = False\n    quantization: Optional[str] = None\n    revision: Optional[str] = None\n\n    tokenizer: Optional[str] = None\n    tokenizer_mode: Literal[\"auto\", \"slow\"] = \"auto\"\n    tokenizer_revision: Optional[str] = None\n    skip_tokenizer_init: bool = False\n    chat_template: Optional[str] = None\n\n    seed: int = 0\n\n    extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n        default_factory=dict,\n        description=\"Additional dictionary of keyword arguments that will be passed to the\"\n        \" `vLLM` class of `vllm` library. See all the supported arguments at: \"\n        \"https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\",\n    )\n    structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n        default=None,\n        description=\"The structured output format to use across all the generations.\",\n    )\n\n    _model: \"_vLLM\" = PrivateAttr(None)\n    _tokenizer: \"PreTrainedTokenizer\" = PrivateAttr(None)\n    _structured_output_logits_processor: Optional[Callable] = PrivateAttr(default=None)\n\n    def load(self) -> None:\n        \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\n        Additionally, this method also sets the `chat_template` for the tokenizer, so as to properly\n        parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the\n        default value is ChatML format, unless explicitly provided.\n        \"\"\"\n        super().load()\n\n        CudaDevicePlacementMixin.load(self)\n\n        try:\n            from vllm import LLM as _vLLM\n        except ImportError as ie:\n            raise ImportError(\n                \"vLLM is not installed. Please install it using `pip install vllm`.\"\n            ) from ie\n\n        self._model = _vLLM(\n            self.model,\n            dtype=self.dtype,\n            trust_remote_code=self.trust_remote_code,\n            quantization=self.quantization,\n            revision=self.revision,\n            tokenizer=self.tokenizer,\n            tokenizer_mode=self.tokenizer_mode,\n            tokenizer_revision=self.tokenizer_revision,\n            skip_tokenizer_init=self.skip_tokenizer_init,\n            seed=self.seed,\n            **self.extra_kwargs,  # type: ignore\n        )\n\n        self._tokenizer = self._model.get_tokenizer()  # type: ignore\n        if self.chat_template is not None:\n            self._tokenizer.chat_template = self.chat_template  # type: ignore\n\n        if self.structured_output:\n            self._structured_output_logits_processor = self._prepare_structured_output(\n                self.structured_output\n            )\n\n    def unload(self) -> None:\n        \"\"\"Unloads the `vLLM` model.\"\"\"\n        self._model = None  # type: ignore\n        self._tokenizer = None  # type: ignore\n        CudaDevicePlacementMixin.unload(self)\n        super().unload()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    def prepare_input(self, input: \"StandardInput\") -> str:\n        \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n        input.\n\n        Args:\n            input: the input list containing chat items.\n\n        Returns:\n            The prompt to send to the LLM.\n        \"\"\"\n        if self._tokenizer.chat_template is None:\n            return input[0][\"content\"]\n\n        prompt: str = (\n            self._tokenizer.apply_chat_template(\n                input,  # type: ignore\n                tokenize=False,\n                add_generation_prompt=True,  # type: ignore\n            )\n            if input\n            else \"\"\n        )\n        return super().apply_magpie_pre_query_template(prompt, input)\n\n    def _prepare_batches(\n        self, inputs: List[FormattedInput]\n    ) -> Tuple[List[List[FormattedInput]], List[int]]:\n        \"\"\"Prepares the inputs by grouping them by the structured output.\n\n        When we generate structured outputs with schemas obtained from a dataset, we need to\n        prepare the data to try to send batches of inputs instead of single inputs to the model\n        to take advante of the engine. So we group the inputs by the structured output to be\n        passed in the `generate` method.\n\n        Args:\n            inputs: The batch of inputs passed to the generate method. As we expect to be generating\n                structured outputs, each element will be a tuple containing the instruction and the\n                structured output.\n\n        Returns:\n            The prepared batches (sub-batches let's say) to be passed to the `generate` method.\n            Each new tuple will contain instead of the single instruction, a list of instructions\n        \"\"\"\n        instruction_order = {}\n        batches = {}\n        for i, (instruction, structured_output) in enumerate(inputs):\n            instruction = self.prepare_input(instruction)\n            instruction_order[instruction] = i\n            structured_output = json.dumps(structured_output)\n            if structured_output not in batches:\n                batches[structured_output] = [instruction]\n            else:\n                batches[structured_output].append(instruction)\n\n        # Flatten the instructions in prepared_data\n        flat_instructions = [\n            instruction for _, group in batches.items() for instruction in group\n        ]\n        # Generate the list of indices based on the original order\n        sorted_indices = [\n            instruction_order[instruction] for instruction in flat_instructions\n        ]\n        return [\n            (batch, json.loads(schema)) for schema, batch in batches.items()\n        ], sorted_indices\n\n    @validate_call\n    def generate(  # type: ignore\n        self,\n        inputs: List[FormattedInput],\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        presence_penalty: float = 0.0,\n        frequency_penalty: float = 0.0,\n        repetition_penalty: float = 1.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        top_k: int = -1,\n        min_p: float = 0.0,\n        stop: Optional[List[str]] = None,\n        stop_token_ids: Optional[List[int]] = None,\n        include_stop_str_in_output: bool = False,\n        logits_processors: Optional[LogitsProcessors] = None,\n        extra_sampling_params: Optional[Dict[str, Any]] = None,\n    ) -> List[GenerateOutput]:\n        \"\"\"Generates `num_generations` responses for each input.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            repetition_penalty: the repetition penalty to use for the generation Defaults to\n                `1.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            top_k: the top-k value to use for the generation. Defaults to `0`.\n            min_p: the minimum probability to use for the generation. Defaults to `0.0`.\n            stop: a list of strings that will be used to stop the generation when found.\n                Defaults to `None`.\n            stop_token_ids: a list of token ids that will be used to stop the generation\n                when found. Defaults to `None`.\n            include_stop_str_in_output: whether to include the stop string in the output.\n                Defaults to `False`.\n            logits_processors: a list of functions to process the logits before sampling.\n                Defaults to `None`.\n            extra_sampling_params: dictionary with additional arguments to be passed to\n                the `SamplingParams` class from `vllm`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        from vllm import SamplingParams\n\n        if not logits_processors:\n            logits_processors = []\n\n        if extra_sampling_params is None:\n            extra_sampling_params = {}\n\n        structured_output = None\n\n        if isinstance(inputs[0], tuple):\n            prepared_batches, sorted_indices = self._prepare_batches(inputs)\n        else:\n            # Simulate a batch without the structured output content\n            prepared_batches = [([self.prepare_input(input) for input in inputs], None)]\n            sorted_indices = None\n\n        # Case in which we have a single structured output for the dataset\n        if self._structured_output_logits_processor:\n            logits_processors.append(self._structured_output_logits_processor)\n\n        batched_outputs = []\n\n        for prepared_inputs, structured_output in prepared_batches:\n            if structured_output:\n                logits_processors.append(\n                    self._prepare_structured_output(structured_output)\n                )\n\n            sampling_params = SamplingParams(  # type: ignore\n                n=num_generations,\n                presence_penalty=presence_penalty,\n                frequency_penalty=frequency_penalty,\n                repetition_penalty=repetition_penalty,\n                temperature=temperature,\n                top_p=top_p,\n                top_k=top_k,\n                min_p=min_p,\n                max_tokens=max_new_tokens,\n                stop=stop,\n                stop_token_ids=stop_token_ids,\n                include_stop_str_in_output=include_stop_str_in_output,\n                logits_processors=logits_processors,\n                **extra_sampling_params,\n            )\n\n            batch_outputs = self._model.generate(\n                prepared_inputs,\n                sampling_params,\n                use_tqdm=False,  # type: ignore\n            )\n\n            batched_outputs += [\n                [output.text for output in outputs.outputs] for outputs in batch_outputs\n            ]\n\n        # If logits_processor is set, we need to sort the outputs back to the original order\n        # (would be needed only if we have multiple structured outputs in the dataset)\n        if sorted_indices is not None:\n            batched_outputs = _sort_batches(\n                batched_outputs, sorted_indices, num_generations=num_generations\n            )\n        return batched_outputs\n\n    def _prepare_structured_output(\n        self, structured_output: Optional[OutlinesStructuredOutputType] = None\n    ) -> Union[Callable, None]:\n        \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n        Args:\n            structured_output: the configuration dict to prepare the structured output.\n\n        Returns:\n            The callable that will be used to guide the generation of the model.\n        \"\"\"\n        from distilabel.steps.tasks.structured_outputs.outlines import (\n            prepare_guided_output,\n        )\n\n        result = prepare_guided_output(structured_output, \"vllm\", self._model)\n        if (schema := result.get(\"schema\")) and self.structured_output:\n            self.structured_output[\"schema\"] = schema\n        return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.load","title":"load()","text":"

Loads the vLLM model using either the path or the Hugging Face Hub repository id. Additionally, this method also sets the chat_template for the tokenizer, so as to properly parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the default value is ChatML format, unless explicitly provided.

Source code in src/distilabel/models/llms/vllm.py
def load(self) -> None:\n    \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\n    Additionally, this method also sets the `chat_template` for the tokenizer, so as to properly\n    parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the\n    default value is ChatML format, unless explicitly provided.\n    \"\"\"\n    super().load()\n\n    CudaDevicePlacementMixin.load(self)\n\n    try:\n        from vllm import LLM as _vLLM\n    except ImportError as ie:\n        raise ImportError(\n            \"vLLM is not installed. Please install it using `pip install vllm`.\"\n        ) from ie\n\n    self._model = _vLLM(\n        self.model,\n        dtype=self.dtype,\n        trust_remote_code=self.trust_remote_code,\n        quantization=self.quantization,\n        revision=self.revision,\n        tokenizer=self.tokenizer,\n        tokenizer_mode=self.tokenizer_mode,\n        tokenizer_revision=self.tokenizer_revision,\n        skip_tokenizer_init=self.skip_tokenizer_init,\n        seed=self.seed,\n        **self.extra_kwargs,  # type: ignore\n    )\n\n    self._tokenizer = self._model.get_tokenizer()  # type: ignore\n    if self.chat_template is not None:\n        self._tokenizer.chat_template = self.chat_template  # type: ignore\n\n    if self.structured_output:\n        self._structured_output_logits_processor = self._prepare_structured_output(\n            self.structured_output\n        )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.unload","title":"unload()","text":"

Unloads the vLLM model.

Source code in src/distilabel/models/llms/vllm.py
def unload(self) -> None:\n    \"\"\"Unloads the `vLLM` model.\"\"\"\n    self._model = None  # type: ignore\n    self._tokenizer = None  # type: ignore\n    CudaDevicePlacementMixin.unload(self)\n    super().unload()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.prepare_input","title":"prepare_input(input)","text":"

Prepares the input (applying the chat template and tokenization) for the provided input.

Parameters:

Name Type Description Default input StandardInput

the input list containing chat items.

required

Returns:

Type Description str

The prompt to send to the LLM.

Source code in src/distilabel/models/llms/vllm.py
def prepare_input(self, input: \"StandardInput\") -> str:\n    \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n    input.\n\n    Args:\n        input: the input list containing chat items.\n\n    Returns:\n        The prompt to send to the LLM.\n    \"\"\"\n    if self._tokenizer.chat_template is None:\n        return input[0][\"content\"]\n\n    prompt: str = (\n        self._tokenizer.apply_chat_template(\n            input,  # type: ignore\n            tokenize=False,\n            add_generation_prompt=True,  # type: ignore\n        )\n        if input\n        else \"\"\n    )\n    return super().apply_magpie_pre_query_template(prompt, input)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM._prepare_batches","title":"_prepare_batches(inputs)","text":"

Prepares the inputs by grouping them by the structured output.

When we generate structured outputs with schemas obtained from a dataset, we need to prepare the data to try to send batches of inputs instead of single inputs to the model to take advante of the engine. So we group the inputs by the structured output to be passed in the generate method.

Parameters:

Name Type Description Default inputs List[FormattedInput]

The batch of inputs passed to the generate method. As we expect to be generating structured outputs, each element will be a tuple containing the instruction and the structured output.

required

Returns:

Type Description List[List[FormattedInput]]

The prepared batches (sub-batches let's say) to be passed to the generate method.

List[int]

Each new tuple will contain instead of the single instruction, a list of instructions

Source code in src/distilabel/models/llms/vllm.py
def _prepare_batches(\n    self, inputs: List[FormattedInput]\n) -> Tuple[List[List[FormattedInput]], List[int]]:\n    \"\"\"Prepares the inputs by grouping them by the structured output.\n\n    When we generate structured outputs with schemas obtained from a dataset, we need to\n    prepare the data to try to send batches of inputs instead of single inputs to the model\n    to take advante of the engine. So we group the inputs by the structured output to be\n    passed in the `generate` method.\n\n    Args:\n        inputs: The batch of inputs passed to the generate method. As we expect to be generating\n            structured outputs, each element will be a tuple containing the instruction and the\n            structured output.\n\n    Returns:\n        The prepared batches (sub-batches let's say) to be passed to the `generate` method.\n        Each new tuple will contain instead of the single instruction, a list of instructions\n    \"\"\"\n    instruction_order = {}\n    batches = {}\n    for i, (instruction, structured_output) in enumerate(inputs):\n        instruction = self.prepare_input(instruction)\n        instruction_order[instruction] = i\n        structured_output = json.dumps(structured_output)\n        if structured_output not in batches:\n            batches[structured_output] = [instruction]\n        else:\n            batches[structured_output].append(instruction)\n\n    # Flatten the instructions in prepared_data\n    flat_instructions = [\n        instruction for _, group in batches.items() for instruction in group\n    ]\n    # Generate the list of indices based on the original order\n    sorted_indices = [\n        instruction_order[instruction] for instruction in flat_instructions\n    ]\n    return [\n        (batch, json.loads(schema)) for schema, batch in batches.items()\n    ], sorted_indices\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, stop=None, stop_token_ids=None, include_stop_str_in_output=False, logits_processors=None, extra_sampling_params=None)","text":"

Generates num_generations responses for each input.

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 repetition_penalty float

the repetition penalty to use for the generation Defaults to 1.0.

1.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 top_k int

the top-k value to use for the generation. Defaults to 0.

-1 min_p float

the minimum probability to use for the generation. Defaults to 0.0.

0.0 stop Optional[List[str]]

a list of strings that will be used to stop the generation when found. Defaults to None.

None stop_token_ids Optional[List[int]]

a list of token ids that will be used to stop the generation when found. Defaults to None.

None include_stop_str_in_output bool

whether to include the stop string in the output. Defaults to False.

False logits_processors Optional[LogitsProcessors]

a list of functions to process the logits before sampling. Defaults to None.

None extra_sampling_params Optional[Dict[str, Any]]

dictionary with additional arguments to be passed to the SamplingParams class from vllm.

None

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/vllm.py
@validate_call\ndef generate(  # type: ignore\n    self,\n    inputs: List[FormattedInput],\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    presence_penalty: float = 0.0,\n    frequency_penalty: float = 0.0,\n    repetition_penalty: float = 1.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    top_k: int = -1,\n    min_p: float = 0.0,\n    stop: Optional[List[str]] = None,\n    stop_token_ids: Optional[List[int]] = None,\n    include_stop_str_in_output: bool = False,\n    logits_processors: Optional[LogitsProcessors] = None,\n    extra_sampling_params: Optional[Dict[str, Any]] = None,\n) -> List[GenerateOutput]:\n    \"\"\"Generates `num_generations` responses for each input.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        repetition_penalty: the repetition penalty to use for the generation Defaults to\n            `1.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        top_k: the top-k value to use for the generation. Defaults to `0`.\n        min_p: the minimum probability to use for the generation. Defaults to `0.0`.\n        stop: a list of strings that will be used to stop the generation when found.\n            Defaults to `None`.\n        stop_token_ids: a list of token ids that will be used to stop the generation\n            when found. Defaults to `None`.\n        include_stop_str_in_output: whether to include the stop string in the output.\n            Defaults to `False`.\n        logits_processors: a list of functions to process the logits before sampling.\n            Defaults to `None`.\n        extra_sampling_params: dictionary with additional arguments to be passed to\n            the `SamplingParams` class from `vllm`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    from vllm import SamplingParams\n\n    if not logits_processors:\n        logits_processors = []\n\n    if extra_sampling_params is None:\n        extra_sampling_params = {}\n\n    structured_output = None\n\n    if isinstance(inputs[0], tuple):\n        prepared_batches, sorted_indices = self._prepare_batches(inputs)\n    else:\n        # Simulate a batch without the structured output content\n        prepared_batches = [([self.prepare_input(input) for input in inputs], None)]\n        sorted_indices = None\n\n    # Case in which we have a single structured output for the dataset\n    if self._structured_output_logits_processor:\n        logits_processors.append(self._structured_output_logits_processor)\n\n    batched_outputs = []\n\n    for prepared_inputs, structured_output in prepared_batches:\n        if structured_output:\n            logits_processors.append(\n                self._prepare_structured_output(structured_output)\n            )\n\n        sampling_params = SamplingParams(  # type: ignore\n            n=num_generations,\n            presence_penalty=presence_penalty,\n            frequency_penalty=frequency_penalty,\n            repetition_penalty=repetition_penalty,\n            temperature=temperature,\n            top_p=top_p,\n            top_k=top_k,\n            min_p=min_p,\n            max_tokens=max_new_tokens,\n            stop=stop,\n            stop_token_ids=stop_token_ids,\n            include_stop_str_in_output=include_stop_str_in_output,\n            logits_processors=logits_processors,\n            **extra_sampling_params,\n        )\n\n        batch_outputs = self._model.generate(\n            prepared_inputs,\n            sampling_params,\n            use_tqdm=False,  # type: ignore\n        )\n\n        batched_outputs += [\n            [output.text for output in outputs.outputs] for outputs in batch_outputs\n        ]\n\n    # If logits_processor is set, we need to sort the outputs back to the original order\n    # (would be needed only if we have multiple structured outputs in the dataset)\n    if sorted_indices is not None:\n        batched_outputs = _sort_batches(\n            batched_outputs, sorted_indices, num_generations=num_generations\n        )\n    return batched_outputs\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None)","text":"

Creates the appropriate function to filter tokens to generate structured outputs.

Parameters:

Name Type Description Default structured_output Optional[OutlinesStructuredOutputType]

the configuration dict to prepare the structured output.

None

Returns:

Type Description Union[Callable, None]

The callable that will be used to guide the generation of the model.

Source code in src/distilabel/models/llms/vllm.py
def _prepare_structured_output(\n    self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[Callable, None]:\n    \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n    Args:\n        structured_output: the configuration dict to prepare the structured output.\n\n    Returns:\n        The callable that will be used to guide the generation of the model.\n    \"\"\"\n    from distilabel.steps.tasks.structured_outputs.outlines import (\n        prepare_guided_output,\n    )\n\n    result = prepare_guided_output(structured_output, \"vllm\", self._model)\n    if (schema := result.get(\"schema\")) and self.structured_output:\n        self.structured_output[\"schema\"] = schema\n    return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin","title":"CudaDevicePlacementMixin","text":"

Bases: BaseModel

Mixin class to assign CUDA devices to the LLM based on the cuda_devices attribute and the device placement information provided in _device_llm_placement_map. Providing the device placement information is optional, but if it is provided, it will be used to assign CUDA devices to the LLMs, trying to avoid using the same device for different LLMs.

Attributes:

Name Type Description cuda_devices RuntimeParameter[Union[List[int], Literal['auto']]]

a list with the ID of the CUDA devices to be used by the LLM. If set to \"auto\", the devices will be automatically assigned based on the device placement information provided in _device_llm_placement_map. If set to a list of devices, it will be checked if the devices are available to be used by the LLM. If not, a warning will be logged.

disable_cuda_device_placement RuntimeParameter[bool]

Whether to disable the CUDA device placement logic or not. Defaults to False.

_llm_identifier Union[str, None]

the identifier of the LLM to be used as key in _device_llm_placement_map.

_device_llm_placement_map Generator[Dict[str, List[int]], None, None]

a dictionary with the device placement information for each LLM.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
class CudaDevicePlacementMixin(BaseModel):\n    \"\"\"Mixin class to assign CUDA devices to the `LLM` based on the `cuda_devices` attribute\n    and the device placement information provided in `_device_llm_placement_map`. Providing\n    the device placement information is optional, but if it is provided, it will be used to\n    assign CUDA devices to the `LLM`s, trying to avoid using the same device for different\n    `LLM`s.\n\n    Attributes:\n        cuda_devices: a list with the ID of the CUDA devices to be used by the `LLM`. If set\n            to \"auto\", the devices will be automatically assigned based on the device\n            placement information provided in `_device_llm_placement_map`. If set to a list\n            of devices, it will be checked if the devices are available to be used by the\n            `LLM`. If not, a warning will be logged.\n        disable_cuda_device_placement: Whether to disable the CUDA device placement logic\n            or not. Defaults to `False`.\n        _llm_identifier: the identifier of the `LLM` to be used as key in `_device_llm_placement_map`.\n        _device_llm_placement_map: a dictionary with the device placement information for each\n            `LLM`.\n    \"\"\"\n\n    cuda_devices: RuntimeParameter[Union[List[int], Literal[\"auto\"]]] = Field(\n        default=\"auto\", description=\"A list with the ID of the CUDA devices to be used.\"\n    )\n    disable_cuda_device_placement: RuntimeParameter[bool] = Field(\n        default=False,\n        description=\"Whether to disable the CUDA device placement logic or not.\",\n    )\n\n    _llm_identifier: Union[str, None] = PrivateAttr(default=None)\n    _desired_num_gpus: PositiveInt = PrivateAttr(default=1)\n    _available_cuda_devices: List[int] = PrivateAttr(default_factory=list)\n    _can_check_cuda_devices: bool = PrivateAttr(default=False)\n\n    _logger: \"Logger\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Assign CUDA devices to the LLM based on the device placement information provided\n        in `_device_llm_placement_map`.\"\"\"\n\n        if self.disable_cuda_device_placement:\n            return\n\n        try:\n            import pynvml\n\n            pynvml.nvmlInit()\n            device_count = pynvml.nvmlDeviceGetCount()\n            self._available_cuda_devices = list(range(device_count))\n            self._can_check_cuda_devices = True\n        except ImportError as ie:\n            if self.cuda_devices == \"auto\":\n                raise ImportError(\n                    \"The 'pynvml' library is not installed. It is required to automatically\"\n                    \" assign CUDA devices to the `LLM`s. Please, install it and try again.\"\n                ) from ie\n\n            if self.cuda_devices:\n                self._logger.warning(  # type: ignore\n                    \"The 'pynvml' library is not installed. It is recommended to install it\"\n                    \" to check if the CUDA devices assigned to the LLM are available.\"\n                )\n\n        self._assign_cuda_devices()\n\n    def unload(self) -> None:\n        \"\"\"Unloads the LLM and removes the CUDA devices assigned to it from the device\n        placement information provided in `_device_llm_placement_map`.\"\"\"\n        if self.disable_cuda_device_placement:\n            return\n\n        with self._device_llm_placement_map() as device_map:\n            if self._llm_identifier in device_map:\n                self._logger.debug(  # type: ignore\n                    f\"Removing '{self._llm_identifier}' from the CUDA device map file\"\n                    f\" '{_CUDA_DEVICE_PLACEMENT_MIXIN_FILE}'.\"\n                )\n                del device_map[self._llm_identifier]\n\n    @contextmanager\n    def _device_llm_placement_map(self) -> Generator[Dict[str, List[int]], None, None]:\n        \"\"\"Reads the content of the device placement file of the node with a lock, yields\n        the content, and writes the content back to the file after the context manager is\n        closed. If the file doesn't exist, an empty dictionary will be yielded.\n\n        Yields:\n            The content of the device placement file.\n        \"\"\"\n        _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.parent.mkdir(parents=True, exist_ok=True)\n        _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.touch()\n        with portalocker.Lock(\n            _CUDA_DEVICE_PLACEMENT_MIXIN_FILE,\n            \"r+\",\n            flags=portalocker.LockFlags.EXCLUSIVE,\n        ) as f:\n            try:\n                content = json.load(f)\n            except json.JSONDecodeError:\n                content = {}\n            yield content\n            f.seek(0)\n            f.truncate()\n            f.write(json.dumps(content))\n\n    def _assign_cuda_devices(self) -> None:\n        \"\"\"Assigns CUDA devices to the LLM based on the device placement information provided\n        in `_device_llm_placement_map`. If the `cuda_devices` attribute is set to \"auto\", it\n        will be set to the first available CUDA device that is not going to be used by any\n        other LLM. If the `cuda_devices` attribute is set to a list of devices, it will be\n        checked if the devices are available to be used by the LLM. If not, a warning will be\n        logged.\"\"\"\n\n        # Take the lock and read the device placement information for each LLM.\n        with self._device_llm_placement_map() as device_map:\n            if self.cuda_devices == \"auto\":\n                self.cuda_devices = []\n                for _ in range(self._desired_num_gpus):\n                    if (device_id := self._get_cuda_device(device_map)) is not None:\n                        self.cuda_devices.append(device_id)\n                        device_map[self._llm_identifier] = self.cuda_devices  # type: ignore\n                if len(self.cuda_devices) != self._desired_num_gpus:\n                    self._logger.warning(  # type: ignore\n                        f\"Could not assign the desired number of GPUs {self._desired_num_gpus}\"\n                        f\" for LLM with identifier '{self._llm_identifier}'.\"\n                    )\n            else:\n                self._check_cuda_devices(device_map)\n\n            device_map[self._llm_identifier] = self.cuda_devices  # type: ignore\n\n        # `_device_llm_placement_map` was not provided and user didn't set the `cuda_devices`\n        # attribute. In this case, the `cuda_devices` attribute will be set to an empty list.\n        if self.cuda_devices == \"auto\":\n            self.cuda_devices = []\n\n        self._set_cuda_visible_devices()\n\n    def _check_cuda_devices(self, device_map: Dict[str, List[int]]) -> None:\n        \"\"\"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.\n\n        Args:\n            device_map: a dictionary with the device placement information for each LLM.\n        \"\"\"\n        for device in self.cuda_devices:  # type: ignore\n            for llm, devices in device_map.items():\n                if device in devices:\n                    self._logger.warning(  # type: ignore\n                        f\"LLM with identifier '{llm}' is also going to use CUDA device \"\n                        f\"'{device}'. This may lead to performance issues or running out\"\n                        \" of memory depending on the device capabilities and the loaded\"\n                        \" models.\"\n                    )\n\n    def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]:\n        \"\"\"Returns the first available CUDA device to be used by the LLM that is not going\n        to be used by any other LLM.\n\n        Args:\n            device_map: a dictionary with the device placement information for each LLM.\n\n        Returns:\n            The first available CUDA device to be used by the LLM.\n\n        Raises:\n            RuntimeError: if there is no available CUDA device to be used by the LLM.\n        \"\"\"\n        for device in self._available_cuda_devices:\n            if all(device not in devices for devices in device_map.values()):\n                return device\n\n        return None\n\n    def _set_cuda_visible_devices(self) -> None:\n        \"\"\"Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices\n        to be used by the LLM.\n        \"\"\"\n        if not self.cuda_devices:\n            return\n\n        if self._can_check_cuda_devices and not all(\n            device in self._available_cuda_devices for device in self.cuda_devices\n        ):\n            raise RuntimeError(\n                f\"Invalid CUDA devices for LLM '{self._llm_identifier}': {self.cuda_devices}.\"\n                f\" The available devices are: {self._available_cuda_devices}. Please, review\"\n                \" the 'cuda_devices' attribute and try again.\"\n            )\n\n        cuda_devices = \",\".join([str(device) for device in self.cuda_devices])\n        self._logger.info(  # type: ignore\n            f\"\ud83c\udfae LLM '{self._llm_identifier}' is going to use the following CUDA devices:\"\n            f\" {self.cuda_devices}.\"\n        )\n        os.environ[\"CUDA_VISIBLE_DEVICES\"] = cuda_devices\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin.load","title":"load()","text":"

Assign CUDA devices to the LLM based on the device placement information provided in _device_llm_placement_map.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def load(self) -> None:\n    \"\"\"Assign CUDA devices to the LLM based on the device placement information provided\n    in `_device_llm_placement_map`.\"\"\"\n\n    if self.disable_cuda_device_placement:\n        return\n\n    try:\n        import pynvml\n\n        pynvml.nvmlInit()\n        device_count = pynvml.nvmlDeviceGetCount()\n        self._available_cuda_devices = list(range(device_count))\n        self._can_check_cuda_devices = True\n    except ImportError as ie:\n        if self.cuda_devices == \"auto\":\n            raise ImportError(\n                \"The 'pynvml' library is not installed. It is required to automatically\"\n                \" assign CUDA devices to the `LLM`s. Please, install it and try again.\"\n            ) from ie\n\n        if self.cuda_devices:\n            self._logger.warning(  # type: ignore\n                \"The 'pynvml' library is not installed. It is recommended to install it\"\n                \" to check if the CUDA devices assigned to the LLM are available.\"\n            )\n\n    self._assign_cuda_devices()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin.unload","title":"unload()","text":"

Unloads the LLM and removes the CUDA devices assigned to it from the device placement information provided in _device_llm_placement_map.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def unload(self) -> None:\n    \"\"\"Unloads the LLM and removes the CUDA devices assigned to it from the device\n    placement information provided in `_device_llm_placement_map`.\"\"\"\n    if self.disable_cuda_device_placement:\n        return\n\n    with self._device_llm_placement_map() as device_map:\n        if self._llm_identifier in device_map:\n            self._logger.debug(  # type: ignore\n                f\"Removing '{self._llm_identifier}' from the CUDA device map file\"\n                f\" '{_CUDA_DEVICE_PLACEMENT_MIXIN_FILE}'.\"\n            )\n            del device_map[self._llm_identifier]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._device_llm_placement_map","title":"_device_llm_placement_map()","text":"

Reads the content of the device placement file of the node with a lock, yields the content, and writes the content back to the file after the context manager is closed. If the file doesn't exist, an empty dictionary will be yielded.

Yields:

Type Description Dict[str, List[int]]

The content of the device placement file.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
@contextmanager\ndef _device_llm_placement_map(self) -> Generator[Dict[str, List[int]], None, None]:\n    \"\"\"Reads the content of the device placement file of the node with a lock, yields\n    the content, and writes the content back to the file after the context manager is\n    closed. If the file doesn't exist, an empty dictionary will be yielded.\n\n    Yields:\n        The content of the device placement file.\n    \"\"\"\n    _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.parent.mkdir(parents=True, exist_ok=True)\n    _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.touch()\n    with portalocker.Lock(\n        _CUDA_DEVICE_PLACEMENT_MIXIN_FILE,\n        \"r+\",\n        flags=portalocker.LockFlags.EXCLUSIVE,\n    ) as f:\n        try:\n            content = json.load(f)\n        except json.JSONDecodeError:\n            content = {}\n        yield content\n        f.seek(0)\n        f.truncate()\n        f.write(json.dumps(content))\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._assign_cuda_devices","title":"_assign_cuda_devices()","text":"

Assigns CUDA devices to the LLM based on the device placement information provided in _device_llm_placement_map. If the cuda_devices attribute is set to \"auto\", it will be set to the first available CUDA device that is not going to be used by any other LLM. If the cuda_devices attribute is set to a list of devices, it will be checked if the devices are available to be used by the LLM. If not, a warning will be logged.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def _assign_cuda_devices(self) -> None:\n    \"\"\"Assigns CUDA devices to the LLM based on the device placement information provided\n    in `_device_llm_placement_map`. If the `cuda_devices` attribute is set to \"auto\", it\n    will be set to the first available CUDA device that is not going to be used by any\n    other LLM. If the `cuda_devices` attribute is set to a list of devices, it will be\n    checked if the devices are available to be used by the LLM. If not, a warning will be\n    logged.\"\"\"\n\n    # Take the lock and read the device placement information for each LLM.\n    with self._device_llm_placement_map() as device_map:\n        if self.cuda_devices == \"auto\":\n            self.cuda_devices = []\n            for _ in range(self._desired_num_gpus):\n                if (device_id := self._get_cuda_device(device_map)) is not None:\n                    self.cuda_devices.append(device_id)\n                    device_map[self._llm_identifier] = self.cuda_devices  # type: ignore\n            if len(self.cuda_devices) != self._desired_num_gpus:\n                self._logger.warning(  # type: ignore\n                    f\"Could not assign the desired number of GPUs {self._desired_num_gpus}\"\n                    f\" for LLM with identifier '{self._llm_identifier}'.\"\n                )\n        else:\n            self._check_cuda_devices(device_map)\n\n        device_map[self._llm_identifier] = self.cuda_devices  # type: ignore\n\n    # `_device_llm_placement_map` was not provided and user didn't set the `cuda_devices`\n    # attribute. In this case, the `cuda_devices` attribute will be set to an empty list.\n    if self.cuda_devices == \"auto\":\n        self.cuda_devices = []\n\n    self._set_cuda_visible_devices()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._check_cuda_devices","title":"_check_cuda_devices(device_map)","text":"

Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.

Parameters:

Name Type Description Default device_map Dict[str, List[int]]

a dictionary with the device placement information for each LLM.

required Source code in src/distilabel/models/mixins/cuda_device_placement.py
def _check_cuda_devices(self, device_map: Dict[str, List[int]]) -> None:\n    \"\"\"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.\n\n    Args:\n        device_map: a dictionary with the device placement information for each LLM.\n    \"\"\"\n    for device in self.cuda_devices:  # type: ignore\n        for llm, devices in device_map.items():\n            if device in devices:\n                self._logger.warning(  # type: ignore\n                    f\"LLM with identifier '{llm}' is also going to use CUDA device \"\n                    f\"'{device}'. This may lead to performance issues or running out\"\n                    \" of memory depending on the device capabilities and the loaded\"\n                    \" models.\"\n                )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._get_cuda_device","title":"_get_cuda_device(device_map)","text":"

Returns the first available CUDA device to be used by the LLM that is not going to be used by any other LLM.

Parameters:

Name Type Description Default device_map Dict[str, List[int]]

a dictionary with the device placement information for each LLM.

required

Returns:

Type Description Union[int, None]

The first available CUDA device to be used by the LLM.

Raises:

Type Description RuntimeError

if there is no available CUDA device to be used by the LLM.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]:\n    \"\"\"Returns the first available CUDA device to be used by the LLM that is not going\n    to be used by any other LLM.\n\n    Args:\n        device_map: a dictionary with the device placement information for each LLM.\n\n    Returns:\n        The first available CUDA device to be used by the LLM.\n\n    Raises:\n        RuntimeError: if there is no available CUDA device to be used by the LLM.\n    \"\"\"\n    for device in self._available_cuda_devices:\n        if all(device not in devices for devices in device_map.values()):\n            return device\n\n    return None\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._set_cuda_visible_devices","title":"_set_cuda_visible_devices()","text":"

Sets the CUDA_VISIBLE_DEVICES environment variable to the list of CUDA devices to be used by the LLM.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def _set_cuda_visible_devices(self) -> None:\n    \"\"\"Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices\n    to be used by the LLM.\n    \"\"\"\n    if not self.cuda_devices:\n        return\n\n    if self._can_check_cuda_devices and not all(\n        device in self._available_cuda_devices for device in self.cuda_devices\n    ):\n        raise RuntimeError(\n            f\"Invalid CUDA devices for LLM '{self._llm_identifier}': {self.cuda_devices}.\"\n            f\" The available devices are: {self._available_cuda_devices}. Please, review\"\n            \" the 'cuda_devices' attribute and try again.\"\n        )\n\n    cuda_devices = \",\".join([str(device) for device in self.cuda_devices])\n    self._logger.info(  # type: ignore\n        f\"\ud83c\udfae LLM '{self._llm_identifier}' is going to use the following CUDA devices:\"\n        f\" {self.cuda_devices}.\"\n    )\n    os.environ[\"CUDA_VISIBLE_DEVICES\"] = cuda_devices\n
"},{"location":"api/pipeline/","title":"Pipeline","text":"

This section contains the API reference for the distilabel pipelines. For an example on how to use the pipelines, see the Tutorial - Pipeline.

"},{"location":"api/pipeline/#distilabel.pipeline.base","title":"base","text":""},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline","title":"BasePipeline","text":"

Bases: ABC, RequirementsMixin, _Serializable

Base class for a distilabel pipeline.

Attributes:

Name Type Description name

The name of the pipeline.

description

A description of the pipeline.

dag

The DAG instance that represents the pipeline.

_cache_dir

The directory where the pipeline will be cached.

_logger

The logger instance that will be used by the pipeline.

_batch_manager Optional[_BatchManager]

The batch manager that will manage the batches received from the steps while running the pipeline. It will be created when the pipeline is run, from scratch or from cache. Defaults to None.

_write_buffer Optional[_WriteBuffer]

The buffer that will store the data of the leaf steps of the pipeline while running, so the Distiset can be created at the end. It will be created when the pipeline is run. Defaults to None.

_fs Optional[AbstractFileSystem]

The fsspec filesystem to be used to store the data of the _Batches passed between the steps. It will be set when the pipeline is run. Defaults to None.

_storage_base_path Optional[str]

The base path where the data of the _Batches passed between the steps will be stored. It will be set then the pipeline is run. Defaults to None.

_use_fs_to_pass_data bool

Whether to use the file system to pass the data of the _Batches between the steps. Even if this parameter is False, the Batches received by GlobalSteps will always use the file system to pass the data. Defaults to False.

_dry_run

A flag to indicate if the pipeline is running in dry run mode. Defaults to False.

output_queue

A queue to store the output of the steps while running the pipeline.

load_queue

A queue used by each Step to notify the main process it has finished loading or it the step has been unloaded.

Source code in src/distilabel/pipeline/base.py
class BasePipeline(ABC, RequirementsMixin, _Serializable):\n    \"\"\"Base class for a `distilabel` pipeline.\n\n    Attributes:\n        name: The name of the pipeline.\n        description: A description of the pipeline.\n        dag: The `DAG` instance that represents the pipeline.\n        _cache_dir: The directory where the pipeline will be cached.\n        _logger: The logger instance that will be used by the pipeline.\n        _batch_manager: The batch manager that will manage the batches received from the\n            steps while running the pipeline. It will be created when the pipeline is run,\n            from scratch or from cache. Defaults to `None`.\n        _write_buffer: The buffer that will store the data of the leaf steps of the pipeline\n            while running, so the `Distiset` can be created at the end. It will be created\n            when the pipeline is run. Defaults to `None`.\n        _fs: The `fsspec` filesystem to be used to store the data of the `_Batch`es passed\n            between the steps. It will be set when the pipeline is run. Defaults to `None`.\n        _storage_base_path: The base path where the data of the `_Batch`es passed between\n            the steps will be stored. It will be set then the pipeline is run. Defaults\n            to `None`.\n        _use_fs_to_pass_data: Whether to use the file system to pass the data of the\n            `_Batch`es between the steps. Even if this parameter is `False`, the `Batch`es\n            received by `GlobalStep`s will always use the file system to pass the data.\n            Defaults to `False`.\n        _dry_run: A flag to indicate if the pipeline is running in dry run mode. Defaults\n            to `False`.\n        output_queue: A queue to store the output of the steps while running the pipeline.\n        load_queue: A queue used by each `Step` to notify the main process it has finished\n            loading or it the step has been unloaded.\n    \"\"\"\n\n    _output_queue: \"Queue[Any]\"\n    _load_queue: \"Queue[Union[StepLoadStatus, None]]\"\n\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        cache_dir: Optional[Union[str, \"PathLike\"]] = None,\n        enable_metadata: bool = False,\n        requirements: Optional[List[str]] = None,\n    ) -> None:\n        \"\"\"Initialize the `BasePipeline` instance.\n\n        Args:\n            name: The name of the pipeline. If not generated, a random one will be generated by default.\n            description: A description of the pipeline. Defaults to `None`.\n            cache_dir: A directory where the pipeline will be cached. Defaults to `None`.\n            enable_metadata: Whether to include the distilabel metadata column for the pipeline\n                in the final `Distiset`. It contains metadata used by distilabel, for example\n                the raw outputs of the `LLM` without processing would be here, inside `raw_output_...`\n                field. Defaults to `False`.\n            requirements: List of requirements that must be installed to run the pipeline.\n                Defaults to `None`, but can be helpful to inform in a pipeline to be shared\n                that this requirements must be installed.\n        \"\"\"\n        self.name = name or _PIPELINE_DEFAULT_NAME\n        self.description = description\n        self._enable_metadata = enable_metadata\n        self.dag = DAG()\n\n        if cache_dir:\n            self._cache_dir = Path(cache_dir)\n        elif env_cache_dir := envs.DISTILABEL_CACHE_DIR:\n            self._cache_dir = Path(env_cache_dir)\n        else:\n            self._cache_dir = constants.PIPELINES_CACHE_DIR\n\n        self._logger = logging.getLogger(\"distilabel.pipeline\")\n\n        self._batch_manager: Optional[\"_BatchManager\"] = None\n        self._write_buffer: Optional[\"_WriteBuffer\"] = None\n\n        self._steps_load_status: Dict[str, int] = {}\n        self._steps_load_status_lock = threading.Lock()\n\n        self._stop_called = False\n        self._stop_called_lock = threading.Lock()\n        self._stop_calls = 0\n\n        self._recover_offline_batch_generate_for_step: Union[\n            Tuple[str, List[List[Dict[str, Any]]]], None\n        ] = None\n\n        self._fs: Optional[fsspec.AbstractFileSystem] = None\n        self._storage_base_path: Optional[str] = None\n        self._use_fs_to_pass_data: bool = False\n        self._dry_run = False\n\n        self._current_stage = 0\n        self._stages_last_batch: List[List[str]] = []\n\n        self.requirements = requirements or []\n\n        self._exception: Union[Exception, None] = None\n\n        self._log_queue: Union[\"Queue[Any]\", None] = None\n\n    def __enter__(self) -> Self:\n        \"\"\"Set the global pipeline instance when entering a pipeline context.\"\"\"\n        _GlobalPipelineManager.set_pipeline(self)\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback) -> None:\n        \"\"\"Unset the global pipeline instance when exiting a pipeline context.\"\"\"\n        _GlobalPipelineManager.set_pipeline(None)\n        self._set_pipeline_name()\n\n    def _set_pipeline_name(self) -> None:\n        \"\"\"Creates a name for the pipeline if it's the default one (if hasn't been set).\"\"\"\n        if self.name == _PIPELINE_DEFAULT_NAME:\n            self.name = f\"pipeline_{'_'.join(self.dag)}\"\n\n    @property\n    def signature(self) -> str:\n        \"\"\"Makes a signature (hash) of a pipeline, using the step ids and the adjacency between them.\n\n        The main use is to find the pipeline in the cache folder.\n\n        Returns:\n            Signature of the pipeline.\n        \"\"\"\n\n        pipeline_dump = self.dump()[\"pipeline\"]\n        steps_names = list(self.dag)\n        connections_info = [\n            f\"{c['from']}-{'-'.join(c['to'])}\" for c in pipeline_dump[\"connections\"]\n        ]\n\n        routing_batch_functions_info = []\n        for function in pipeline_dump[\"routing_batch_functions\"]:\n            step = function[\"step\"]\n            routing_batch_function: \"RoutingBatchFunction\" = self.dag.get_step(step)[\n                constants.ROUTING_BATCH_FUNCTION_ATTR_NAME\n            ]\n            if type_info := routing_batch_function._get_type_info():\n                step += f\"-{type_info}\"\n            routing_batch_functions_info.append(step)\n\n        return hashlib.sha1(\n            \",\".join(\n                steps_names + connections_info + routing_batch_functions_info\n            ).encode()\n        ).hexdigest()\n\n    def run(\n        self,\n        parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n        use_cache: bool = True,\n        storage_parameters: Optional[Dict[str, Any]] = None,\n        use_fs_to_pass_data: bool = False,\n        dataset: Optional[\"InputDataset\"] = None,\n        dataset_batch_size: int = 50,\n        logging_handlers: Optional[List[logging.Handler]] = None,\n    ) -> \"Distiset\":  # type: ignore\n        \"\"\"Run the pipeline. It will set the runtime parameters for the steps and validate\n        the pipeline.\n\n        This method should be extended by the specific pipeline implementation,\n        adding the logic to run the pipeline.\n\n        Args:\n            parameters: A dictionary with the step name as the key and a dictionary with\n                the runtime parameters for the step as the value. Defaults to `None`.\n            use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n                `True`.\n            storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n                that will be used to store the data of the `_Batch`es passed between the\n                steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n                `GlobalStep` it will be always used). It must have at least the \"path\" key,\n                and it can contain additional keys depending on the protocol. By default,\n                it will use the local file system and a directory in the cache directory.\n                Defaults to `None`.\n            use_fs_to_pass_data: Whether to use the file system to pass the data of\n                the `_Batch`es between the steps. Even if this parameter is `False`, the\n                `Batch`es received by `GlobalStep`s will always use the file system to\n                pass the data. Defaults to `False`.\n            dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n                root step. Convenient method when you have already processed the dataset in\n                your script and just want to pass it already processed. Defaults to `None`.\n            dataset_batch_size: if `dataset` is given, this will be the size of the batches\n                yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n            logging_handlers: A list of logging handlers that will be used to log the\n                output of the pipeline. This argument can be useful so the logging messages\n                can be extracted and used in a different context. Defaults to `None`.\n\n        Returns:\n            The `Distiset` created by the pipeline.\n        \"\"\"\n\n        self._exception: Union[Exception, None] = None\n\n        # Set the runtime parameters that will be used during the pipeline execution.\n        # They are used to generate the signature of the pipeline that is used to hit the\n        # cache when the pipeline is run, so it's important to do it first.\n        self._set_runtime_parameters(parameters or {})\n\n        self._refresh_pipeline_from_cache()\n\n        if dataset is not None:\n            self._add_dataset_generator_step(dataset, dataset_batch_size)\n\n        setup_logging(\n            log_queue=self._log_queue,\n            filename=str(self._cache_location[\"log_file\"]),\n            logging_handlers=logging_handlers,\n        )\n\n        # Set the name of the pipeline if it's the default one. This should be called\n        # if the pipeline is defined within the context manager, and the run is called\n        # outside of it. Is here in the following case:\n        # with Pipeline() as pipeline:\n        #    pipeline.run()\n        self._set_pipeline_name()\n\n        # Validate the pipeline DAG to check that all the steps are chainable, there are\n        # no missing runtime parameters, batch sizes are correct, etc.\n        self.dag.validate()\n\n        self._set_pipeline_artifacts_path_in_steps()\n\n        # Set the initial load status for all the steps\n        self._init_steps_load_status()\n\n        # Load the stages status or initialize it\n        self._load_stages_status(use_cache)\n\n        # Load the `_BatchManager` from cache or create one from scratch\n        self._load_batch_manager(use_cache)\n\n        # Check pipeline requirements are installed\n        self._check_requirements()\n\n        # Setup the filesystem that will be used to pass the data of the `_Batch`es\n        self._setup_fsspec(storage_parameters)\n        self._use_fs_to_pass_data = use_fs_to_pass_data\n\n        if self._dry_run:\n            self._logger.info(\"\ud83c\udf35 Dry run mode\")\n\n        # If the batch manager is not able to generate batches, that means that the loaded\n        # `_BatchManager` from cache didn't have any remaining batches to process i.e.\n        # the previous pipeline execution was completed successfully.\n        if not self._batch_manager.can_generate():  # type: ignore\n            self._logger.info(\n                \"\ud83d\udcbe Loaded batch manager from cache doesn't contain any remaining data.\"\n                \" Returning `Distiset` from cache data...\"\n            )\n            distiset = create_distiset(\n                data_dir=self._cache_location[\"data\"],\n                pipeline_path=self._cache_location[\"pipeline\"],\n                log_filename_path=self._cache_location[\"log_file\"],\n                enable_metadata=self._enable_metadata,\n                dag=self.dag,\n            )\n            stop_logging()\n            return distiset\n\n        self._setup_write_buffer(use_cache)\n\n        self._print_load_stages_info()\n\n    def dry_run(\n        self,\n        parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n        batch_size: int = 1,\n        dataset: Optional[\"InputDataset\"] = None,\n    ) -> \"Distiset\":\n        \"\"\"Do a dry run to test the pipeline runs as expected.\n\n        Running a `Pipeline` in dry run mode will set all the `batch_size` of generator steps\n        to the specified `batch_size`, and run just with a single batch, effectively\n        running the whole pipeline with a single example. The cache will be set to `False`.\n\n        Args:\n            parameters: A dictionary with the step name as the key and a dictionary with\n                the runtime parameters for the step as the value. Defaults to `None`.\n            batch_size: The batch size of the unique batch generated by the generators\n                steps of the pipeline. Defaults to `1`.\n            dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n                root step. Convenient method when you have already processed the dataset in\n                your script and just want to pass it already processed. Defaults to `None`.\n\n        Returns:\n            Will return the `Distiset` as the main run method would do.\n        \"\"\"\n        self._dry_run = True\n\n        for step_name in self.dag:\n            step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n\n            if step.is_generator:\n                if not parameters:\n                    parameters = {}\n                parameters[step_name] = {\"batch_size\": batch_size}\n\n        distiset = self.run(parameters=parameters, use_cache=False, dataset=dataset)\n\n        self._dry_run = False\n        return distiset\n\n    def _add_dataset_generator_step(\n        self, dataset: \"InputDataset\", batch_size: int = 50\n    ) -> None:\n        \"\"\"Create a root step to work as the `GeneratorStep` for the pipeline using a\n        dataset.\n\n        Args:\n            dataset: A dataset that will be used to create a `GeneratorStep` and\n                placed in the DAG as the root step.\n            batch_size: The size of the batches generated by the `GeneratorStep`.\n\n        Raises:\n            ValueError: If there's already a `GeneratorStep` in the pipeline.\n        \"\"\"\n        for step_name in self.dag:\n            step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            if isinstance(step_name, GeneratorStep):\n                raise DistilabelUserError(\n                    \"There is already a `GeneratorStep` in the pipeline, you can either\"\n                    \" pass a `dataset` to the run method, or create a `GeneratorStep` explictly.\"\n                    f\" `GeneratorStep`: {step}\",\n                    page=\"sections/how_to_guides/basic/step/#types-of-steps\",\n                )\n        loader = make_generator_step(\n            dataset=dataset,\n            pipeline=self,\n            batch_size=batch_size,\n        )\n        self.dag.add_root_step(loader)\n\n    def get_runtime_parameters_info(self) -> \"PipelineRuntimeParametersInfo\":\n        \"\"\"Get the runtime parameters for the steps in the pipeline.\n\n        Returns:\n            A dictionary with the step name as the key and a list of dictionaries with\n            the parameter name and the parameter info as the value.\n        \"\"\"\n        runtime_parameters = {}\n        for step_name in self.dag:\n            step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            runtime_parameters[step_name] = step.get_runtime_parameters_info()\n        return runtime_parameters\n\n    def _init_steps_load_status(self) -> None:\n        \"\"\"Initialize the `_steps_load_status` dictionary assigning 0 to every step of\n        the pipeline.\"\"\"\n        for step_name in self.dag:\n            self._steps_load_status[step_name] = _STEP_NOT_LOADED_CODE\n\n    def _set_pipeline_artifacts_path_in_steps(self) -> None:\n        \"\"\"Sets the attribute `_pipeline_artifacts_path` in all the `Step`s of the pipeline,\n        so steps can use it to get the path to save the generated artifacts.\"\"\"\n        artifacts_path = self._cache_location[\"data\"] / constants.STEPS_ARTIFACTS_PATH\n        for name in self.dag:\n            step: \"_Step\" = self.dag.get_step(name)[constants.STEP_ATTR_NAME]\n            step.set_pipeline_artifacts_path(path=artifacts_path)\n\n    def _check_requirements(self) -> None:\n        \"\"\"Checks if the dependencies required to run the pipeline are installed.\n\n        Raises:\n            ModuleNotFoundError: if one or more requirements are missing.\n        \"\"\"\n        if to_install := self.requirements_to_install():\n            # Print the list of requirements like they would appear in a requirements.txt\n            to_install_list = \"\\n\" + \"\\n\".join(to_install)\n            msg = f\"Please install the following requirements to run the pipeline: {to_install_list}\"\n            self._logger.error(msg)\n            raise ModuleNotFoundError(msg)\n\n    def _setup_fsspec(\n        self, storage_parameters: Optional[Dict[str, Any]] = None\n    ) -> None:\n        \"\"\"Setups the `fsspec` filesystem to be used to store the data of the `_Batch`es\n        passed between the steps.\n\n        Args:\n            storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n                that will be used to store the data of the `_Batch`es passed between the\n                steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n                `GlobalStep` it will be always used). It must have at least the \"path\" key,\n                and it can contain additional keys depending on the protocol. By default,\n                it will use the local file system and a directory in the cache directory.\n                Defaults to `None`.\n        \"\"\"\n        if not storage_parameters:\n            self._fs = fsspec.filesystem(\"file\")\n            self._storage_base_path = (\n                f\"file://{self._cache_location['batch_input_data']}\"\n            )\n            return\n\n        if \"path\" not in storage_parameters:\n            raise DistilabelUserError(\n                \"The 'path' key must be present in the `storage_parameters` dictionary\"\n                \" if it's not `None`.\",\n                page=\"sections/how_to_guides/advanced/fs_to_pass_data/\",\n            )\n\n        path = storage_parameters.pop(\"path\")\n        protocol = UPath(path).protocol\n\n        self._fs = fsspec.filesystem(protocol, **storage_parameters)\n        self._storage_base_path = path\n\n    def _add_step(self, step: \"_Step\") -> None:\n        \"\"\"Add a step to the pipeline.\n\n        Args:\n            step: The step to be added to the pipeline.\n        \"\"\"\n        self.dag.add_step(step)\n\n    def _add_edge(self, from_step: str, to_step: str) -> None:\n        \"\"\"Add an edge between two steps in the pipeline.\n\n        Args:\n            from_step: The name of the step that will generate the input for `to_step`.\n            to_step: The name of the step that will receive the input from `from_step`.\n        \"\"\"\n        self.dag.add_edge(from_step, to_step)\n\n        # Check if `from_step` has a `routing_batch_function`. If it does, then mark\n        # `to_step` as a step that will receive a routed batch.\n        node = self.dag.get_step(from_step)  # type: ignore\n        routing_batch_function = node.get(\n            constants.ROUTING_BATCH_FUNCTION_ATTR_NAME, None\n        )\n        self.dag.set_step_attr(\n            name=to_step,\n            attr=constants.RECEIVES_ROUTED_BATCHES_ATTR_NAME,\n            value=routing_batch_function is not None,\n        )\n\n    def _is_convergence_step(self, step_name: str) -> None:\n        \"\"\"Checks if a step is a convergence step.\n\n        Args:\n            step_name: The name of the step.\n        \"\"\"\n        return self.dag.get_step(step_name).get(constants.CONVERGENCE_STEP_ATTR_NAME)\n\n    def _add_routing_batch_function(\n        self, step_name: str, routing_batch_function: \"RoutingBatchFunction\"\n    ) -> None:\n        \"\"\"Add a routing batch function to a step.\n\n        Args:\n            step_name: The name of the step that will receive the routed batch.\n            routing_batch_function: The function that will route the batch to the step.\n        \"\"\"\n        self.dag.set_step_attr(\n            name=step_name,\n            attr=constants.ROUTING_BATCH_FUNCTION_ATTR_NAME,\n            value=routing_batch_function,\n        )\n\n    def _set_runtime_parameters(self, parameters: Dict[str, Dict[str, Any]]) -> None:\n        \"\"\"Set the runtime parameters for the steps in the pipeline.\n\n        Args:\n            parameters: A dictionary with the step name as the key and a dictionary with\n            the parameter name as the key and the parameter value as the value.\n        \"\"\"\n        step_names = set(self.dag.G)\n        for step_name, step_parameters in parameters.items():\n            if step_name not in step_names:\n                self._logger.warning(\n                    f\"\u2753 Step '{step_name}' provided in `Pipeline.run(parameters={{...}})` not found in the pipeline.\"\n                    f\" Available steps are: {step_names}.\"\n                )\n            else:\n                step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n                step.set_runtime_parameters(step_parameters)\n\n    def _model_dump(self, obj: Any, **kwargs: Any) -> Dict[str, Any]:\n        \"\"\"Dumps the DAG content to a dict.\n\n        Args:\n            obj (Any): Unused, just kept to match the signature of the parent method.\n            kwargs (Any): Unused, just kept to match the signature of the parent method.\n\n        Returns:\n            Dict[str, Any]: Internal representation of the DAG from networkx in a serializable format.\n        \"\"\"\n        return self.dag.dump()\n\n    def draw(\n        self,\n        path: Optional[Union[str, Path]] = \"pipeline.png\",\n        top_to_bottom: bool = False,\n        show_edge_labels: bool = True,\n    ) -> str:\n        \"\"\"\n        Draws the pipeline.\n\n        Parameters:\n            path: The path to save the image to.\n            top_to_bottom: Whether to draw the DAG top to bottom. Defaults to `False`.\n            show_edge_labels: Whether to show the edge labels. Defaults to `True`.\n\n        Returns:\n            The path to the saved image.\n        \"\"\"\n        png = self.dag.draw(\n            top_to_bottom=top_to_bottom, show_edge_labels=show_edge_labels\n        )\n        with open(path, \"wb\") as f:\n            f.write(png)\n        return path\n\n    def __repr__(self) -> str:\n        \"\"\"\n        If running in a Jupyter notebook, display an image representing this `Pipeline`.\n        \"\"\"\n        if in_notebook():\n            try:\n                from IPython.display import Image, display\n\n                image_data = self.dag.draw()\n\n                display(Image(image_data))\n            except Exception:\n                pass\n        return super().__repr__()\n\n    def dump(self, **kwargs: Any) -> Dict[str, Any]:\n        return {\n            \"distilabel\": {\"version\": __version__},\n            \"pipeline\": {\n                \"name\": self.name,\n                \"description\": self.description,\n                **super().dump(),\n            },\n            \"requirements\": self.requirements,\n        }\n\n    @classmethod\n    def from_dict(cls, data: Dict[str, Any]) -> Self:\n        \"\"\"Create a Pipeline from a dict containing the serialized data.\n\n        Note:\n            It's intended for internal use.\n\n        Args:\n            data (Dict[str, Any]): Dictionary containing the serialized data from a Pipeline.\n\n        Returns:\n            BasePipeline: Pipeline recreated from the dictionary info.\n        \"\"\"\n        name = data[\"pipeline\"][\"name\"]\n        description = data[\"pipeline\"].get(\"description\")\n        requirements = data.get(\"requirements\", [])\n        with cls(name=name, description=description, requirements=requirements) as pipe:\n            pipe.dag = DAG.from_dict(data[\"pipeline\"])\n        return pipe\n\n    @property\n    def _cache_location(self) -> \"_CacheLocation\":\n        \"\"\"Dictionary containing the object that will stored and the location,\n        whether it is a filename or a folder.\n\n        Returns:\n            Path: Filenames where the pipeline content will be serialized.\n        \"\"\"\n        folder = self._cache_dir / self.name / self.signature\n        pipeline_execution_dir = folder / \"executions\" / self.aggregated_steps_signature\n        return {\n            \"pipeline\": pipeline_execution_dir / \"pipeline.yaml\",\n            \"batch_manager\": pipeline_execution_dir / \"batch_manager.json\",\n            \"steps_data\": self._cache_dir / self.name / \"steps_data\",\n            \"data\": pipeline_execution_dir / \"data\",\n            \"batch_input_data\": pipeline_execution_dir / \"batch_input_data\",\n            \"log_file\": pipeline_execution_dir / \"pipeline.log\",\n            \"stages_file\": pipeline_execution_dir / \"stages.json\",\n        }\n\n    @property\n    def aggregated_steps_signature(self) -> str:\n        \"\"\"Creates an aggregated signature using `Step`s signature that will be used for\n        the `_BatchManager`.\n\n        Returns:\n            The aggregated signature.\n        \"\"\"\n        signatures = []\n        for step_name in self.dag:\n            step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            signatures.append(step.signature)\n        return hashlib.sha1(\"\".join(signatures).encode()).hexdigest()\n\n    def _cache(self) -> None:\n        \"\"\"Saves the `BasePipeline` using the `_cache_filename`.\"\"\"\n        if self._dry_run:\n            return\n\n        self.save(\n            path=self._cache_location[\"pipeline\"],\n            format=self._cache_location[\"pipeline\"].suffix.replace(\".\", \"\"),  # type: ignore\n        )\n\n        if self._batch_manager is not None:\n            self._batch_manager.cache(\n                path=self._cache_location[\"batch_manager\"],\n                steps_data_path=self._cache_location[\"steps_data\"],\n            )\n\n        self._save_stages_status()\n\n        self._logger.debug(\"Pipeline and batch manager saved to cache.\")\n\n    def _save_stages_status(self) -> None:\n        \"\"\"Saves the stages status to cache.\"\"\"\n        self.save(\n            path=self._cache_location[\"stages_file\"],\n            format=\"json\",\n            dump={\n                \"current_stage\": self._current_stage,\n                \"stages_last_batch\": self._stages_last_batch,\n            },\n        )\n\n    def _load_stages_status(self, use_cache: bool = True) -> None:\n        \"\"\"Try to load the stages status from cache, or initialize it if cache file doesn't\n        exist or cache is not going to be used.\"\"\"\n        if use_cache and self._cache_location[\"stages_file\"].exists():\n            stages_status = read_json(self._cache_location[\"stages_file\"])\n            self._current_stage = stages_status[\"current_stage\"]\n            self._stages_last_batch = stages_status[\"stages_last_batch\"]\n        else:\n            self._current_stage = 0\n            self._stages_last_batch = [\n                [] for _ in range(len(self.dag.get_steps_load_stages()[0]))\n            ]\n\n    def _refresh_pipeline_from_cache(self) -> None:\n        \"\"\"Refresh the DAG (and its steps) from the cache file. This is useful as some\n        `Step`s can update and change their state during the pipeline execution, and this\n        method will make sure the pipeline is up-to-date with the latest changes when\n        the pipeline is reloaded from cache.\n        \"\"\"\n\n        def recursively_handle_secrets_and_excluded_attributes(\n            cached_model: \"BaseModel\", model: \"BaseModel\"\n        ) -> None:\n            \"\"\"Recursively handle the secrets and excluded attributes of a `BaseModel`,\n            setting the values of the cached model to the values of the model.\n\n            Args:\n                cached_model: The cached model that will be updated as it doesn't contain\n                    the secrets and excluded attributes (not serialized).\n                model: The model that contains the secrets and excluded attributes because\n                    it comes from pipeline instantiation.\n            \"\"\"\n            for field_name, field_info in cached_model.model_fields.items():\n                if field_name in (\"pipeline\"):\n                    continue\n\n                inner_type = extract_annotation_inner_type(field_info.annotation)\n                if is_type_pydantic_secret_field(inner_type) or field_info.exclude:\n                    setattr(cached_model, field_name, getattr(model, field_name))\n                elif isclass(inner_type) and issubclass(inner_type, BaseModel):\n                    recursively_handle_secrets_and_excluded_attributes(\n                        getattr(cached_model, field_name),\n                        getattr(model, field_name),\n                    )\n\n        if self._cache_location[\"pipeline\"].exists():\n            cached_dag = self.from_yaml(self._cache_location[\"pipeline\"]).dag\n\n            for step_name in cached_dag:\n                step_cached: \"_Step\" = cached_dag.get_step(step_name)[\n                    constants.STEP_ATTR_NAME\n                ]\n                step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n                recursively_handle_secrets_and_excluded_attributes(step_cached, step)\n\n            self.dag = cached_dag\n\n    def _load_batch_manager(self, use_cache: bool = True) -> None:\n        \"\"\"Will try to load the `_BatchManager` from the cache dir if found. Otherwise,\n        it will create one from scratch.\n\n        If the `_BatchManager` is loaded from cache, we check for invalid steps (those that\n        may have a different signature than the original in the pipeline folder), and\n        restart them, as well as their successors.\n\n        Args:\n            use_cache: whether the cache should be used or not.\n        \"\"\"\n        batch_manager_cache_loc = self._cache_location[\"batch_manager\"]\n\n        # This first condition handles the case in which the pipeline is exactly the same\n        # no steps have been added, removed or changed.\n        if use_cache and batch_manager_cache_loc.exists():\n            self._logger.info(\n                f\"\ud83d\udcbe Loading `_BatchManager` from cache: '{batch_manager_cache_loc}'\"\n            )\n            self._batch_manager = _BatchManager.load_from_cache(\n                dag=self.dag,\n                batch_manager_path=batch_manager_cache_loc,\n                steps_data_path=self._cache_location[\"steps_data\"],\n            )\n            self._invalidate_steps_cache_if_required()\n        # In this other case, the pipeline has been changed. We need to create a new batch\n        # manager and if `use_cache==True` then check which outputs have we computed and\n        # cached for steps that haven't changed but that were executed in another pipeline,\n        # and therefore we can reuse\n        else:\n            self._batch_manager = _BatchManager.from_dag(\n                dag=self.dag,\n                use_cache=use_cache,\n                steps_data_path=self._cache_location[\"steps_data\"],\n            )\n\n    def _invalidate_steps_cache_if_required(self) -> None:\n        \"\"\"Iterates over the steps of the pipeline and invalidates their cache if required.\"\"\"\n        for step_name in self.dag:\n            # `GeneratorStep`s doesn't receive input data so no need to check their\n            # `_BatchManagerStep`\n            if self.dag.get_step(step_name)[constants.STEP_ATTR_NAME].is_generator:\n                continue\n\n            step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            if not step.use_cache:\n                self._batch_manager.invalidate_cache_for(\n                    step_name=step.name,\n                    dag=self.dag,\n                    steps_data_path=self._cache_location[\"steps_data\"],\n                )  # type: ignore\n                self._logger.info(\n                    f\"\u267b\ufe0f Step '{step.name}' won't use cache (`use_cache=False`). The cache of this step and their successors won't be\"\n                    \" reused and the results will have to be recomputed.\"\n                )\n                break\n\n    def _setup_write_buffer(self, use_cache: bool = True) -> None:\n        \"\"\"Setups the `_WriteBuffer` that will store the data of the leaf steps of the\n        pipeline while running, so the `Distiset` can be created at the end.\n        \"\"\"\n        if not use_cache and self._cache_location[\"data\"].exists():\n            shutil.rmtree(self._cache_location[\"data\"])\n        buffer_data_path = self._cache_location[\"data\"] / constants.STEPS_OUTPUTS_PATH\n        self._logger.info(f\"\ud83d\udcdd Pipeline data will be written to '{buffer_data_path}'\")\n        self._write_buffer = _WriteBuffer(\n            buffer_data_path,\n            self.dag.leaf_steps,\n            steps_cached={\n                step_name: self.dag.get_step(step_name)[\n                    constants.STEP_ATTR_NAME\n                ].use_cache\n                for step_name in self.dag\n            },\n        )\n\n    def _print_load_stages_info(self) -> None:\n        \"\"\"Prints the information about the load stages.\"\"\"\n        stages, _ = self.dag.get_steps_load_stages()\n        msg = \"\"\n        for stage, steps in enumerate(stages):\n            steps_to_be_loaded = self._steps_to_be_loaded_in_stage(stage)\n            msg += f\"\\n * Stage {stage}:\"\n            for step in steps:\n                msg += f\"\\n   - '{step}'\"\n                if step not in steps_to_be_loaded:\n                    msg += \" (results cached, won't be loaded and executed)\"\n        self._logger.info(\n            f\"\u231b The steps of the pipeline will be loaded in stages:{msg}\"\n        )\n\n    def _run_output_queue_loop_in_thread(self) -> threading.Thread:\n        \"\"\"Runs the output queue loop in a separate thread to receive the output batches\n        from the steps. This is done to avoid the signal handler to block the loop, which\n        would prevent the pipeline from stopping correctly.\"\"\"\n        thread = threading.Thread(target=self._output_queue_loop)\n        thread.start()\n        return thread\n\n    def _output_queue_loop(self) -> None:\n        \"\"\"Loop to receive the output batches from the steps and manage the flow of the\n        batches through the pipeline.\"\"\"\n        if not self._initialize_pipeline_execution():\n            return\n\n        while self._should_continue_processing():  # type: ignore\n            self._logger.debug(\"Waiting for output batch from step...\")\n            if (batch := self._output_queue.get()) is None:\n                self._logger.debug(\"Received `None` from output queue. Breaking loop.\")\n                break\n\n            self._logger.debug(\n                f\"Received batch with seq_no {batch.seq_no} from step '{batch.step_name}'\"\n                f\" from output queue: {batch}\"\n            )\n\n            self._process_batch(batch)\n\n            # If `_stop_called` was set to `True` while waiting for the output queue, then\n            # we need to handle the stop of the pipeline and break the loop to avoid\n            # propagating the batches through the pipeline and making the stop process\n            # slower.\n            with self._stop_called_lock:\n                if self._stop_called:\n                    self._handle_batch_on_stop(batch)\n                    break\n\n            # If there is another load stage and all the `last_batch`es from the stage\n            # have been received, then load the next stage.\n            if self._should_load_next_stage():\n                if not self._update_stage():\n                    break\n\n            self._manage_batch_flow(batch)\n\n        self._finalize_pipeline_execution()\n\n    def _initialize_pipeline_execution(self) -> bool:\n        \"\"\"Load the steps of the required stage to initialize the pipeline execution,\n        and requests the initial batches to trigger the batch flowing in the pipeline.\n\n        Returns:\n            `True` if initialization went OK, `False` otherwise.\n        \"\"\"\n        # Wait for all the steps to be loaded correctly\n        if not self._run_stage_steps_and_wait(stage=self._current_stage):\n            self._set_steps_not_loaded_exception()\n            return False\n\n        # Send the \"first\" batches to the steps so the batches starts flowing through\n        # the input queues and output queue\n        self._request_initial_batches()\n\n        return True\n\n    def _should_continue_processing(self) -> bool:\n        \"\"\"Condition for the consume batches from the `output_queue` loop.\n\n        Returns:\n            `True` if should continue consuming batches, `False` otherwise and the pipeline\n            should stop.\n        \"\"\"\n        with self._stop_called_lock:\n            return self._batch_manager.can_generate() and not self._stop_called  # type: ignore\n\n    def _process_batch(\n        self, batch: \"_Batch\", send_last_batch_flag: bool = True\n    ) -> None:\n        \"\"\"Process a batch consumed from the `output_queue`.\n\n        Args:\n            batch: the batch to be processed.\n        \"\"\"\n        if batch.data_path:\n            self._logger.debug(\n                f\"Reading {batch.seq_no} batch data from '{batch.step_name}': '{batch.data_path}'\"\n            )\n            batch.read_batch_data_from_fs()\n\n        if batch.step_name in self.dag.leaf_steps:\n            self._write_buffer.add_batch(batch)  # type: ignore\n\n        if batch.last_batch:\n            self._register_stages_last_batch(batch)\n\n            # Make sure to send the `LAST_BATCH_SENT_FLAG` to the predecessors of the step\n            # if the batch is the last one, so they stop their processing loop even if they\n            # haven't received the last batch because of the routing function.\n            if send_last_batch_flag:\n                for step_name in self.dag.get_step_predecessors(batch.step_name):\n                    if self._is_step_running(step_name):\n                        self._send_last_batch_flag_to_step(step_name)\n\n    def _set_step_for_recovering_offline_batch_generation(\n        self, step: \"_Step\", data: List[List[Dict[str, Any]]]\n    ) -> None:\n        \"\"\"Sets the required information to recover a pipeline execution from a `_Step`\n        that used an `LLM` with offline batch generation.\n\n        Args:\n            step: The `_Step` that used an `LLM` with offline batch generation.\n            data: The data that was used to generate the batches for the step.\n        \"\"\"\n        # Replace step so the attribute `jobs_ids` of the `LLM` is not lost, as it was\n        # updated in the child process but not in the main process.\n        step_name: str = step.name  # type: ignore\n        self.dag.set_step_attr(\n            name=step_name, attr=constants.STEP_ATTR_NAME, value=step\n        )\n        self._recover_offline_batch_generate_for_step = (step_name, data)\n\n    def _add_batch_for_recovering_offline_batch_generation(self) -> None:\n        \"\"\"Adds a dummy `_Batch` to the specified step name (it's a `Task` that used an\n        `LLM` with offline batch generation) to recover the pipeline state for offline\n        batch generation in next pipeline executions.\"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        if self._recover_offline_batch_generate_for_step is None:\n            return\n\n        step_name, data = self._recover_offline_batch_generate_for_step\n        self._logger.debug(\n            f\"Adding batch to '{step_name}' step to recover pipeline execution for offline\"\n            \" batch generation...\"\n        )\n        self._batch_manager.add_batch_to_recover_offline_batch_generation(\n            to_step=step_name,\n            data=data,\n        )\n\n    def _register_stages_last_batch(self, batch: \"_Batch\") -> None:\n        \"\"\"Registers the last batch received from a step in the `_stages_last_batch`\n        dictionary.\n\n        Args:\n            batch: The last batch received from a step.\n        \"\"\"\n        _, stages_last_steps = self.dag.get_steps_load_stages()\n        stage_last_steps = stages_last_steps[self._current_stage]\n        if batch.step_name in stage_last_steps:\n            self._stages_last_batch[self._current_stage].append(batch.step_name)\n            self._stages_last_batch[self._current_stage].sort()\n\n    def _update_stage(self) -> bool:\n        \"\"\"Checks if the steps of next stage should be loaded and updates `_current_stage`\n        attribute.\n\n        Returns:\n            `True` if updating the stage went OK, `False` otherwise.\n        \"\"\"\n        self._current_stage += 1\n        if not self._run_stage_steps_and_wait(stage=self._current_stage):\n            self._set_steps_not_loaded_exception()\n            return False\n\n        return True\n\n    def _should_load_next_stage(self) -> bool:\n        \"\"\"Returns if the next stage should be loaded.\n\n        Returns:\n            `True` if the next stage should be loaded, `False` otherwise.\n        \"\"\"\n        _, stage_last_steps = self.dag.get_steps_load_stages()\n        there_is_next_stage = self._current_stage + 1 < len(stage_last_steps)\n        stage_last_batches_received = (\n            self._stages_last_batch[self._current_stage]\n            == stage_last_steps[self._current_stage]\n        )\n        return there_is_next_stage and stage_last_batches_received\n\n    def _finalize_pipeline_execution(self) -> None:\n        \"\"\"Finalizes the pipeline execution handling the prematurely stop of the pipeline\n        if required, caching the data and ensuring that all the steps finish its execution.\"\"\"\n\n        # Send `None` to steps `input_queue`s just in case some step is still waiting\n        self._notify_steps_to_stop()\n\n        for step_name in self.dag:\n            while self._is_step_running(step_name):\n                self._logger.debug(f\"Waiting for step '{step_name}' to finish...\")\n                time.sleep(0.5)\n\n        with self._stop_called_lock:\n            if self._stop_called:\n                self._handle_stop()\n\n            # Reset flag state\n            self._stop_called = False\n\n        self._add_batch_for_recovering_offline_batch_generation()\n\n        self._cache()\n\n    def _run_load_queue_loop_in_thread(self) -> threading.Thread:\n        \"\"\"Runs a background thread that reads from the `load_queue` to update the status\n        of the number of replicas loaded for each step.\n\n        Returns:\n            The thread that was started.\n        \"\"\"\n        thread = threading.Thread(target=self._run_load_queue_loop)\n        thread.start()\n        return thread\n\n    def _run_load_queue_loop(self) -> None:\n        \"\"\"Runs a loop that reads from the `load_queue` to update the status of the number\n        of replicas loaded for each step.\"\"\"\n\n        while True:\n            if (load_info := self._load_queue.get()) is None:\n                self._logger.debug(\"Received `None` from load queue. Breaking loop.\")\n                break\n\n            with self._steps_load_status_lock:\n                step_name, status = load_info[\"name\"], load_info[\"status\"]\n                if status == \"loaded\":\n                    if self._steps_load_status[step_name] == _STEP_NOT_LOADED_CODE:\n                        self._steps_load_status[step_name] = 1\n                    else:\n                        self._steps_load_status[step_name] += 1\n                elif status == \"unloaded\":\n                    self._steps_load_status[step_name] -= 1\n                else:\n                    # load failed\n                    self._steps_load_status[step_name] = _STEP_LOAD_FAILED_CODE\n\n                self._logger.debug(\n                    f\"Step '{step_name}' loaded replicas: {self._steps_load_status[step_name]}\"\n                )\n\n    def _is_step_running(self, step_name: str) -> bool:\n        \"\"\"Checks if the step is running (at least one replica is running).\n\n        Args:\n            step_name: The step to be check if running.\n\n        Returns:\n            `True` if the step is running, `False` otherwise.\n        \"\"\"\n        with self._steps_load_status_lock:\n            return self._steps_load_status[step_name] >= 1\n\n    def _steps_to_be_loaded_in_stage(self, stage: int) -> List[str]:\n        \"\"\"Returns the list of steps of the provided stage that should be loaded taking\n        into account if they have finished.\n\n        Args:\n            stage: the stage number\n\n        Returns:\n            A list containing the name of the steps that should be loaded in this stage.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        steps_stages, _ = self.dag.get_steps_load_stages()\n\n        return [\n            step\n            for step in steps_stages[stage]\n            if not self._batch_manager.step_has_finished(step)\n        ]\n\n    def _run_stage_steps_and_wait(self, stage: int) -> bool:\n        \"\"\"Runs the steps of the specified stage and waits for them to be ready.\n\n        Args:\n            stage: the stage from which the steps have to be loaded.\n\n        Returns:\n            `True` if all the steps have been loaded correctly, `False` otherwise.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        steps = self._steps_to_be_loaded_in_stage(stage)\n        self._logger.debug(f\"Steps to be loaded in stage {stage}: {steps}\")\n\n        # Run the steps of the stage\n        self._run_steps(steps=steps)\n\n        # Wait for them to be ready\n        self._logger.info(f\"\u23f3 Waiting for all the steps of stage {stage} to load...\")\n        previous_message = None\n        with self._stop_called_lock:\n            while not self._stop_called:\n                with self._steps_load_status_lock:\n                    filtered_steps_load_status = {\n                        step_name: replicas\n                        for step_name, replicas in self._steps_load_status.items()\n                        if step_name in steps\n                    }\n                    self._logger.debug(\n                        f\"Steps from stage {stage} loaded: {filtered_steps_load_status}\"\n                    )\n\n                    if any(\n                        replicas_loaded == _STEP_LOAD_FAILED_CODE\n                        for replicas_loaded in filtered_steps_load_status.values()\n                    ):\n                        self._logger.error(\n                            f\"\u274c Failed to load all the steps of stage {stage}\"\n                        )\n                        return False\n\n                    num_steps_loaded = 0\n                    replicas_message = \"\"\n                    for step_name, replicas in filtered_steps_load_status.items():\n                        step_replica_count = self.dag.get_step_replica_count(step_name)\n                        if replicas == step_replica_count:\n                            num_steps_loaded += 1\n                        replicas_message += f\"\\n * '{step_name}' replicas: {max(0, replicas)}/{step_replica_count}\"\n\n                    message = f\"\u23f3 Steps from stage {stage} loaded: {num_steps_loaded}/{len(filtered_steps_load_status)}{replicas_message}\"\n                    if num_steps_loaded > 0 and message != previous_message:\n                        self._logger.info(message)\n                        previous_message = message\n\n                    if num_steps_loaded == len(filtered_steps_load_status):\n                        self._logger.info(\n                            f\"\u2705 All the steps from stage {stage} have been loaded!\"\n                        )\n                        return True\n\n                time.sleep(2.5)\n\n        return not self._stop_called\n\n    def _handle_stop(self) -> None:\n        \"\"\"Handles the stop of the pipeline execution, which will stop the steps from\n        processing more batches and wait for the output queue to be empty, to not lose\n        any data that was already processed by the steps before the stop was called.\"\"\"\n        self._logger.debug(\"Handling stop of the pipeline execution...\")\n\n        self._add_batches_back_to_batch_manager()\n\n        # Wait for the input queue to be empty, which means that all the steps finished\n        # processing the batches that were sent before the stop flag.\n        for step_name in self.dag:\n            self._wait_step_input_queue_empty(step_name)\n\n        self._consume_output_queue()\n\n        if self._should_load_next_stage():\n            self._current_stage += 1\n\n    def _wait_step_input_queue_empty(self, step_name: str) -> Union[\"Queue[Any]\", None]:\n        \"\"\"Waits for the input queue of a step to be empty.\n\n        Args:\n            step_name: The name of the step.\n\n        Returns:\n            The input queue of the step if it's not loaded or finished, `None` otherwise.\n        \"\"\"\n        if self._check_step_not_loaded_or_finished(step_name):\n            return None\n\n        if input_queue := self.dag.get_step(step_name).get(\n            constants.INPUT_QUEUE_ATTR_NAME\n        ):\n            while input_queue.qsize() != 0:\n                pass\n            return input_queue\n\n    def _check_step_not_loaded_or_finished(self, step_name: str) -> bool:\n        \"\"\"Checks if a step is not loaded or already finished.\n\n        Args:\n            step_name: The name of the step.\n\n        Returns:\n            `True` if the step is not loaded or already finished, `False` otherwise.\n        \"\"\"\n        with self._steps_load_status_lock:\n            num_replicas = self._steps_load_status[step_name]\n\n            # The step has finished (replicas = 0) or it has failed to load\n            if num_replicas in [0, _STEP_LOAD_FAILED_CODE]:\n                return True\n\n        return False\n\n    @property\n    @abstractmethod\n    def QueueClass(self) -> Callable:\n        \"\"\"The class of the queue to use in the pipeline.\"\"\"\n        pass\n\n    def _create_step_input_queue(self, step_name: str) -> \"Queue[Any]\":\n        \"\"\"Creates an input queue for a step.\n\n        Args:\n            step_name: The name of the step.\n\n        Returns:\n            The input queue created.\n        \"\"\"\n        input_queue = self.QueueClass()\n        self.dag.set_step_attr(step_name, constants.INPUT_QUEUE_ATTR_NAME, input_queue)\n        return input_queue\n\n    @abstractmethod\n    def _run_step(self, step: \"_Step\", input_queue: \"Queue[Any]\", replica: int) -> None:\n        \"\"\"Runs the `Step` instance.\n\n        Args:\n            step: The `Step` instance to run.\n            input_queue: The input queue where the step will receive the batches.\n            replica: The replica ID assigned.\n        \"\"\"\n        pass\n\n    def _run_steps(self, steps: Iterable[str]) -> None:\n        \"\"\"Runs the `Step`s of the pipeline, creating first an input queue for each step\n        that will be used to send the batches.\n\n        Args:\n            steps:\n        \"\"\"\n        for step_name in steps:\n            step: \"Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            input_queue = self._create_step_input_queue(step_name=step_name)\n\n            # Set `pipeline` to `None` as in some Python environments the pipeline is not\n            # picklable and it will raise an error when trying to send the step to the process.\n            # `TypeError: cannot pickle 'code' object`\n            step.pipeline = None\n\n            if not step.is_normal and step.resources.replicas > 1:  # type: ignore\n                self._logger.warning(\n                    f\"Step '{step_name}' is a `GeneratorStep` or `GlobalStep` and has more\"\n                    \" than 1 replica. Only `Step` instances can have more than 1 replica.\"\n                    \" The number of replicas for the step will be set to 1.\"\n                )\n\n            step_num_replicas: int = step.resources.replicas if step.is_normal else 1  # type: ignore\n            for replica in range(step_num_replicas):\n                self._logger.debug(\n                    f\"Running 1 replica of step '{step.name}' with ID {replica}...\"\n                )\n                self._run_step(\n                    step=step.model_copy(deep=True),\n                    input_queue=input_queue,\n                    replica=replica,\n                )\n\n    def _add_batches_back_to_batch_manager(self) -> None:\n        \"\"\"Add the `Batch`es that were sent to a `Step` back to the `_BatchManager`. This\n        method should be used when the pipeline has been stopped prematurely.\"\"\"\n        for step_name in self.dag:\n            node = self.dag.get_step(step_name)\n            step: \"_Step\" = node[constants.STEP_ATTR_NAME]\n            if step.is_generator:\n                continue\n            if input_queue := node.get(constants.INPUT_QUEUE_ATTR_NAME):\n                while not input_queue.empty():\n                    batch = input_queue.get()\n                    if not isinstance(batch, _Batch):\n                        continue\n                    self._batch_manager.add_batch(  # type: ignore\n                        to_step=step_name,\n                        batch=batch,\n                        prepend=True,\n                    )\n                    self._logger.debug(\n                        f\"Adding batch back to the batch manager: {batch}\"\n                    )\n                input_queue.put(None)\n\n    def _consume_output_queue(self) -> None:\n        \"\"\"Consumes the `Batch`es from the output queue until it's empty. This method should\n        be used when the pipeline has been stopped prematurely to consume and to not lose\n        the `Batch`es that were processed by the leaf `Step`s before stopping the pipeline.\"\"\"\n        while not self._output_queue.empty():\n            batch = self._output_queue.get()\n            if batch is None:\n                continue\n            self._process_batch(batch, send_last_batch_flag=False)\n            self._handle_batch_on_stop(batch)\n\n    def _manage_batch_flow(self, batch: \"_Batch\") -> None:\n        \"\"\"Checks if the step that generated the batch has more data in its buffer to\n        generate a new batch. If there's data, then a new batch is sent to the step. If\n        the step has no data in its buffer, then the predecessors generator steps are\n        requested to send a new batch.\n\n        Args:\n            batch: The batch that was processed.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        route_to, do_not_route_to, routed = self._get_successors(batch)\n\n        self._register_batch(batch)\n\n        # Keep track of the steps that the batch was routed to\n        if routed:\n            batch.batch_routed_to = route_to\n\n        self._set_next_expected_seq_no(\n            steps=do_not_route_to,\n            from_step=batch.step_name,\n            next_expected_seq_no=batch.seq_no + 1,\n        )\n\n        step = self._get_step_from_batch(batch)\n\n        # Add the batch to the successors input buffers\n        for successor in route_to:\n            # Copy batch to avoid modifying the same reference in the batch manager\n            batch_to_add = batch.copy() if len(route_to) > 1 else batch\n\n            self._batch_manager.add_batch(successor, batch_to_add)\n\n            # Check if the step is a generator and if there are successors that need data\n            # from this step. This usually happens when the generator `batch_size` is smaller\n            # than the `input_batch_size` of the successor steps.\n            if (\n                step.is_generator\n                and step.name in self._batch_manager.step_empty_buffers(successor)\n            ):\n                last_batch_sent = self._batch_manager.get_last_batch_sent(step.name)\n                self._send_batch_to_step(last_batch_sent.next_batch())  # type: ignore\n\n            # If successor step has enough data in its buffer to create a new batch, then\n            # send the batch to the step.\n            while new_batch := self._batch_manager.get_batch(successor):\n                self._send_batch_to_step(new_batch)\n\n        if not step.is_generator:\n            # Step (\"this\", the one from which the batch was received) has enough data on its\n            # buffers to create a new batch\n            while new_batch := self._batch_manager.get_batch(step.name):  # type: ignore\n                # if new_batch := self._batch_manager.get_batch(step.name):  # type: ignore\n                self._send_batch_to_step(new_batch)\n\n            else:\n                self._request_more_batches_if_needed(step)\n        else:\n            if len(self.dag) == 1:\n                self._request_batch_from_generator(step.name)  # type: ignore\n\n        self._cache()\n\n    def _send_to_step(self, step_name: str, to_send: Any) -> None:\n        \"\"\"Sends something to the input queue of a step.\n\n        Args:\n            step_name: The name of the step.\n            to_send: The object to send.\n        \"\"\"\n        input_queue = self.dag.get_step(step_name)[constants.INPUT_QUEUE_ATTR_NAME]\n        input_queue.put(to_send)\n\n    def _send_batch_to_step(self, batch: \"_Batch\") -> None:\n        \"\"\"Sends a batch to the input queue of a step, writing the data of the batch\n        to the filesystem and setting `batch.data_path` with the path where the data\n        was written (if requiered i.e. the step is a global step or `use_fs_to_pass_data`)\n\n        This method should be extended by the specific pipeline implementation, adding\n        the logic to send the batch to the step.\n\n        Args:\n            batch: The batch to send.\n        \"\"\"\n        self._logger.debug(\n            f\"Setting batch {batch.seq_no} as last batch sent to '{batch.step_name}': {batch}\"\n        )\n        self._batch_manager.set_last_batch_sent(batch)  # type: ignore\n\n        step: \"_Step\" = self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n        if not step.is_generator and (step.is_global or self._use_fs_to_pass_data):\n            base_path = UPath(self._storage_base_path) / step.name  # type: ignore\n            self._logger.debug(\n                f\"Writing {batch.seq_no} batch for '{batch.step_name}' step to filesystem: {base_path}\"\n            )\n            batch.write_batch_data_to_fs(self._fs, base_path)  # type: ignore\n\n        self._logger.debug(\n            f\"Sending batch {batch.seq_no} to step '{batch.step_name}': {batch}\"\n        )\n        self._send_to_step(batch.step_name, batch)\n\n    def _gather_requirements(self) -> List[str]:\n        \"\"\"Extracts the requirements from the steps to be used in the pipeline.\n\n        Returns:\n            List of requirements gathered from the steps.\n        \"\"\"\n        steps_requirements = []\n        for step in self.dag:\n            step_req = self.dag.get_step(step)[constants.STEP_ATTR_NAME].requirements\n            steps_requirements.extend(step_req)\n\n        return steps_requirements\n\n    def _register_batch(self, batch: \"_Batch\") -> None:\n        \"\"\"Registers a batch in the batch manager.\n\n        Args:\n            batch: The batch to register.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n        self._batch_manager.register_batch(\n            batch, steps_data_path=self._cache_location[\"steps_data\"]\n        )  # type: ignore\n        self._logger.debug(\n            f\"Batch {batch.seq_no} from step '{batch.step_name}' registered in batch\"\n            \" manager\"\n        )\n\n    def _send_last_batch_flag_to_step(self, step_name: str) -> None:\n        \"\"\"Sends the `LAST_BATCH_SENT_FLAG` to a step to stop processing batches.\n\n        Args:\n            step_name: The name of the step.\n        \"\"\"\n        self._logger.debug(\n            f\"Sending `LAST_BATCH_SENT_FLAG` to '{step_name}' step to stop processing\"\n            \" batches...\"\n        )\n\n        for _ in range(self.dag.get_step_replica_count(step_name)):\n            self._send_to_step(step_name, constants.LAST_BATCH_SENT_FLAG)\n        self._batch_manager.set_last_batch_flag_sent_to(step_name)  # type: ignore\n\n    def _request_initial_batches(self) -> None:\n        \"\"\"Requests the initial batches to the generator steps.\"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n        for step in self._batch_manager._steps.values():\n            if not self._is_step_running(step.step_name):\n                continue\n            if batch := step.get_batch():\n                self._logger.debug(\n                    f\"Sending initial batch to '{step.step_name}' step: {batch}\"\n                )\n                self._send_batch_to_step(batch)\n\n        for step_name in self.dag.root_steps:\n            if not self._is_step_running(step_name):\n                continue\n            seq_no = 0\n            if last_batch := self._batch_manager.get_last_batch(step_name):\n                seq_no = last_batch.seq_no + 1\n            batch = _Batch(seq_no=seq_no, step_name=step_name, last_batch=self._dry_run)\n            self._logger.debug(\n                f\"Requesting initial batch to '{step_name}' generator step: {batch}\"\n            )\n            self._send_batch_to_step(batch)\n\n    def _request_batch_from_generator(self, step_name: str) -> None:\n        \"\"\"Request a new batch to a `GeneratorStep`.\n\n        Args:\n            step_name: the name of the `GeneratorStep` to which a batch has to be requested.\n        \"\"\"\n        # Get the last batch that the previous step sent to generate the next batch\n        # (next `seq_no`).\n        last_batch = self._batch_manager.get_last_batch_sent(step_name)  # type: ignore\n        if last_batch is None:\n            return\n        self._send_batch_to_step(last_batch.next_batch())\n\n    def _request_more_batches_if_needed(self, step: \"Step\") -> None:\n        \"\"\"Request more batches to the predecessors steps of `step` if needed.\n\n        Args:\n            step: The step of which it has to be checked if more batches are needed from\n                its predecessors.\n        \"\"\"\n        empty_buffers = self._batch_manager.step_empty_buffers(step.name)  # type: ignore\n        for previous_step_name in empty_buffers:\n            # Only more batches can be requested to the `GeneratorStep`s as they are the\n            # only kind of steps that lazily generate batches.\n            if previous_step_name not in self.dag.root_steps:\n                continue\n\n            self._request_batch_from_generator(previous_step_name)\n\n    def _handle_batch_on_stop(self, batch: \"_Batch\") -> None:\n        \"\"\"Handles a batch that was received from the output queue when the pipeline was\n        stopped. It will add and register the batch in the batch manager.\n\n        Args:\n            batch: The batch to handle.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        self._batch_manager.register_batch(\n            batch, steps_data_path=self._cache_location[\"steps_data\"]\n        )\n        step: \"Step\" = self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n        for successor in self.dag.get_step_successors(step.name):  # type: ignore\n            self._batch_manager.add_batch(successor, batch)\n\n    def _get_step_from_batch(self, batch: \"_Batch\") -> \"Step\":\n        \"\"\"Gets the `Step` instance from a batch.\n\n        Args:\n            batch: The batch to get the step from.\n\n        Returns:\n            The `Step` instance.\n        \"\"\"\n        return self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n\n    def _notify_steps_to_stop(self) -> None:\n        \"\"\"Notifies the steps to stop their infinite running loop by sending `None` to\n        their input queues.\"\"\"\n        with self._steps_load_status_lock:\n            for step_name, replicas in self._steps_load_status.items():\n                if replicas > 0:\n                    for _ in range(replicas):\n                        self._send_to_step(step_name, None)\n\n    def _get_successors(self, batch: \"_Batch\") -> Tuple[List[str], List[str], bool]:\n        \"\"\"Gets the successors and the successors to which the batch has to be routed.\n\n        Args:\n            batch: The batch to which the successors will be determined.\n\n        Returns:\n            The successors to route the batch to and whether the batch was routed using\n            a routing function.\n        \"\"\"\n        node = self.dag.get_step(batch.step_name)\n        step: \"Step\" = node[constants.STEP_ATTR_NAME]\n        successors = list(self.dag.get_step_successors(step.name))  # type: ignore\n        route_to = successors\n\n        # Check if the step has a routing function to send the batch to specific steps\n        if routing_batch_function := node.get(\n            constants.ROUTING_BATCH_FUNCTION_ATTR_NAME\n        ):\n            route_to = routing_batch_function(batch, successors)\n            successors_str = \", \".join(f\"'{successor}'\" for successor in route_to)\n            self._logger.info(\n                f\"\ud83d\ude8f Using '{step.name}' routing function to send batch {batch.seq_no} to steps: {successors_str}\"\n            )\n\n        return route_to, list(set(successors) - set(route_to)), route_to != successors\n\n    def _set_next_expected_seq_no(\n        self, steps: List[str], from_step: str, next_expected_seq_no: int\n    ) -> None:\n        \"\"\"Sets the next expected sequence number of a `_Batch` received by `step` from\n        `from_step`. This is necessary as some `Step`s might not receive all the batches\n        comming from the previous steps because there is a routing batch function.\n\n        Args:\n            steps: list of steps to which the next expected sequence number of a `_Batch`\n                from `from_step` has to be updated in the `_BatchManager`.\n            from_step: the name of the step from which the next expected sequence number\n                of a `_Batch` has to be updated in `steps`.\n            next_expected_seq_no: the number of the next expected sequence number of a `Batch`\n                from `from_step`.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        for step in steps:\n            self._batch_manager.set_next_expected_seq_no(\n                step_name=step,\n                from_step=from_step,\n                next_expected_seq_no=next_expected_seq_no,\n            )\n\n    @abstractmethod\n    def _teardown(self) -> None:\n        \"\"\"Clean/release/stop resources reserved to run the pipeline.\"\"\"\n        pass\n\n    @abstractmethod\n    def _set_steps_not_loaded_exception(self) -> None:\n        \"\"\"Used to raise `RuntimeError` when the load of the steps failed.\n\n        Raises:\n            RuntimeError: containing the information and why a step failed to be loaded.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def _stop(self) -> None:\n        \"\"\"Stops the pipeline in a controlled way.\"\"\"\n        pass\n\n    def _stop_load_queue_loop(self) -> None:\n        \"\"\"Stops the `_load_queue` loop sending a `None`.\"\"\"\n        self._logger.debug(\"Sending `None` to the load queue to notify stop...\")\n        self._load_queue.put(None)\n\n    def _stop_output_queue_loop(self) -> None:\n        \"\"\"Stops the `_output_queue` loop sending a `None`.\"\"\"\n        self._logger.debug(\"Sending `None` to the output queue to notify stop...\")\n        self._output_queue.put(None)\n\n    def _handle_keyboard_interrupt(self) -> Any:\n        \"\"\"Handles KeyboardInterrupt signal sent during the Pipeline.run method.\n\n        It will try to call self._stop (if the pipeline didn't started yet, it won't\n        have any effect), and if the pool is already started, will close it before exiting\n        the program.\n\n        Returns:\n            The original `signal.SIGINT` handler.\n        \"\"\"\n\n        def signal_handler(signumber: int, frame: Any) -> None:\n            self._stop()\n\n        return signal.signal(signal.SIGINT, signal_handler)\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.signature","title":"signature: str property","text":"

Makes a signature (hash) of a pipeline, using the step ids and the adjacency between them.

The main use is to find the pipeline in the cache folder.

Returns:

Type Description str

Signature of the pipeline.

"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.aggregated_steps_signature","title":"aggregated_steps_signature: str property","text":"

Creates an aggregated signature using Steps signature that will be used for the _BatchManager.

Returns:

Type Description str

The aggregated signature.

"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.QueueClass","title":"QueueClass: Callable abstractmethod property","text":"

The class of the queue to use in the pipeline.

"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__init__","title":"__init__(name=None, description=None, cache_dir=None, enable_metadata=False, requirements=None)","text":"

Initialize the BasePipeline instance.

Parameters:

Name Type Description Default name Optional[str]

The name of the pipeline. If not generated, a random one will be generated by default.

None description Optional[str]

A description of the pipeline. Defaults to None.

None cache_dir Optional[Union[str, PathLike]]

A directory where the pipeline will be cached. Defaults to None.

None enable_metadata bool

Whether to include the distilabel metadata column for the pipeline in the final Distiset. It contains metadata used by distilabel, for example the raw outputs of the LLM without processing would be here, inside raw_output_... field. Defaults to False.

False requirements Optional[List[str]]

List of requirements that must be installed to run the pipeline. Defaults to None, but can be helpful to inform in a pipeline to be shared that this requirements must be installed.

None Source code in src/distilabel/pipeline/base.py
def __init__(\n    self,\n    name: Optional[str] = None,\n    description: Optional[str] = None,\n    cache_dir: Optional[Union[str, \"PathLike\"]] = None,\n    enable_metadata: bool = False,\n    requirements: Optional[List[str]] = None,\n) -> None:\n    \"\"\"Initialize the `BasePipeline` instance.\n\n    Args:\n        name: The name of the pipeline. If not generated, a random one will be generated by default.\n        description: A description of the pipeline. Defaults to `None`.\n        cache_dir: A directory where the pipeline will be cached. Defaults to `None`.\n        enable_metadata: Whether to include the distilabel metadata column for the pipeline\n            in the final `Distiset`. It contains metadata used by distilabel, for example\n            the raw outputs of the `LLM` without processing would be here, inside `raw_output_...`\n            field. Defaults to `False`.\n        requirements: List of requirements that must be installed to run the pipeline.\n            Defaults to `None`, but can be helpful to inform in a pipeline to be shared\n            that this requirements must be installed.\n    \"\"\"\n    self.name = name or _PIPELINE_DEFAULT_NAME\n    self.description = description\n    self._enable_metadata = enable_metadata\n    self.dag = DAG()\n\n    if cache_dir:\n        self._cache_dir = Path(cache_dir)\n    elif env_cache_dir := envs.DISTILABEL_CACHE_DIR:\n        self._cache_dir = Path(env_cache_dir)\n    else:\n        self._cache_dir = constants.PIPELINES_CACHE_DIR\n\n    self._logger = logging.getLogger(\"distilabel.pipeline\")\n\n    self._batch_manager: Optional[\"_BatchManager\"] = None\n    self._write_buffer: Optional[\"_WriteBuffer\"] = None\n\n    self._steps_load_status: Dict[str, int] = {}\n    self._steps_load_status_lock = threading.Lock()\n\n    self._stop_called = False\n    self._stop_called_lock = threading.Lock()\n    self._stop_calls = 0\n\n    self._recover_offline_batch_generate_for_step: Union[\n        Tuple[str, List[List[Dict[str, Any]]]], None\n    ] = None\n\n    self._fs: Optional[fsspec.AbstractFileSystem] = None\n    self._storage_base_path: Optional[str] = None\n    self._use_fs_to_pass_data: bool = False\n    self._dry_run = False\n\n    self._current_stage = 0\n    self._stages_last_batch: List[List[str]] = []\n\n    self.requirements = requirements or []\n\n    self._exception: Union[Exception, None] = None\n\n    self._log_queue: Union[\"Queue[Any]\", None] = None\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__enter__","title":"__enter__()","text":"

Set the global pipeline instance when entering a pipeline context.

Source code in src/distilabel/pipeline/base.py
def __enter__(self) -> Self:\n    \"\"\"Set the global pipeline instance when entering a pipeline context.\"\"\"\n    _GlobalPipelineManager.set_pipeline(self)\n    return self\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__exit__","title":"__exit__(exc_type, exc_value, traceback)","text":"

Unset the global pipeline instance when exiting a pipeline context.

Source code in src/distilabel/pipeline/base.py
def __exit__(self, exc_type, exc_value, traceback) -> None:\n    \"\"\"Unset the global pipeline instance when exiting a pipeline context.\"\"\"\n    _GlobalPipelineManager.set_pipeline(None)\n    self._set_pipeline_name()\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.run","title":"run(parameters=None, use_cache=True, storage_parameters=None, use_fs_to_pass_data=False, dataset=None, dataset_batch_size=50, logging_handlers=None)","text":"

Run the pipeline. It will set the runtime parameters for the steps and validate the pipeline.

This method should be extended by the specific pipeline implementation, adding the logic to run the pipeline.

Parameters:

Name Type Description Default parameters Optional[Dict[str, Dict[str, Any]]]

A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None.

None use_cache bool

Whether to use the cache from previous pipeline runs. Defaults to True.

True storage_parameters Optional[Dict[str, Any]]

A dictionary with the storage parameters (fsspec and path) that will be used to store the data of the _Batches passed between the steps if use_fs_to_pass_data is True (for the batches received by a GlobalStep it will be always used). It must have at least the \"path\" key, and it can contain additional keys depending on the protocol. By default, it will use the local file system and a directory in the cache directory. Defaults to None.

None use_fs_to_pass_data bool

Whether to use the file system to pass the data of the _Batches between the steps. Even if this parameter is False, the Batches received by GlobalSteps will always use the file system to pass the data. Defaults to False.

False dataset Optional[InputDataset]

If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None.

None dataset_batch_size int

if dataset is given, this will be the size of the batches yield by the GeneratorStep created using the dataset. Defaults to 50.

50 logging_handlers Optional[List[Handler]]

A list of logging handlers that will be used to log the output of the pipeline. This argument can be useful so the logging messages can be extracted and used in a different context. Defaults to None.

None

Returns:

Type Description Distiset

The Distiset created by the pipeline.

Source code in src/distilabel/pipeline/base.py
def run(\n    self,\n    parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n    use_cache: bool = True,\n    storage_parameters: Optional[Dict[str, Any]] = None,\n    use_fs_to_pass_data: bool = False,\n    dataset: Optional[\"InputDataset\"] = None,\n    dataset_batch_size: int = 50,\n    logging_handlers: Optional[List[logging.Handler]] = None,\n) -> \"Distiset\":  # type: ignore\n    \"\"\"Run the pipeline. It will set the runtime parameters for the steps and validate\n    the pipeline.\n\n    This method should be extended by the specific pipeline implementation,\n    adding the logic to run the pipeline.\n\n    Args:\n        parameters: A dictionary with the step name as the key and a dictionary with\n            the runtime parameters for the step as the value. Defaults to `None`.\n        use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n            `True`.\n        storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n            that will be used to store the data of the `_Batch`es passed between the\n            steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n            `GlobalStep` it will be always used). It must have at least the \"path\" key,\n            and it can contain additional keys depending on the protocol. By default,\n            it will use the local file system and a directory in the cache directory.\n            Defaults to `None`.\n        use_fs_to_pass_data: Whether to use the file system to pass the data of\n            the `_Batch`es between the steps. Even if this parameter is `False`, the\n            `Batch`es received by `GlobalStep`s will always use the file system to\n            pass the data. Defaults to `False`.\n        dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n            root step. Convenient method when you have already processed the dataset in\n            your script and just want to pass it already processed. Defaults to `None`.\n        dataset_batch_size: if `dataset` is given, this will be the size of the batches\n            yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n        logging_handlers: A list of logging handlers that will be used to log the\n            output of the pipeline. This argument can be useful so the logging messages\n            can be extracted and used in a different context. Defaults to `None`.\n\n    Returns:\n        The `Distiset` created by the pipeline.\n    \"\"\"\n\n    self._exception: Union[Exception, None] = None\n\n    # Set the runtime parameters that will be used during the pipeline execution.\n    # They are used to generate the signature of the pipeline that is used to hit the\n    # cache when the pipeline is run, so it's important to do it first.\n    self._set_runtime_parameters(parameters or {})\n\n    self._refresh_pipeline_from_cache()\n\n    if dataset is not None:\n        self._add_dataset_generator_step(dataset, dataset_batch_size)\n\n    setup_logging(\n        log_queue=self._log_queue,\n        filename=str(self._cache_location[\"log_file\"]),\n        logging_handlers=logging_handlers,\n    )\n\n    # Set the name of the pipeline if it's the default one. This should be called\n    # if the pipeline is defined within the context manager, and the run is called\n    # outside of it. Is here in the following case:\n    # with Pipeline() as pipeline:\n    #    pipeline.run()\n    self._set_pipeline_name()\n\n    # Validate the pipeline DAG to check that all the steps are chainable, there are\n    # no missing runtime parameters, batch sizes are correct, etc.\n    self.dag.validate()\n\n    self._set_pipeline_artifacts_path_in_steps()\n\n    # Set the initial load status for all the steps\n    self._init_steps_load_status()\n\n    # Load the stages status or initialize it\n    self._load_stages_status(use_cache)\n\n    # Load the `_BatchManager` from cache or create one from scratch\n    self._load_batch_manager(use_cache)\n\n    # Check pipeline requirements are installed\n    self._check_requirements()\n\n    # Setup the filesystem that will be used to pass the data of the `_Batch`es\n    self._setup_fsspec(storage_parameters)\n    self._use_fs_to_pass_data = use_fs_to_pass_data\n\n    if self._dry_run:\n        self._logger.info(\"\ud83c\udf35 Dry run mode\")\n\n    # If the batch manager is not able to generate batches, that means that the loaded\n    # `_BatchManager` from cache didn't have any remaining batches to process i.e.\n    # the previous pipeline execution was completed successfully.\n    if not self._batch_manager.can_generate():  # type: ignore\n        self._logger.info(\n            \"\ud83d\udcbe Loaded batch manager from cache doesn't contain any remaining data.\"\n            \" Returning `Distiset` from cache data...\"\n        )\n        distiset = create_distiset(\n            data_dir=self._cache_location[\"data\"],\n            pipeline_path=self._cache_location[\"pipeline\"],\n            log_filename_path=self._cache_location[\"log_file\"],\n            enable_metadata=self._enable_metadata,\n            dag=self.dag,\n        )\n        stop_logging()\n        return distiset\n\n    self._setup_write_buffer(use_cache)\n\n    self._print_load_stages_info()\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.dry_run","title":"dry_run(parameters=None, batch_size=1, dataset=None)","text":"

Do a dry run to test the pipeline runs as expected.

Running a Pipeline in dry run mode will set all the batch_size of generator steps to the specified batch_size, and run just with a single batch, effectively running the whole pipeline with a single example. The cache will be set to False.

Parameters:

Name Type Description Default parameters Optional[Dict[str, Dict[str, Any]]]

A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None.

None batch_size int

The batch size of the unique batch generated by the generators steps of the pipeline. Defaults to 1.

1 dataset Optional[InputDataset]

If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None.

None

Returns:

Type Description Distiset

Will return the Distiset as the main run method would do.

Source code in src/distilabel/pipeline/base.py
def dry_run(\n    self,\n    parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n    batch_size: int = 1,\n    dataset: Optional[\"InputDataset\"] = None,\n) -> \"Distiset\":\n    \"\"\"Do a dry run to test the pipeline runs as expected.\n\n    Running a `Pipeline` in dry run mode will set all the `batch_size` of generator steps\n    to the specified `batch_size`, and run just with a single batch, effectively\n    running the whole pipeline with a single example. The cache will be set to `False`.\n\n    Args:\n        parameters: A dictionary with the step name as the key and a dictionary with\n            the runtime parameters for the step as the value. Defaults to `None`.\n        batch_size: The batch size of the unique batch generated by the generators\n            steps of the pipeline. Defaults to `1`.\n        dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n            root step. Convenient method when you have already processed the dataset in\n            your script and just want to pass it already processed. Defaults to `None`.\n\n    Returns:\n        Will return the `Distiset` as the main run method would do.\n    \"\"\"\n    self._dry_run = True\n\n    for step_name in self.dag:\n        step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n\n        if step.is_generator:\n            if not parameters:\n                parameters = {}\n            parameters[step_name] = {\"batch_size\": batch_size}\n\n    distiset = self.run(parameters=parameters, use_cache=False, dataset=dataset)\n\n    self._dry_run = False\n    return distiset\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.get_runtime_parameters_info","title":"get_runtime_parameters_info()","text":"

Get the runtime parameters for the steps in the pipeline.

Returns:

Type Description PipelineRuntimeParametersInfo

A dictionary with the step name as the key and a list of dictionaries with

PipelineRuntimeParametersInfo

the parameter name and the parameter info as the value.

Source code in src/distilabel/pipeline/base.py
def get_runtime_parameters_info(self) -> \"PipelineRuntimeParametersInfo\":\n    \"\"\"Get the runtime parameters for the steps in the pipeline.\n\n    Returns:\n        A dictionary with the step name as the key and a list of dictionaries with\n        the parameter name and the parameter info as the value.\n    \"\"\"\n    runtime_parameters = {}\n    for step_name in self.dag:\n        step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n        runtime_parameters[step_name] = step.get_runtime_parameters_info()\n    return runtime_parameters\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.draw","title":"draw(path='pipeline.png', top_to_bottom=False, show_edge_labels=True)","text":"

Draws the pipeline.

Parameters:

Name Type Description Default path Optional[Union[str, Path]]

The path to save the image to.

'pipeline.png' top_to_bottom bool

Whether to draw the DAG top to bottom. Defaults to False.

False show_edge_labels bool

Whether to show the edge labels. Defaults to True.

True

Returns:

Type Description str

The path to the saved image.

Source code in src/distilabel/pipeline/base.py
def draw(\n    self,\n    path: Optional[Union[str, Path]] = \"pipeline.png\",\n    top_to_bottom: bool = False,\n    show_edge_labels: bool = True,\n) -> str:\n    \"\"\"\n    Draws the pipeline.\n\n    Parameters:\n        path: The path to save the image to.\n        top_to_bottom: Whether to draw the DAG top to bottom. Defaults to `False`.\n        show_edge_labels: Whether to show the edge labels. Defaults to `True`.\n\n    Returns:\n        The path to the saved image.\n    \"\"\"\n    png = self.dag.draw(\n        top_to_bottom=top_to_bottom, show_edge_labels=show_edge_labels\n    )\n    with open(path, \"wb\") as f:\n        f.write(png)\n    return path\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__repr__","title":"__repr__()","text":"

If running in a Jupyter notebook, display an image representing this Pipeline.

Source code in src/distilabel/pipeline/base.py
def __repr__(self) -> str:\n    \"\"\"\n    If running in a Jupyter notebook, display an image representing this `Pipeline`.\n    \"\"\"\n    if in_notebook():\n        try:\n            from IPython.display import Image, display\n\n            image_data = self.dag.draw()\n\n            display(Image(image_data))\n        except Exception:\n            pass\n    return super().__repr__()\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.from_dict","title":"from_dict(data) classmethod","text":"

Create a Pipeline from a dict containing the serialized data.

Note

It's intended for internal use.

Parameters:

Name Type Description Default data Dict[str, Any]

Dictionary containing the serialized data from a Pipeline.

required

Returns:

Name Type Description BasePipeline Self

Pipeline recreated from the dictionary info.

Source code in src/distilabel/pipeline/base.py
@classmethod\ndef from_dict(cls, data: Dict[str, Any]) -> Self:\n    \"\"\"Create a Pipeline from a dict containing the serialized data.\n\n    Note:\n        It's intended for internal use.\n\n    Args:\n        data (Dict[str, Any]): Dictionary containing the serialized data from a Pipeline.\n\n    Returns:\n        BasePipeline: Pipeline recreated from the dictionary info.\n    \"\"\"\n    name = data[\"pipeline\"][\"name\"]\n    description = data[\"pipeline\"].get(\"description\")\n    requirements = data.get(\"requirements\", [])\n    with cls(name=name, description=description, requirements=requirements) as pipe:\n        pipe.dag = DAG.from_dict(data[\"pipeline\"])\n    return pipe\n
"},{"location":"api/pipeline/#distilabel.pipeline.local","title":"local","text":""},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline","title":"Pipeline","text":"

Bases: BasePipeline

Local pipeline implementation using multiprocessing.

Source code in src/distilabel/pipeline/local.py
class Pipeline(BasePipeline):\n    \"\"\"Local pipeline implementation using `multiprocessing`.\"\"\"\n\n    def ray(\n        self,\n        ray_head_node_url: Optional[str] = None,\n        ray_init_kwargs: Optional[Dict[str, Any]] = None,\n    ) -> RayPipeline:\n        \"\"\"Creates a `RayPipeline` using the init parameters of this pipeline. This is a\n        convenient method that can be used to \"transform\" one common `Pipeline` to a `RayPipeline`\n        and it's mainly used by the CLI.\n\n        Args:\n            ray_head_node_url: The URL that can be used to connect to the head node of\n                the Ray cluster. Normally, you won't want to use this argument as the\n                recommended way to submit a job to a Ray cluster is using the [Ray Jobs\n                CLI](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html#ray-jobs-overview).\n                Defaults to `None`.\n            ray_init_kwargs: kwargs that will be passed to the `ray.init` method. Defaults\n                to `None`.\n\n        Returns:\n            A `RayPipeline` instance.\n        \"\"\"\n        pipeline = RayPipeline(\n            name=self.name,\n            description=self.description,\n            cache_dir=self._cache_dir,\n            enable_metadata=self._enable_metadata,\n            requirements=self.requirements,\n            ray_head_node_url=ray_head_node_url,\n            ray_init_kwargs=ray_init_kwargs,\n        )\n        pipeline.dag = self.dag\n        return pipeline\n\n    def run(\n        self,\n        parameters: Optional[Dict[Any, Dict[str, Any]]] = None,\n        use_cache: bool = True,\n        storage_parameters: Optional[Dict[str, Any]] = None,\n        use_fs_to_pass_data: bool = False,\n        dataset: Optional[\"InputDataset\"] = None,\n        dataset_batch_size: int = 50,\n        logging_handlers: Optional[List[\"logging.Handler\"]] = None,\n    ) -> \"Distiset\":\n        \"\"\"Runs the pipeline.\n\n        Args:\n            parameters: A dictionary with the step name as the key and a dictionary with\n                the runtime parameters for the step as the value. Defaults to `None`.\n            use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n                `True`.\n            storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n                that will be used to store the data of the `_Batch`es passed between the\n                steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n                `GlobalStep` it will be always used). It must have at least the \"path\" key,\n                and it can contain additional keys depending on the protocol. By default,\n                it will use the local file system and a directory in the cache directory.\n                Defaults to `None`.\n            use_fs_to_pass_data: Whether to use the file system to pass the data of\n                the `_Batch`es between the steps. Even if this parameter is `False`, the\n                `Batch`es received by `GlobalStep`s will always use the file system to\n                pass the data. Defaults to `False`.\n            dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n                root step. Convenient method when you have already processed the dataset in\n                your script and just want to pass it already processed. Defaults to `None`.\n            dataset_batch_size: if `dataset` is given, this will be the size of the batches\n                yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n            logging_handlers: A list of logging handlers that will be used to log the\n                output of the pipeline. This argument can be useful so the logging messages\n                can be extracted and used in a different context. Defaults to `None`.\n\n        Returns:\n            The `Distiset` created by the pipeline.\n\n        Raises:\n            RuntimeError: If the pipeline fails to load all the steps.\n        \"\"\"\n        if script_executed_in_ray_cluster():\n            print(\"Script running in Ray cluster... Using `RayPipeline`...\")\n            return self.ray().run(\n                parameters=parameters,\n                use_cache=use_cache,\n                storage_parameters=storage_parameters,\n                use_fs_to_pass_data=use_fs_to_pass_data,\n                dataset=dataset,\n                dataset_batch_size=dataset_batch_size,\n            )\n\n        self._log_queue = cast(\"Queue[Any]\", mp.Queue())\n\n        if distiset := super().run(\n            parameters=parameters,\n            use_cache=use_cache,\n            storage_parameters=storage_parameters,\n            use_fs_to_pass_data=use_fs_to_pass_data,\n            dataset=dataset,\n            dataset_batch_size=dataset_batch_size,\n            logging_handlers=logging_handlers,\n        ):\n            return distiset\n\n        num_processes = self.dag.get_total_replica_count()\n        with (\n            mp.Manager() as manager,\n            _NoDaemonPool(\n                num_processes,\n                initializer=_init_worker,\n                initargs=(\n                    self._log_queue,\n                    self.name,\n                    self.signature,\n                ),\n            ) as pool,\n        ):\n            self._manager = manager\n            self._pool = pool\n            self._output_queue = self.QueueClass()\n            self._load_queue = self.QueueClass()\n            self._handle_keyboard_interrupt()\n\n            # Run the loop for receiving the load status of each step\n            self._load_steps_thread = self._run_load_queue_loop_in_thread()\n\n            # Start a loop to receive the output batches from the steps\n            self._output_queue_thread = self._run_output_queue_loop_in_thread()\n            self._output_queue_thread.join()\n\n            self._teardown()\n\n            if self._exception:\n                raise self._exception\n\n        distiset = create_distiset(\n            self._cache_location[\"data\"],\n            pipeline_path=self._cache_location[\"pipeline\"],\n            log_filename_path=self._cache_location[\"log_file\"],\n            enable_metadata=self._enable_metadata,\n            dag=self.dag,\n        )\n\n        stop_logging()\n\n        return distiset\n\n    @property\n    def QueueClass(self) -> Callable:\n        \"\"\"The callable used to create the input and output queues.\n\n        Returns:\n            The callable to create a `Queue`.\n        \"\"\"\n        assert self._manager, \"Manager is not initialized\"\n        return self._manager.Queue\n\n    def _run_step(self, step: \"_Step\", input_queue: \"Queue[Any]\", replica: int) -> None:\n        \"\"\"Runs the `Step` wrapped in a `_ProcessWrapper` in a separate process of the\n        `Pool`.\n\n        Args:\n            step: The step to run.\n            input_queue: The input queue to send the data to the step.\n            replica: The replica ID assigned.\n        \"\"\"\n        assert self._pool, \"Pool is not initialized\"\n\n        step_wrapper = _StepWrapper(\n            step=step,  # type: ignore\n            replica=replica,\n            input_queue=input_queue,\n            output_queue=self._output_queue,\n            load_queue=self._load_queue,\n            dry_run=self._dry_run,\n            ray_pipeline=False,\n        )\n\n        self._pool.apply_async(step_wrapper.run, error_callback=self._error_callback)\n\n    def _error_callback(self, e: BaseException) -> None:\n        \"\"\"Error callback that will be called when an error occurs in a `Step` process.\n\n        Args:\n            e: The exception raised by the process.\n        \"\"\"\n        global _SUBPROCESS_EXCEPTION\n\n        # First we check that the exception is a `_StepWrapperException`, otherwise, we\n        # print it out and stop the pipeline, since some errors may be unhandled\n        if not isinstance(e, _StepWrapperException):\n            self._logger.error(f\"\u274c Failed with an unhandled exception: {e}\")\n            self._stop()\n            return\n\n        if e.is_load_error:\n            self._logger.error(f\"\u274c Failed to load step '{e.step.name}': {e.message}\")\n            _SUBPROCESS_EXCEPTION = e.subprocess_exception\n            _SUBPROCESS_EXCEPTION.__traceback__ = tblib.Traceback.from_string(  # type: ignore\n                e.formatted_traceback\n            ).as_traceback()\n            return\n\n        # If the step is global, is not in the last trophic level and has no successors,\n        # then we can ignore the error and continue executing the pipeline\n        step_name: str = e.step.name  # type: ignore\n        if (\n            e.step.is_global\n            and not self.dag.step_in_last_trophic_level(step_name)\n            and list(self.dag.get_step_successors(step_name)) == []\n        ):\n            self._logger.error(\n                f\"\u270b An error occurred when running global step '{step_name}' with no\"\n                \" successors and not in the last trophic level. Pipeline execution can\"\n                f\" continue. Error will be ignored.\"\n            )\n            self._logger.error(f\"Subprocess traceback:\\n\\n{e.formatted_traceback}\")\n            return\n\n        # Handle tasks using an `LLM` using offline batch generation\n        if isinstance(\n            e.subprocess_exception, DistilabelOfflineBatchGenerationNotFinishedException\n        ):\n            self._logger.info(\n                f\"\u23f9\ufe0f '{e.step.name}' task stopped pipeline execution: LLM offline batch\"\n                \" generation in progress. Rerun pipeline with cache to check results and\"\n                \" continue execution.\"\n            )\n            self._set_step_for_recovering_offline_batch_generation(e.step, e.data)  # type: ignore\n            with self._stop_called_lock:\n                if not self._stop_called:\n                    self._stop(acquire_lock=False)\n            return\n\n        # Global step with successors failed\n        self._logger.error(f\"An error occurred in global step '{step_name}'\")\n        self._logger.error(f\"Subprocess traceback:\\n\\n{e.formatted_traceback}\")\n\n        self._stop()\n\n    def _teardown(self) -> None:\n        \"\"\"Clean/release/stop resources reserved to run the pipeline.\"\"\"\n        if self._write_buffer:\n            self._write_buffer.close()\n\n        if self._batch_manager:\n            self._batch_manager = None\n\n        self._stop_load_queue_loop()\n        self._load_steps_thread.join()\n\n        if self._pool:\n            self._pool.terminate()\n            self._pool.join()\n\n        if self._manager:\n            self._manager.shutdown()\n            self._manager.join()\n\n    def _set_steps_not_loaded_exception(self) -> None:\n        \"\"\"Raises a `RuntimeError` notifying that the steps load has failed.\n\n        Raises:\n            RuntimeError: containing the information and why a step failed to be loaded.\n        \"\"\"\n        self._exception = RuntimeError(\n            \"Failed to load all the steps. Could not run pipeline.\"\n        )\n        self._exception.__cause__ = _SUBPROCESS_EXCEPTION\n\n    def _stop(self, acquire_lock: bool = True) -> None:\n        \"\"\"Stops the pipeline execution. It will first send `None` to the input queues\n        of all the steps and then wait until the output queue is empty i.e. all the steps\n        finished processing the batches that were sent before the stop flag. Then it will\n        send `None` to the output queue to notify the pipeline to stop.\n\n        Args:\n            acquire_lock: Whether to acquire the lock to access the `_stop_called` attribute.\n        \"\"\"\n\n        if acquire_lock:\n            self._stop_called_lock.acquire()\n\n        if self._stop_called:\n            self._stop_calls += 1\n            if self._stop_calls == 1:\n                self._logger.warning(\"\ud83d\uded1 Press again to force the pipeline to stop.\")\n            elif self._stop_calls > 1:\n                self._logger.warning(\"\ud83d\uded1 Forcing pipeline interruption.\")\n\n                if self._pool:\n                    self._pool.terminate()\n                    self._pool.join()\n                    self._pool = None\n\n                if self._manager:\n                    self._manager.shutdown()\n                    self._manager.join()\n                    self._manager = None\n\n                stop_logging()\n\n                sys.exit(1)\n\n            return\n        self._stop_called = True\n\n        if acquire_lock:\n            self._stop_called_lock.release()\n\n        self._logger.debug(\n            f\"Steps loaded before calling `stop`: {self._steps_load_status}\"\n        )\n        self._logger.info(\n            \"\ud83d\uded1 Stopping pipeline. Waiting for steps to finish processing batches...\"\n        )\n\n        self._stop_output_queue_loop()\n
"},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.QueueClass","title":"QueueClass: Callable property","text":"

The callable used to create the input and output queues.

Returns:

Type Description Callable

The callable to create a Queue.

"},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.ray","title":"ray(ray_head_node_url=None, ray_init_kwargs=None)","text":"

Creates a RayPipeline using the init parameters of this pipeline. This is a convenient method that can be used to \"transform\" one common Pipeline to a RayPipeline and it's mainly used by the CLI.

Parameters:

Name Type Description Default ray_head_node_url Optional[str]

The URL that can be used to connect to the head node of the Ray cluster. Normally, you won't want to use this argument as the recommended way to submit a job to a Ray cluster is using the Ray Jobs CLI. Defaults to None.

None ray_init_kwargs Optional[Dict[str, Any]]

kwargs that will be passed to the ray.init method. Defaults to None.

None

Returns:

Type Description RayPipeline

A RayPipeline instance.

Source code in src/distilabel/pipeline/local.py
def ray(\n    self,\n    ray_head_node_url: Optional[str] = None,\n    ray_init_kwargs: Optional[Dict[str, Any]] = None,\n) -> RayPipeline:\n    \"\"\"Creates a `RayPipeline` using the init parameters of this pipeline. This is a\n    convenient method that can be used to \"transform\" one common `Pipeline` to a `RayPipeline`\n    and it's mainly used by the CLI.\n\n    Args:\n        ray_head_node_url: The URL that can be used to connect to the head node of\n            the Ray cluster. Normally, you won't want to use this argument as the\n            recommended way to submit a job to a Ray cluster is using the [Ray Jobs\n            CLI](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html#ray-jobs-overview).\n            Defaults to `None`.\n        ray_init_kwargs: kwargs that will be passed to the `ray.init` method. Defaults\n            to `None`.\n\n    Returns:\n        A `RayPipeline` instance.\n    \"\"\"\n    pipeline = RayPipeline(\n        name=self.name,\n        description=self.description,\n        cache_dir=self._cache_dir,\n        enable_metadata=self._enable_metadata,\n        requirements=self.requirements,\n        ray_head_node_url=ray_head_node_url,\n        ray_init_kwargs=ray_init_kwargs,\n    )\n    pipeline.dag = self.dag\n    return pipeline\n
"},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.run","title":"run(parameters=None, use_cache=True, storage_parameters=None, use_fs_to_pass_data=False, dataset=None, dataset_batch_size=50, logging_handlers=None)","text":"

Runs the pipeline.

Parameters:

Name Type Description Default parameters Optional[Dict[Any, Dict[str, Any]]]

A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None.

None use_cache bool

Whether to use the cache from previous pipeline runs. Defaults to True.

True storage_parameters Optional[Dict[str, Any]]

A dictionary with the storage parameters (fsspec and path) that will be used to store the data of the _Batches passed between the steps if use_fs_to_pass_data is True (for the batches received by a GlobalStep it will be always used). It must have at least the \"path\" key, and it can contain additional keys depending on the protocol. By default, it will use the local file system and a directory in the cache directory. Defaults to None.

None use_fs_to_pass_data bool

Whether to use the file system to pass the data of the _Batches between the steps. Even if this parameter is False, the Batches received by GlobalSteps will always use the file system to pass the data. Defaults to False.

False dataset Optional[InputDataset]

If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None.

None dataset_batch_size int

if dataset is given, this will be the size of the batches yield by the GeneratorStep created using the dataset. Defaults to 50.

50 logging_handlers Optional[List[Handler]]

A list of logging handlers that will be used to log the output of the pipeline. This argument can be useful so the logging messages can be extracted and used in a different context. Defaults to None.

None

Returns:

Type Description Distiset

The Distiset created by the pipeline.

Raises:

Type Description RuntimeError

If the pipeline fails to load all the steps.

Source code in src/distilabel/pipeline/local.py
def run(\n    self,\n    parameters: Optional[Dict[Any, Dict[str, Any]]] = None,\n    use_cache: bool = True,\n    storage_parameters: Optional[Dict[str, Any]] = None,\n    use_fs_to_pass_data: bool = False,\n    dataset: Optional[\"InputDataset\"] = None,\n    dataset_batch_size: int = 50,\n    logging_handlers: Optional[List[\"logging.Handler\"]] = None,\n) -> \"Distiset\":\n    \"\"\"Runs the pipeline.\n\n    Args:\n        parameters: A dictionary with the step name as the key and a dictionary with\n            the runtime parameters for the step as the value. Defaults to `None`.\n        use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n            `True`.\n        storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n            that will be used to store the data of the `_Batch`es passed between the\n            steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n            `GlobalStep` it will be always used). It must have at least the \"path\" key,\n            and it can contain additional keys depending on the protocol. By default,\n            it will use the local file system and a directory in the cache directory.\n            Defaults to `None`.\n        use_fs_to_pass_data: Whether to use the file system to pass the data of\n            the `_Batch`es between the steps. Even if this parameter is `False`, the\n            `Batch`es received by `GlobalStep`s will always use the file system to\n            pass the data. Defaults to `False`.\n        dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n            root step. Convenient method when you have already processed the dataset in\n            your script and just want to pass it already processed. Defaults to `None`.\n        dataset_batch_size: if `dataset` is given, this will be the size of the batches\n            yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n        logging_handlers: A list of logging handlers that will be used to log the\n            output of the pipeline. This argument can be useful so the logging messages\n            can be extracted and used in a different context. Defaults to `None`.\n\n    Returns:\n        The `Distiset` created by the pipeline.\n\n    Raises:\n        RuntimeError: If the pipeline fails to load all the steps.\n    \"\"\"\n    if script_executed_in_ray_cluster():\n        print(\"Script running in Ray cluster... Using `RayPipeline`...\")\n        return self.ray().run(\n            parameters=parameters,\n            use_cache=use_cache,\n            storage_parameters=storage_parameters,\n            use_fs_to_pass_data=use_fs_to_pass_data,\n            dataset=dataset,\n            dataset_batch_size=dataset_batch_size,\n        )\n\n    self._log_queue = cast(\"Queue[Any]\", mp.Queue())\n\n    if distiset := super().run(\n        parameters=parameters,\n        use_cache=use_cache,\n        storage_parameters=storage_parameters,\n        use_fs_to_pass_data=use_fs_to_pass_data,\n        dataset=dataset,\n        dataset_batch_size=dataset_batch_size,\n        logging_handlers=logging_handlers,\n    ):\n        return distiset\n\n    num_processes = self.dag.get_total_replica_count()\n    with (\n        mp.Manager() as manager,\n        _NoDaemonPool(\n            num_processes,\n            initializer=_init_worker,\n            initargs=(\n                self._log_queue,\n                self.name,\n                self.signature,\n            ),\n        ) as pool,\n    ):\n        self._manager = manager\n        self._pool = pool\n        self._output_queue = self.QueueClass()\n        self._load_queue = self.QueueClass()\n        self._handle_keyboard_interrupt()\n\n        # Run the loop for receiving the load status of each step\n        self._load_steps_thread = self._run_load_queue_loop_in_thread()\n\n        # Start a loop to receive the output batches from the steps\n        self._output_queue_thread = self._run_output_queue_loop_in_thread()\n        self._output_queue_thread.join()\n\n        self._teardown()\n\n        if self._exception:\n            raise self._exception\n\n    distiset = create_distiset(\n        self._cache_location[\"data\"],\n        pipeline_path=self._cache_location[\"pipeline\"],\n        log_filename_path=self._cache_location[\"log_file\"],\n        enable_metadata=self._enable_metadata,\n        dag=self.dag,\n    )\n\n    stop_logging()\n\n    return distiset\n
"},{"location":"api/pipeline/routing_batch_function/","title":"Routing batch function","text":""},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function","title":"routing_batch_function","text":""},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunc","title":"RoutingBatchFunc = Callable[[List[str]], List[str]] module-attribute","text":"

Type alias for a routing batch function. It takes a list of all the downstream steps and returns a list with the names of the steps that should receive the batch.

"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction","title":"RoutingBatchFunction","text":"

Bases: BaseModel, _Serializable

A thin wrapper around a routing batch function that can be used to route batches from one upstream step to specific downstream steps.

Attributes:

Name Type Description routing_function RoutingBatchFunc

The routing function that takes a list of all the downstream steps and returns a list with the names of the steps that should receive the batch.

_step Union[_Step, None]

The upstream step that is connected to the routing batch function.

_routed_batch_registry Dict[str, Dict[int, List[str]]]

A dictionary that keeps track of the batches that have been routed to specific downstream steps.

Source code in src/distilabel/pipeline/routing_batch_function.py
class RoutingBatchFunction(BaseModel, _Serializable):\n    \"\"\"A thin wrapper around a routing batch function that can be used to route batches\n    from one upstream step to specific downstream steps.\n\n    Attributes:\n        routing_function: The routing function that takes a list of all the downstream steps\n            and returns a list with the names of the steps that should receive the batch.\n        _step: The upstream step that is connected to the routing batch function.\n        _routed_batch_registry: A dictionary that keeps track of the batches that have been\n            routed to specific downstream steps.\n    \"\"\"\n\n    routing_function: RoutingBatchFunc\n    description: Optional[str] = None\n\n    _step: Union[\"_Step\", None] = PrivateAttr(default=None)\n    _routed_batch_registry: Dict[str, Dict[int, List[str]]] = PrivateAttr(\n        default_factory=dict\n    )\n    _factory_function_module: Union[str, None] = PrivateAttr(default=None)\n    _factory_function_name: Union[str, None] = PrivateAttr(default=None)\n    _factory_function_kwargs: Union[Dict[str, Any], None] = PrivateAttr(default=None)\n\n    def route_batch(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n        \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n        should be routed.\n\n        Args:\n            batch: The batch that should be routed.\n            steps: A list of all the downstream steps that can receive the batch.\n\n        Returns:\n            A list with the names of the steps that should receive the batch.\n        \"\"\"\n        routed_steps = self.routing_function(steps)\n        self._register_routed_batch(batch, routed_steps)\n        return routed_steps\n\n    def set_factory_function(\n        self,\n        factory_function_module: str,\n        factory_function_name: str,\n        factory_function_kwargs: Dict[str, Any],\n    ) -> None:\n        \"\"\"Sets the factory function that was used to create the `routing_batch_function`.\n\n        Args:\n            factory_function_module: The module name where the factory function is defined.\n            factory_function_name: The name of the factory function that was used to create\n                the `routing_batch_function`.\n            factory_function_kwargs: The keyword arguments that were used when calling the\n                factory function.\n        \"\"\"\n        self._factory_function_module = factory_function_module\n        self._factory_function_name = factory_function_name\n        self._factory_function_kwargs = factory_function_kwargs\n\n    def __call__(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n        \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n        should be routed.\n\n        Args:\n            batch: The batch that should be routed.\n            steps: A list of all the downstream steps that can receive the batch.\n\n        Returns:\n            A list with the names of the steps that should receive the batch.\n        \"\"\"\n        return self.route_batch(batch, steps)\n\n    def _register_routed_batch(self, batch: \"_Batch\", routed_steps: List[str]) -> None:\n        \"\"\"Registers a batch that has been routed to specific downstream steps.\n\n        Args:\n            batch: The batch that has been routed.\n            routed_steps: The list of downstream steps that have been selected to receive\n                the batch.\n        \"\"\"\n        upstream_step = batch.step_name\n        batch_seq_no = batch.seq_no\n        self._routed_batch_registry.setdefault(upstream_step, {}).setdefault(\n            batch_seq_no, routed_steps\n        )\n\n    def __rshift__(\n        self, other: List[\"DownstreamConnectableSteps\"]\n    ) -> List[\"DownstreamConnectableSteps\"]:\n        \"\"\"Connects a list of dowstream steps to the upstream step of the routing batch\n        function.\n\n        Args:\n            other: A list of downstream steps that should be connected to the upstream step\n                of the routing batch function.\n\n        Returns:\n            The list of downstream steps that have been connected to the upstream step of the\n            routing batch function.\n        \"\"\"\n        if not isinstance(other, list):\n            raise DistilabelUserError(\n                f\"Can only set a `routing_batch_function` for a list of steps. Got: {other}.\"\n                \" Please, review the right-hand side of the `routing_batch_function >> other`\"\n                \" expression. It should be\"\n                \" `upstream_step >> routing_batch_function >> [downstream_step_1, dowstream_step_2, ...]`.\",\n                page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n            )\n\n        if not self._step:\n            raise DistilabelUserError(\n                \"Routing batch function doesn't have an upstream step. Cannot connect downstream\"\n                \" steps before connecting the upstream step. Connect this routing batch\"\n                \" function to an upstream step using the `>>` operator. For example:\"\n                \" `upstream_step >> routing_batch_function >> [downstream_step_1, downstream_step_2, ...]`.\",\n                page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n            )\n\n        for step in other:\n            self._step.connect(step)\n        return other\n\n    def dump(self, **kwargs: Any) -> Dict[str, Any]:\n        \"\"\"Dumps the routing batch function to a dictionary, and the information of the\n        factory function used to create this routing batch function.\n\n        Args:\n            **kwargs: Additional keyword arguments that should be included in the dump.\n\n        Returns:\n            A dictionary with the routing batch function information and the factory function\n            information.\n        \"\"\"\n        dump_info: Dict[str, Any] = {\"step\": self._step.name}  # type: ignore\n\n        if self.description:\n            dump_info[\"description\"] = self.description\n\n        if type_info := self._get_type_info():\n            dump_info[TYPE_INFO_KEY] = type_info\n\n        return dump_info\n\n    def _get_type_info(self) -> Dict[str, Any]:\n        \"\"\"Returns the information of the factory function used to create the routing batch\n        function.\n\n        Returns:\n            A dictionary with the factory function information.\n        \"\"\"\n\n        type_info = {}\n\n        if self._factory_function_module:\n            type_info[\"module\"] = self._factory_function_module\n\n        if self._factory_function_name:\n            type_info[\"name\"] = self._factory_function_name\n\n        if self._factory_function_kwargs:\n            type_info[\"kwargs\"] = self._factory_function_kwargs\n\n        return type_info\n\n    @classmethod\n    def from_dict(cls, data: Dict[str, Any]) -> Self:\n        \"\"\"Loads a routing batch function from a dictionary. It must contain the information\n        of the factory function used to create the routing batch function.\n\n        Args:\n            data: A dictionary with the routing batch function information and the factory\n                function information.\n        \"\"\"\n        type_info = data.get(TYPE_INFO_KEY)\n        if not type_info:\n            step = data.get(\"step\")\n            raise ValueError(\n                f\"The routing batch function for step '{step}' was created without a factory\"\n                \" function, and it cannot be reconstructed.\"\n            )\n\n        module = type_info.get(\"module\")\n        name = type_info.get(\"name\")\n        kwargs = type_info.get(\"kwargs\")\n\n        if not module or not name or not kwargs:\n            raise ValueError(\n                \"The routing batch function was created with a factory function, but the\"\n                \" information is incomplete. Cannot reconstruct the routing batch function.\"\n            )\n\n        routing_batch_function = _get_module_attr(module=module, name=name)(**kwargs)\n        routing_batch_function.description = data.get(\"description\")\n        routing_batch_function.set_factory_function(\n            factory_function_module=module,\n            factory_function_name=name,\n            factory_function_kwargs=kwargs,\n        )\n\n        return routing_batch_function\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.route_batch","title":"route_batch(batch, steps)","text":"

Returns a list of selected downstream steps from steps to which the batch should be routed.

Parameters:

Name Type Description Default batch _Batch

The batch that should be routed.

required steps List[str]

A list of all the downstream steps that can receive the batch.

required

Returns:

Type Description List[str]

A list with the names of the steps that should receive the batch.

Source code in src/distilabel/pipeline/routing_batch_function.py
def route_batch(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n    \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n    should be routed.\n\n    Args:\n        batch: The batch that should be routed.\n        steps: A list of all the downstream steps that can receive the batch.\n\n    Returns:\n        A list with the names of the steps that should receive the batch.\n    \"\"\"\n    routed_steps = self.routing_function(steps)\n    self._register_routed_batch(batch, routed_steps)\n    return routed_steps\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.set_factory_function","title":"set_factory_function(factory_function_module, factory_function_name, factory_function_kwargs)","text":"

Sets the factory function that was used to create the routing_batch_function.

Parameters:

Name Type Description Default factory_function_module str

The module name where the factory function is defined.

required factory_function_name str

The name of the factory function that was used to create the routing_batch_function.

required factory_function_kwargs Dict[str, Any]

The keyword arguments that were used when calling the factory function.

required Source code in src/distilabel/pipeline/routing_batch_function.py
def set_factory_function(\n    self,\n    factory_function_module: str,\n    factory_function_name: str,\n    factory_function_kwargs: Dict[str, Any],\n) -> None:\n    \"\"\"Sets the factory function that was used to create the `routing_batch_function`.\n\n    Args:\n        factory_function_module: The module name where the factory function is defined.\n        factory_function_name: The name of the factory function that was used to create\n            the `routing_batch_function`.\n        factory_function_kwargs: The keyword arguments that were used when calling the\n            factory function.\n    \"\"\"\n    self._factory_function_module = factory_function_module\n    self._factory_function_name = factory_function_name\n    self._factory_function_kwargs = factory_function_kwargs\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.__call__","title":"__call__(batch, steps)","text":"

Returns a list of selected downstream steps from steps to which the batch should be routed.

Parameters:

Name Type Description Default batch _Batch

The batch that should be routed.

required steps List[str]

A list of all the downstream steps that can receive the batch.

required

Returns:

Type Description List[str]

A list with the names of the steps that should receive the batch.

Source code in src/distilabel/pipeline/routing_batch_function.py
def __call__(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n    \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n    should be routed.\n\n    Args:\n        batch: The batch that should be routed.\n        steps: A list of all the downstream steps that can receive the batch.\n\n    Returns:\n        A list with the names of the steps that should receive the batch.\n    \"\"\"\n    return self.route_batch(batch, steps)\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.__rshift__","title":"__rshift__(other)","text":"

Connects a list of dowstream steps to the upstream step of the routing batch function.

Parameters:

Name Type Description Default other List[DownstreamConnectableSteps]

A list of downstream steps that should be connected to the upstream step of the routing batch function.

required

Returns:

Type Description List[DownstreamConnectableSteps]

The list of downstream steps that have been connected to the upstream step of the

List[DownstreamConnectableSteps]

routing batch function.

Source code in src/distilabel/pipeline/routing_batch_function.py
def __rshift__(\n    self, other: List[\"DownstreamConnectableSteps\"]\n) -> List[\"DownstreamConnectableSteps\"]:\n    \"\"\"Connects a list of dowstream steps to the upstream step of the routing batch\n    function.\n\n    Args:\n        other: A list of downstream steps that should be connected to the upstream step\n            of the routing batch function.\n\n    Returns:\n        The list of downstream steps that have been connected to the upstream step of the\n        routing batch function.\n    \"\"\"\n    if not isinstance(other, list):\n        raise DistilabelUserError(\n            f\"Can only set a `routing_batch_function` for a list of steps. Got: {other}.\"\n            \" Please, review the right-hand side of the `routing_batch_function >> other`\"\n            \" expression. It should be\"\n            \" `upstream_step >> routing_batch_function >> [downstream_step_1, dowstream_step_2, ...]`.\",\n            page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n        )\n\n    if not self._step:\n        raise DistilabelUserError(\n            \"Routing batch function doesn't have an upstream step. Cannot connect downstream\"\n            \" steps before connecting the upstream step. Connect this routing batch\"\n            \" function to an upstream step using the `>>` operator. For example:\"\n            \" `upstream_step >> routing_batch_function >> [downstream_step_1, downstream_step_2, ...]`.\",\n            page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n        )\n\n    for step in other:\n        self._step.connect(step)\n    return other\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.dump","title":"dump(**kwargs)","text":"

Dumps the routing batch function to a dictionary, and the information of the factory function used to create this routing batch function.

Parameters:

Name Type Description Default **kwargs Any

Additional keyword arguments that should be included in the dump.

{}

Returns:

Type Description Dict[str, Any]

A dictionary with the routing batch function information and the factory function

Dict[str, Any]

information.

Source code in src/distilabel/pipeline/routing_batch_function.py
def dump(self, **kwargs: Any) -> Dict[str, Any]:\n    \"\"\"Dumps the routing batch function to a dictionary, and the information of the\n    factory function used to create this routing batch function.\n\n    Args:\n        **kwargs: Additional keyword arguments that should be included in the dump.\n\n    Returns:\n        A dictionary with the routing batch function information and the factory function\n        information.\n    \"\"\"\n    dump_info: Dict[str, Any] = {\"step\": self._step.name}  # type: ignore\n\n    if self.description:\n        dump_info[\"description\"] = self.description\n\n    if type_info := self._get_type_info():\n        dump_info[TYPE_INFO_KEY] = type_info\n\n    return dump_info\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.from_dict","title":"from_dict(data) classmethod","text":"

Loads a routing batch function from a dictionary. It must contain the information of the factory function used to create the routing batch function.

Parameters:

Name Type Description Default data Dict[str, Any]

A dictionary with the routing batch function information and the factory function information.

required Source code in src/distilabel/pipeline/routing_batch_function.py
@classmethod\ndef from_dict(cls, data: Dict[str, Any]) -> Self:\n    \"\"\"Loads a routing batch function from a dictionary. It must contain the information\n    of the factory function used to create the routing batch function.\n\n    Args:\n        data: A dictionary with the routing batch function information and the factory\n            function information.\n    \"\"\"\n    type_info = data.get(TYPE_INFO_KEY)\n    if not type_info:\n        step = data.get(\"step\")\n        raise ValueError(\n            f\"The routing batch function for step '{step}' was created without a factory\"\n            \" function, and it cannot be reconstructed.\"\n        )\n\n    module = type_info.get(\"module\")\n    name = type_info.get(\"name\")\n    kwargs = type_info.get(\"kwargs\")\n\n    if not module or not name or not kwargs:\n        raise ValueError(\n            \"The routing batch function was created with a factory function, but the\"\n            \" information is incomplete. Cannot reconstruct the routing batch function.\"\n        )\n\n    routing_batch_function = _get_module_attr(module=module, name=name)(**kwargs)\n    routing_batch_function.description = data.get(\"description\")\n    routing_batch_function.set_factory_function(\n        factory_function_module=module,\n        factory_function_name=name,\n        factory_function_kwargs=kwargs,\n    )\n\n    return routing_batch_function\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.routing_batch_function","title":"routing_batch_function(description=None)","text":"

Creates a routing batch function that can be used to route batches from one upstream step to specific downstream steps.

Parameters:

Name Type Description Default description Optional[str]

An optional description for the routing batch function.

None

Returns:

Type Description Callable[[RoutingBatchFunc], RoutingBatchFunction]

A RoutingBatchFunction instance that can be used with the >> operators and with

Callable[[RoutingBatchFunc], RoutingBatchFunction]

the Pipeline.connect method when defining the pipeline.

Example:

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import LoadDataFromHub, GroupColumns\n\n\n@routing_batch_function\ndef random_routing_batch(steps: List[str]) -> List[str]:\n    return random.sample(steps, 2)\n\n\nwith Pipeline(name=\"routing-batch-function\") as pipeline:\n    load_data = LoadDataFromHub()\n\n    generations = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        generations.append(task)\n\n    combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n    load_data >> random_routing_batch >> generations >> combine_columns\n
Source code in src/distilabel/pipeline/routing_batch_function.py
def routing_batch_function(\n    description: Optional[str] = None,\n) -> Callable[[RoutingBatchFunc], RoutingBatchFunction]:\n    \"\"\"Creates a routing batch function that can be used to route batches from one upstream\n    step to specific downstream steps.\n\n    Args:\n        description: An optional description for the routing batch function.\n\n    Returns:\n        A `RoutingBatchFunction` instance that can be used with the `>>` operators and with\n        the `Pipeline.connect` method when defining the pipeline.\n\n    Example:\n\n    ```python\n    from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\n    from distilabel.pipeline import Pipeline, routing_batch_function\n    from distilabel.steps import LoadDataFromHub, GroupColumns\n\n\n    @routing_batch_function\n    def random_routing_batch(steps: List[str]) -> List[str]:\n        return random.sample(steps, 2)\n\n\n    with Pipeline(name=\"routing-batch-function\") as pipeline:\n        load_data = LoadDataFromHub()\n\n        generations = []\n        for llm in (\n            OpenAILLM(model=\"gpt-4-0125-preview\"),\n            MistralLLM(model=\"mistral-large-2402\"),\n            VertexAILLM(model=\"gemini-1.5-pro\"),\n        ):\n            task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n            generations.append(task)\n\n        combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n        load_data >> random_routing_batch >> generations >> combine_columns\n    ```\n    \"\"\"\n\n    def decorator(func: RoutingBatchFunc) -> RoutingBatchFunction:\n        factory_function_name, factory_function_module, factory_function_kwargs = (\n            None,\n            None,\n            None,\n        )\n\n        # Check if `routing_batch_function` was created using a factory function from an installed package\n        stack = inspect.stack()\n        if len(stack) > 2:\n            factory_function_frame_info = stack[1]\n\n            # Function factory path\n            if factory_function_frame_info.function != \"<module>\":\n                factory_function_name = factory_function_frame_info.function\n                factory_function_module = inspect.getmodule(\n                    factory_function_frame_info.frame\n                ).__name__  # type: ignore\n\n                # Function factory kwargs\n                factory_function_kwargs = factory_function_frame_info.frame.f_locals\n\n        routing_batch_function = RoutingBatchFunction(\n            routing_function=func,\n            description=description,\n        )\n\n        if (\n            factory_function_module\n            and factory_function_name\n            and factory_function_kwargs\n        ):\n            routing_batch_function.set_factory_function(\n                factory_function_module=factory_function_module,\n                factory_function_name=factory_function_name,\n                factory_function_kwargs=factory_function_kwargs,\n            )\n\n        return routing_batch_function\n\n    return decorator\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.sample_n_steps","title":"sample_n_steps(n)","text":"

A simple function that creates a routing batch function that samples n steps from the list of all the downstream steps.

Parameters:

Name Type Description Default n int

The number of steps to sample from the list of all the downstream steps.

required

Returns:

Type Description RoutingBatchFunction

A RoutingBatchFunction instance that can be used with the >> operators and with

RoutingBatchFunction

the Pipeline.connect method when defining the pipeline.

Example:

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, sample_n_steps\nfrom distilabel.steps import LoadDataFromHub, GroupColumns\n\nrandom_routing_batch = sample_n_steps(2)\n\n\nwith Pipeline(name=\"routing-batch-function\") as pipeline:\n    load_data = LoadDataFromHub()\n\n    generations = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        generations.append(task)\n\n    combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n    load_data >> random_routing_batch >> generations >> combine_columns\n
Source code in src/distilabel/pipeline/routing_batch_function.py
def sample_n_steps(n: int) -> RoutingBatchFunction:\n    \"\"\"A simple function that creates a routing batch function that samples `n` steps from\n    the list of all the downstream steps.\n\n    Args:\n        n: The number of steps to sample from the list of all the downstream steps.\n\n    Returns:\n        A `RoutingBatchFunction` instance that can be used with the `>>` operators and with\n        the `Pipeline.connect` method when defining the pipeline.\n\n    Example:\n\n    ```python\n    from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\n    from distilabel.pipeline import Pipeline, sample_n_steps\n    from distilabel.steps import LoadDataFromHub, GroupColumns\n\n    random_routing_batch = sample_n_steps(2)\n\n\n    with Pipeline(name=\"routing-batch-function\") as pipeline:\n        load_data = LoadDataFromHub()\n\n        generations = []\n        for llm in (\n            OpenAILLM(model=\"gpt-4-0125-preview\"),\n            MistralLLM(model=\"mistral-large-2402\"),\n            VertexAILLM(model=\"gemini-1.5-pro\"),\n        ):\n            task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n            generations.append(task)\n\n        combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n        load_data >> random_routing_batch >> generations >> combine_columns\n    ```\n    \"\"\"\n\n    @routing_batch_function(\n        description=f\"Sample {n} steps from the list of downstream steps.\"\n    )\n    def sample_n(steps: List[str]) -> List[str]:\n        return random.sample(steps, n)\n\n    return sample_n\n
"},{"location":"api/pipeline/step_wrapper/","title":"Step Wrapper","text":""},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper","title":"_StepWrapper","text":"

Wrapper to run the Step.

Attributes:

Name Type Description step

The step to run.

replica

The replica ID assigned.

input_queue

The queue to receive the input data.

output_queue

The queue to send the output data.

load_queue

The queue used to notify the main process that the step has been loaded, has been unloaded or has failed to load.

Source code in src/distilabel/pipeline/step_wrapper.py
class _StepWrapper:\n    \"\"\"Wrapper to run the `Step`.\n\n    Attributes:\n        step: The step to run.\n        replica: The replica ID assigned.\n        input_queue: The queue to receive the input data.\n        output_queue: The queue to send the output data.\n        load_queue: The queue used to notify the main process that the step has been loaded,\n            has been unloaded or has failed to load.\n    \"\"\"\n\n    def __init__(\n        self,\n        step: Union[\"Step\", \"GeneratorStep\"],\n        replica: int,\n        input_queue: \"Queue[_Batch]\",\n        output_queue: \"Queue[_Batch]\",\n        load_queue: \"Queue[Union[StepLoadStatus, None]]\",\n        dry_run: bool = False,\n        ray_pipeline: bool = False,\n    ) -> None:\n        \"\"\"Initializes the `_ProcessWrapper`.\n\n        Args:\n            step: The step to run.\n            input_queue: The queue to receive the input data.\n            output_queue: The queue to send the output data.\n            load_queue: The queue used to notify the main process that the step has been\n                loaded, has been unloaded or has failed to load.\n            dry_run: Flag to ensure we are forcing to run the last batch.\n            ray_pipeline: Whether the step is running a `RayPipeline` or not.\n        \"\"\"\n        self.step = step\n        self.replica = replica\n        self.input_queue = input_queue\n        self.output_queue = output_queue\n        self.load_queue = load_queue\n        self.dry_run = dry_run\n        self.ray_pipeline = ray_pipeline\n\n        self._init_cuda_device_placement()\n\n    def _init_cuda_device_placement(self) -> None:\n        \"\"\"Sets the LLM identifier and the number of desired GPUs of the `CudaDevicePlacementMixin`\"\"\"\n\n        def _init_cuda_device_placement_mixin(attr: CudaDevicePlacementMixin) -> None:\n            if self.ray_pipeline:\n                attr.disable_cuda_device_placement = True\n            else:\n                desired_num_gpus = self.step.resources.gpus or 1\n                attr._llm_identifier = f\"{self.step.name}-replica-{self.replica}\"\n                attr._desired_num_gpus = desired_num_gpus\n\n        for field_name in self.step.model_fields_set:\n            attr = getattr(self.step, field_name)\n            if isinstance(attr, CudaDevicePlacementMixin):\n                _init_cuda_device_placement_mixin(attr)\n\n        if isinstance(self.step, CudaDevicePlacementMixin):\n            _init_cuda_device_placement_mixin(self.step)\n\n    def run(self) -> str:\n        \"\"\"The target function executed by the process. This function will also handle\n        the step lifecycle, executing first the `load` function of the `Step` and then\n        waiting to receive a batch from the `input_queue` that will be handled by the\n        `process` method of the `Step`.\n\n        Returns:\n            The name of the step that was executed.\n        \"\"\"\n\n        try:\n            self.step.load()\n            self.step._logger.debug(f\"Step '{self.step.name}' loaded!\")\n        except Exception as e:\n            self.step.unload()\n            self._notify_load_failed()\n            raise _StepWrapperException.create_load_error(\n                message=f\"Step load failed: {e}\",\n                step=self.step,\n                subprocess_exception=e,\n            ) from e\n\n        self._notify_load()\n\n        if self.step.is_generator:\n            self._generator_step_process_loop()\n        else:\n            self._non_generator_process_loop()\n\n        # Just in case `None` sentinel was sent\n        try:\n            self.input_queue.get(block=False)\n        except Exception:\n            pass\n\n        self.step.unload()\n\n        self._notify_unload()\n\n        self.step._logger.info(\n            f\"\ud83c\udfc1 Finished running step '{self.step.name}' (replica ID: {self.replica})\"\n        )\n\n        return self.step.name  # type: ignore\n\n    def _notify_load(self) -> None:\n        \"\"\"Notifies that the step has finished executing its `load` function successfully.\"\"\"\n        self.step._logger.debug(\n            f\"Notifying load of step '{self.step.name}' (replica ID {self.replica})...\"\n        )\n        self.load_queue.put({\"name\": self.step.name, \"status\": \"loaded\"})  # type: ignore\n\n    def _notify_unload(self) -> None:\n        \"\"\"Notifies that the step has been unloaded.\"\"\"\n        self.step._logger.debug(\n            f\"Notifying unload of step '{self.step.name}' (replica ID {self.replica})...\"\n        )\n        self.load_queue.put({\"name\": self.step.name, \"status\": \"unloaded\"})  # type: ignore\n\n    def _notify_load_failed(self) -> None:\n        \"\"\"Notifies that the step failed to load.\"\"\"\n        self.step._logger.debug(\n            f\"Notifying load failed of step '{self.step.name}' (replica ID {self.replica})...\"\n        )\n        self.load_queue.put({\"name\": self.step.name, \"status\": \"load_failed\"})  # type: ignore\n\n    def _generator_step_process_loop(self) -> None:\n        \"\"\"Runs the process loop for a generator step. It will call the `process` method\n        of the step and send the output data to the `output_queue` and block until the next\n        batch request is received (i.e. receiving an empty batch from the `input_queue`).\n\n        If the `last_batch` attribute of the batch is `True`, the loop will stop and the\n        process will finish.\n\n        Raises:\n            _StepWrapperException: If an error occurs during the execution of the\n                `process` method.\n        \"\"\"\n        step = cast(\"GeneratorStep\", self.step)\n\n        try:\n            if (batch := self.input_queue.get()) is None:\n                self.step._logger.info(\n                    f\"\ud83d\uded1 Stopping yielding batches from step '{self.step.name}'\"\n                )\n                return\n\n            offset = batch.seq_no * step.batch_size  # type: ignore\n\n            self.step._logger.info(\n                f\"\ud83e\uddec Starting yielding batches from generator step '{self.step.name}'.\"\n                f\" Offset: {offset}\"\n            )\n\n            for data, last_batch in step.process_applying_mappings(offset=offset):\n                batch.set_data([data])\n                batch.last_batch = self.dry_run or last_batch\n                self._send_batch(batch)\n\n                if batch.last_batch:\n                    return\n\n                self.step._logger.debug(\n                    f\"Step '{self.step.name}' waiting for next batch request...\"\n                )\n                if (batch := self.input_queue.get()) is None:\n                    self.step._logger.info(\n                        f\"\ud83d\uded1 Stopping yielding batches from step '{self.step.name}'\"\n                    )\n                    return\n        except Exception as e:\n            raise _StepWrapperException(str(e), self.step, 2, e) from e\n\n    def _non_generator_process_loop(self) -> None:\n        \"\"\"Runs the process loop for a non-generator step. It will call the `process`\n        method of the step and send the output data to the `output_queue` and block until\n        the next batch is received from the `input_queue`. If the `last_batch` attribute\n        of the batch is `True`, the loop will stop and the process will finish.\n\n        If an error occurs during the execution of the `process` method and the step is\n        global, the process will raise a `_StepWrapperException`. If the step is not\n        global, the process will log the error and send an empty batch to the `output_queue`.\n\n        Raises:\n            _StepWrapperException: If an error occurs during the execution of the\n                `process` method and the step is global.\n        \"\"\"\n        step = cast(\"Step\", self.step)\n        while True:\n            if (batch := self.input_queue.get()) is None:\n                self.step._logger.info(\n                    f\"\ud83d\uded1 Stopping processing batches from step '{self.step.name}'\"\n                )\n                break\n\n            if batch == LAST_BATCH_SENT_FLAG:\n                self.step._logger.debug(\"Received `LAST_BATCH_SENT_FLAG`. Stopping...\")\n                break\n\n            self.step._logger.info(\n                f\"\ud83d\udce6 Processing batch {batch.seq_no} in '{batch.step_name}' (replica ID: {self.replica})\"\n            )\n\n            if batch.data_path is not None:\n                self.step._logger.debug(f\"Reading batch data from '{batch.data_path}'\")\n                batch.read_batch_data_from_fs()\n\n            result = []\n            try:\n                if self.step.has_multiple_inputs:\n                    result = next(step.process_applying_mappings(*batch.data))\n                else:\n                    result = next(step.process_applying_mappings(batch.data[0]))\n            except Exception as e:\n                if self.step.is_global:\n                    self.step.unload()\n                    self._notify_unload()\n                    data = (\n                        batch.data\n                        if isinstance(\n                            e, DistilabelOfflineBatchGenerationNotFinishedException\n                        )\n                        else None\n                    )\n                    raise _StepWrapperException(str(e), self.step, 2, e, data) from e\n\n                # Impute step outputs columns with `None`\n                result = self._impute_step_outputs(batch)\n\n                # if the step is not global then we can skip the batch which means sending\n                # an empty batch to the output queue\n                self.step._logger.warning(\n                    f\"\u26a0\ufe0f Processing batch {batch.seq_no} with step '{self.step.name}' failed.\"\n                    \" Sending empty batch filled with `None`s...\"\n                )\n                self.step._logger.warning(\n                    f\"Subprocess traceback:\\n\\n{traceback.format_exc()}\"\n                )\n            finally:\n                batch.set_data([result])\n                self._send_batch(batch)\n\n            if batch.last_batch:\n                break\n\n    def _impute_step_outputs(self, batch: \"_Batch\") -> List[Dict[str, Any]]:\n        \"\"\"Imputes the step outputs columns with `None` in the batch data.\n\n        Args:\n            batch: The batch to impute.\n        \"\"\"\n        return self.step.impute_step_outputs(batch.data[0])\n\n    def _send_batch(self, batch: _Batch) -> None:\n        \"\"\"Sends a batch to the `output_queue`.\"\"\"\n        if batch.data_path is not None:\n            self.step._logger.debug(f\"Writing batch data to '{batch.data_path}'\")\n            batch.write_batch_data_to_fs()\n\n        self.step._logger.info(\n            f\"\ud83d\udce8 Step '{batch.step_name}' sending batch {batch.seq_no} to output queue\"\n        )\n        self.output_queue.put(batch)\n
"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper.__init__","title":"__init__(step, replica, input_queue, output_queue, load_queue, dry_run=False, ray_pipeline=False)","text":"

Initializes the _ProcessWrapper.

Parameters:

Name Type Description Default step Union[Step, GeneratorStep]

The step to run.

required input_queue Queue[_Batch]

The queue to receive the input data.

required output_queue Queue[_Batch]

The queue to send the output data.

required load_queue Queue[Union[StepLoadStatus, None]]

The queue used to notify the main process that the step has been loaded, has been unloaded or has failed to load.

required dry_run bool

Flag to ensure we are forcing to run the last batch.

False ray_pipeline bool

Whether the step is running a RayPipeline or not.

False Source code in src/distilabel/pipeline/step_wrapper.py
def __init__(\n    self,\n    step: Union[\"Step\", \"GeneratorStep\"],\n    replica: int,\n    input_queue: \"Queue[_Batch]\",\n    output_queue: \"Queue[_Batch]\",\n    load_queue: \"Queue[Union[StepLoadStatus, None]]\",\n    dry_run: bool = False,\n    ray_pipeline: bool = False,\n) -> None:\n    \"\"\"Initializes the `_ProcessWrapper`.\n\n    Args:\n        step: The step to run.\n        input_queue: The queue to receive the input data.\n        output_queue: The queue to send the output data.\n        load_queue: The queue used to notify the main process that the step has been\n            loaded, has been unloaded or has failed to load.\n        dry_run: Flag to ensure we are forcing to run the last batch.\n        ray_pipeline: Whether the step is running a `RayPipeline` or not.\n    \"\"\"\n    self.step = step\n    self.replica = replica\n    self.input_queue = input_queue\n    self.output_queue = output_queue\n    self.load_queue = load_queue\n    self.dry_run = dry_run\n    self.ray_pipeline = ray_pipeline\n\n    self._init_cuda_device_placement()\n
"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper.run","title":"run()","text":"

The target function executed by the process. This function will also handle the step lifecycle, executing first the load function of the Step and then waiting to receive a batch from the input_queue that will be handled by the process method of the Step.

Returns:

Type Description str

The name of the step that was executed.

Source code in src/distilabel/pipeline/step_wrapper.py
def run(self) -> str:\n    \"\"\"The target function executed by the process. This function will also handle\n    the step lifecycle, executing first the `load` function of the `Step` and then\n    waiting to receive a batch from the `input_queue` that will be handled by the\n    `process` method of the `Step`.\n\n    Returns:\n        The name of the step that was executed.\n    \"\"\"\n\n    try:\n        self.step.load()\n        self.step._logger.debug(f\"Step '{self.step.name}' loaded!\")\n    except Exception as e:\n        self.step.unload()\n        self._notify_load_failed()\n        raise _StepWrapperException.create_load_error(\n            message=f\"Step load failed: {e}\",\n            step=self.step,\n            subprocess_exception=e,\n        ) from e\n\n    self._notify_load()\n\n    if self.step.is_generator:\n        self._generator_step_process_loop()\n    else:\n        self._non_generator_process_loop()\n\n    # Just in case `None` sentinel was sent\n    try:\n        self.input_queue.get(block=False)\n    except Exception:\n        pass\n\n    self.step.unload()\n\n    self._notify_unload()\n\n    self.step._logger.info(\n        f\"\ud83c\udfc1 Finished running step '{self.step.name}' (replica ID: {self.replica})\"\n    )\n\n    return self.step.name  # type: ignore\n
"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException","title":"_StepWrapperException","text":"

Bases: Exception

Exception to be raised when an error occurs in the _StepWrapper class.

Attributes:

Name Type Description message

The error message.

step

The Step that raised the error.

code

The error code.

subprocess_exception

The exception raised by the subprocess.

data

The data that caused the error. Defaults to None.

Source code in src/distilabel/pipeline/step_wrapper.py
class _StepWrapperException(Exception):\n    \"\"\"Exception to be raised when an error occurs in the `_StepWrapper` class.\n\n    Attributes:\n        message: The error message.\n        step: The `Step` that raised the error.\n        code: The error code.\n        subprocess_exception: The exception raised by the subprocess.\n        data: The data that caused the error. Defaults to `None`.\n    \"\"\"\n\n    def __init__(\n        self,\n        message: str,\n        step: \"_Step\",\n        code: int,\n        subprocess_exception: Exception,\n        data: Optional[List[List[Dict[str, Any]]]] = None,\n    ) -> None:\n        self.message = f\"{message}\\n\\nFor further information visit '{DISTILABEL_DOCS_URL}api/pipeline/step_wrapper'\"\n        self.step = step\n        self.code = code\n        self.subprocess_exception = subprocess_exception\n        self.formatted_traceback = \"\".join(\n            traceback.format_exception(\n                type(subprocess_exception),\n                subprocess_exception,\n                subprocess_exception.__traceback__,\n            )\n        )\n        self.data = data\n\n    @classmethod\n    def create_load_error(\n        cls,\n        message: str,\n        step: \"_Step\",\n        subprocess_exception: Optional[Exception] = None,\n    ) -> \"_StepWrapperException\":\n        \"\"\"Creates a `_StepWrapperException` for a load error.\n\n        Args:\n            message: The error message.\n            step: The `Step` that raised the error.\n            subprocess_exception: The exception raised by the subprocess. Defaults to `None`.\n\n        Returns:\n            The `_StepWrapperException` instance.\n        \"\"\"\n        return cls(message, step, 1, subprocess_exception, None)\n\n    @property\n    def is_load_error(self) -> bool:\n        \"\"\"Whether the error is a load error.\n\n        Returns:\n            `True` if the error is a load error, `False` otherwise.\n        \"\"\"\n        return self.code == 1\n
"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException.is_load_error","title":"is_load_error: bool property","text":"

Whether the error is a load error.

Returns:

Type Description bool

True if the error is a load error, False otherwise.

"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException.create_load_error","title":"create_load_error(message, step, subprocess_exception=None) classmethod","text":"

Creates a _StepWrapperException for a load error.

Parameters:

Name Type Description Default message str

The error message.

required step _Step

The Step that raised the error.

required subprocess_exception Optional[Exception]

The exception raised by the subprocess. Defaults to None.

None

Returns:

Type Description _StepWrapperException

The _StepWrapperException instance.

Source code in src/distilabel/pipeline/step_wrapper.py
@classmethod\ndef create_load_error(\n    cls,\n    message: str,\n    step: \"_Step\",\n    subprocess_exception: Optional[Exception] = None,\n) -> \"_StepWrapperException\":\n    \"\"\"Creates a `_StepWrapperException` for a load error.\n\n    Args:\n        message: The error message.\n        step: The `Step` that raised the error.\n        subprocess_exception: The exception raised by the subprocess. Defaults to `None`.\n\n    Returns:\n        The `_StepWrapperException` instance.\n    \"\"\"\n    return cls(message, step, 1, subprocess_exception, None)\n
"},{"location":"api/pipeline/typing/","title":"Pipeline Typing","text":""},{"location":"api/pipeline/typing/#distilabel.pipeline.typing","title":"typing","text":""},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.DownstreamConnectable","title":"DownstreamConnectable = Union['Step', 'GlobalStep'] module-attribute","text":"

Alias for the Step types that can be connected as downstream steps.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.UpstreamConnectableSteps","title":"UpstreamConnectableSteps = TypeVar('UpstreamConnectableSteps', bound=Union['Step', 'GlobalStep', 'GeneratorStep']) module-attribute","text":"

Type for the Step types that can be connected as upstream steps.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.DownstreamConnectableSteps","title":"DownstreamConnectableSteps = TypeVar('DownstreamConnectableSteps', bound=DownstreamConnectable, covariant=True) module-attribute","text":"

Type for the Step types that can be connected as downstream steps.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.PipelineRuntimeParametersInfo","title":"PipelineRuntimeParametersInfo = Dict[str, Union[List['RuntimeParameterInfo'], Dict[str, 'RuntimeParameterInfo']]] module-attribute","text":"

Alias for the information of the runtime parameters of a Pipeline.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.InputDataset","title":"InputDataset = Union['Dataset', 'pd.DataFrame', List[Dict[str, str]]] module-attribute","text":"

Alias for the types we can process as input dataset.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.StepLoadStatus","title":"StepLoadStatus","text":"

Bases: TypedDict

Dict containing information about if one step was loaded/unloaded or if it's load failed

Source code in src/distilabel/pipeline/typing.py
class StepLoadStatus(TypedDict):\n    \"\"\"Dict containing information about if one step was loaded/unloaded or if it's load\n    failed\"\"\"\n\n    name: str\n    status: Literal[\"loaded\", \"unloaded\", \"load_failed\"]\n
"},{"location":"api/step/","title":"Step","text":"

This section contains the API reference for the distilabel step, both for the _Step base class and the Step class.

For more information and examples on how to use existing steps or create custom ones, please refer to Tutorial - Step.

"},{"location":"api/step/#distilabel.steps.base","title":"base","text":""},{"location":"api/step/#distilabel.steps.base.StepInput","title":"StepInput = Annotated[List[Dict[str, Any]], _STEP_INPUT_ANNOTATION] module-attribute","text":"

StepInput is just an Annotated alias of the typing List[Dict[str, Any]] with extra metadata that allows distilabel to perform validations over the process step method defined in each Step

"},{"location":"api/step/#distilabel.steps.base._Step","title":"_Step","text":"

Bases: RuntimeParametersMixin, RequirementsMixin, SignatureMixin, BaseModel, _Serializable, ABC

Base class for the steps that can be included in a Pipeline.

A Step is a class defining some processing logic. The input and outputs for this processing logic are lists of dictionaries with the same keys:

```python\n[\n    {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n    {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n    {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n]\n```\n

The processing logic is defined in the process method, which depending on the number of previous steps, can receive more than one list of dictionaries, each with the output of the previous steps. In order to make distilabel know where the outputs from the previous steps are, the process function from each Step must have an argument or positional argument annotated with StepInput.

```python\nclass StepWithOnePreviousStep(Step):\n    def process(self, inputs: StepInput) -> StepOutput:\n        yield [...]\n\nclass StepWithSeveralPreviousStep(Step):\n    # mind the * to indicate that the argument is a list of StepInput\n    def process(self, *inputs: StepInput) -> StepOutput:\n        yield [...]\n```\n

In order to perform static validations and to check that the chaining of the steps in the pipeline is valid, a Step must also define the inputs and outputs properties:

  • inputs: a list of strings with the names of the columns that the step needs as input. It can be an empty list if the step is a generator step.
  • outputs: a list of strings with the names of the columns that the step will produce as output.

Optionally, a Step can override the load method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc.

Finally, the Step class inherits from pydantic.BaseModel, so attributes can be easily defined, validated, serialized and included in the __init__ method of the step.

Source code in src/distilabel/steps/base.py
class _Step(\n    RuntimeParametersMixin,\n    RequirementsMixin,\n    SignatureMixin,\n    BaseModel,\n    _Serializable,\n    ABC,\n):\n    \"\"\"Base class for the steps that can be included in a `Pipeline`.\n\n    A `Step` is a class defining some processing logic. The input and outputs for this\n    processing logic are lists of dictionaries with the same keys:\n\n        ```python\n        [\n            {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n            {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n            {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n        ]\n        ```\n\n    The processing logic is defined in the `process` method, which depending on the\n    number of previous steps, can receive more than one list of dictionaries, each with\n    the output of the previous steps. In order to make `distilabel` know where the outputs\n    from the previous steps are, the `process` function from each `Step` must have an argument\n    or positional argument annotated with `StepInput`.\n\n        ```python\n        class StepWithOnePreviousStep(Step):\n            def process(self, inputs: StepInput) -> StepOutput:\n                yield [...]\n\n        class StepWithSeveralPreviousStep(Step):\n            # mind the * to indicate that the argument is a list of StepInput\n            def process(self, *inputs: StepInput) -> StepOutput:\n                yield [...]\n        ```\n\n    In order to perform static validations and to check that the chaining of the steps\n    in the pipeline is valid, a `Step` must also define the `inputs` and `outputs`\n    properties:\n\n    - `inputs`: a list of strings with the names of the columns that the step needs as\n        input. It can be an empty list if the step is a generator step.\n    - `outputs`: a list of strings with the names of the columns that the step will\n        produce as output.\n\n    Optionally, a `Step` can override the `load` method to perform any initialization\n    logic before the `process` method is called. For example, to load an LLM, stablish a\n    connection to a database, etc.\n\n    Finally, the `Step` class inherits from `pydantic.BaseModel`, so attributes can be easily\n    defined, validated, serialized and included in the `__init__` method of the step.\n    \"\"\"\n\n    model_config = ConfigDict(\n        arbitrary_types_allowed=True,\n        validate_default=True,\n        validate_assignment=True,\n        extra=\"forbid\",\n    )\n\n    name: Optional[str] = Field(default=None, pattern=r\"^[a-zA-Z0-9_-]+$\")\n    resources: StepResources = StepResources()\n    pipeline: Any = Field(default=None, exclude=True, repr=False)\n    input_mappings: Dict[str, str] = {}\n    output_mappings: Dict[str, str] = {}\n    use_cache: bool = True\n\n    _pipeline_artifacts_path: Path = PrivateAttr(None)\n    _built_from_decorator: bool = PrivateAttr(default=False)\n    _logger: \"Logger\" = PrivateAttr(None)\n\n    def model_post_init(self, __context: Any) -> None:\n        from distilabel.pipeline.base import _GlobalPipelineManager\n\n        super().model_post_init(__context)\n\n        if self.pipeline is None:\n            self.pipeline = _GlobalPipelineManager.get_pipeline()\n\n        if self.pipeline is None:\n            _logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n            _logger.warning(\n                f\"Step '{self.name}' hasn't received a pipeline, and it hasn't been\"\n                \" created within a `Pipeline` context. Please, use\"\n                \" `with Pipeline() as pipeline:` and create the step within the context.\"\n            )\n\n        if not self.name:\n            # This must be done before the check for repeated names, but assuming\n            # we are passing the pipeline from the _GlobalPipelineManager, should\n            # be done after that.\n            self.name = _infer_step_name(type(self).__name__, self.pipeline)\n\n        if self.pipeline is not None:\n            # If not set an error will be raised in `Pipeline.run` parent\n            self.pipeline._add_step(self)\n\n    def connect(\n        self,\n        *steps: \"_Step\",\n        routing_batch_function: Optional[\"RoutingBatchFunction\"] = None,\n    ) -> None:\n        \"\"\"Connects the current step to another step in the pipeline, which means that\n        the output of this step will be the input of the other step.\n\n        Args:\n            steps: The steps to connect to the current step.\n            routing_batch_function: A function that receives a list of steps and returns\n                a list of steps to which the output batch generated by this step should be\n                routed. It should be used to define the routing logic of the pipeline. If\n                not provided, the output batch will be routed to all the connected steps.\n                Defaults to `None`.\n        \"\"\"\n        assert self.pipeline is not None\n\n        if routing_batch_function:\n            self._set_routing_batch_function(routing_batch_function)\n\n        for step in steps:\n            self.pipeline._add_edge(from_step=self.name, to_step=step.name)  # type: ignore\n\n    def _set_routing_batch_function(\n        self, routing_batch_function: \"RoutingBatchFunction\"\n    ) -> None:\n        \"\"\"Sets a routing batch function for the batches generated by this step, so they\n        get routed to specific downstream steps.\n\n        Args:\n            routing_batch_function: The routing batch function that will be used to route\n                the batches generated by this step.\n        \"\"\"\n        self.pipeline._add_routing_batch_function(\n            step_name=self.name,  # type: ignore\n            routing_batch_function=routing_batch_function,\n        )\n        routing_batch_function._step = self\n\n    @overload\n    def __rshift__(self, other: \"RoutingBatchFunction\") -> \"RoutingBatchFunction\": ...\n\n    @overload\n    def __rshift__(\n        self, other: List[\"DownstreamConnectableSteps\"]\n    ) -> List[\"DownstreamConnectableSteps\"]: ...\n\n    @overload\n    def __rshift__(self, other: \"DownstreamConnectable\") -> \"DownstreamConnectable\": ...\n\n    def __rshift__(\n        self,\n        other: Union[\n            \"DownstreamConnectable\",\n            \"RoutingBatchFunction\",\n            List[\"DownstreamConnectableSteps\"],\n        ],\n    ) -> Union[\n        \"DownstreamConnectable\",\n        \"RoutingBatchFunction\",\n        List[\"DownstreamConnectableSteps\"],\n    ]:\n        \"\"\"Allows using the `>>` operator to connect steps in the pipeline.\n\n        Args:\n            other: The step to connect, a list of steps to connect to or a routing batch\n                function to be set for the step.\n\n        Returns:\n            The connected step, the list of connected steps or the routing batch function.\n\n        Example:\n            ```python\n            step1 >> step2\n            # Would be equivalent to:\n            step1.connect(step2)\n\n            # It also allows to connect a list of steps\n            step1 >> [step2, step3]\n            ```\n        \"\"\"\n        # Here to avoid circular imports\n        from distilabel.pipeline.routing_batch_function import RoutingBatchFunction\n\n        if isinstance(other, list):\n            self.connect(*other)\n            return other\n\n        if isinstance(other, RoutingBatchFunction):\n            self._set_routing_batch_function(other)\n            return other\n\n        self.connect(other)\n        return other\n\n    def __rrshift__(self, other: List[\"UpstreamConnectableSteps\"]) -> Self:\n        \"\"\"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline\n        to a single step, as the list doesn't have the __rshift__ operator.\n\n        Args:\n            other: The step to connect to.\n\n        Returns:\n            The connected step\n\n        Example:\n            ```python\n            [step2, step3] >> step1\n            # Would be equivalent to:\n            step2.connect(step1)\n            step3.connect(step1)\n            ```\n        \"\"\"\n        for o in other:\n            o.connect(self)\n        return self\n\n    def load(self) -> None:\n        \"\"\"Method to perform any initialization logic before the `process` method is\n        called. For example, to load an LLM, stablish a connection to a database, etc.\n        \"\"\"\n        self._logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n\n    def unload(self) -> None:\n        \"\"\"Method to perform any cleanup logic after the `process` method is called. For\n        example, to close a connection to a database, etc.\n        \"\"\"\n        self._logger.debug(\"Executing step unload logic.\")\n\n    @property\n    def is_generator(self) -> bool:\n        \"\"\"Whether the step is a generator step or not.\n\n        Returns:\n            `True` if the step is a generator step, `False` otherwise.\n        \"\"\"\n        return isinstance(self, GeneratorStep)\n\n    @property\n    def is_global(self) -> bool:\n        \"\"\"Whether the step is a global step or not.\n\n        Returns:\n            `True` if the step is a global step, `False` otherwise.\n        \"\"\"\n        return isinstance(self, GlobalStep)\n\n    @property\n    def is_normal(self) -> bool:\n        \"\"\"Whether the step is a normal step or not.\n\n        Returns:\n            `True` if the step is a normal step, `False` otherwise.\n        \"\"\"\n        return not self.is_generator and not self.is_global\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of strings with the names of the mandatory columns that the step needs as\n        input or dictionary in which the keys are the input columns of the step and the\n        values are booleans indicating whether the column is optional or not.\n\n        Returns:\n            List of strings with the names of the columns that the step needs as input.\n        \"\"\"\n        return []\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of strings with the names of the columns that the step will produce as\n        output or dictionary in which the keys are the output columns of the step and the\n        values are booleans indicating whether the column is optional or not.\n\n        Returns:\n            List of strings with the names of the columns that the step will produce as\n            output.\n        \"\"\"\n        return []\n\n    @cached_property\n    def process_parameters(self) -> List[inspect.Parameter]:\n        \"\"\"Returns the parameters of the `process` method of the step.\n\n        Returns:\n            The parameters of the `process` method of the step.\n        \"\"\"\n        return list(inspect.signature(self.process).parameters.values())  # type: ignore\n\n    def has_multiple_inputs(self) -> bool:\n        \"\"\"Whether the `process` method of the step receives more than one input or not\n        i.e. has a `*` argument annotated with `StepInput`.\n\n        Returns:\n            `True` if the `process` method of the step receives more than one input,\n            `False` otherwise.\n        \"\"\"\n        return any(\n            param.kind == param.VAR_POSITIONAL for param in self.process_parameters\n        )\n\n    def get_process_step_input(self) -> Union[inspect.Parameter, None]:\n        \"\"\"Returns the parameter of the `process` method of the step annotated with\n        `StepInput`.\n\n        Returns:\n            The parameter of the `process` method of the step annotated with `StepInput`,\n            or `None` if there is no parameter annotated with `StepInput`.\n\n        Raises:\n            TypeError: If the step has more than one parameter annotated with `StepInput`.\n        \"\"\"\n        step_input_parameter = None\n        for parameter in self.process_parameters:\n            if is_parameter_annotated_with(parameter, _STEP_INPUT_ANNOTATION):\n                if step_input_parameter is not None:\n                    raise DistilabelTypeError(\n                        f\"Step '{self.name}' should have only one parameter with type\"\n                        \" hint `StepInput`.\",\n                        page=\"sections/how_to_guides/basic/step/#defining-custom-steps\",\n                    )\n                step_input_parameter = parameter\n        return step_input_parameter\n\n    def verify_inputs_mappings(self) -> None:\n        \"\"\"Verifies that the `inputs_mappings` of the step are valid i.e. the input\n        columns exist in the inputs of the step.\n\n        Raises:\n            ValueError: If the `inputs_mappings` of the step are not valid.\n        \"\"\"\n        if not self.input_mappings:\n            return\n\n        for input in self.input_mappings:\n            if input not in self.inputs:\n                raise DistilabelUserError(\n                    f\"The input column '{input}' doesn't exist in the inputs of the\"\n                    f\" step '{self.name}'. Inputs of the step are: {self.inputs}.\"\n                    \" Please, review the `inputs_mappings` argument of the step.\",\n                    page=\"sections/how_to_guides/basic/step/#arguments\",\n                )\n\n    def verify_outputs_mappings(self) -> None:\n        \"\"\"Verifies that the `outputs_mappings` of the step are valid i.e. the output\n        columns exist in the outputs of the step.\n\n        Raises:\n            ValueError: If the `outputs_mappings` of the step are not valid.\n        \"\"\"\n        if not self.output_mappings:\n            return\n\n        for output in self.output_mappings:\n            if output not in self.outputs:\n                raise DistilabelUserError(\n                    f\"The output column '{output}' doesn't exist in the outputs of the\"\n                    f\" step '{self.name}'. Outputs of the step are: {self.outputs}.\"\n                    \" Please, review the `outputs_mappings` argument of the step.\",\n                    page=\"sections/how_to_guides/basic/step/#arguments\",\n                )\n\n    def get_inputs(self) -> Dict[str, bool]:\n        \"\"\"Gets the inputs of the step after the `input_mappings`. This method is meant\n        to be used to run validations on the inputs of the step.\n\n        Returns:\n            The inputs of the step after the `input_mappings` and if they are required or\n            not.\n        \"\"\"\n        if isinstance(self.inputs, list):\n            return {\n                self.input_mappings.get(input, input): True for input in self.inputs\n            }\n\n        return {\n            self.input_mappings.get(input, input): required\n            for input, required in self.inputs.items()\n        }\n\n    def get_outputs(self) -> Dict[str, bool]:\n        \"\"\"Gets the outputs of the step after the `outputs_mappings`. This method is\n        meant to be used to run validations on the outputs of the step.\n\n        Returns:\n            The outputs of the step after the `outputs_mappings` and if they are required\n            or not.\n        \"\"\"\n        if isinstance(self.outputs, list):\n            return {\n                self.output_mappings.get(output, output): True\n                for output in self.outputs\n            }\n\n        return {\n            self.output_mappings.get(output, output): required\n            for output, required in self.outputs.items()\n        }\n\n    def set_pipeline_artifacts_path(self, path: Path) -> None:\n        \"\"\"Sets the `_pipeline_artifacts_path` attribute. This method is meant to be used\n        by the `Pipeline` once the cache location is known.\n\n        Args:\n            path: the path where the artifacts generated by the pipeline steps should be\n                saved.\n        \"\"\"\n        self._pipeline_artifacts_path = path\n\n    @property\n    def artifacts_directory(self) -> Union[Path, None]:\n        \"\"\"Gets the path of the directory where the step should save its generated artifacts.\n\n        Returns:\n            The path of the directory where the step should save the generated artifacts,\n                or `None` if `_pipeline_artifacts_path` is not set.\n        \"\"\"\n        if self._pipeline_artifacts_path is None:\n            return None\n        return self._pipeline_artifacts_path / self.name  # type: ignore\n\n    def save_artifact(\n        self,\n        name: str,\n        write_function: Callable[[Path], None],\n        metadata: Optional[Dict[str, Any]] = None,\n    ) -> None:\n        \"\"\"Saves an artifact generated by the `Step`.\n\n        Args:\n            name: the name of the artifact.\n            write_function: a function that will receive the path where the artifact should\n                be saved.\n            metadata: the artifact metadata. Defaults to `None`.\n        \"\"\"\n        if self.artifacts_directory is None:\n            self._logger.warning(\n                f\"Cannot save artifact with '{name}' as `_pipeline_artifacts_path` is not\"\n                \" set. This is normal if the `Step` is being executed as a standalone component.\"\n            )\n            return\n\n        artifact_directory_path = self.artifacts_directory / name\n        artifact_directory_path.mkdir(parents=True, exist_ok=True)\n\n        self._logger.info(f\"\ud83c\udffa Storing '{name}' generated artifact...\")\n\n        self._logger.debug(\n            f\"Calling `write_function` to write artifact in '{artifact_directory_path}'...\"\n        )\n        write_function(artifact_directory_path)\n\n        metadata_path = artifact_directory_path / \"metadata.json\"\n        self._logger.debug(\n            f\"Calling `write_json` to write artifact metadata in '{metadata_path}'...\"\n        )\n        write_json(filename=metadata_path, data=metadata or {})\n\n    def impute_step_outputs(\n        self, step_output: List[Dict[str, Any]]\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Imputes the output columns of the step that are not present in the step output.\n        \"\"\"\n        result = []\n        for row in step_output:\n            data = row.copy()\n            for output in self.get_outputs().keys():\n                data[output] = None\n            result.append(data)\n        return result\n\n    def _model_dump(self, obj: Any, **kwargs: Any) -> Dict[str, Any]:\n        dump = super()._model_dump(obj, **kwargs)\n        dump[\"runtime_parameters_info\"] = self.get_runtime_parameters_info()\n        return dump\n
"},{"location":"api/step/#distilabel.steps.base._Step.is_generator","title":"is_generator: bool property","text":"

Whether the step is a generator step or not.

Returns:

Type Description bool

True if the step is a generator step, False otherwise.

"},{"location":"api/step/#distilabel.steps.base._Step.is_global","title":"is_global: bool property","text":"

Whether the step is a global step or not.

Returns:

Type Description bool

True if the step is a global step, False otherwise.

"},{"location":"api/step/#distilabel.steps.base._Step.is_normal","title":"is_normal: bool property","text":"

Whether the step is a normal step or not.

Returns:

Type Description bool

True if the step is a normal step, False otherwise.

"},{"location":"api/step/#distilabel.steps.base._Step.inputs","title":"inputs: StepColumns property","text":"

List of strings with the names of the mandatory columns that the step needs as input or dictionary in which the keys are the input columns of the step and the values are booleans indicating whether the column is optional or not.

Returns:

Type Description StepColumns

List of strings with the names of the columns that the step needs as input.

"},{"location":"api/step/#distilabel.steps.base._Step.outputs","title":"outputs: StepColumns property","text":"

List of strings with the names of the columns that the step will produce as output or dictionary in which the keys are the output columns of the step and the values are booleans indicating whether the column is optional or not.

Returns:

Type Description StepColumns

List of strings with the names of the columns that the step will produce as

StepColumns

output.

"},{"location":"api/step/#distilabel.steps.base._Step.process_parameters","title":"process_parameters: List[inspect.Parameter] cached property","text":"

Returns the parameters of the process method of the step.

Returns:

Type Description List[Parameter]

The parameters of the process method of the step.

"},{"location":"api/step/#distilabel.steps.base._Step.artifacts_directory","title":"artifacts_directory: Union[Path, None] property","text":"

Gets the path of the directory where the step should save its generated artifacts.

Returns:

Type Description Union[Path, None]

The path of the directory where the step should save the generated artifacts, or None if _pipeline_artifacts_path is not set.

"},{"location":"api/step/#distilabel.steps.base._Step.connect","title":"connect(*steps, routing_batch_function=None)","text":"

Connects the current step to another step in the pipeline, which means that the output of this step will be the input of the other step.

Parameters:

Name Type Description Default steps _Step

The steps to connect to the current step.

() routing_batch_function Optional[RoutingBatchFunction]

A function that receives a list of steps and returns a list of steps to which the output batch generated by this step should be routed. It should be used to define the routing logic of the pipeline. If not provided, the output batch will be routed to all the connected steps. Defaults to None.

None Source code in src/distilabel/steps/base.py
def connect(\n    self,\n    *steps: \"_Step\",\n    routing_batch_function: Optional[\"RoutingBatchFunction\"] = None,\n) -> None:\n    \"\"\"Connects the current step to another step in the pipeline, which means that\n    the output of this step will be the input of the other step.\n\n    Args:\n        steps: The steps to connect to the current step.\n        routing_batch_function: A function that receives a list of steps and returns\n            a list of steps to which the output batch generated by this step should be\n            routed. It should be used to define the routing logic of the pipeline. If\n            not provided, the output batch will be routed to all the connected steps.\n            Defaults to `None`.\n    \"\"\"\n    assert self.pipeline is not None\n\n    if routing_batch_function:\n        self._set_routing_batch_function(routing_batch_function)\n\n    for step in steps:\n        self.pipeline._add_edge(from_step=self.name, to_step=step.name)  # type: ignore\n
"},{"location":"api/step/#distilabel.steps.base._Step.__rshift__","title":"__rshift__(other)","text":"
__rshift__(other: RoutingBatchFunction) -> RoutingBatchFunction\n
__rshift__(other: List[DownstreamConnectableSteps]) -> List[DownstreamConnectableSteps]\n
__rshift__(other: DownstreamConnectable) -> DownstreamConnectable\n

Allows using the >> operator to connect steps in the pipeline.

Parameters:

Name Type Description Default other Union[DownstreamConnectable, RoutingBatchFunction, List[DownstreamConnectableSteps]]

The step to connect, a list of steps to connect to or a routing batch function to be set for the step.

required

Returns:

Type Description Union[DownstreamConnectable, RoutingBatchFunction, List[DownstreamConnectableSteps]]

The connected step, the list of connected steps or the routing batch function.

Example
step1 >> step2\n# Would be equivalent to:\nstep1.connect(step2)\n\n# It also allows to connect a list of steps\nstep1 >> [step2, step3]\n
Source code in src/distilabel/steps/base.py
def __rshift__(\n    self,\n    other: Union[\n        \"DownstreamConnectable\",\n        \"RoutingBatchFunction\",\n        List[\"DownstreamConnectableSteps\"],\n    ],\n) -> Union[\n    \"DownstreamConnectable\",\n    \"RoutingBatchFunction\",\n    List[\"DownstreamConnectableSteps\"],\n]:\n    \"\"\"Allows using the `>>` operator to connect steps in the pipeline.\n\n    Args:\n        other: The step to connect, a list of steps to connect to or a routing batch\n            function to be set for the step.\n\n    Returns:\n        The connected step, the list of connected steps or the routing batch function.\n\n    Example:\n        ```python\n        step1 >> step2\n        # Would be equivalent to:\n        step1.connect(step2)\n\n        # It also allows to connect a list of steps\n        step1 >> [step2, step3]\n        ```\n    \"\"\"\n    # Here to avoid circular imports\n    from distilabel.pipeline.routing_batch_function import RoutingBatchFunction\n\n    if isinstance(other, list):\n        self.connect(*other)\n        return other\n\n    if isinstance(other, RoutingBatchFunction):\n        self._set_routing_batch_function(other)\n        return other\n\n    self.connect(other)\n    return other\n
"},{"location":"api/step/#distilabel.steps.base._Step.__rrshift__","title":"__rrshift__(other)","text":"

Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline to a single step, as the list doesn't have the rshift operator.

Parameters:

Name Type Description Default other List[UpstreamConnectableSteps]

The step to connect to.

required

Returns:

Type Description Self

The connected step

Example
[step2, step3] >> step1\n# Would be equivalent to:\nstep2.connect(step1)\nstep3.connect(step1)\n
Source code in src/distilabel/steps/base.py
def __rrshift__(self, other: List[\"UpstreamConnectableSteps\"]) -> Self:\n    \"\"\"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline\n    to a single step, as the list doesn't have the __rshift__ operator.\n\n    Args:\n        other: The step to connect to.\n\n    Returns:\n        The connected step\n\n    Example:\n        ```python\n        [step2, step3] >> step1\n        # Would be equivalent to:\n        step2.connect(step1)\n        step3.connect(step1)\n        ```\n    \"\"\"\n    for o in other:\n        o.connect(self)\n    return self\n
"},{"location":"api/step/#distilabel.steps.base._Step.load","title":"load()","text":"

Method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc.

Source code in src/distilabel/steps/base.py
def load(self) -> None:\n    \"\"\"Method to perform any initialization logic before the `process` method is\n    called. For example, to load an LLM, stablish a connection to a database, etc.\n    \"\"\"\n    self._logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n
"},{"location":"api/step/#distilabel.steps.base._Step.unload","title":"unload()","text":"

Method to perform any cleanup logic after the process method is called. For example, to close a connection to a database, etc.

Source code in src/distilabel/steps/base.py
def unload(self) -> None:\n    \"\"\"Method to perform any cleanup logic after the `process` method is called. For\n    example, to close a connection to a database, etc.\n    \"\"\"\n    self._logger.debug(\"Executing step unload logic.\")\n
"},{"location":"api/step/#distilabel.steps.base._Step.has_multiple_inputs","title":"has_multiple_inputs()","text":"

Whether the process method of the step receives more than one input or not i.e. has a * argument annotated with StepInput.

Returns:

Type Description bool

True if the process method of the step receives more than one input,

bool

False otherwise.

Source code in src/distilabel/steps/base.py
def has_multiple_inputs(self) -> bool:\n    \"\"\"Whether the `process` method of the step receives more than one input or not\n    i.e. has a `*` argument annotated with `StepInput`.\n\n    Returns:\n        `True` if the `process` method of the step receives more than one input,\n        `False` otherwise.\n    \"\"\"\n    return any(\n        param.kind == param.VAR_POSITIONAL for param in self.process_parameters\n    )\n
"},{"location":"api/step/#distilabel.steps.base._Step.get_process_step_input","title":"get_process_step_input()","text":"

Returns the parameter of the process method of the step annotated with StepInput.

Returns:

Type Description Union[Parameter, None]

The parameter of the process method of the step annotated with StepInput,

Union[Parameter, None]

or None if there is no parameter annotated with StepInput.

Raises:

Type Description TypeError

If the step has more than one parameter annotated with StepInput.

Source code in src/distilabel/steps/base.py
def get_process_step_input(self) -> Union[inspect.Parameter, None]:\n    \"\"\"Returns the parameter of the `process` method of the step annotated with\n    `StepInput`.\n\n    Returns:\n        The parameter of the `process` method of the step annotated with `StepInput`,\n        or `None` if there is no parameter annotated with `StepInput`.\n\n    Raises:\n        TypeError: If the step has more than one parameter annotated with `StepInput`.\n    \"\"\"\n    step_input_parameter = None\n    for parameter in self.process_parameters:\n        if is_parameter_annotated_with(parameter, _STEP_INPUT_ANNOTATION):\n            if step_input_parameter is not None:\n                raise DistilabelTypeError(\n                    f\"Step '{self.name}' should have only one parameter with type\"\n                    \" hint `StepInput`.\",\n                    page=\"sections/how_to_guides/basic/step/#defining-custom-steps\",\n                )\n            step_input_parameter = parameter\n    return step_input_parameter\n
"},{"location":"api/step/#distilabel.steps.base._Step.verify_inputs_mappings","title":"verify_inputs_mappings()","text":"

Verifies that the inputs_mappings of the step are valid i.e. the input columns exist in the inputs of the step.

Raises:

Type Description ValueError

If the inputs_mappings of the step are not valid.

Source code in src/distilabel/steps/base.py
def verify_inputs_mappings(self) -> None:\n    \"\"\"Verifies that the `inputs_mappings` of the step are valid i.e. the input\n    columns exist in the inputs of the step.\n\n    Raises:\n        ValueError: If the `inputs_mappings` of the step are not valid.\n    \"\"\"\n    if not self.input_mappings:\n        return\n\n    for input in self.input_mappings:\n        if input not in self.inputs:\n            raise DistilabelUserError(\n                f\"The input column '{input}' doesn't exist in the inputs of the\"\n                f\" step '{self.name}'. Inputs of the step are: {self.inputs}.\"\n                \" Please, review the `inputs_mappings` argument of the step.\",\n                page=\"sections/how_to_guides/basic/step/#arguments\",\n            )\n
"},{"location":"api/step/#distilabel.steps.base._Step.verify_outputs_mappings","title":"verify_outputs_mappings()","text":"

Verifies that the outputs_mappings of the step are valid i.e. the output columns exist in the outputs of the step.

Raises:

Type Description ValueError

If the outputs_mappings of the step are not valid.

Source code in src/distilabel/steps/base.py
def verify_outputs_mappings(self) -> None:\n    \"\"\"Verifies that the `outputs_mappings` of the step are valid i.e. the output\n    columns exist in the outputs of the step.\n\n    Raises:\n        ValueError: If the `outputs_mappings` of the step are not valid.\n    \"\"\"\n    if not self.output_mappings:\n        return\n\n    for output in self.output_mappings:\n        if output not in self.outputs:\n            raise DistilabelUserError(\n                f\"The output column '{output}' doesn't exist in the outputs of the\"\n                f\" step '{self.name}'. Outputs of the step are: {self.outputs}.\"\n                \" Please, review the `outputs_mappings` argument of the step.\",\n                page=\"sections/how_to_guides/basic/step/#arguments\",\n            )\n
"},{"location":"api/step/#distilabel.steps.base._Step.get_inputs","title":"get_inputs()","text":"

Gets the inputs of the step after the input_mappings. This method is meant to be used to run validations on the inputs of the step.

Returns:

Type Description Dict[str, bool]

The inputs of the step after the input_mappings and if they are required or

Dict[str, bool]

not.

Source code in src/distilabel/steps/base.py
def get_inputs(self) -> Dict[str, bool]:\n    \"\"\"Gets the inputs of the step after the `input_mappings`. This method is meant\n    to be used to run validations on the inputs of the step.\n\n    Returns:\n        The inputs of the step after the `input_mappings` and if they are required or\n        not.\n    \"\"\"\n    if isinstance(self.inputs, list):\n        return {\n            self.input_mappings.get(input, input): True for input in self.inputs\n        }\n\n    return {\n        self.input_mappings.get(input, input): required\n        for input, required in self.inputs.items()\n    }\n
"},{"location":"api/step/#distilabel.steps.base._Step.get_outputs","title":"get_outputs()","text":"

Gets the outputs of the step after the outputs_mappings. This method is meant to be used to run validations on the outputs of the step.

Returns:

Type Description Dict[str, bool]

The outputs of the step after the outputs_mappings and if they are required

Dict[str, bool]

or not.

Source code in src/distilabel/steps/base.py
def get_outputs(self) -> Dict[str, bool]:\n    \"\"\"Gets the outputs of the step after the `outputs_mappings`. This method is\n    meant to be used to run validations on the outputs of the step.\n\n    Returns:\n        The outputs of the step after the `outputs_mappings` and if they are required\n        or not.\n    \"\"\"\n    if isinstance(self.outputs, list):\n        return {\n            self.output_mappings.get(output, output): True\n            for output in self.outputs\n        }\n\n    return {\n        self.output_mappings.get(output, output): required\n        for output, required in self.outputs.items()\n    }\n
"},{"location":"api/step/#distilabel.steps.base._Step.set_pipeline_artifacts_path","title":"set_pipeline_artifacts_path(path)","text":"

Sets the _pipeline_artifacts_path attribute. This method is meant to be used by the Pipeline once the cache location is known.

Parameters:

Name Type Description Default path Path

the path where the artifacts generated by the pipeline steps should be saved.

required Source code in src/distilabel/steps/base.py
def set_pipeline_artifacts_path(self, path: Path) -> None:\n    \"\"\"Sets the `_pipeline_artifacts_path` attribute. This method is meant to be used\n    by the `Pipeline` once the cache location is known.\n\n    Args:\n        path: the path where the artifacts generated by the pipeline steps should be\n            saved.\n    \"\"\"\n    self._pipeline_artifacts_path = path\n
"},{"location":"api/step/#distilabel.steps.base._Step.save_artifact","title":"save_artifact(name, write_function, metadata=None)","text":"

Saves an artifact generated by the Step.

Parameters:

Name Type Description Default name str

the name of the artifact.

required write_function Callable[[Path], None]

a function that will receive the path where the artifact should be saved.

required metadata Optional[Dict[str, Any]]

the artifact metadata. Defaults to None.

None Source code in src/distilabel/steps/base.py
def save_artifact(\n    self,\n    name: str,\n    write_function: Callable[[Path], None],\n    metadata: Optional[Dict[str, Any]] = None,\n) -> None:\n    \"\"\"Saves an artifact generated by the `Step`.\n\n    Args:\n        name: the name of the artifact.\n        write_function: a function that will receive the path where the artifact should\n            be saved.\n        metadata: the artifact metadata. Defaults to `None`.\n    \"\"\"\n    if self.artifacts_directory is None:\n        self._logger.warning(\n            f\"Cannot save artifact with '{name}' as `_pipeline_artifacts_path` is not\"\n            \" set. This is normal if the `Step` is being executed as a standalone component.\"\n        )\n        return\n\n    artifact_directory_path = self.artifacts_directory / name\n    artifact_directory_path.mkdir(parents=True, exist_ok=True)\n\n    self._logger.info(f\"\ud83c\udffa Storing '{name}' generated artifact...\")\n\n    self._logger.debug(\n        f\"Calling `write_function` to write artifact in '{artifact_directory_path}'...\"\n    )\n    write_function(artifact_directory_path)\n\n    metadata_path = artifact_directory_path / \"metadata.json\"\n    self._logger.debug(\n        f\"Calling `write_json` to write artifact metadata in '{metadata_path}'...\"\n    )\n    write_json(filename=metadata_path, data=metadata or {})\n
"},{"location":"api/step/#distilabel.steps.base._Step.impute_step_outputs","title":"impute_step_outputs(step_output)","text":"

Imputes the output columns of the step that are not present in the step output.

Source code in src/distilabel/steps/base.py
def impute_step_outputs(\n    self, step_output: List[Dict[str, Any]]\n) -> List[Dict[str, Any]]:\n    \"\"\"\n    Imputes the output columns of the step that are not present in the step output.\n    \"\"\"\n    result = []\n    for row in step_output:\n        data = row.copy()\n        for output in self.get_outputs().keys():\n            data[output] = None\n        result.append(data)\n    return result\n
"},{"location":"api/step/#distilabel.steps.base.Step","title":"Step","text":"

Bases: _Step, ABC

Base class for the steps that can be included in a Pipeline.

Attributes:

Name Type Description input_batch_size RuntimeParameter[PositiveInt]

The number of rows that will contain the batches processed by the step. Defaults to 50.

Runtime parameters
  • input_batch_size: The number of rows that will contain the batches processed by the step. Defaults to 50.
Source code in src/distilabel/steps/base.py
class Step(_Step, ABC):\n    \"\"\"Base class for the steps that can be included in a `Pipeline`.\n\n    Attributes:\n        input_batch_size: The number of rows that will contain the batches processed by\n            the step. Defaults to `50`.\n\n    Runtime parameters:\n        - `input_batch_size`: The number of rows that will contain the batches processed\n            by the step. Defaults to `50`.\n    \"\"\"\n\n    input_batch_size: RuntimeParameter[PositiveInt] = Field(\n        default=DEFAULT_INPUT_BATCH_SIZE,\n        description=\"The number of rows that will contain the batches processed by the\"\n        \" step.\",\n    )\n\n    @abstractmethod\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        \"\"\"Method that defines the processing logic of the step. It should yield the\n        output rows.\n\n        Args:\n            *inputs: An argument used to receive the outputs of the previous steps. The\n                number of arguments depends on the number of previous steps. It doesn't\n                need to be an `*args` argument, it can be a regular argument annotated\n                with `StepInput` if the step has only one previous step.\n        \"\"\"\n        pass\n\n    def process_applying_mappings(self, *args: List[Dict[str, Any]]) -> \"StepOutput\":\n        \"\"\"Runs the `process` method of the step applying the `input_mappings` to the input\n        rows and the `outputs_mappings` to the output rows. This is the function that\n        should be used to run the processing logic of the step.\n\n        Yields:\n            The output rows.\n        \"\"\"\n\n        inputs, overriden_inputs = (\n            self._apply_input_mappings(args)\n            if self.input_mappings\n            else (args, [{} for _ in range(len(args[0]))])\n        )\n\n        # If the `Step` was built using the `@step` decorator, then we need to pass\n        # the runtime parameters as kwargs, so they can be used within the processing\n        # function\n        generator = (\n            self.process(*inputs)\n            if not self._built_from_decorator\n            else self.process(*inputs, **self._runtime_parameters)\n        )\n\n        for output_rows in generator:\n            restored = []\n            for i, row in enumerate(output_rows):\n                # Correct the index here because we don't know the num_generations from the llm\n                # ahead of time. For example, if we have `len(overriden_inputs)==5` and `len(row)==10`,\n                # from `num_generations==2` and `group_generations=False` in the LLM:\n                # The loop will use indices 0, 1, 2, 3, 4, 0, 1, 2, 3, 4\n                ntimes_i = i % len(overriden_inputs)\n                restored.append(\n                    self._apply_mappings_and_restore_overriden(\n                        row, overriden_inputs[ntimes_i]\n                    )\n                )\n            yield restored\n\n    def _apply_input_mappings(\n        self, inputs: Tuple[List[Dict[str, Any]], ...]\n    ) -> Tuple[Tuple[List[Dict[str, Any]], ...], List[Dict[str, Any]]]:\n        \"\"\"Applies the `input_mappings` to the input rows.\n\n        Args:\n            inputs: The input rows.\n\n        Returns:\n            The input rows with the `input_mappings` applied and the overriden values\n                that were replaced by the `input_mappings`.\n        \"\"\"\n        reverted_input_mappings = {v: k for k, v in self.input_mappings.items()}\n\n        renamed_inputs = []\n        overriden_inputs = []\n        for i, row_inputs in enumerate(inputs):\n            renamed_row_inputs = []\n            for row in row_inputs:\n                overriden_keys = {}\n                renamed_row = {}\n                for k, v in row.items():\n                    renamed_key = reverted_input_mappings.get(k, k)\n\n                    if renamed_key not in renamed_row or k != renamed_key:\n                        renamed_row[renamed_key] = v\n\n                        if k != renamed_key and renamed_key in row and len(inputs) == 1:\n                            overriden_keys[renamed_key] = row[renamed_key]\n\n                if i == 0:\n                    overriden_inputs.append(overriden_keys)\n                renamed_row_inputs.append(renamed_row)\n            renamed_inputs.append(renamed_row_inputs)\n        return tuple(renamed_inputs), overriden_inputs\n\n    def _apply_mappings_and_restore_overriden(\n        self, row: Dict[str, Any], overriden: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Reverts the `input_mappings` applied to the input rows and applies the `output_mappings`\n        to the output rows. In addition, it restores the overriden values that were replaced\n        by the `input_mappings`.\n\n        Args:\n            row: The output row.\n            overriden: The overriden values that were replaced by the `input_mappings`.\n\n        Returns:\n            The output row with the `output_mappings` applied and the overriden values\n            restored.\n        \"\"\"\n        result = {}\n        for k, v in row.items():\n            mapped_key = (\n                self.output_mappings.get(k, None)\n                or self.input_mappings.get(k, None)\n                or k\n            )\n            result[mapped_key] = v\n\n        # Restore overriden values\n        for k, v in overriden.items():\n            if k not in result:\n                result[k] = v\n\n        return result\n
"},{"location":"api/step/#distilabel.steps.base.Step.process","title":"process(*inputs) abstractmethod","text":"

Method that defines the processing logic of the step. It should yield the output rows.

Parameters:

Name Type Description Default *inputs StepInput

An argument used to receive the outputs of the previous steps. The number of arguments depends on the number of previous steps. It doesn't need to be an *args argument, it can be a regular argument annotated with StepInput if the step has only one previous step.

() Source code in src/distilabel/steps/base.py
@abstractmethod\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n    \"\"\"Method that defines the processing logic of the step. It should yield the\n    output rows.\n\n    Args:\n        *inputs: An argument used to receive the outputs of the previous steps. The\n            number of arguments depends on the number of previous steps. It doesn't\n            need to be an `*args` argument, it can be a regular argument annotated\n            with `StepInput` if the step has only one previous step.\n    \"\"\"\n    pass\n
"},{"location":"api/step/#distilabel.steps.base.Step.process_applying_mappings","title":"process_applying_mappings(*args)","text":"

Runs the process method of the step applying the input_mappings to the input rows and the outputs_mappings to the output rows. This is the function that should be used to run the processing logic of the step.

Yields:

Type Description StepOutput

The output rows.

Source code in src/distilabel/steps/base.py
def process_applying_mappings(self, *args: List[Dict[str, Any]]) -> \"StepOutput\":\n    \"\"\"Runs the `process` method of the step applying the `input_mappings` to the input\n    rows and the `outputs_mappings` to the output rows. This is the function that\n    should be used to run the processing logic of the step.\n\n    Yields:\n        The output rows.\n    \"\"\"\n\n    inputs, overriden_inputs = (\n        self._apply_input_mappings(args)\n        if self.input_mappings\n        else (args, [{} for _ in range(len(args[0]))])\n    )\n\n    # If the `Step` was built using the `@step` decorator, then we need to pass\n    # the runtime parameters as kwargs, so they can be used within the processing\n    # function\n    generator = (\n        self.process(*inputs)\n        if not self._built_from_decorator\n        else self.process(*inputs, **self._runtime_parameters)\n    )\n\n    for output_rows in generator:\n        restored = []\n        for i, row in enumerate(output_rows):\n            # Correct the index here because we don't know the num_generations from the llm\n            # ahead of time. For example, if we have `len(overriden_inputs)==5` and `len(row)==10`,\n            # from `num_generations==2` and `group_generations=False` in the LLM:\n            # The loop will use indices 0, 1, 2, 3, 4, 0, 1, 2, 3, 4\n            ntimes_i = i % len(overriden_inputs)\n            restored.append(\n                self._apply_mappings_and_restore_overriden(\n                    row, overriden_inputs[ntimes_i]\n                )\n            )\n        yield restored\n
"},{"location":"api/step/decorator/","title":"@step","text":"

This section contains the reference for the @step decorator, used to create new Step subclasses without having to manually define the class.

For more information check the Tutorial - Step page.

"},{"location":"api/step/decorator/#distilabel.steps.decorator","title":"decorator","text":""},{"location":"api/step/decorator/#distilabel.steps.decorator.step","title":"step(inputs=None, outputs=None, step_type='normal')","text":"
step(inputs: Union[StepColumns, None] = None, outputs: Union[StepColumns, None] = None, step_type: Literal['normal'] = 'normal') -> Callable[..., Type[Step]]\n
step(inputs: Union[StepColumns, None] = None, outputs: Union[StepColumns, None] = None, step_type: Literal['global'] = 'global') -> Callable[..., Type[GlobalStep]]\n
step(inputs: None = None, outputs: Union[StepColumns, None] = None, step_type: Literal['generator'] = 'generator') -> Callable[..., Type[GeneratorStep]]\n

Creates an Step from a processing function.

Parameters:

Name Type Description Default inputs Union[StepColumns, None]

a list containing the name of the inputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not, that are required by the step. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None.

None outputs Union[StepColumns, None]

a list containing the name of the outputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column will be generated or not. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None.

None step_type Literal['normal', 'global', 'generator']

the kind of step to create. Valid choices are: \"normal\" (Step), \"global\" (GlobalStep) or \"generator\" (GeneratorStep). Defaults to \"normal\".

'normal'

Returns:

Type Description Callable[..., Type[_Step]]

A callable that will generate the type given the processing function.

Example:

# Normal step\n@step(inputs=[\"instruction\"], outputs=[\"generation\"])\ndef GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:\n    for input in inputs:\n        input[\"generation\"] = dummy_generation\n    yield inputs\n\n# Global step\n@step(inputs=[\"instruction\"], step_type=\"global\")\ndef FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:\n    yield [\n        input\n        for input in inputs\n        if len(input[\"instruction\"]) <= max_length\n    ]\n\n# Generator step\n@step(outputs=[\"num\"], step_type=\"generator\")\ndef RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:\n    data = list(range(num_rows))\n    for i in range(0, len(data), 100):\n        last_batch = i + 100 >= len(data)\n        yield [{\"num\": num} for num in data[i : i + 100]], last_batch\n
Source code in src/distilabel/steps/decorator.py
def step(\n    inputs: Union[\"StepColumns\", None] = None,\n    outputs: Union[\"StepColumns\", None] = None,\n    step_type: Literal[\"normal\", \"global\", \"generator\"] = \"normal\",\n) -> Callable[..., Type[\"_Step\"]]:\n    \"\"\"Creates an `Step` from a processing function.\n\n    Args:\n        inputs: a list containing the name of the inputs columns/keys or a dictionary\n            where the keys are the columns and the values are booleans indicating whether\n            the column is required or not, that are required by the step. If not provided\n            the default will be an empty list `[]` and it will be assumed that the step\n            doesn't need any specific columns. Defaults to `None`.\n        outputs: a list containing the name of the outputs columns/keys or a dictionary\n            where the keys are the columns and the values are booleans indicating whether\n            the column will be generated or not. If not provided the default will be an\n            empty list `[]` and it will be assumed that the step doesn't need any specific\n            columns. Defaults to `None`.\n        step_type: the kind of step to create. Valid choices are: \"normal\" (`Step`),\n            \"global\" (`GlobalStep`) or \"generator\" (`GeneratorStep`). Defaults to\n            `\"normal\"`.\n\n    Returns:\n        A callable that will generate the type given the processing function.\n\n    Example:\n\n    ```python\n    # Normal step\n    @step(inputs=[\"instruction\"], outputs=[\"generation\"])\n    def GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:\n        for input in inputs:\n            input[\"generation\"] = dummy_generation\n        yield inputs\n\n    # Global step\n    @step(inputs=[\"instruction\"], step_type=\"global\")\n    def FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:\n        yield [\n            input\n            for input in inputs\n            if len(input[\"instruction\"]) <= max_length\n        ]\n\n    # Generator step\n    @step(outputs=[\"num\"], step_type=\"generator\")\n    def RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:\n        data = list(range(num_rows))\n        for i in range(0, len(data), 100):\n            last_batch = i + 100 >= len(data)\n            yield [{\"num\": num} for num in data[i : i + 100]], last_batch\n    ```\n    \"\"\"\n\n    inputs = inputs or []\n    outputs = outputs or []\n\n    def decorator(func: ProcessingFunc) -> Type[\"_Step\"]:\n        if step_type not in _STEP_MAPPING:\n            raise ValueError(\n                f\"Invalid step type '{step_type}'. Please, review the '{func.__name__}'\"\n                \" function decorated with the `@step` decorator and provide a valid\"\n                \" `step_type`. Valid choices are: 'normal', 'global' or 'generator'.\"\n            )\n\n        BaseClass = _STEP_MAPPING[step_type]\n\n        signature = inspect.signature(func)\n\n        runtime_parameters = {\n            name: (\n                param.annotation,\n                param.default if param.default != param.empty else None,\n            )\n            for name, param in signature.parameters.items()\n        }\n\n        runtime_parameters = {}\n        step_input_parameter = None\n        for name, param in signature.parameters.items():\n            if is_parameter_annotated_with(param, _RUNTIME_PARAMETER_ANNOTATION):\n                runtime_parameters[name] = (\n                    param.annotation,\n                    param.default if param.default != param.empty else None,\n                )\n\n            if not step_type == \"generator\" and is_parameter_annotated_with(\n                param, _STEP_INPUT_ANNOTATION\n            ):\n                if step_input_parameter is not None:\n                    raise ValueError(\n                        f\"Function '{func.__name__}' has more than one parameter annotated\"\n                        f\" with `StepInput`. Please, review the '{func.__name__}' function\"\n                        \" decorated with the `@step` decorator and provide only one\"\n                        \" argument annotated with `StepInput`.\"\n                    )\n                step_input_parameter = param\n\n        RuntimeParametersModel = create_model(  # type: ignore\n            \"RuntimeParametersModel\",\n            **runtime_parameters,  # type: ignore\n        )\n\n        def inputs_property(self) -> \"StepColumns\":\n            return inputs\n\n        def outputs_property(self) -> \"StepColumns\":\n            return outputs\n\n        def process(\n            self, *args: Any, **kwargs: Any\n        ) -> Union[\"StepOutput\", \"GeneratorStepOutput\"]:\n            return func(*args, **kwargs)\n\n        return type(  # type: ignore\n            func.__name__,\n            (\n                BaseClass,\n                RuntimeParametersModel,\n            ),\n            {\n                \"process\": process,\n                \"inputs\": property(inputs_property),\n                \"outputs\": property(outputs_property),\n                \"__module__\": func.__module__,\n                \"__doc__\": func.__doc__,\n                \"_built_from_decorator\": True,\n                # Override the `get_process_step_input` method to return the parameter\n                # of the original function annotated with `StepInput`.\n                \"get_process_step_input\": lambda self: step_input_parameter,\n            },\n        )\n\n    return decorator\n
"},{"location":"api/step/generator_step/","title":"GeneratorStep","text":"

This section contains the API reference for the GeneratorStep class.

For more information and examples on how to use existing generator steps or create custom ones, please refer to Tutorial - Step - GeneratorStep.

"},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep","title":"GeneratorStep","text":"

Bases: _Step, ABC

A special kind of Step that is able to generate data i.e. it doesn't receive any input from the previous steps.

Attributes:

Name Type Description batch_size RuntimeParameter[int]

The number of rows that will contain the batches generated by the step. Defaults to 50.

Runtime parameters
  • batch_size: The number of rows that will contain the batches generated by the step. Defaults to 50.
Source code in src/distilabel/steps/base.py
class GeneratorStep(_Step, ABC):\n    \"\"\"A special kind of `Step` that is able to generate data i.e. it doesn't receive\n    any input from the previous steps.\n\n    Attributes:\n        batch_size: The number of rows that will contain the batches generated by the\n            step. Defaults to `50`.\n\n    Runtime parameters:\n        - `batch_size`: The number of rows that will contain the batches generated by\n            the step. Defaults to `50`.\n    \"\"\"\n\n    batch_size: RuntimeParameter[int] = Field(\n        default=50,\n        description=\"The number of rows that will contain the batches generated by the\"\n        \" step.\",\n    )\n\n    @abstractmethod\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        \"\"\"Method that defines the generation logic of the step. It should yield the\n        output rows and a boolean indicating if it's the last batch or not.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to 0.\n\n        Yields:\n            The output rows and a boolean indicating if it's the last batch or not.\n        \"\"\"\n        pass\n\n    def process_applying_mappings(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        \"\"\"Runs the `process` method of the step applying the `outputs_mappings` to the\n        output rows. This is the function that should be used to run the generation logic\n        of the step.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to 0.\n\n        Yields:\n            The output rows and a boolean indicating if it's the last batch or not.\n        \"\"\"\n\n        # If the `Step` was built using the `@step` decorator, then we need to pass\n        # the runtime parameters as `kwargs`, so they can be used within the processing\n        # function\n        generator = (\n            self.process(offset=offset)\n            if not self._built_from_decorator\n            else self.process(offset=offset, **self._runtime_parameters)\n        )\n\n        for output_rows, last_batch in generator:\n            yield (\n                [\n                    {self.output_mappings.get(k, k): v for k, v in row.items()}\n                    for row in output_rows\n                ],\n                last_batch,\n            )\n
"},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep.process","title":"process(offset=0) abstractmethod","text":"

Method that defines the generation logic of the step. It should yield the output rows and a boolean indicating if it's the last batch or not.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

The output rows and a boolean indicating if it's the last batch or not.

Source code in src/distilabel/steps/base.py
@abstractmethod\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n    \"\"\"Method that defines the generation logic of the step. It should yield the\n    output rows and a boolean indicating if it's the last batch or not.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to 0.\n\n    Yields:\n        The output rows and a boolean indicating if it's the last batch or not.\n    \"\"\"\n    pass\n
"},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep.process_applying_mappings","title":"process_applying_mappings(offset=0)","text":"

Runs the process method of the step applying the outputs_mappings to the output rows. This is the function that should be used to run the generation logic of the step.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

The output rows and a boolean indicating if it's the last batch or not.

Source code in src/distilabel/steps/base.py
def process_applying_mappings(self, offset: int = 0) -> \"GeneratorStepOutput\":\n    \"\"\"Runs the `process` method of the step applying the `outputs_mappings` to the\n    output rows. This is the function that should be used to run the generation logic\n    of the step.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to 0.\n\n    Yields:\n        The output rows and a boolean indicating if it's the last batch or not.\n    \"\"\"\n\n    # If the `Step` was built using the `@step` decorator, then we need to pass\n    # the runtime parameters as `kwargs`, so they can be used within the processing\n    # function\n    generator = (\n        self.process(offset=offset)\n        if not self._built_from_decorator\n        else self.process(offset=offset, **self._runtime_parameters)\n    )\n\n    for output_rows, last_batch in generator:\n        yield (\n            [\n                {self.output_mappings.get(k, k): v for k, v in row.items()}\n                for row in output_rows\n            ],\n            last_batch,\n        )\n
"},{"location":"api/step/generator_step/#distilabel.steps.generators.utils.make_generator_step","title":"make_generator_step(dataset, pipeline=None, batch_size=50, input_mappings=None, output_mappings=None, resources=StepResources(), repo_id='default_name')","text":"

Helper method to create a GeneratorStep from a dataset, to simplify

Parameters:

Name Type Description Default dataset Union[Dataset, DataFrame, List[Dict[str, str]]]

The dataset to use in the Pipeline.

required batch_size int

The batch_size, will default to the same used by the GeneratorSteps. Defaults to 50.

50 input_mappings Optional[Dict[str, str]]

Applies the same as any other step. Defaults to None.

None output_mappings Optional[Dict[str, str]]

Applies the same as any other step. Defaults to None.

None resources StepResources

Applies the same as any other step. Defaults to StepResources().

StepResources() repo_id Optional[str]

The repository ID to use in the LoadDataFromHub step. This shouldn't be necessary, but in case of error, the dataset will try to be loaded using load_dataset internally. If that case happens, the repo_id will be used.

'default_name'

Raises:

Type Description ValueError

If the format is different from the ones supported.

Returns:

Type Description GeneratorStep

A LoadDataFromDicts if the input is a list of dicts, or LoadDataFromHub instance

GeneratorStep

if the input is a pd.DataFrame or a Dataset.

Source code in src/distilabel/steps/generators/utils.py
def make_generator_step(\n    dataset: Union[Dataset, pd.DataFrame, List[Dict[str, str]]],\n    pipeline: Union[\"BasePipeline\", None] = None,\n    batch_size: int = 50,\n    input_mappings: Optional[Dict[str, str]] = None,\n    output_mappings: Optional[Dict[str, str]] = None,\n    resources: StepResources = StepResources(),\n    repo_id: Optional[str] = \"default_name\",\n) -> \"GeneratorStep\":\n    \"\"\"Helper method to create a `GeneratorStep` from a dataset, to simplify\n\n    Args:\n        dataset: The dataset to use in the `Pipeline`.\n        batch_size: The batch_size, will default to the same used by the `GeneratorStep`s.\n            Defaults to `50`.\n        input_mappings: Applies the same as any other step. Defaults to `None`.\n        output_mappings: Applies the same as any other step. Defaults to `None`.\n        resources: Applies the same as any other step. Defaults to `StepResources()`.\n        repo_id: The repository ID to use in the `LoadDataFromHub` step.\n            This shouldn't be necessary, but in case of error, the dataset will try to be loaded\n            using `load_dataset` internally. If that case happens, the `repo_id` will be used.\n\n    Raises:\n        ValueError: If the format is different from the ones supported.\n\n    Returns:\n        A `LoadDataFromDicts` if the input is a list of dicts, or `LoadDataFromHub` instance\n        if the input is a `pd.DataFrame` or a `Dataset`.\n    \"\"\"\n    from distilabel.steps import LoadDataFromDicts, LoadDataFromHub\n\n    if isinstance(dataset, list):\n        return LoadDataFromDicts(\n            pipeline=pipeline,\n            data=dataset,\n            batch_size=batch_size,\n            input_mappings=input_mappings or {},\n            output_mappings=output_mappings or {},\n            resources=resources,\n        )\n\n    if isinstance(dataset, pd.DataFrame):\n        dataset = Dataset.from_pandas(dataset, preserve_index=False)\n\n    if not isinstance(dataset, Dataset):\n        raise DistilabelUserError(\n            f\"Dataset type not allowed: {type(dataset)}, must be one of: \"\n            \"`datasets.Dataset`, `pd.DataFrame`, `List[Dict[str, str]]`\",\n            page=\"sections/how_to_guides/basic/pipeline/?h=make_#__tabbed_1_2\",\n        )\n\n    loader = LoadDataFromHub(\n        pipeline=pipeline,\n        repo_id=repo_id,\n        batch_size=batch_size,\n        input_mappings=input_mappings or {},\n        output_mappings=output_mappings or {},\n        resources=resources,\n    )\n    super(loader.__class__, loader).load()  # Ensure the logger is loaded\n    loader._dataset = dataset\n    loader.num_examples = len(dataset)\n    loader._dataset_info = {\"default\": dataset.info}\n    return loader\n
"},{"location":"api/step/global_step/","title":"GlobalStep","text":"

This section contains the API reference for the GlobalStep class.

For more information and examples on how to use existing global steps or create custom ones, please refer to Tutorial - Step - GlobalStep.

"},{"location":"api/step/global_step/#distilabel.steps.base.GlobalStep","title":"GlobalStep","text":"

Bases: Step, ABC

A special kind of Step which it's process method receives all the data processed by their previous steps at once, instead of receiving it in batches. This kind of steps are useful when the processing logic requires to have all the data at once, for example to train a model, to perform a global aggregation, etc.

Source code in src/distilabel/steps/base.py
class GlobalStep(Step, ABC):\n    \"\"\"A special kind of `Step` which it's `process` method receives all the data processed\n    by their previous steps at once, instead of receiving it in batches. This kind of steps\n    are useful when the processing logic requires to have all the data at once, for example\n    to train a model, to perform a global aggregation, etc.\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return []\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return []\n
"},{"location":"api/step/resources/","title":"StepResources","text":""},{"location":"api/step/resources/#distilabel.steps.base.StepResources","title":"StepResources","text":"

Bases: RuntimeParametersMixin, BaseModel

A class to define the resources assigned to a _Step.

Attributes:

Name Type Description replicas RuntimeParameter[PositiveInt]

The number of replicas for the step.

cpus Optional[RuntimeParameter[PositiveInt]]

The number of CPUs assigned to each step replica.

gpus Optional[RuntimeParameter[PositiveInt]]

The number of GPUs assigned to each step replica.

memory Optional[RuntimeParameter[PositiveInt]]

The memory in bytes required for each step replica.

resources Optional[RuntimeParameter[Dict[str, int]]]

A dictionary containing the number of custom resources required for each step replica.

Source code in src/distilabel/steps/base.py
class StepResources(RuntimeParametersMixin, BaseModel):\n    \"\"\"A class to define the resources assigned to a `_Step`.\n\n    Attributes:\n        replicas: The number of replicas for the step.\n        cpus: The number of CPUs assigned to each step replica.\n        gpus: The number of GPUs assigned to each step replica.\n        memory: The memory in bytes required for each step replica.\n        resources: A dictionary containing the number of custom resources required for\n            each step replica.\n    \"\"\"\n\n    replicas: RuntimeParameter[PositiveInt] = Field(\n        default=1, description=\"The number of replicas for the step.\"\n    )\n    cpus: Optional[RuntimeParameter[PositiveInt]] = Field(\n        default=None, description=\"The number of CPUs assigned to each step replica.\"\n    )\n    gpus: Optional[RuntimeParameter[PositiveInt]] = Field(\n        default=None, description=\"The number of GPUs assigned to each step replica.\"\n    )\n    memory: Optional[RuntimeParameter[PositiveInt]] = Field(\n        default=None, description=\"The memory in bytes required for each step replica.\"\n    )\n    resources: Optional[RuntimeParameter[Dict[str, int]]] = Field(\n        default=None,\n        description=\"A dictionary containing names of custom resources and the\"\n        \" number of those resources required for each step replica.\",\n    )\n
"},{"location":"api/step/typing/","title":"Step Typing","text":""},{"location":"api/step/typing/#distilabel.steps.typing","title":"typing","text":""},{"location":"api/step/typing/#distilabel.steps.typing.StepOutput","title":"StepOutput = Iterator[List[Dict[str, Any]]] module-attribute","text":"

StepOutput is an alias of the typing Iterator[List[Dict[str, Any]]]

"},{"location":"api/step/typing/#distilabel.steps.typing.GeneratorStepOutput","title":"GeneratorStepOutput = Iterator[Tuple[List[Dict[str, Any]], bool]] module-attribute","text":"

GeneratorStepOutput is an alias of the typing Iterator[Tuple[List[Dict[str, Any]], bool]]

"},{"location":"api/step/typing/#distilabel.steps.typing.StepColumns","title":"StepColumns = Union[List[str], Dict[str, bool]] module-attribute","text":"

StepColumns is an alias of the typing Union[List[str], Dict[str, bool]] used by the inputs and outputs properties of an Step. In the case of a List[str], it is a list with the required columns. In the case of a Dict[str, bool], it is a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not.

"},{"location":"api/step_gallery/argilla/","title":"Argilla","text":"

This section contains the existing steps integrated with Argilla so as to easily push the generated datasets to Argilla.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base","title":"base","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase","title":"ArgillaBase","text":"

Bases: Step, ABC

Abstract step that provides a class to subclass from, that contains the boilerplate code required to interact with Argilla, as well as some extra validations on top of it. It also defines the abstract methods that need to be implemented in order to add a new dataset type as a step.

Note

This class is not intended to be instanced directly, but via subclass.

Attributes:

Name Type Description dataset_name RuntimeParameter[str]

The name of the dataset in Argilla where the records will be added.

dataset_workspace Optional[RuntimeParameter[str]]

The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

api_url Optional[RuntimeParameter[str]]

The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

api_key Optional[RuntimeParameter[SecretStr]]

The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

Runtime parameters
  • dataset_name: The name of the dataset in Argilla where the records will be added.
  • dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.
  • api_url: The base URL to use for the Argilla API requests.
  • api_key: The API key to authenticate the requests to the Argilla API.
Input columns
  • dynamic, based on the inputs value provided
Source code in src/distilabel/steps/argilla/base.py
class ArgillaBase(Step, ABC):\n    \"\"\"Abstract step that provides a class to subclass from, that contains the boilerplate code\n    required to interact with Argilla, as well as some extra validations on top of it. It also defines\n    the abstract methods that need to be implemented in order to add a new dataset type as a step.\n\n    Note:\n        This class is not intended to be instanced directly, but via subclass.\n\n    Attributes:\n        dataset_name: The name of the dataset in Argilla where the records will be added.\n        dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n            `None`, which means it will be created in the default workspace.\n        api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n            the `ARGILLA_API_URL` environment variable.\n        api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n            be read from the `ARGILLA_API_KEY` environment variable.\n\n    Runtime parameters:\n        - `dataset_name`: The name of the dataset in Argilla where the records will be\n            added.\n        - `dataset_workspace`: The workspace where the dataset will be created in Argilla.\n            Defaults to `None`, which means it will be created in the default workspace.\n        - `api_url`: The base URL to use for the Argilla API requests.\n        - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n    Input columns:\n        - dynamic, based on the `inputs` value provided\n    \"\"\"\n\n    dataset_name: RuntimeParameter[str] = Field(\n        default=None, description=\"The name of the dataset in Argilla.\"\n    )\n    dataset_workspace: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The workspace where the dataset will be created in Argilla. Defaults \"\n        \"to `None` which means it will be created in the default workspace.\",\n    )\n\n    api_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(_ARGILLA_API_URL_ENV_VAR_NAME),\n        description=\"The base URL to use for the Argilla API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_ARGILLA_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Argilla API.\",\n    )\n\n    _client: Optional[\"Argilla\"] = PrivateAttr(...)\n    _dataset: Optional[\"Dataset\"] = PrivateAttr(...)\n\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.\"\"\"\n        super().model_post_init(__context)\n\n        if importlib.util.find_spec(\"argilla\") is None:\n            raise ImportError(\n                \"Argilla is not installed. Please install it using `pip install argilla\"\n                \" --upgrade`.\"\n            )\n\n    def _client_init(self) -> None:\n        \"\"\"Initializes the Argilla API client with the provided `api_url` and `api_key`.\"\"\"\n        try:\n            self._client = rg.Argilla(  # type: ignore\n                api_url=self.api_url,\n                api_key=self.api_key.get_secret_value(),  # type: ignore\n                headers={\"Authorization\": f\"Bearer {os.environ['HF_TOKEN']}\"}\n                if isinstance(self.api_url, str)\n                and \"hf.space\" in self.api_url\n                and \"HF_TOKEN\" in os.environ\n                else {},\n            )\n        except Exception as e:\n            raise DistilabelUserError(\n                f\"Failed to initialize the Argilla API: {e}\",\n                page=\"sections/how_to_guides/advanced/argilla/\",\n            ) from e\n\n    @property\n    def _dataset_exists_in_workspace(self) -> bool:\n        \"\"\"Checks if the dataset already exists in Argilla in the provided workspace if any.\n\n        Returns:\n            `True` if the dataset exists, `False` otherwise.\n        \"\"\"\n        return (\n            self._client.datasets(  # type: ignore\n                name=self.dataset_name,  # type: ignore\n                workspace=self.dataset_workspace,\n            )\n            is not None\n        )\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs of the step is an empty list, since the steps subclassing from this one, will\n        always be leaf nodes and won't propagate the inputs neither generate any outputs.\n        \"\"\"\n        return []\n\n    def load(self) -> None:\n        \"\"\"Method to perform any initialization logic before the `process` method is\n        called. For example, to load an LLM, stablish a connection to a database, etc.\n        \"\"\"\n        super().load()\n\n        if self.api_url is None or self.api_key is None:\n            raise DistilabelUserError(\n                \"`Argilla` step requires the `api_url` and `api_key` to be provided. Please,\"\n                \" provide those at step instantiation, via environment variables `ARGILLA_API_URL`\"\n                \" and `ARGILLA_API_KEY`, or as `Step` runtime parameters via `pipeline.run(parameters={...})`.\",\n                page=\"sections/how_to_guides/advanced/argilla/\",\n            )\n\n        self._client_init()\n\n    @property\n    @abstractmethod\n    def inputs(self) -> \"StepColumns\": ...\n\n    @abstractmethod\n    def process(self, *inputs: StepInput) -> \"StepOutput\": ...\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.outputs","title":"outputs: StepColumns property","text":"

The outputs of the step is an empty list, since the steps subclassing from this one, will always be leaf nodes and won't propagate the inputs neither generate any outputs.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.model_post_init","title":"model_post_init(__context)","text":"

Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.

Source code in src/distilabel/steps/argilla/base.py
def model_post_init(self, __context: Any) -> None:\n    \"\"\"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.\"\"\"\n    super().model_post_init(__context)\n\n    if importlib.util.find_spec(\"argilla\") is None:\n        raise ImportError(\n            \"Argilla is not installed. Please install it using `pip install argilla\"\n            \" --upgrade`.\"\n        )\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.load","title":"load()","text":"

Method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc.

Source code in src/distilabel/steps/argilla/base.py
def load(self) -> None:\n    \"\"\"Method to perform any initialization logic before the `process` method is\n    called. For example, to load an LLM, stablish a connection to a database, etc.\n    \"\"\"\n    super().load()\n\n    if self.api_url is None or self.api_key is None:\n        raise DistilabelUserError(\n            \"`Argilla` step requires the `api_url` and `api_key` to be provided. Please,\"\n            \" provide those at step instantiation, via environment variables `ARGILLA_API_URL`\"\n            \" and `ARGILLA_API_KEY`, or as `Step` runtime parameters via `pipeline.run(parameters={...})`.\",\n            page=\"sections/how_to_guides/advanced/argilla/\",\n        )\n\n    self._client_init()\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference","title":"preference","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla","title":"PreferenceToArgilla","text":"

Bases: ArgillaBase

Creates a preference dataset in Argilla.

Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a preference dataset, where there's one field for the instruction and one extra field per each generation within the same record, and then a rating question per each of the generation fields. The rating question asks the annotator to set a rating from 1 to 5 for each of the provided generations.

Note

This step is meant to be used in conjunction with the UltraFeedback step, or any other step generating both ratings and responses for a given set of instruction and generations for the given instruction. But alternatively, it can also be used with any other task or step generating only the instruction and generations, as the ratings and rationales are optional.

Attributes:

Name Type Description num_generations int

The number of generations to include in the dataset.

dataset_name int

The name of the dataset in Argilla.

dataset_workspace int

The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

api_url int

The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

api_key int

The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

Runtime parameters
  • api_url: The base URL to use for the Argilla API requests.
  • api_key: The API key to authenticate the requests to the Argilla API.
Input columns
  • instruction (str): The instruction that was used to generate the completion.
  • generations (List[str]): The completion that was generated based on the input instruction.
  • ratings (List[str], optional): The ratings for the generations. If not provided, the generated ratings won't be pushed to Argilla.
  • rationales (List[str], optional): The rationales for the ratings. If not provided, the generated rationales won't be pushed to Argilla.

Examples:

Push a preference dataset to an Argilla instance:

from distilabel.steps import PreferenceToArgilla\n\nto_argilla = PreferenceToArgilla(\n    num_generations=2,\n    api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n    api_key=\"api.key\",\n    dataset_name=\"argilla_dataset\",\n    dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generations\": [\"first_generation\", \"second_generation\"],\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n

It can also include ratings and rationales:

result = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generations\": [\"first_generation\", \"second_generation\"],\n                \"ratings\": [\"4\", \"5\"],\n                \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n            }\n        ],\n    )\n)\n# >>> result\n# [\n#     {\n#         'instruction': 'instruction',\n#         'generations': ['first_generation', 'second_generation'],\n#         'ratings': ['4', '5'],\n#         'rationales': ['rationale for 4', 'rationale for 5']\n#     }\n# ]\n
Source code in src/distilabel/steps/argilla/preference.py
class PreferenceToArgilla(ArgillaBase):\n    \"\"\"Creates a preference dataset in Argilla.\n\n    Step that creates a dataset in Argilla during the load phase, and then pushes the input\n    batches into it as records. This dataset is a preference dataset, where there's one field\n    for the instruction and one extra field per each generation within the same record, and then\n    a rating question per each of the generation fields. The rating question asks the annotator to\n    set a rating from 1 to 5 for each of the provided generations.\n\n    Note:\n        This step is meant to be used in conjunction with the `UltraFeedback` step, or any other step\n        generating both ratings and responses for a given set of instruction and generations for the\n        given instruction. But alternatively, it can also be used with any other task or step generating\n        only the `instruction` and `generations`, as the `ratings` and `rationales` are optional.\n\n    Attributes:\n        num_generations: The number of generations to include in the dataset.\n        dataset_name: The name of the dataset in Argilla.\n        dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n            `None`, which means it will be created in the default workspace.\n        api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n            the `ARGILLA_API_URL` environment variable.\n        api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n            be read from the `ARGILLA_API_KEY` environment variable.\n\n    Runtime parameters:\n        - `api_url`: The base URL to use for the Argilla API requests.\n        - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n    Input columns:\n        - instruction (`str`): The instruction that was used to generate the completion.\n        - generations (`List[str]`): The completion that was generated based on the input instruction.\n        - ratings (`List[str]`, optional): The ratings for the generations. If not provided, the\n            generated ratings won't be pushed to Argilla.\n        - rationales (`List[str]`, optional): The rationales for the ratings. If not provided, the\n            generated rationales won't be pushed to Argilla.\n\n    Examples:\n        Push a preference dataset to an Argilla instance:\n\n        ```python\n        from distilabel.steps import PreferenceToArgilla\n\n        to_argilla = PreferenceToArgilla(\n            num_generations=2,\n            api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n            api_key=\"api.key\",\n            dataset_name=\"argilla_dataset\",\n            dataset_workspace=\"my_workspace\",\n        )\n        to_argilla.load()\n\n        result = next(\n            to_argilla.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"generations\": [\"first_generation\", \"second_generation\"],\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n        ```\n\n        It can also include ratings and rationales:\n\n        ```python\n        result = next(\n            to_argilla.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"generations\": [\"first_generation\", \"second_generation\"],\n                        \"ratings\": [\"4\", \"5\"],\n                        \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [\n        #     {\n        #         'instruction': 'instruction',\n        #         'generations': ['first_generation', 'second_generation'],\n        #         'ratings': ['4', '5'],\n        #         'rationales': ['rationale for 4', 'rationale for 5']\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    num_generations: int\n\n    _id: str = PrivateAttr(default=\"id\")\n    _instruction: str = PrivateAttr(...)\n    _generations: str = PrivateAttr(...)\n    _ratings: str = PrivateAttr(...)\n    _rationales: str = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Sets the `_instruction` and `_generations` attributes based on the `inputs_mapping`, otherwise\n        uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n        the text-generation scenario. And then it pushes it to Argilla.\n        \"\"\"\n        super().load()\n\n        # Both `instruction` and `generations` will be used as the fields of the dataset\n        self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n        self._generations = self.input_mappings.get(\"generations\", \"generations\")\n        # Both `ratings` and `rationales` will be used as suggestions to the default questions of the dataset\n        self._ratings = self.input_mappings.get(\"ratings\", \"ratings\")\n        self._rationales = self.input_mappings.get(\"rationales\", \"rationales\")\n\n        if self._dataset_exists_in_workspace:\n            _dataset = self._client.datasets(  # type: ignore\n                name=self.dataset_name,  # type: ignore\n                workspace=self.dataset_workspace,  # type: ignore\n            )\n\n            for field in _dataset.fields:\n                if not isinstance(field, rg.TextField):\n                    continue\n                if (\n                    field.name\n                    not in [self._id, self._instruction]  # type: ignore\n                    + [\n                        f\"{self._generations}-{idx}\"\n                        for idx in range(self.num_generations)\n                    ]\n                    and field.required\n                ):\n                    raise DistilabelUserError(\n                        f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n                        f\" already exists, but contains at least a required field that is\"\n                        f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generations}`\"\n                        f\" (one per generation starting from 0 up to {self.num_generations - 1}).\",\n                        page=\"components-gallery/steps/preferencetoargilla/\",\n                    )\n\n            self._dataset = _dataset\n        else:\n            _settings = rg.Settings(  # type: ignore\n                fields=[\n                    rg.TextField(name=self._id, title=self._id),  # type: ignore\n                    rg.TextField(name=self._instruction, title=self._instruction),  # type: ignore\n                    *self._generation_fields(),  # type: ignore\n                ],\n                questions=self._rating_rationale_pairs(),  # type: ignore\n            )\n            _dataset = rg.Dataset(  # type: ignore\n                name=self.dataset_name,\n                workspace=self.dataset_workspace,\n                settings=_settings,\n                client=self._client,\n            )\n            self._dataset = _dataset.create()\n\n    def _generation_fields(self) -> List[\"TextField\"]:\n        \"\"\"Method to generate the fields for each of the generations.\n\n        Returns:\n            A list containing `TextField`s for each text generation.\n        \"\"\"\n        return [\n            rg.TextField(  # type: ignore\n                name=f\"{self._generations}-{idx}\",\n                title=f\"{self._generations}-{idx}\",\n                required=True if idx == 0 else False,\n            )\n            for idx in range(self.num_generations)\n        ]\n\n    def _rating_rationale_pairs(\n        self,\n    ) -> List[Union[\"RatingQuestion\", \"TextQuestion\"]]:\n        \"\"\"Method to generate the rating and rationale questions for each of the generations.\n\n        Returns:\n            A list of questions containing a `RatingQuestion` and `TextQuestion` pair for\n            each text generation.\n        \"\"\"\n        questions = []\n        for idx in range(self.num_generations):\n            questions.extend(\n                [\n                    rg.RatingQuestion(  # type: ignore\n                        name=f\"{self._generations}-{idx}-rating\",\n                        title=f\"Rate {self._generations}-{idx} given {self._instruction}.\",\n                        description=f\"Ignore this question if the corresponding `{self._generations}-{idx}` field is not available.\"\n                        if idx != 0\n                        else None,\n                        values=[1, 2, 3, 4, 5],\n                        required=True if idx == 0 else False,\n                    ),\n                    rg.TextQuestion(  # type: ignore\n                        name=f\"{self._generations}-{idx}-rationale\",\n                        title=f\"Specify the rationale for {self._generations}-{idx}'s rating.\",\n                        description=f\"Ignore this question if the corresponding `{self._generations}-{idx}` field is not available.\"\n                        if idx != 0\n                        else None,\n                        required=False,\n                    ),\n                ]\n            )\n        return questions\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the step are the `instruction` and the `generations`. Optionally, one could also\n        provide the `ratings` and the `rationales` for the generations.\"\"\"\n        return [\"instruction\", \"generations\"]\n\n    @property\n    def optional_inputs(self) -> List[str]:\n        \"\"\"The optional inputs for the step are the `ratings` and the `rationales` for the generations.\"\"\"\n        return [\"ratings\", \"rationales\"]\n\n    def _add_suggestions_if_any(self, input: Dict[str, Any]) -> List[\"Suggestion\"]:\n        \"\"\"Method to generate the suggestions for the `rg.Record` based on the input.\n\n        Returns:\n            A list of `Suggestion`s for the rating and rationales questions.\n        \"\"\"\n        # Since the `suggestions` i.e. answers to the `questions` are optional, will default to {}\n        suggestions = []\n        # If `ratings` is in `input`, then add those as suggestions\n        if self._ratings in input:\n            suggestions.extend(\n                [\n                    rg.Suggestion(  # type: ignore\n                        value=rating,\n                        question_name=f\"{self._generations}-{idx}-rating\",\n                    )\n                    for idx, rating in enumerate(input[self._ratings])\n                    if rating is not None\n                    and isinstance(rating, int)\n                    and rating in [1, 2, 3, 4, 5]\n                ],\n            )\n        # If `rationales` is in `input`, then add those as suggestions\n        if self._rationales in input:\n            suggestions.extend(\n                [\n                    rg.Suggestion(  # type: ignore\n                        value=rationale,\n                        question_name=f\"{self._generations}-{idx}-rationale\",\n                    )\n                    for idx, rationale in enumerate(input[self._rationales])\n                    if rationale is not None and isinstance(rationale, str)\n                ],\n            )\n        return suggestions\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Creates and pushes the records as `rg.Record`s to the Argilla dataset.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n        records = []\n        for input in inputs:\n            # Generate the SHA-256 hash of the instruction to use it as the metadata\n            instruction_id = hashlib.sha256(\n                input[\"instruction\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            generations = {\n                f\"{self._generations}-{idx}\": generation\n                for idx, generation in enumerate(input[\"generations\"])  # type: ignore\n            }\n\n            records.append(  # type: ignore\n                rg.Record(  # type: ignore\n                    fields={\n                        \"id\": instruction_id,\n                        \"instruction\": input[\"instruction\"],  # type: ignore\n                        **generations,\n                    },\n                    suggestions=self._add_suggestions_if_any(input),  # type: ignore\n                )\n            )\n        self._dataset.records.log(records)  # type: ignore\n        yield inputs\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.inputs","title":"inputs: List[str] property","text":"

The inputs for the step are the instruction and the generations. Optionally, one could also provide the ratings and the rationales for the generations.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.optional_inputs","title":"optional_inputs: List[str] property","text":"

The optional inputs for the step are the ratings and the rationales for the generations.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.load","title":"load()","text":"

Sets the _instruction and _generations attributes based on the inputs_mapping, otherwise uses the default values; and then uses those values to create a FeedbackDataset suited for the text-generation scenario. And then it pushes it to Argilla.

Source code in src/distilabel/steps/argilla/preference.py
def load(self) -> None:\n    \"\"\"Sets the `_instruction` and `_generations` attributes based on the `inputs_mapping`, otherwise\n    uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n    the text-generation scenario. And then it pushes it to Argilla.\n    \"\"\"\n    super().load()\n\n    # Both `instruction` and `generations` will be used as the fields of the dataset\n    self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n    self._generations = self.input_mappings.get(\"generations\", \"generations\")\n    # Both `ratings` and `rationales` will be used as suggestions to the default questions of the dataset\n    self._ratings = self.input_mappings.get(\"ratings\", \"ratings\")\n    self._rationales = self.input_mappings.get(\"rationales\", \"rationales\")\n\n    if self._dataset_exists_in_workspace:\n        _dataset = self._client.datasets(  # type: ignore\n            name=self.dataset_name,  # type: ignore\n            workspace=self.dataset_workspace,  # type: ignore\n        )\n\n        for field in _dataset.fields:\n            if not isinstance(field, rg.TextField):\n                continue\n            if (\n                field.name\n                not in [self._id, self._instruction]  # type: ignore\n                + [\n                    f\"{self._generations}-{idx}\"\n                    for idx in range(self.num_generations)\n                ]\n                and field.required\n            ):\n                raise DistilabelUserError(\n                    f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n                    f\" already exists, but contains at least a required field that is\"\n                    f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generations}`\"\n                    f\" (one per generation starting from 0 up to {self.num_generations - 1}).\",\n                    page=\"components-gallery/steps/preferencetoargilla/\",\n                )\n\n        self._dataset = _dataset\n    else:\n        _settings = rg.Settings(  # type: ignore\n            fields=[\n                rg.TextField(name=self._id, title=self._id),  # type: ignore\n                rg.TextField(name=self._instruction, title=self._instruction),  # type: ignore\n                *self._generation_fields(),  # type: ignore\n            ],\n            questions=self._rating_rationale_pairs(),  # type: ignore\n        )\n        _dataset = rg.Dataset(  # type: ignore\n            name=self.dataset_name,\n            workspace=self.dataset_workspace,\n            settings=_settings,\n            client=self._client,\n        )\n        self._dataset = _dataset.create()\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.process","title":"process(inputs)","text":"

Creates and pushes the records as rg.Records to the Argilla dataset.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/argilla/preference.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Creates and pushes the records as `rg.Record`s to the Argilla dataset.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n    records = []\n    for input in inputs:\n        # Generate the SHA-256 hash of the instruction to use it as the metadata\n        instruction_id = hashlib.sha256(\n            input[\"instruction\"].encode(\"utf-8\")  # type: ignore\n        ).hexdigest()\n\n        generations = {\n            f\"{self._generations}-{idx}\": generation\n            for idx, generation in enumerate(input[\"generations\"])  # type: ignore\n        }\n\n        records.append(  # type: ignore\n            rg.Record(  # type: ignore\n                fields={\n                    \"id\": instruction_id,\n                    \"instruction\": input[\"instruction\"],  # type: ignore\n                    **generations,\n                },\n                suggestions=self._add_suggestions_if_any(input),  # type: ignore\n            )\n        )\n    self._dataset.records.log(records)  # type: ignore\n    yield inputs\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation","title":"text_generation","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla","title":"TextGenerationToArgilla","text":"

Bases: ArgillaBase

Creates a text generation dataset in Argilla.

Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a text-generation dataset, where there's one field per each input, and then a label question to rate the quality of the completion in either bad (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d).

Note

This step is meant to be used in conjunction with a TextGeneration step and no column mapping is needed, as it will use the default values for the instruction and generation columns.

Attributes:

Name Type Description dataset_name

The name of the dataset in Argilla.

dataset_workspace

The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

api_url

The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

api_key

The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

Runtime parameters
  • api_url: The base URL to use for the Argilla API requests.
  • api_key: The API key to authenticate the requests to the Argilla API.
Input columns
  • instruction (str): The instruction that was used to generate the completion.
  • generation (str or List[str]): The completions that were generated based on the input instruction.

Examples:

Push a text generation dataset to an Argilla instance:

from distilabel.steps import PreferenceToArgilla\n\nto_argilla = TextGenerationToArgilla(\n    num_generations=2,\n    api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n    api_key=\"api.key\",\n    dataset_name=\"argilla_dataset\",\n    dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generation\": \"generation\",\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generation': 'generation'}]\n
Source code in src/distilabel/steps/argilla/text_generation.py
class TextGenerationToArgilla(ArgillaBase):\n    \"\"\"Creates a text generation dataset in Argilla.\n\n    `Step` that creates a dataset in Argilla during the load phase, and then pushes the input\n    batches into it as records. This dataset is a text-generation dataset, where there's one field\n    per each input, and then a label question to rate the quality of the completion in either bad\n    (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d).\n\n    Note:\n        This step is meant to be used in conjunction with a `TextGeneration` step and no column mapping\n        is needed, as it will use the default values for the `instruction` and `generation` columns.\n\n    Attributes:\n        dataset_name: The name of the dataset in Argilla.\n        dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n            `None`, which means it will be created in the default workspace.\n        api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n            the `ARGILLA_API_URL` environment variable.\n        api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n            be read from the `ARGILLA_API_KEY` environment variable.\n\n    Runtime parameters:\n        - `api_url`: The base URL to use for the Argilla API requests.\n        - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n    Input columns:\n        - instruction (`str`): The instruction that was used to generate the completion.\n        - generation (`str` or `List[str]`): The completions that were generated based on the input instruction.\n\n    Examples:\n        Push a text generation dataset to an Argilla instance:\n\n        ```python\n        from distilabel.steps import PreferenceToArgilla\n\n        to_argilla = TextGenerationToArgilla(\n            num_generations=2,\n            api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n            api_key=\"api.key\",\n            dataset_name=\"argilla_dataset\",\n            dataset_workspace=\"my_workspace\",\n        )\n        to_argilla.load()\n\n        result = next(\n            to_argilla.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"generation\": \"generation\",\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'instruction', 'generation': 'generation'}]\n        ```\n    \"\"\"\n\n    _id: str = PrivateAttr(default=\"id\")\n    _instruction: str = PrivateAttr(...)\n    _generation: str = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Sets the `_instruction` and `_generation` attributes based on the `inputs_mapping`, otherwise\n        uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n        the text-generation scenario. And then it pushes it to Argilla.\n        \"\"\"\n        super().load()\n\n        self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n        self._generation = self.input_mappings.get(\"generation\", \"generation\")\n\n        if self._dataset_exists_in_workspace:\n            _dataset = self._client.datasets(  # type: ignore\n                name=self.dataset_name,  # type: ignore\n                workspace=self.dataset_workspace,  # type: ignore\n            )\n\n            for field in _dataset.fields:\n                if not isinstance(field, rg.TextField):  # type: ignore\n                    continue\n                if (\n                    field.name not in [self._id, self._instruction, self._generation]\n                    and field.required\n                ):\n                    raise DistilabelUserError(\n                        f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n                        f\" already exists, but contains at least a required field that is\"\n                        f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generation}`,\"\n                        \" so it cannot be reused for this dataset.\",\n                        page=\"components-gallery/steps/textgenerationtoargilla/\",\n                    )\n\n            self._dataset = _dataset\n        else:\n            _settings = rg.Settings(  # type: ignore\n                fields=[\n                    rg.TextField(name=self._id, title=self._id),  # type: ignore\n                    rg.TextField(name=self._instruction, title=self._instruction),  # type: ignore\n                    rg.TextField(name=self._generation, title=self._generation),  # type: ignore\n                ],\n                questions=[\n                    rg.LabelQuestion(  # type: ignore\n                        name=\"quality\",\n                        title=f\"What's the quality of the {self._generation} for the given {self._instruction}?\",\n                        labels={\"bad\": \"\ud83d\udc4e\", \"good\": \"\ud83d\udc4d\"},  # type: ignore\n                    )\n                ],\n            )\n            _dataset = rg.Dataset(  # type: ignore\n                name=self.dataset_name,\n                workspace=self.dataset_workspace,\n                settings=_settings,\n                client=self._client,\n            )\n            self._dataset = _dataset.create()\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the step are the `instruction` and the `generation`.\"\"\"\n        return [\"instruction\", \"generation\"]\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Creates and pushes the records as FeedbackRecords to the Argilla dataset.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n        records = []\n        for input in inputs:\n            # Generate the SHA-256 hash of the instruction to use it as the metadata\n            instruction_id = hashlib.sha256(\n                input[\"instruction\"].encode(\"utf-8\")\n            ).hexdigest()\n\n            generations = input[\"generation\"]\n\n            # If the `generation` is not a list, then convert it into a list\n            if not isinstance(generations, list):\n                generations = [generations]\n\n            # Create a `generations_set` to avoid adding duplicates\n            generations_set = set()\n\n            for generation in generations:\n                # If the generation is already in the set, then skip it\n                if generation in generations_set:\n                    continue\n                # Otherwise, add it to the set\n                generations_set.add(generation)\n\n                records.append(\n                    rg.Record(  # type: ignore\n                        fields={\n                            self._id: instruction_id,\n                            self._instruction: input[\"instruction\"],\n                            self._generation: generation,\n                        },\n                    ),\n                )\n        self._dataset.records.log(records)  # type: ignore\n        yield inputs\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.inputs","title":"inputs: List[str] property","text":"

The inputs for the step are the instruction and the generation.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.load","title":"load()","text":"

Sets the _instruction and _generation attributes based on the inputs_mapping, otherwise uses the default values; and then uses those values to create a FeedbackDataset suited for the text-generation scenario. And then it pushes it to Argilla.

Source code in src/distilabel/steps/argilla/text_generation.py
def load(self) -> None:\n    \"\"\"Sets the `_instruction` and `_generation` attributes based on the `inputs_mapping`, otherwise\n    uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n    the text-generation scenario. And then it pushes it to Argilla.\n    \"\"\"\n    super().load()\n\n    self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n    self._generation = self.input_mappings.get(\"generation\", \"generation\")\n\n    if self._dataset_exists_in_workspace:\n        _dataset = self._client.datasets(  # type: ignore\n            name=self.dataset_name,  # type: ignore\n            workspace=self.dataset_workspace,  # type: ignore\n        )\n\n        for field in _dataset.fields:\n            if not isinstance(field, rg.TextField):  # type: ignore\n                continue\n            if (\n                field.name not in [self._id, self._instruction, self._generation]\n                and field.required\n            ):\n                raise DistilabelUserError(\n                    f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n                    f\" already exists, but contains at least a required field that is\"\n                    f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generation}`,\"\n                    \" so it cannot be reused for this dataset.\",\n                    page=\"components-gallery/steps/textgenerationtoargilla/\",\n                )\n\n        self._dataset = _dataset\n    else:\n        _settings = rg.Settings(  # type: ignore\n            fields=[\n                rg.TextField(name=self._id, title=self._id),  # type: ignore\n                rg.TextField(name=self._instruction, title=self._instruction),  # type: ignore\n                rg.TextField(name=self._generation, title=self._generation),  # type: ignore\n            ],\n            questions=[\n                rg.LabelQuestion(  # type: ignore\n                    name=\"quality\",\n                    title=f\"What's the quality of the {self._generation} for the given {self._instruction}?\",\n                    labels={\"bad\": \"\ud83d\udc4e\", \"good\": \"\ud83d\udc4d\"},  # type: ignore\n                )\n            ],\n        )\n        _dataset = rg.Dataset(  # type: ignore\n            name=self.dataset_name,\n            workspace=self.dataset_workspace,\n            settings=_settings,\n            client=self._client,\n        )\n        self._dataset = _dataset.create()\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.process","title":"process(inputs)","text":"

Creates and pushes the records as FeedbackRecords to the Argilla dataset.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/argilla/text_generation.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Creates and pushes the records as FeedbackRecords to the Argilla dataset.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n    records = []\n    for input in inputs:\n        # Generate the SHA-256 hash of the instruction to use it as the metadata\n        instruction_id = hashlib.sha256(\n            input[\"instruction\"].encode(\"utf-8\")\n        ).hexdigest()\n\n        generations = input[\"generation\"]\n\n        # If the `generation` is not a list, then convert it into a list\n        if not isinstance(generations, list):\n            generations = [generations]\n\n        # Create a `generations_set` to avoid adding duplicates\n        generations_set = set()\n\n        for generation in generations:\n            # If the generation is already in the set, then skip it\n            if generation in generations_set:\n                continue\n            # Otherwise, add it to the set\n            generations_set.add(generation)\n\n            records.append(\n                rg.Record(  # type: ignore\n                    fields={\n                        self._id: instruction_id,\n                        self._instruction: input[\"instruction\"],\n                        self._generation: generation,\n                    },\n                ),\n            )\n    self._dataset.records.log(records)  # type: ignore\n    yield inputs\n
"},{"location":"api/step_gallery/columns/","title":"Columns","text":"

This section contains the existing steps intended to be used for common column operations to apply to the batches.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand","title":"expand","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns","title":"ExpandColumns","text":"

Bases: Step

Expand columns that contain lists into multiple rows.

ExpandColumns is a Step that takes a list of columns and expands them into multiple rows. The new rows will have the same data as the original row, except for the expanded column, which will contain a single item from the original list.

Attributes:

Name Type Description columns Union[Dict[str, str], List[str]]

A dictionary that maps the column to be expanded to the new column name or a list of columns to be expanded. If a list is provided, the new column name will be the same as the column name.

Input columns
  • dynamic (determined by columns attribute): The columns to be expanded into multiple rows.
Output columns
  • dynamic (determined by columns attribute): The expanded columns.
Categories
  • columns

Examples:

Expand the selected columns into multiple rows:

from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n    columns=[\"generation\"],\n)\nexpand_columns.load()\n\nresult = next(\n    expand_columns.process(\n        [\n            {\n                \"instruction\": \"instruction 1\",\n                \"generation\": [\"generation 1\", \"generation 2\"]}\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n
Source code in src/distilabel/steps/columns/expand.py
class ExpandColumns(Step):\n    \"\"\"Expand columns that contain lists into multiple rows.\n\n    `ExpandColumns` is a `Step` that takes a list of columns and expands them into multiple\n    rows. The new rows will have the same data as the original row, except for the expanded\n    column, which will contain a single item from the original list.\n\n    Attributes:\n        columns: A dictionary that maps the column to be expanded to the new column name\n            or a list of columns to be expanded. If a list is provided, the new column name\n            will be the same as the column name.\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): The columns to be expanded into\n            multiple rows.\n\n    Output columns:\n        - dynamic (determined by `columns` attribute):  The expanded columns.\n\n    Categories:\n        - columns\n\n    Examples:\n        Expand the selected columns into multiple rows:\n\n        ```python\n        from distilabel.steps import ExpandColumns\n\n        expand_columns = ExpandColumns(\n            columns=[\"generation\"],\n        )\n        expand_columns.load()\n\n        result = next(\n            expand_columns.process(\n                [\n                    {\n                        \"instruction\": \"instruction 1\",\n                        \"generation\": [\"generation 1\", \"generation 2\"]}\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n        ```\n    \"\"\"\n\n    columns: Union[Dict[str, str], List[str]]\n\n    @field_validator(\"columns\")\n    @classmethod\n    def always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:\n        \"\"\"Ensure that the columns are always a dictionary.\n\n        Args:\n            value: The columns to be expanded.\n\n        Returns:\n            The columns to be expanded as a dictionary.\n        \"\"\"\n        if isinstance(value, list):\n            return {col: col for col in value}\n\n        return value\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The columns to be expanded.\"\"\"\n        return list(self.columns.keys())\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The expanded columns.\"\"\"\n        return [\n            new_column if new_column else expand_column\n            for expand_column, new_column in self.columns.items()\n        ]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Expand the columns in the input data.\n\n        Args:\n            inputs: The input data.\n\n        Yields:\n            The expanded rows.\n        \"\"\"\n        yield [row for input in inputs for row in self._expand_columns(input)]\n\n    def _expand_columns(self, input: Dict[str, Any]) -> List[Dict[str, Any]]:\n        \"\"\"Expand the columns in the input data.\n\n        Args:\n            input: The input data.\n\n        Returns:\n            The expanded rows.\n        \"\"\"\n        expanded_rows = []\n        for expand_column, new_column in self.columns.items():  # type: ignore\n            data = input.get(expand_column)\n            rows = []\n            for item, expanded in zip_longest(*[data, expanded_rows], fillvalue=input):\n                rows.append({**expanded, new_column: item})\n            expanded_rows = rows\n        return expanded_rows\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.inputs","title":"inputs: StepColumns property","text":"

The columns to be expanded.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.outputs","title":"outputs: StepColumns property","text":"

The expanded columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.always_dict","title":"always_dict(value) classmethod","text":"

Ensure that the columns are always a dictionary.

Parameters:

Name Type Description Default value Union[Dict[str, str], List[str]]

The columns to be expanded.

required

Returns:

Type Description Dict[str, str]

The columns to be expanded as a dictionary.

Source code in src/distilabel/steps/columns/expand.py
@field_validator(\"columns\")\n@classmethod\ndef always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:\n    \"\"\"Ensure that the columns are always a dictionary.\n\n    Args:\n        value: The columns to be expanded.\n\n    Returns:\n        The columns to be expanded as a dictionary.\n    \"\"\"\n    if isinstance(value, list):\n        return {col: col for col in value}\n\n    return value\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.process","title":"process(inputs)","text":"

Expand the columns in the input data.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Yields:

Type Description StepOutput

The expanded rows.

Source code in src/distilabel/steps/columns/expand.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Expand the columns in the input data.\n\n    Args:\n        inputs: The input data.\n\n    Yields:\n        The expanded rows.\n    \"\"\"\n    yield [row for input in inputs for row in self._expand_columns(input)]\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep","title":"keep","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns","title":"KeepColumns","text":"

Bases: Step

Keeps selected columns in the dataset.

KeepColumns is a Step that implements the process method that keeps only the columns specified in the columns attribute. Also KeepColumns provides an attribute columns to specify the columns to keep which will override the default value for the properties inputs and outputs.

Note

The order in which the columns are provided is important, as the output will be sorted using the provided order, which is useful before pushing either a dataset.Dataset via the PushToHub step or a distilabel.Distiset via the Pipeline.run output variable.

Attributes:

Name Type Description columns List[str]

List of strings with the names of the columns to keep.

Input columns
  • dynamic (determined by columns attribute): The columns to keep.
Output columns
  • dynamic (determined by columns attribute): The columns that were kept.
Categories
  • columns

Examples:

Select the columns to keep:

from distilabel.steps import KeepColumns\n\nkeep_columns = KeepColumns(\n    columns=[\"instruction\", \"generation\"],\n)\nkeep_columns.load()\n\nresult = next(\n    keep_columns.process(\n        [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n    )\n)\n# >>> result\n# [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n
Source code in src/distilabel/steps/columns/keep.py
class KeepColumns(Step):\n    \"\"\"Keeps selected columns in the dataset.\n\n    `KeepColumns` is a `Step` that implements the `process` method that keeps only the columns\n    specified in the `columns` attribute. Also `KeepColumns` provides an attribute `columns` to\n    specify the columns to keep which will override the default value for the properties `inputs`\n    and `outputs`.\n\n    Note:\n        The order in which the columns are provided is important, as the output will be sorted\n        using the provided order, which is useful before pushing either a `dataset.Dataset` via\n        the `PushToHub` step or a `distilabel.Distiset` via the `Pipeline.run` output variable.\n\n    Attributes:\n        columns: List of strings with the names of the columns to keep.\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): The columns to keep.\n\n    Output columns:\n        - dynamic (determined by `columns` attribute): The columns that were kept.\n\n    Categories:\n        - columns\n\n    Examples:\n        Select the columns to keep:\n\n        ```python\n        from distilabel.steps import KeepColumns\n\n        keep_columns = KeepColumns(\n            columns=[\"instruction\", \"generation\"],\n        )\n        keep_columns.load()\n\n        result = next(\n            keep_columns.process(\n                [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n            )\n        )\n        # >>> result\n        # [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n        ```\n    \"\"\"\n\n    columns: List[str]\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task are the column names in `columns`.\"\"\"\n        return self.columns\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs for the task are the column names in `columns`.\"\"\"\n        return self.columns\n\n    @override\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        \"\"\"The `process` method keeps only the columns specified in the `columns` attribute.\n\n        Args:\n            *inputs: A list of dictionaries with the input data.\n\n        Yields:\n            A list of dictionaries with the output data.\n        \"\"\"\n        for input in inputs:\n            outputs = []\n            for item in input:\n                outputs.append({col: item[col] for col in self.columns})\n            yield outputs\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task are the column names in columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.outputs","title":"outputs: StepColumns property","text":"

The outputs for the task are the column names in columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.process","title":"process(*inputs)","text":"

The process method keeps only the columns specified in the columns attribute.

Parameters:

Name Type Description Default *inputs StepInput

A list of dictionaries with the input data.

()

Yields:

Type Description StepOutput

A list of dictionaries with the output data.

Source code in src/distilabel/steps/columns/keep.py
@override\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n    \"\"\"The `process` method keeps only the columns specified in the `columns` attribute.\n\n    Args:\n        *inputs: A list of dictionaries with the input data.\n\n    Yields:\n        A list of dictionaries with the output data.\n    \"\"\"\n    for input in inputs:\n        outputs = []\n        for item in input:\n            outputs.append({col: item[col] for col in self.columns})\n        yield outputs\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.merge","title":"merge","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.merge.MergeColumns","title":"MergeColumns","text":"

Bases: Step

Merge columns from a row.

MergeColumns is a Step that implements the process method that calls the merge_columns function to handle and combine columns in a StepInput. MergeColumns provides two attributes columns and output_column to specify the columns to merge and the resulting output column.

This step can be useful if you have a Task that generates instructions for example, and you want to have more examples of those. In such a case, you could for example use another Task to multiply your instructions synthetically, what would yield two different columns splitted. Using MergeColumns you can merge them and use them as a single column in your dataset for further processing.

Attributes:

Name Type Description columns List[str]

List of strings with the names of the columns to merge.

output_column Optional[str]

str name of the output column

Input columns
  • dynamic (determined by columns attribute): The columns to merge.
Output columns
  • dynamic (determined by columns and output_column attributes): The columns that were merged.
Categories
  • columns

Examples:

Combine columns in rows of a dataset:

from distilabel.steps import MergeColumns\n\ncombiner = MergeColumns(\n    columns=[\"queries\", \"multiple_queries\"],\n    output_column=\"queries\",\n)\ncombiner.load()\n\nresult = next(\n    combiner.process(\n        [\n            {\n                \"queries\": \"How are you?\",\n                \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n            }\n        ],\n    )\n)\n# >>> result\n# [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n
Source code in src/distilabel/steps/columns/merge.py
class MergeColumns(Step):\n    \"\"\"Merge columns from a row.\n\n    `MergeColumns` is a `Step` that implements the `process` method that calls the `merge_columns`\n    function to handle and combine columns in a `StepInput`. `MergeColumns` provides two attributes\n    `columns` and `output_column` to specify the columns to merge and the resulting output column.\n\n    This step can be useful if you have a `Task` that generates instructions for example, and you\n    want to have more examples of those. In such a case, you could for example use another `Task`\n    to multiply your instructions synthetically, what would yield two different columns splitted.\n    Using `MergeColumns` you can merge them and use them as a single column in your dataset for\n    further processing.\n\n    Attributes:\n        columns: List of strings with the names of the columns to merge.\n        output_column: str name of the output column\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): The columns to merge.\n\n    Output columns:\n        - dynamic (determined by `columns` and `output_column` attributes): The columns\n            that were merged.\n\n    Categories:\n        - columns\n\n    Examples:\n        Combine columns in rows of a dataset:\n\n        ```python\n        from distilabel.steps import MergeColumns\n\n        combiner = MergeColumns(\n            columns=[\"queries\", \"multiple_queries\"],\n            output_column=\"queries\",\n        )\n        combiner.load()\n\n        result = next(\n            combiner.process(\n                [\n                    {\n                        \"queries\": \"How are you?\",\n                        \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n        ```\n    \"\"\"\n\n    columns: List[str]\n    output_column: Optional[str] = None\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return self.columns\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [self.output_column] if self.output_column else [\"merged_column\"]\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        combined = []\n        for input in inputs:\n            combined.append(\n                merge_columns(\n                    input,\n                    columns=self.columns,\n                    new_column=self.outputs[0],\n                )\n            )\n        yield combined\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group","title":"group","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns","title":"GroupColumns","text":"

Bases: Step

Combines columns from a list of StepInput.

GroupColumns is a Step that implements the process method that calls the group_dicts function to handle and combine a list of StepInput. Also GroupColumns provides two attributes columns and output_columns to specify the columns to group and the output columns which will override the default value for the properties inputs and outputs, respectively.

Attributes:

Name Type Description columns List[str]

List of strings with the names of the columns to group.

output_columns Optional[List[str]]

Optional list of strings with the names of the output columns.

Input columns
  • dynamic (determined by columns attribute): The columns to group.
Output columns
  • dynamic (determined by columns and output_columns attributes): The columns that were grouped.
Categories
  • columns

Examples:

Group columns of a dataset:\n\n```python\nfrom distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n    name=\"group_columns\",\n    columns=[\"generation\", \"model_name\"],\n)\ngroup_columns.load()\n\nresult = next(\n    group_columns.process(\n        [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n        [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n    )\n)\n# >>> result\n# [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n```\n\nSpecify the name of the output columns:\n\n```python\nfrom distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n    name=\"group_columns\",\n    columns=[\"generation\", \"model_name\"],\n    output_columns=[\"generations\", \"generation_models\"]\n)\ngroup_columns.load()\n\nresult = next(\n    group_columns.process(\n        [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n        [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n    )\n)\n# >>> result\n#[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n```\n
Source code in src/distilabel/steps/columns/group.py
class GroupColumns(Step):\n    \"\"\"Combines columns from a list of `StepInput`.\n\n    `GroupColumns` is a `Step` that implements the `process` method that calls the `group_dicts`\n    function to handle and combine a list of `StepInput`. Also `GroupColumns` provides two attributes\n    `columns` and `output_columns` to specify the columns to group and the output columns\n    which will override the default value for the properties `inputs` and `outputs`, respectively.\n\n    Attributes:\n        columns: List of strings with the names of the columns to group.\n        output_columns: Optional list of strings with the names of the output columns.\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): The columns to group.\n\n    Output columns:\n        - dynamic (determined by `columns` and `output_columns` attributes): The columns\n            that were grouped.\n\n    Categories:\n        - columns\n\n    Examples:\n\n        Group columns of a dataset:\n\n        ```python\n        from distilabel.steps import GroupColumns\n\n        group_columns = GroupColumns(\n            name=\"group_columns\",\n            columns=[\"generation\", \"model_name\"],\n        )\n        group_columns.load()\n\n        result = next(\n            group_columns.process(\n                [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n                [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n            )\n        )\n        # >>> result\n        # [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n        ```\n\n        Specify the name of the output columns:\n\n        ```python\n        from distilabel.steps import GroupColumns\n\n        group_columns = GroupColumns(\n            name=\"group_columns\",\n            columns=[\"generation\", \"model_name\"],\n            output_columns=[\"generations\", \"generation_models\"]\n        )\n        group_columns.load()\n\n        result = next(\n            group_columns.process(\n                [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n                [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n            )\n        )\n        # >>> result\n        #[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n        ```\n    \"\"\"\n\n    columns: List[str]\n    output_columns: Optional[List[str]] = None\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task are the column names in `columns`.\"\"\"\n        return self.columns\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs for the task are the column names in `output_columns` or\n        `grouped_{column}` for each column in `columns`.\"\"\"\n        return (\n            self.output_columns\n            if self.output_columns is not None\n            else [f\"grouped_{column}\" for column in self.columns]\n        )\n\n    @override\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        \"\"\"The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with the combined `StepInput` using the `group_dicts` function.\n        \"\"\"\n        yield group_columns(\n            *inputs,\n            group_columns=self.inputs,\n            output_group_columns=self.outputs,\n        )\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task are the column names in columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.outputs","title":"outputs: StepColumns property","text":"

The outputs for the task are the column names in output_columns or grouped_{column} for each column in columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.process","title":"process(*inputs)","text":"

The process method calls the group_dicts function to handle and combine a list of StepInput.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with the combined StepInput using the group_dicts function.

Source code in src/distilabel/steps/columns/group.py
@override\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n    \"\"\"The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with the combined `StepInput` using the `group_dicts` function.\n    \"\"\"\n    yield group_columns(\n        *inputs,\n        group_columns=self.inputs,\n        output_group_columns=self.outputs,\n    )\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.CombineColumns","title":"CombineColumns","text":"

Bases: GroupColumns

CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead.

Source code in src/distilabel/steps/columns/group.py
class CombineColumns(GroupColumns):\n    \"\"\"`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.\"\"\"\n\n    def __init__(self, **data: Any) -> None:\n        warnings.warn(\n            \"`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.\",\n            DeprecationWarning,\n            stacklevel=2,\n        )\n        return super().__init__(**data)\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils","title":"utils","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.merge_distilabel_metadata","title":"merge_distilabel_metadata(*output_dicts)","text":"

Merge the DISTILABEL_METADATA_KEY from multiple output dictionaries.

Parameters:

Name Type Description Default *output_dicts Dict[str, Any]

Variable number of dictionaries containing distilabel metadata.

()

Returns:

Type Description Dict[str, Any]

A merged dictionary containing all the distilabel metadata from the input dictionaries.

Source code in src/distilabel/steps/columns/utils.py
def merge_distilabel_metadata(*output_dicts: Dict[str, Any]) -> Dict[str, Any]:\n    \"\"\"\n    Merge the `DISTILABEL_METADATA_KEY` from multiple output dictionaries.\n\n    Args:\n        *output_dicts: Variable number of dictionaries containing distilabel metadata.\n\n    Returns:\n        A merged dictionary containing all the distilabel metadata from the input dictionaries.\n    \"\"\"\n    merged_metadata = defaultdict(list)\n\n    for output_dict in output_dicts:\n        metadata = output_dict.get(DISTILABEL_METADATA_KEY, {})\n        for key, value in metadata.items():\n            merged_metadata[key].append(value)\n\n    final_metadata = {}\n    for key, value_list in merged_metadata.items():\n        if len(value_list) == 1:\n            final_metadata[key] = value_list[0]\n        else:\n            final_metadata[key] = value_list\n\n    return final_metadata\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.group_columns","title":"group_columns(*inputs, group_columns, output_group_columns=None)","text":"

Groups multiple list of dictionaries into a single list of dictionaries on the specified group_columns. If group_columns are provided, then it will also rename group_columns.

Parameters:

Name Type Description Default inputs StepInput

list of dictionaries to combine.

() group_columns List[str]

list of keys to merge on.

required output_group_columns Optional[List[str]]

list of keys to rename the merge keys to. Defaults to None.

None

Returns:

Type Description StepInput

A list of dictionaries where the values of the group_columns are combined into a

StepInput

list and renamed to output_group_columns.

Source code in src/distilabel/steps/columns/utils.py
def group_columns(\n    *inputs: \"StepInput\",\n    group_columns: List[str],\n    output_group_columns: Optional[List[str]] = None,\n) -> \"StepInput\":\n    \"\"\"Groups multiple list of dictionaries into a single list of dictionaries on the\n    specified `group_columns`. If `group_columns` are provided, then it will also rename\n    `group_columns`.\n\n    Args:\n        inputs: list of dictionaries to combine.\n        group_columns: list of keys to merge on.\n        output_group_columns: list of keys to rename the merge keys to. Defaults to `None`.\n\n    Returns:\n        A list of dictionaries where the values of the `group_columns` are combined into a\n        list and renamed to `output_group_columns`.\n    \"\"\"\n    if output_group_columns is not None and len(output_group_columns) != len(\n        group_columns\n    ):\n        raise ValueError(\n            \"The length of `output_group_columns` must be the same as the length of `group_columns`.\"\n        )\n    if output_group_columns is None:\n        output_group_columns = [f\"grouped_{key}\" for key in group_columns]\n    group_columns_dict = dict(zip(group_columns, output_group_columns))\n\n    result = []\n    # Use zip to iterate over lists based on their index\n    for dicts_at_index in zip(*inputs):\n        combined_dict = {}\n        metadata_dicts = []\n        # Iterate over dicts at the same index\n        for d in dicts_at_index:\n            # Extract metadata for merging\n            if DISTILABEL_METADATA_KEY in d:\n                metadata_dicts.append(\n                    {DISTILABEL_METADATA_KEY: d[DISTILABEL_METADATA_KEY]}\n                )\n            # Iterate over key-value pairs in each dict\n            for key, value in d.items():\n                if key == DISTILABEL_METADATA_KEY:\n                    continue\n                # If the key is in the merge_keys, append the value to the existing list\n                if key in group_columns_dict.keys():\n                    combined_dict.setdefault(group_columns_dict[key], []).append(value)\n                # If the key is not in the merge_keys, create a new key-value pair\n                else:\n                    combined_dict[key] = value\n\n        if metadata_dicts:\n            combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(\n                *metadata_dicts\n            )\n\n        result.append(combined_dict)\n    return result\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.merge_columns","title":"merge_columns(row, columns, new_column='combined_key')","text":"

Merge columns in a dictionary into a single column on the specified new_column.

Parameters:

Name Type Description Default row Dict[str, Any]

Dictionary corresponding to a row in a dataset.

required columns List[str]

List of keys to merge.

required new_column str

Name of the new key created.

'combined_key'

Returns:

Type Description Dict[str, Any]

Dictionary with the new merged key.

Source code in src/distilabel/steps/columns/utils.py
def merge_columns(\n    row: Dict[str, Any], columns: List[str], new_column: str = \"combined_key\"\n) -> Dict[str, Any]:\n    \"\"\"Merge columns in a dictionary into a single column on the specified `new_column`.\n\n    Args:\n        row: Dictionary corresponding to a row in a dataset.\n        columns: List of keys to merge.\n        new_column: Name of the new key created.\n\n    Returns:\n        Dictionary with the new merged key.\n    \"\"\"\n    result = row.copy()  # preserve the original dictionary\n    combined = []\n    for key in columns:\n        to_combine = result.pop(key)\n        if not isinstance(to_combine, list):\n            to_combine = [to_combine]\n        combined += to_combine\n    result[new_column] = combined\n    return result\n
"},{"location":"api/step_gallery/extra/","title":"Extra","text":""},{"location":"api/step_gallery/extra/#distilabel.steps","title":"steps","text":""},{"location":"api/step_gallery/extra/#distilabel.steps.DBSCAN","title":"DBSCAN","text":"

Bases: GlobalStep

DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core samples in regions of high density and expands clusters from them. This algorithm is good for data which contains clusters of similar density.

This is a GlobalStep that clusters the embeddings using the DBSCAN algorithm from sklearn. Visit TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub.

Input columns
  • projection (List[float]): Vector representation of the text to cluster, normally the output from the UMAP step.
Output columns
  • cluster_label (int): Integer representing the label of a given cluster. -1 means it wasn't clustered.
Categories
  • clustering
  • text-classification
References
  • DBSCAN demo of sklearn
  • sklearn dbscan

Attributes:

Name Type Description - eps

The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.

- min_samples

The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse.

- metric

The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter.

- n_jobs

The number of parallel jobs to run.

Runtime parameters
  • eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
  • min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse.
  • metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter.
  • n_jobs: The number of parallel jobs to run.
Source code in src/distilabel/steps/clustering/dbscan.py
class DBSCAN(GlobalStep):\n    r\"\"\"DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core\n    samples in regions of high density and expands clusters from them. This algorithm\n    is good for data which contains clusters of similar density.\n\n    This is a `GlobalStep` that clusters the embeddings using the DBSCAN algorithm\n    from `sklearn`. Visit `TextClustering` step for an example of use.\n    The trained model is saved as an artifact when creating a distiset\n    and pushing it to the Hugging Face Hub.\n\n    Input columns:\n        - projection (`List[float]`): Vector representation of the text to cluster,\n            normally the output from the `UMAP` step.\n\n    Output columns:\n        - cluster_label (`int`): Integer representing the label of a given cluster. -1\n            means it wasn't clustered.\n\n    Categories:\n        - clustering\n        - text-classification\n\n    References:\n        - [`DBSCAN demo of sklearn`](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#demo-of-dbscan-clustering-algorithm)\n        - [`sklearn dbscan`](https://scikit-learn.org/stable/modules/clustering.html#dbscan)\n\n    Attributes:\n        - eps: The maximum distance between two samples for one to be considered as in the\n            neighborhood of the other. This is not a maximum bound on the distances of\n            points within a cluster. This is the most important DBSCAN parameter to\n            choose appropriately for your data set and distance function.\n        - min_samples: The number of samples (or total weight) in a neighborhood for a point\n            to be considered as a core point. This includes the point itself. If `min_samples`\n            is set to a higher value, DBSCAN will find denser clusters, whereas if it is set\n            to a lower value, the found clusters will be more sparse.\n        - metric: The metric to use when calculating distance between instances in a feature\n            array. If metric is a string or callable, it must be one of the options allowed\n            by `sklearn.metrics.pairwise_distances` for its metric parameter.\n        - n_jobs: The number of parallel jobs to run.\n\n    Runtime parameters:\n        - `eps`: The maximum distance between two samples for one to be considered as in the\n            neighborhood of the other. This is not a maximum bound on the distances of\n            points within a cluster. This is the most important DBSCAN parameter to\n            choose appropriately for your data set and distance function.\n        - `min_samples`: The number of samples (or total weight) in a neighborhood for a point\n            to be considered as a core point. This includes the point itself. If `min_samples`\n            is set to a higher value, DBSCAN will find denser clusters, whereas if it is set\n            to a lower value, the found clusters will be more sparse.\n        - `metric`: The metric to use when calculating distance between instances in a feature\n            array. If metric is a string or callable, it must be one of the options allowed\n            by `sklearn.metrics.pairwise_distances` for its metric parameter.\n        - `n_jobs`: The number of parallel jobs to run.\n    \"\"\"\n\n    eps: Optional[RuntimeParameter[float]] = Field(\n        default=0.3,\n        description=(\n            \"The maximum distance between two samples for one to be considered \"\n            \"as in the neighborhood of the other. This is not a maximum bound \"\n            \"on the distances of points within a cluster. This is the most \"\n            \"important DBSCAN parameter to choose appropriately for your data set \"\n            \"and distance function.\"\n        ),\n    )\n    min_samples: Optional[RuntimeParameter[int]] = Field(\n        default=30,\n        description=(\n            \"The number of samples (or total weight) in a neighborhood for a point to \"\n            \"be considered as a core point. This includes the point itself. If \"\n            \"`min_samples` is set to a higher value, DBSCAN will find denser clusters, \"\n            \"whereas if it is set to a lower value, the found clusters will be more \"\n            \"sparse.\"\n        ),\n    )\n    metric: Optional[RuntimeParameter[str]] = Field(\n        default=\"euclidean\",\n        description=(\n            \"The metric to use when calculating distance between instances in a \"\n            \"feature array. If metric is a string or callable, it must be one of \"\n            \"the options allowed by `sklearn.metrics.pairwise_distances` for \"\n            \"its metric parameter.\"\n        ),\n    )\n    n_jobs: Optional[RuntimeParameter[int]] = Field(\n        default=8, description=\"The number of parallel jobs to run.\"\n    )\n\n    _clusterer: Optional[\"_DBSCAN\"] = PrivateAttr(None)\n\n    def load(self) -> None:\n        super().load()\n        if importlib.util.find_spec(\"sklearn\") is None:\n            raise ImportError(\n                \"`sklearn` package is not installed. Please install it using `pip install scikit-learn`.\"\n            )\n        from sklearn.cluster import DBSCAN as _DBSCAN\n\n        self._clusterer = _DBSCAN(\n            eps=self.eps,\n            min_samples=self.min_samples,\n            metric=self.metric,\n            n_jobs=self.n_jobs,\n        )\n\n    def unload(self) -> None:\n        self._clusterer = None\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"projection\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"cluster_label\"]\n\n    def _save_model(self, model: Any) -> None:\n        import joblib\n\n        def save_model(path):\n            with open(str(path / \"DBSCAN.joblib\"), \"wb\") as f:\n                joblib.dump(model, f)\n\n        self.save_artifact(\n            name=\"DBSCAN_model\",\n            write_function=lambda path: save_model(path),\n            metadata={\n                \"eps\": self.eps,\n                \"min_samples\": self.min_samples,\n                \"metric\": self.metric,\n            },\n        )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        projections = np.array([input[\"projection\"] for input in inputs])\n\n        self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Start training DBSCAN...\")\n        fitted_clusterer = self._clusterer.fit(projections)\n        cluster_labels = fitted_clusterer.labels_\n        # Sets the cluster labels for each input, -1 means it wasn't clustered\n        for input, cluster_label in zip(inputs, cluster_labels):\n            input[\"cluster_label\"] = cluster_label\n        self._logger.info(f\"DBSCAN labels assigned: {len(set(cluster_labels))}\")\n        self._save_model(fitted_clusterer)\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering","title":"TextClustering","text":"

Bases: TextClassification, GlobalTask

Task that clusters a set of texts and generates summary labels for each cluster.

This is a GlobalTask that inherits from TextClassification, this means that all the attributes from that class are available here. Also, in this case we deal with all the inputs at once, instead of using batches. The input_batch_size is used here to send the examples to the LLM in batches (a subtle difference with the more common Task definitions). The task looks in each cluster for a given number of representative examples (the number is set by the samples_per_cluster attribute), and sends them to the LLM to get a label/s that represent the cluster. The labels are then assigned to each text in the cluster. The clusters and projections used in the step, are assumed to be obtained from the UMAP + DBSCAN steps, but could be generated for similar steps, as long as they represent the same concepts. This step runs a pipeline like the one in this repository: https://github.com/huggingface/text-clustering

Input columns
  • text (str): The reference text we want to obtain labels for.
  • projection (List[float]): Vector representation of the text to cluster, normally the output from the UMAP step.
  • cluster_label (int): Integer representing the label of a given cluster. -1 means it wasn't clustered.
Output columns
  • summary_label (str): The label or list of labels for the text.
  • model_name (str): The name of the model used to generate the label/s.
Categories
  • clustering
  • text-classification
References
  • text-clustering repository

Attributes:

Name Type Description - savefig

Whether to generate and save a figure with the clustering of the texts.

- samples_per_cluster

The number of examples to use in the LLM as a sample of the cluster.

Examples:

Generate labels for a set of texts using clustering:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import UMAP, DBSCAN, TextClustering\nfrom distilabel.pipeline import Pipeline\n\nds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\nwith Pipeline(name=\"Text clustering dataset\") as pipeline:\n    batch_size = 500\n\n    ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n    loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n    umap = UMAP(n_components=2, metric=\"cosine\")\n    dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n    text_clustering = TextClustering(\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        ),\n        n=3,  # 3 labels per example\n        query_title=\"Examples of Personas\",\n        samples_per_cluster=10,\n        context=(\n            \"Describe the main themes, topics, or categories that could describe the \"\n            \"following types of personas. All the examples of personas must share \"\n            \"the same set of labels.\"\n        ),\n        default_label=\"None\",\n        savefig=True,\n        input_batch_size=8,\n        input_mappings={\"text\": \"persona\"},\n        use_default_structured_output=True,\n    )\n\n    loader >> umap >> dbscan >> text_clustering\n
Source code in src/distilabel/steps/clustering/text_clustering.py
class TextClustering(TextClassification, GlobalTask):\n    \"\"\"Task that clusters a set of texts and generates summary labels for each cluster.\n\n    This is a `GlobalTask` that inherits from `TextClassification`, this means that all\n    the attributes from that class are available here. Also, in this case we deal\n    with all the inputs at once, instead of using batches. The `input_batch_size` is\n    used here to send the examples to the LLM in batches (a subtle difference with the\n    more common `Task` definitions).\n    The task looks in each cluster for a given number of representative examples (the number\n    is set by the `samples_per_cluster` attribute), and sends them to the LLM to get a label/s\n    that represent the cluster. The labels are then assigned to each text in the cluster.\n    The clusters and projections used in the step, are assumed to be obtained from the `UMAP`\n    + `DBSCAN` steps, but could be generated for similar steps, as long as they represent the\n    same concepts.\n    This step runs a pipeline like the one in this repository:\n    https://github.com/huggingface/text-clustering\n\n    Input columns:\n        - text (`str`): The reference text we want to obtain labels for.\n        - projection (`List[float]`): Vector representation of the text to cluster,\n            normally the output from the `UMAP` step.\n        - cluster_label (`int`): Integer representing the label of a given cluster. -1\n            means it wasn't clustered.\n\n    Output columns:\n        - summary_label (`str`): The label or list of labels for the text.\n        - model_name (`str`): The name of the model used to generate the label/s.\n\n    Categories:\n        - clustering\n        - text-classification\n\n    References:\n        - [`text-clustering repository`](https://github.com/huggingface/text-clustering)\n\n    Attributes:\n        - savefig: Whether to generate and save a figure with the clustering of the texts.\n        - samples_per_cluster: The number of examples to use in the LLM as a sample of the cluster.\n\n    Examples:\n        Generate labels for a set of texts using clustering:\n\n        ```python\n        from distilabel.models import InferenceEndpointsLLM\n        from distilabel.steps import UMAP, DBSCAN, TextClustering\n        from distilabel.pipeline import Pipeline\n\n        ds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\n        with Pipeline(name=\"Text clustering dataset\") as pipeline:\n            batch_size = 500\n\n            ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n            loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n            umap = UMAP(n_components=2, metric=\"cosine\")\n            dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n            text_clustering = TextClustering(\n                llm=InferenceEndpointsLLM(\n                    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                ),\n                n=3,  # 3 labels per example\n                query_title=\"Examples of Personas\",\n                samples_per_cluster=10,\n                context=(\n                    \"Describe the main themes, topics, or categories that could describe the \"\n                    \"following types of personas. All the examples of personas must share \"\n                    \"the same set of labels.\"\n                ),\n                default_label=\"None\",\n                savefig=True,\n                input_batch_size=8,\n                input_mappings={\"text\": \"persona\"},\n                use_default_structured_output=True,\n            )\n\n            loader >> umap >> dbscan >> text_clustering\n        ```\n    \"\"\"\n\n    savefig: Optional[RuntimeParameter[bool]] = Field(\n        default=True,\n        description=\"Whether to generate and save a figure with the clustering of the texts.\",\n    )\n    samples_per_cluster: int = Field(\n        default=10,\n        description=\"The number of examples to use in the LLM as a sample of the cluster.\",\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task are the same as those for `TextClassification` plus\n        the `projection` and `cluster_label` columns (which can be obtained from\n        UMAP + DBSCAN steps).\n        \"\"\"\n        return super().inputs + [\"projection\", \"cluster_label\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `summary_label` and the `model_name`.\"\"\"\n        return [\"summary_label\", \"model_name\"]\n\n    def load(self) -> None:\n        super().load()\n        if self.savefig and (importlib.util.find_spec(\"matplotlib\") is None):\n            raise ImportError(\n                \"`matplotlib` package is not installed. Please install it using `pip install matplotlib`.\"\n            )\n\n    def _save_figure(\n        self,\n        data: pd.DataFrame,\n        cluster_centers: Dict[str, Tuple[float, float]],\n        cluster_summaries: Dict[int, str],\n    ) -> None:\n        \"\"\"Saves the figure starting from the dataframe, using matplotlib.\n\n        Args:\n            data: pd.DataFrame with the columns 'X', 'Y' and 'labels' representing\n                the projections and the label of each text respectively.\n            cluster_centers: Dictionary mapping from each label the center of a cluster,\n                to help with the placement of the annotations.\n            cluster_summaries: The summaries of the clusters, obtained from the LLM.\n        \"\"\"\n        import matplotlib.pyplot as plt\n\n        fig, ax = plt.subplots(figsize=(12, 8), dpi=300)\n        unique_labels = data[\"labels\"].unique()\n        # Map of colors for each label (-1 is black)\n        colormap = dict(\n            zip(unique_labels, plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))))\n        )\n        colormap[-1] = np.array([0, 0, 0, 0])\n        data[\"color\"] = data[\"labels\"].map(colormap)\n\n        data.plot(\n            kind=\"scatter\",\n            x=\"X\",\n            y=\"Y\",\n            c=\"color\",\n            s=0.75,\n            alpha=0.8,\n            linewidth=0.4,\n            ax=ax,\n            colorbar=False,\n        )\n\n        for label in cluster_summaries.keys():\n            if label == -1:\n                continue\n            summary = str(cluster_summaries[label])  # These are obtained from the LLM\n            position = cluster_centers[label]\n            t = ax.text(\n                position[0],\n                position[1],\n                summary,\n                horizontalalignment=\"center\",\n                verticalalignment=\"center\",\n                fontsize=4,\n            )\n            t.set_bbox(\n                {\n                    \"facecolor\": \"white\",\n                    \"alpha\": 0.9,\n                    \"linewidth\": 0,\n                    \"boxstyle\": \"square,pad=0.1\",\n                }\n            )\n\n        ax.set_axis_off()\n        # Save the plot as an artifact of the step\n        self.save_artifact(\n            name=\"Text clusters\",\n            write_function=lambda path: fig.savefig(path / \"figure_clustering.png\"),\n            metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n        )\n        plt.close()\n\n    def _create_figure(\n        self,\n        inputs: StepInput,\n        label2docs: Dict[int, List[str]],\n        cluster_summaries: Dict[int, str],\n    ) -> None:\n        \"\"\"Creates a figure of the clustered texts and save it as an artifact.\n\n        Args:\n            inputs: The inputs of the step, as we will extract information from them again.\n            label2docs: Map from each label to the list of documents (texts) that belong to that cluster.\n            cluster_summaries: The summaries of the clusters, obtained from the LLM.\n        \"\"\"\n        self._logger.info(\"\ud83d\uddbc\ufe0f Creating figure for the clusters...\")\n\n        labels = []\n        projections = []\n        id2cluster = {}\n        for i, input in enumerate(inputs):\n            label = input[\"cluster_label\"]\n            id2cluster[i] = label\n            labels.append(label)\n            projections.append(input[\"projection\"])\n\n        projections = np.array(projections)\n\n        # Contains the placement of the cluster centers in the figure\n        cluster_centers: Dict[str, Tuple[float, float]] = {}\n        for label in label2docs.keys():\n            x = np.mean([projections[doc, 0] for doc in label2docs[label]])\n            y = np.mean([projections[doc, 1] for doc in label2docs[label]])\n            cluster_centers[label] = (x, y)\n\n        df = pd.DataFrame(\n            data={\n                \"X\": projections[:, 0],\n                \"Y\": projections[:, 1],\n                \"labels\": labels,\n            }\n        )\n\n        self._save_figure(\n            df, cluster_centers=cluster_centers, cluster_summaries=cluster_summaries\n        )\n\n    def _prepare_input_texts(\n        self,\n        inputs: StepInput,\n        label2docs: Dict[int, List[int]],\n        unique_labels: List[int],\n    ) -> List[Dict[str, Union[str, int]]]:\n        \"\"\"Prepares a batch of inputs to send to the LLM, with the examples of each cluster.\n\n        Args:\n            inputs: Inputs from the step.\n            label2docs: Map from each label to the list of documents (texts) that\n                belong to that cluster.\n            unique_labels: The unique labels of the clusters.\n\n        Returns:\n            The input texts to send to the LLM, with the examples of each cluster\n            prepared to be used in the prompt, and an additional key to store the\n            labels (that will be needed to find the data after the batches are\n            returned from the LLM).\n        \"\"\"\n        input_texts = []\n        for label in range(unique_labels):  # The label -1 is implicitly excluded\n            # Get the ids but remove possible duplicates, which could happen with bigger probability\n            # the bigger the number of examples requested, and the smaller the subset of examples\n            ids = set(\n                np.random.choice(label2docs[label], size=self.samples_per_cluster)\n            )  # Grab the number of examples\n            examples = [inputs[i][\"text\"] for i in ids]\n            input_text = {\n                \"text\": \"\\n\\n\".join(\n                    [f\"Example {i}:\\n{t}\" for i, t in enumerate(examples, start=1)]\n                ),\n                \"__LABEL\": label,\n            }\n            input_texts.append(input_text)\n        return input_texts\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        labels = [input[\"cluster_label\"] for input in inputs]\n        # -1 because -1 is the label for the unclassified\n        unique_labels = len(set(labels)) - 1\n        # This will be the output of the LLM, the set of labels for each cluster\n        cluster_summaries: Dict[int, str] = {-1: self.default_label}\n\n        # Map from label to list of documents, will use them to select examples from each cluster\n        label2docs = defaultdict(list)\n        for i, label in enumerate(labels):\n            label2docs[label].append(i)\n\n        input_texts = self._prepare_input_texts(inputs, label2docs, unique_labels)\n\n        # Send the texts in batches to the LLM, and get the labels for each cluster\n        for i, batched_inputs in enumerate(batched(input_texts, self.input_batch_size)):\n            self._logger.info(f\"\ud83d\udce6 Processing internal batch of inputs {i}...\")\n            results = super().process(batched_inputs)\n            for result in next(results):  # Extract the elements from the generator\n                cluster_summaries[result[\"__LABEL\"]] = result[\"labels\"]\n\n        # Assign the labels to each text\n        for input in inputs:\n            input[\"summary_label\"] = json.dumps(\n                cluster_summaries[input[\"cluster_label\"]]\n            )\n\n        if self.savefig:\n            self._create_figure(inputs, label2docs, cluster_summaries)\n\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering.inputs","title":"inputs: List[str] property","text":"

The input for the task are the same as those for TextClassification plus the projection and cluster_label columns (which can be obtained from UMAP + DBSCAN steps).

"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering.outputs","title":"outputs: List[str] property","text":"

The output for the task is the summary_label and the model_name.

"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._save_figure","title":"_save_figure(data, cluster_centers, cluster_summaries)","text":"

Saves the figure starting from the dataframe, using matplotlib.

Parameters:

Name Type Description Default data DataFrame

pd.DataFrame with the columns 'X', 'Y' and 'labels' representing the projections and the label of each text respectively.

required cluster_centers Dict[str, Tuple[float, float]]

Dictionary mapping from each label the center of a cluster, to help with the placement of the annotations.

required cluster_summaries Dict[int, str]

The summaries of the clusters, obtained from the LLM.

required Source code in src/distilabel/steps/clustering/text_clustering.py
def _save_figure(\n    self,\n    data: pd.DataFrame,\n    cluster_centers: Dict[str, Tuple[float, float]],\n    cluster_summaries: Dict[int, str],\n) -> None:\n    \"\"\"Saves the figure starting from the dataframe, using matplotlib.\n\n    Args:\n        data: pd.DataFrame with the columns 'X', 'Y' and 'labels' representing\n            the projections and the label of each text respectively.\n        cluster_centers: Dictionary mapping from each label the center of a cluster,\n            to help with the placement of the annotations.\n        cluster_summaries: The summaries of the clusters, obtained from the LLM.\n    \"\"\"\n    import matplotlib.pyplot as plt\n\n    fig, ax = plt.subplots(figsize=(12, 8), dpi=300)\n    unique_labels = data[\"labels\"].unique()\n    # Map of colors for each label (-1 is black)\n    colormap = dict(\n        zip(unique_labels, plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))))\n    )\n    colormap[-1] = np.array([0, 0, 0, 0])\n    data[\"color\"] = data[\"labels\"].map(colormap)\n\n    data.plot(\n        kind=\"scatter\",\n        x=\"X\",\n        y=\"Y\",\n        c=\"color\",\n        s=0.75,\n        alpha=0.8,\n        linewidth=0.4,\n        ax=ax,\n        colorbar=False,\n    )\n\n    for label in cluster_summaries.keys():\n        if label == -1:\n            continue\n        summary = str(cluster_summaries[label])  # These are obtained from the LLM\n        position = cluster_centers[label]\n        t = ax.text(\n            position[0],\n            position[1],\n            summary,\n            horizontalalignment=\"center\",\n            verticalalignment=\"center\",\n            fontsize=4,\n        )\n        t.set_bbox(\n            {\n                \"facecolor\": \"white\",\n                \"alpha\": 0.9,\n                \"linewidth\": 0,\n                \"boxstyle\": \"square,pad=0.1\",\n            }\n        )\n\n    ax.set_axis_off()\n    # Save the plot as an artifact of the step\n    self.save_artifact(\n        name=\"Text clusters\",\n        write_function=lambda path: fig.savefig(path / \"figure_clustering.png\"),\n        metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n    )\n    plt.close()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._create_figure","title":"_create_figure(inputs, label2docs, cluster_summaries)","text":"

Creates a figure of the clustered texts and save it as an artifact.

Parameters:

Name Type Description Default inputs StepInput

The inputs of the step, as we will extract information from them again.

required label2docs Dict[int, List[str]]

Map from each label to the list of documents (texts) that belong to that cluster.

required cluster_summaries Dict[int, str]

The summaries of the clusters, obtained from the LLM.

required Source code in src/distilabel/steps/clustering/text_clustering.py
def _create_figure(\n    self,\n    inputs: StepInput,\n    label2docs: Dict[int, List[str]],\n    cluster_summaries: Dict[int, str],\n) -> None:\n    \"\"\"Creates a figure of the clustered texts and save it as an artifact.\n\n    Args:\n        inputs: The inputs of the step, as we will extract information from them again.\n        label2docs: Map from each label to the list of documents (texts) that belong to that cluster.\n        cluster_summaries: The summaries of the clusters, obtained from the LLM.\n    \"\"\"\n    self._logger.info(\"\ud83d\uddbc\ufe0f Creating figure for the clusters...\")\n\n    labels = []\n    projections = []\n    id2cluster = {}\n    for i, input in enumerate(inputs):\n        label = input[\"cluster_label\"]\n        id2cluster[i] = label\n        labels.append(label)\n        projections.append(input[\"projection\"])\n\n    projections = np.array(projections)\n\n    # Contains the placement of the cluster centers in the figure\n    cluster_centers: Dict[str, Tuple[float, float]] = {}\n    for label in label2docs.keys():\n        x = np.mean([projections[doc, 0] for doc in label2docs[label]])\n        y = np.mean([projections[doc, 1] for doc in label2docs[label]])\n        cluster_centers[label] = (x, y)\n\n    df = pd.DataFrame(\n        data={\n            \"X\": projections[:, 0],\n            \"Y\": projections[:, 1],\n            \"labels\": labels,\n        }\n    )\n\n    self._save_figure(\n        df, cluster_centers=cluster_centers, cluster_summaries=cluster_summaries\n    )\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._prepare_input_texts","title":"_prepare_input_texts(inputs, label2docs, unique_labels)","text":"

Prepares a batch of inputs to send to the LLM, with the examples of each cluster.

Parameters:

Name Type Description Default inputs StepInput

Inputs from the step.

required label2docs Dict[int, List[int]]

Map from each label to the list of documents (texts) that belong to that cluster.

required unique_labels List[int]

The unique labels of the clusters.

required

Returns:

Type Description List[Dict[str, Union[str, int]]]

The input texts to send to the LLM, with the examples of each cluster

List[Dict[str, Union[str, int]]]

prepared to be used in the prompt, and an additional key to store the

List[Dict[str, Union[str, int]]]

labels (that will be needed to find the data after the batches are

List[Dict[str, Union[str, int]]]

returned from the LLM).

Source code in src/distilabel/steps/clustering/text_clustering.py
def _prepare_input_texts(\n    self,\n    inputs: StepInput,\n    label2docs: Dict[int, List[int]],\n    unique_labels: List[int],\n) -> List[Dict[str, Union[str, int]]]:\n    \"\"\"Prepares a batch of inputs to send to the LLM, with the examples of each cluster.\n\n    Args:\n        inputs: Inputs from the step.\n        label2docs: Map from each label to the list of documents (texts) that\n            belong to that cluster.\n        unique_labels: The unique labels of the clusters.\n\n    Returns:\n        The input texts to send to the LLM, with the examples of each cluster\n        prepared to be used in the prompt, and an additional key to store the\n        labels (that will be needed to find the data after the batches are\n        returned from the LLM).\n    \"\"\"\n    input_texts = []\n    for label in range(unique_labels):  # The label -1 is implicitly excluded\n        # Get the ids but remove possible duplicates, which could happen with bigger probability\n        # the bigger the number of examples requested, and the smaller the subset of examples\n        ids = set(\n            np.random.choice(label2docs[label], size=self.samples_per_cluster)\n        )  # Grab the number of examples\n        examples = [inputs[i][\"text\"] for i in ids]\n        input_text = {\n            \"text\": \"\\n\\n\".join(\n                [f\"Example {i}:\\n{t}\" for i, t in enumerate(examples, start=1)]\n            ),\n            \"__LABEL\": label,\n        }\n        input_texts.append(input_text)\n    return input_texts\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.UMAP","title":"UMAP","text":"

Bases: GlobalStep

UMAP is a general purpose manifold learning and dimension reduction algorithm.

This is a GlobalStep that reduces the dimensionality of the embeddings using. Visit the TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub.

Input columns
  • embedding (List[float]): The original embeddings we want to reduce the dimension.
Output columns
  • projection (List[float]): Embedding reduced to the number of components specified, the size of the new embeddings will be determined by the n_components.
Categories
  • clustering
  • text-classification
References
  • UMAP repository
  • UMAP documentation

Attributes:

Name Type Description - n_components

The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100.

- metric

The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean.

- n_jobs

The number of parallel jobs to run. Defaults to 8.

- random_state

The random state to use for the UMAP algorithm.

Runtime parameters
  • n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100.
  • metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean.
  • n_jobs: The number of parallel jobs to run. Defaults to 8.
  • random_state: The random state to use for the UMAP algorithm.
Citations
@misc{mcinnes2020umapuniformmanifoldapproximation,\n    title={UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction},\n    author={Leland McInnes and John Healy and James Melville},\n    year={2020},\n    eprint={1802.03426},\n    archivePrefix={arXiv},\n    primaryClass={stat.ML},\n    url={https://arxiv.org/abs/1802.03426},\n}\n
Source code in src/distilabel/steps/clustering/umap.py
class UMAP(GlobalStep):\n    r\"\"\"UMAP is a general purpose manifold learning and dimension reduction algorithm.\n\n    This is a `GlobalStep` that reduces the dimensionality of the embeddings using. Visit\n    the `TextClustering` step for an example of use. The trained model is saved as an artifact\n    when creating a distiset and pushing it to the Hugging Face Hub.\n\n    Input columns:\n        - embedding (`List[float]`): The original embeddings we want to reduce the dimension.\n\n    Output columns:\n        - projection (`List[float]`): Embedding reduced to the number of components specified,\n            the size of the new embeddings will be determined by the `n_components`.\n\n    Categories:\n        - clustering\n        - text-classification\n\n    References:\n        - [`UMAP repository`](https://github.com/lmcinnes/umap/tree/master)\n        - [`UMAP documentation`](https://umap-learn.readthedocs.io/en/latest/)\n\n    Attributes:\n        - n_components: The dimension of the space to embed into. This defaults to 2 to\n            provide easy visualization (that's probably what you want), but can\n            reasonably be set to any integer value in the range 2 to 100.\n        - metric: The metric to use to compute distances in high dimensional space.\n            Visit UMAP's documentation for more information. Defaults to `euclidean`.\n        - n_jobs: The number of parallel jobs to run. Defaults to `8`.\n        - random_state: The random state to use for the UMAP algorithm.\n\n    Runtime parameters:\n        - `n_components`: The dimension of the space to embed into. This defaults to 2 to\n            provide easy visualization (that's probably what you want), but can\n            reasonably be set to any integer value in the range 2 to 100.\n        - `metric`: The metric to use to compute distances in high dimensional space.\n            Visit UMAP's documentation for more information. Defaults to `euclidean`.\n        - `n_jobs`: The number of parallel jobs to run. Defaults to `8`.\n        - `random_state`: The random state to use for the UMAP algorithm.\n\n    Citations:\n        ```\n        @misc{mcinnes2020umapuniformmanifoldapproximation,\n            title={UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction},\n            author={Leland McInnes and John Healy and James Melville},\n            year={2020},\n            eprint={1802.03426},\n            archivePrefix={arXiv},\n            primaryClass={stat.ML},\n            url={https://arxiv.org/abs/1802.03426},\n        }\n        ```\n    \"\"\"\n\n    n_components: Optional[RuntimeParameter[int]] = Field(\n        default=2,\n        description=(\n            \"The dimension of the space to embed into. This defaults to 2 to \"\n            \"provide easy visualization, but can reasonably be set to any \"\n            \"integer value in the range 2 to 100.\"\n        ),\n    )\n    metric: Optional[RuntimeParameter[str]] = Field(\n        default=\"euclidean\",\n        description=(\n            \"The metric to use to compute distances in high dimensional space. \"\n            \"Visit UMAP's documentation for more information.\"\n        ),\n    )\n    n_jobs: Optional[RuntimeParameter[int]] = Field(\n        default=8, description=\"The number of parallel jobs to run.\"\n    )\n    random_state: Optional[RuntimeParameter[int]] = Field(\n        default=None, description=\"The random state to use for the UMAP algorithm.\"\n    )\n\n    _umap: Optional[\"_UMAP\"] = PrivateAttr(None)\n\n    def load(self) -> None:\n        super().load()\n        if importlib.util.find_spec(\"umap\") is None:\n            raise ImportError(\n                \"`umap` package is not installed. Please install it using `pip install umap-learn`.\"\n            )\n        from umap import UMAP as _UMAP\n\n        self._umap = _UMAP(\n            n_components=self.n_components,\n            metric=self.metric,\n            n_jobs=self.n_jobs,\n            random_state=self.random_state,\n        )\n\n    def unload(self) -> None:\n        self._umap = None\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"embedding\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"projection\"]\n\n    def _save_model(self, model: Any) -> None:\n        import joblib\n\n        def save_model(path):\n            with open(str(path / \"UMAP.joblib\"), \"wb\") as f:\n                joblib.dump(model, f)\n\n        self.save_artifact(\n            name=\"UMAP_model\",\n            write_function=lambda path: save_model(path),\n            metadata={\n                \"n_components\": self.n_components,\n                \"metric\": self.metric,\n            },\n        )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        # Shape of the embeddings is (n_samples, n_features)\n        embeddings = np.array([input[\"embedding\"] for input in inputs])\n\n        self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Start UMAP training...\")\n        mapper = self._umap.fit(embeddings)\n        # Shape of the projection will be (n_samples, n_components)\n        for input, projection in zip(inputs, mapper.embedding_):\n            input[\"projection\"] = projection\n\n        self._save_model(mapper)\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.CombineOutputs","title":"CombineOutputs","text":"

Bases: Step

Combine the outputs of several upstream steps.

CombineOutputs is a Step that takes the outputs of several upstream steps and combines them to generate a new dictionary with all keys/columns of the upstream steps outputs.

Input columns
  • dynamic (based on the upstream Steps): All the columns of the upstream steps outputs.
Output columns
  • dynamic (based on the upstream Steps): All the columns of the upstream steps outputs.
Categories
  • columns

Examples:

Combine dictionaries of a dataset:\n\n```python\nfrom distilabel.steps import CombineOutputs\n\ncombine_outputs = CombineOutputs()\ncombine_outputs.load()\n\nresult = next(\n    combine_outputs.process(\n        [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n        [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n    )\n)\n# [\n#   {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n#   {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n# ]\n```\n\nCombine upstream steps outputs in a pipeline:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs\n\nwith Pipeline() as pipeline:\n    step_1 = ...\n    step_2 = ...\n    step_3 = ...\n    combine = CombineOutputs()\n\n    [step_1, step_2, step_3] >> combine\n```\n
Source code in src/distilabel/steps/columns/combine.py
class CombineOutputs(Step):\n    \"\"\"Combine the outputs of several upstream steps.\n\n    `CombineOutputs` is a `Step` that takes the outputs of several upstream steps and combines\n    them to generate a new dictionary with all keys/columns of the upstream steps outputs.\n\n    Input columns:\n        - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.\n\n    Output columns:\n        - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.\n\n    Categories:\n        - columns\n\n    Examples:\n\n        Combine dictionaries of a dataset:\n\n        ```python\n        from distilabel.steps import CombineOutputs\n\n        combine_outputs = CombineOutputs()\n        combine_outputs.load()\n\n        result = next(\n            combine_outputs.process(\n                [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n                [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n            )\n        )\n        # [\n        #   {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n        #   {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n        # ]\n        ```\n\n        Combine upstream steps outputs in a pipeline:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import CombineOutputs\n\n        with Pipeline() as pipeline:\n            step_1 = ...\n            step_2 = ...\n            step_3 = ...\n            combine = CombineOutputs()\n\n            [step_1, step_2, step_3] >> combine\n        ```\n    \"\"\"\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        combined_outputs = []\n        for output_dicts in zip(*inputs):\n            combined_dict = {}\n            for output_dict in output_dicts:\n                combined_dict.update(\n                    {\n                        k: v\n                        for k, v in output_dict.items()\n                        if k != DISTILABEL_METADATA_KEY\n                    }\n                )\n\n            if any(\n                DISTILABEL_METADATA_KEY in output_dict for output_dict in output_dicts\n            ):\n                combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(\n                    *output_dicts\n                )\n            combined_outputs.append(combined_dict)\n\n        yield combined_outputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering","title":"DeitaFiltering","text":"

Bases: GlobalStep

Filter dataset rows using DEITA filtering strategy.

Filter the dataset based on the DEITA score and the cosine distance between the embeddings. It's an implementation of the filtering step from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

Attributes:

Name Type Description data_budget RuntimeParameter[int]

The desired size of the dataset after filtering.

diversity_threshold RuntimeParameter[float]

If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Defaults to 0.9.

normalize_embeddings RuntimeParameter[bool]

Whether to normalize the embeddings before computing the cosine distance. Defaults to True.

Runtime parameters
  • data_budget: The desired size of the dataset after filtering.
  • diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset.
Input columns
  • evol_instruction_score (float): The score of the instruction generated by ComplexityScorer step.
  • evol_response_score (float): The score of the response generated by QualityScorer step.
  • embedding (List[float]): The embedding generated for the conversation of the instruction-response pair using GenerateEmbeddings step.
Output columns
  • deita_score (float): The DEITA score for the instruction-response pair.
  • deita_score_computed_with (List[str]): The scores used to compute the DEITA score.
  • nearest_neighbor_distance (float): The cosine distance between the embeddings of the instruction-response pair.
Categories
  • filtering
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Filter the dataset based on the DEITA score and the cosine distance between the embeddings:

from distilabel.steps import DeitaFiltering\n\ndeita_filtering = DeitaFiltering(data_budget=1)\n\ndeita_filtering.load()\n\nresult = next(\n    deita_filtering.process(\n        [\n            {\n                \"evol_instruction_score\": 0.5,\n                \"evol_response_score\": 0.5,\n                \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n            },\n            {\n                \"evol_instruction_score\": 0.6,\n                \"evol_response_score\": 0.6,\n                \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n            },\n            {\n                \"evol_instruction_score\": 0.7,\n                \"evol_response_score\": 0.7,\n                \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n            },\n        ],\n    )\n)\n# >>> result\n# [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/deita.py
class DeitaFiltering(GlobalStep):\n    \"\"\"Filter dataset rows using DEITA filtering strategy.\n\n    Filter the dataset based on the DEITA score and the cosine distance between the embeddings.\n    It's an implementation of the filtering step from the paper 'What Makes Good Data\n    for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.\n\n    Attributes:\n        data_budget: The desired size of the dataset after filtering.\n        diversity_threshold: If a row has a cosine distance with respect to it's nearest\n            neighbor greater than this value, it will be included in the filtered dataset.\n            Defaults to `0.9`.\n        normalize_embeddings: Whether to normalize the embeddings before computing the cosine\n            distance. Defaults to `True`.\n\n    Runtime parameters:\n        - `data_budget`: The desired size of the dataset after filtering.\n        - `diversity_threshold`: If a row has a cosine distance with respect to it's nearest\n            neighbor greater than this value, it will be included in the filtered dataset.\n\n    Input columns:\n        - evol_instruction_score (`float`): The score of the instruction generated by\n            `ComplexityScorer` step.\n        - evol_response_score (`float`): The score of the response generated by\n            `QualityScorer` step.\n        - embedding (`List[float]`): The embedding generated for the conversation of the\n            instruction-response pair using `GenerateEmbeddings` step.\n\n    Output columns:\n        - deita_score (`float`): The DEITA score for the instruction-response pair.\n        - deita_score_computed_with (`List[str]`): The scores used to compute the DEITA\n            score.\n        - nearest_neighbor_distance (`float`): The cosine distance between the embeddings\n            of the instruction-response pair.\n\n    Categories:\n        - filtering\n\n    References:\n        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Filter the dataset based on the DEITA score and the cosine distance between the embeddings:\n\n        ```python\n        from distilabel.steps import DeitaFiltering\n\n        deita_filtering = DeitaFiltering(data_budget=1)\n\n        deita_filtering.load()\n\n        result = next(\n            deita_filtering.process(\n                [\n                    {\n                        \"evol_instruction_score\": 0.5,\n                        \"evol_response_score\": 0.5,\n                        \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n                    },\n                    {\n                        \"evol_instruction_score\": 0.6,\n                        \"evol_response_score\": 0.6,\n                        \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n                    },\n                    {\n                        \"evol_instruction_score\": 0.7,\n                        \"evol_response_score\": 0.7,\n                        \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n                    },\n                ],\n            )\n        )\n        # >>> result\n        # [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    data_budget: RuntimeParameter[int] = Field(\n        default=None, description=\"The desired size of the dataset after filtering.\"\n    )\n    diversity_threshold: RuntimeParameter[float] = Field(\n        default=0.9,\n        description=\"If a row has a cosine distance with respect to it's nearest neighbor\"\n        \" greater than this value, it will be included in the filtered dataset.\",\n    )\n    normalize_embeddings: RuntimeParameter[bool] = Field(\n        default=True,\n        description=\"Whether to normalize the embeddings before computing the cosine distance.\",\n    )\n    distance_metric: RuntimeParameter[Literal[\"cosine\", \"manhattan\"]] = Field(\n        default=\"cosine\",\n        description=\"The distance metric to use. Currently only 'cosine' is supported.\",\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"evol_instruction_score\", \"evol_response_score\", \"embedding\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"deita_score\", \"nearest_neighbor_distance\", \"deita_score_computed_with\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Filter the dataset based on the DEITA score and the cosine distance between the\n        embeddings.\n\n        Args:\n            inputs: The input data.\n\n        Returns:\n            The filtered dataset.\n        \"\"\"\n        inputs = self._compute_deita_score(inputs)\n        inputs = self._compute_nearest_neighbor(inputs)\n        inputs.sort(key=lambda x: x[\"deita_score\"], reverse=True)\n\n        selected_rows = []\n        for input in inputs:\n            if len(selected_rows) >= self.data_budget:  # type: ignore\n                break\n            if input[\"nearest_neighbor_distance\"] >= self.diversity_threshold:\n                selected_rows.append(input)\n        yield selected_rows\n\n    def _compute_deita_score(self, inputs: StepInput) -> StepInput:\n        \"\"\"Computes the DEITA score for each instruction-response pair. The DEITA score is\n        the product of the instruction score and the response score.\n\n        Args:\n            inputs: The input data.\n\n        Returns:\n            The input data with the DEITA score computed.\n        \"\"\"\n        for input_ in inputs:\n            evol_instruction_score = input_.get(\"evol_instruction_score\")\n            evol_response_score = input_.get(\"evol_response_score\")\n\n            if evol_instruction_score and evol_response_score:\n                deita_score = evol_instruction_score * evol_response_score\n                score_computed_with = [\"evol_instruction_score\", \"evol_response_score\"]\n            elif evol_instruction_score:\n                self._logger.warning(\n                    \"Response score is missing for the instruction-response pair. Using\"\n                    \" instruction score as DEITA score.\"\n                )\n                deita_score = evol_instruction_score\n                score_computed_with = [\"evol_instruction_score\"]\n            elif evol_response_score:\n                self._logger.warning(\n                    \"Instruction score is missing for the instruction-response pair. Using\"\n                    \" response score as DEITA score.\"\n                )\n                deita_score = evol_response_score\n                score_computed_with = [\"evol_response_score\"]\n            else:\n                self._logger.warning(\n                    \"Instruction and response scores are missing for the instruction-response\"\n                    \" pair. Setting DEITA score to 0.\"\n                )\n                deita_score = 0\n                score_computed_with = []\n\n            input_.update(\n                {\n                    \"deita_score\": deita_score,\n                    \"deita_score_computed_with\": score_computed_with,\n                }\n            )\n        return inputs\n\n    def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:\n        \"\"\"Computes the cosine distance between the embeddings of the instruction-response\n        pairs and the nearest neighbor.\n\n        Args:\n            inputs: The input data.\n\n        Returns:\n            The input data with the cosine distance computed.\n        \"\"\"\n        embeddings = np.array([input[\"embedding\"] for input in inputs])\n        if self.normalize_embeddings:\n            embeddings = self._normalize_embeddings(embeddings)\n        self._logger.info(\"\ud83d\udccf Computing nearest neighbor distance...\")\n\n        if self.distance_metric == \"cosine\":\n            self._logger.info(\"\ud83d\udccf Using cosine distance.\")\n            distances = self._cosine_distance(embeddings)\n        else:\n            self._logger.info(\"\ud83d\udccf Using manhattan distance.\")\n            distances = self._manhattan_distance(embeddings)\n\n        for distance, input in zip(distances, inputs):\n            input[\"nearest_neighbor_distance\"] = distance\n        return inputs\n\n    def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:\n        \"\"\"Normalize the embeddings.\n\n        Args:\n            embeddings: The embeddings to normalize.\n\n        Returns:\n            The normalized embeddings.\n        \"\"\"\n        self._logger.info(\"\u2696\ufe0f Normalizing embeddings...\")\n        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)\n        return embeddings / norms\n\n    def _cosine_distance(self, embeddings: np.array) -> np.array:  # type: ignore\n        \"\"\"Computes the cosine distance between the embeddings.\n\n        Args:\n            embeddings: The embeddings.\n\n        Returns:\n            The cosine distance between the embeddings.\n        \"\"\"\n        cosine_similarity = np.dot(embeddings, embeddings.T)\n        cosine_distance = 1 - cosine_similarity\n        # Ignore self-distance\n        np.fill_diagonal(cosine_distance, np.inf)\n        return np.min(cosine_distance, axis=1)\n\n    def _manhattan_distance(self, embeddings: np.array) -> np.array:  # type: ignore\n        \"\"\"Computes the manhattan distance between the embeddings.\n\n        Args:\n            embeddings: The embeddings.\n\n        Returns:\n            The manhattan distance between the embeddings.\n        \"\"\"\n        manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)\n        # Ignore self-distance\n        np.fill_diagonal(manhattan_distance, np.inf)\n        return np.min(manhattan_distance, axis=1)\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering.process","title":"process(inputs)","text":"

Filter the dataset based on the DEITA score and the cosine distance between the embeddings.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Returns:

Type Description StepOutput

The filtered dataset.

Source code in src/distilabel/steps/deita.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Filter the dataset based on the DEITA score and the cosine distance between the\n    embeddings.\n\n    Args:\n        inputs: The input data.\n\n    Returns:\n        The filtered dataset.\n    \"\"\"\n    inputs = self._compute_deita_score(inputs)\n    inputs = self._compute_nearest_neighbor(inputs)\n    inputs.sort(key=lambda x: x[\"deita_score\"], reverse=True)\n\n    selected_rows = []\n    for input in inputs:\n        if len(selected_rows) >= self.data_budget:  # type: ignore\n            break\n        if input[\"nearest_neighbor_distance\"] >= self.diversity_threshold:\n            selected_rows.append(input)\n    yield selected_rows\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._compute_deita_score","title":"_compute_deita_score(inputs)","text":"

Computes the DEITA score for each instruction-response pair. The DEITA score is the product of the instruction score and the response score.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Returns:

Type Description StepInput

The input data with the DEITA score computed.

Source code in src/distilabel/steps/deita.py
def _compute_deita_score(self, inputs: StepInput) -> StepInput:\n    \"\"\"Computes the DEITA score for each instruction-response pair. The DEITA score is\n    the product of the instruction score and the response score.\n\n    Args:\n        inputs: The input data.\n\n    Returns:\n        The input data with the DEITA score computed.\n    \"\"\"\n    for input_ in inputs:\n        evol_instruction_score = input_.get(\"evol_instruction_score\")\n        evol_response_score = input_.get(\"evol_response_score\")\n\n        if evol_instruction_score and evol_response_score:\n            deita_score = evol_instruction_score * evol_response_score\n            score_computed_with = [\"evol_instruction_score\", \"evol_response_score\"]\n        elif evol_instruction_score:\n            self._logger.warning(\n                \"Response score is missing for the instruction-response pair. Using\"\n                \" instruction score as DEITA score.\"\n            )\n            deita_score = evol_instruction_score\n            score_computed_with = [\"evol_instruction_score\"]\n        elif evol_response_score:\n            self._logger.warning(\n                \"Instruction score is missing for the instruction-response pair. Using\"\n                \" response score as DEITA score.\"\n            )\n            deita_score = evol_response_score\n            score_computed_with = [\"evol_response_score\"]\n        else:\n            self._logger.warning(\n                \"Instruction and response scores are missing for the instruction-response\"\n                \" pair. Setting DEITA score to 0.\"\n            )\n            deita_score = 0\n            score_computed_with = []\n\n        input_.update(\n            {\n                \"deita_score\": deita_score,\n                \"deita_score_computed_with\": score_computed_with,\n            }\n        )\n    return inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._compute_nearest_neighbor","title":"_compute_nearest_neighbor(inputs)","text":"

Computes the cosine distance between the embeddings of the instruction-response pairs and the nearest neighbor.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Returns:

Type Description StepInput

The input data with the cosine distance computed.

Source code in src/distilabel/steps/deita.py
def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:\n    \"\"\"Computes the cosine distance between the embeddings of the instruction-response\n    pairs and the nearest neighbor.\n\n    Args:\n        inputs: The input data.\n\n    Returns:\n        The input data with the cosine distance computed.\n    \"\"\"\n    embeddings = np.array([input[\"embedding\"] for input in inputs])\n    if self.normalize_embeddings:\n        embeddings = self._normalize_embeddings(embeddings)\n    self._logger.info(\"\ud83d\udccf Computing nearest neighbor distance...\")\n\n    if self.distance_metric == \"cosine\":\n        self._logger.info(\"\ud83d\udccf Using cosine distance.\")\n        distances = self._cosine_distance(embeddings)\n    else:\n        self._logger.info(\"\ud83d\udccf Using manhattan distance.\")\n        distances = self._manhattan_distance(embeddings)\n\n    for distance, input in zip(distances, inputs):\n        input[\"nearest_neighbor_distance\"] = distance\n    return inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._normalize_embeddings","title":"_normalize_embeddings(embeddings)","text":"

Normalize the embeddings.

Parameters:

Name Type Description Default embeddings ndarray

The embeddings to normalize.

required

Returns:

Type Description ndarray

The normalized embeddings.

Source code in src/distilabel/steps/deita.py
def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:\n    \"\"\"Normalize the embeddings.\n\n    Args:\n        embeddings: The embeddings to normalize.\n\n    Returns:\n        The normalized embeddings.\n    \"\"\"\n    self._logger.info(\"\u2696\ufe0f Normalizing embeddings...\")\n    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)\n    return embeddings / norms\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._cosine_distance","title":"_cosine_distance(embeddings)","text":"

Computes the cosine distance between the embeddings.

Parameters:

Name Type Description Default embeddings array

The embeddings.

required

Returns:

Type Description array

The cosine distance between the embeddings.

Source code in src/distilabel/steps/deita.py
def _cosine_distance(self, embeddings: np.array) -> np.array:  # type: ignore\n    \"\"\"Computes the cosine distance between the embeddings.\n\n    Args:\n        embeddings: The embeddings.\n\n    Returns:\n        The cosine distance between the embeddings.\n    \"\"\"\n    cosine_similarity = np.dot(embeddings, embeddings.T)\n    cosine_distance = 1 - cosine_similarity\n    # Ignore self-distance\n    np.fill_diagonal(cosine_distance, np.inf)\n    return np.min(cosine_distance, axis=1)\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._manhattan_distance","title":"_manhattan_distance(embeddings)","text":"

Computes the manhattan distance between the embeddings.

Parameters:

Name Type Description Default embeddings array

The embeddings.

required

Returns:

Type Description array

The manhattan distance between the embeddings.

Source code in src/distilabel/steps/deita.py
def _manhattan_distance(self, embeddings: np.array) -> np.array:  # type: ignore\n    \"\"\"Computes the manhattan distance between the embeddings.\n\n    Args:\n        embeddings: The embeddings.\n\n    Returns:\n        The manhattan distance between the embeddings.\n    \"\"\"\n    manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)\n    # Ignore self-distance\n    np.fill_diagonal(manhattan_distance, np.inf)\n    return np.min(manhattan_distance, axis=1)\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingGeneration","title":"EmbeddingGeneration","text":"

Bases: Step

Generate embeddings using an Embeddings model.

EmbeddingGeneration is a Step that using an Embeddings model generates sentence embeddings for the provided input texts.

Attributes:

Name Type Description embeddings Embeddings

the Embeddings model used to generate the sentence embeddings.

Input columns
  • text (str): The text for which the sentence embedding has to be generated.
Output columns
  • embedding (List[Union[float, int]]): the generated sentence embedding.
Categories
  • embedding

Examples:

Generate sentence embeddings with Sentence Transformers:

from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.steps import EmbeddingGeneration\n\nembedding_generation = EmbeddingGeneration(\n    embeddings=SentenceTransformerEmbeddings(\n        model=\"mixedbread-ai/mxbai-embed-large-v1\",\n    )\n)\n\nembedding_generation.load()\n\nresult = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n# [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n
Source code in src/distilabel/steps/embeddings/embedding_generation.py
class EmbeddingGeneration(Step):\n    \"\"\"Generate embeddings using an `Embeddings` model.\n\n    `EmbeddingGeneration` is a `Step` that using an `Embeddings` model generates sentence\n    embeddings for the provided input texts.\n\n    Attributes:\n        embeddings: the `Embeddings` model used to generate the sentence embeddings.\n\n    Input columns:\n        - text (`str`): The text for which the sentence embedding has to be generated.\n\n    Output columns:\n        - embedding (`List[Union[float, int]]`): the generated sentence embedding.\n\n    Categories:\n        - embedding\n\n    Examples:\n        Generate sentence embeddings with Sentence Transformers:\n\n        ```python\n        from distilabel.models import SentenceTransformerEmbeddings\n        from distilabel.steps import EmbeddingGeneration\n\n        embedding_generation = EmbeddingGeneration(\n            embeddings=SentenceTransformerEmbeddings(\n                model=\"mixedbread-ai/mxbai-embed-large-v1\",\n            )\n        )\n\n        embedding_generation.load()\n\n        result = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n        # [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n        ```\n\n    \"\"\"\n\n    embeddings: Embeddings\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"embedding\", \"model_name\"]\n\n    def load(self) -> None:\n        \"\"\"Loads the `Embeddings` model.\"\"\"\n        super().load()\n\n        self.embeddings.load()\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        embeddings = self.embeddings.encode(inputs=[input[\"text\"] for input in inputs])\n        for input, embedding in zip(inputs, embeddings):\n            input[\"embedding\"] = embedding\n            input[\"model_name\"] = self.embeddings.model_name\n        yield inputs\n\n    def unload(self) -> None:\n        super().unload()\n        self.embeddings.unload()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingGeneration.load","title":"load()","text":"

Loads the Embeddings model.

Source code in src/distilabel/steps/embeddings/embedding_generation.py
def load(self) -> None:\n    \"\"\"Loads the `Embeddings` model.\"\"\"\n    super().load()\n\n    self.embeddings.load()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour","title":"FaissNearestNeighbour","text":"

Bases: GlobalStep

Create a faiss index to get the nearest neighbours.

FaissNearestNeighbour is a GlobalStep that creates a faiss index using the Hugging Face datasets library integration, and then gets the nearest neighbours and the scores or distance of the nearest neighbours for each input row.

Attributes:

Name Type Description device Optional[RuntimeParameter[Union[int, List[int]]]]

the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None.

string_factory Optional[RuntimeParameter[str]]

the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None.

metric_type Optional[RuntimeParameter[int]]

the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None.

k Optional[RuntimeParameter[int]]

the number of nearest neighbours to search for each input row. Defaults to 1.

search_batch_size Optional[RuntimeParameter[int]]

the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50.

train_size Optional[RuntimeParameter[int]]

If the index needs a training step, specifies how many vectors will be used to train the index.

Runtime parameters
  • device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None.
  • string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None.
  • metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None.
  • k: the number of nearest neighbours to search for each input row. Defaults to 1.
  • search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50.
  • train_size: If the index needs a training step, specifies how many vectors will be used to train the index.
Input columns
  • embedding (List[Union[float, int]]): a sentence embedding.
Output columns
  • nn_indices (List[int]): a list containing the indices of the k nearest neighbours in the inputs for the row.
  • nn_scores (List[float]): a list containing the score or distance to each k nearest neighbour in the inputs.
Categories
  • embedding
References
  • The Faiss library

Examples:

Generating embeddings and getting the nearest neighbours:

from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\nwith Pipeline(name=\"hello\") as pipeline:\n    load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n    embeddings = EmbeddingGeneration(\n        embeddings=SentenceTransformerEmbeddings(\n            model=\"mixedbread-ai/mxbai-embed-large-v1\"\n        )\n    )\n\n    nearest_neighbours = FaissNearestNeighbour()\n\n    load_data >> embeddings >> nearest_neighbours\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            load_data.name: {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n        },\n        use_cache=False,\n    )\n
Citations
@misc{douze2024faisslibrary,\n    title={The Faiss library},\n    author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazar\u00e9 and Maria Lomeli and Lucas Hosseini and Herv\u00e9 J\u00e9gou},\n    year={2024},\n    eprint={2401.08281},\n    archivePrefix={arXiv},\n    primaryClass={cs.LG},\n    url={https://arxiv.org/abs/2401.08281},\n}\n
Source code in src/distilabel/steps/embeddings/nearest_neighbour.py
class FaissNearestNeighbour(GlobalStep):\n    \"\"\"Create a `faiss` index to get the nearest neighbours.\n\n    `FaissNearestNeighbour` is a `GlobalStep` that creates a `faiss` index using the Hugging\n    Face `datasets` library integration, and then gets the nearest neighbours and the scores\n    or distance of the nearest neighbours for each input row.\n\n    Attributes:\n        device: the CUDA device ID or a list of IDs to be used. If negative integer, it\n            will use all the available GPUs. Defaults to `None`.\n        string_factory: the name of the factory to be used to build the `faiss` index.\n            Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\n            Defaults to `None`.\n        metric_type: the metric to be used to measure the distance between the points. It's\n            an integer and the recommend way to pass it is importing `faiss` and then passing\n            one of `faiss.METRIC_x` variables. Defaults to `None`.\n        k: the number of nearest neighbours to search for each input row. Defaults to `1`.\n        search_batch_size: the number of rows to include in a search batch. The value can\n            be adjusted to maximize the resources usage or to avoid OOM issues. Defaults\n            to `50`.\n        train_size: If the index needs a training step, specifies how many vectors will be\n            used to train the index.\n\n    Runtime parameters:\n        - `device`: the CUDA device ID or a list of IDs to be used. If negative integer,\n            it will use all the available GPUs. Defaults to `None`.\n        - `string_factory`: the name of the factory to be used to build the `faiss` index.\n            Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\n            Defaults to `None`.\n        - `metric_type`: the metric to be used to measure the distance between the points.\n            It's an integer and the recommend way to pass it is importing `faiss` and then\n            passing one of `faiss.METRIC_x` variables. Defaults to `None`.\n        - `k`: the number of nearest neighbours to search for each input row. Defaults to `1`.\n        - `search_batch_size`: the number of rows to include in a search batch. The value\n            can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults\n            to `50`.\n        - `train_size`: If the index needs a training step, specifies how many vectors will\n            be used to train the index.\n\n    Input columns:\n        - embedding (`List[Union[float, int]]`): a sentence embedding.\n\n    Output columns:\n        - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours\n            in the inputs for the row.\n        - nn_scores (`List[float]`): a list containing the score or distance to each `k`\n            nearest neighbour in the inputs.\n\n    Categories:\n        - embedding\n\n    References:\n        - [`The Faiss library`](https://arxiv.org/abs/2401.08281)\n\n    Examples:\n        Generating embeddings and getting the nearest neighbours:\n\n        ```python\n        from distilabel.models import SentenceTransformerEmbeddings\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\n        with Pipeline(name=\"hello\") as pipeline:\n            load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n            embeddings = EmbeddingGeneration(\n                embeddings=SentenceTransformerEmbeddings(\n                    model=\"mixedbread-ai/mxbai-embed-large-v1\"\n                )\n            )\n\n            nearest_neighbours = FaissNearestNeighbour()\n\n            load_data >> embeddings >> nearest_neighbours\n\n        if __name__ == \"__main__\":\n            distiset = pipeline.run(\n                parameters={\n                    load_data.name: {\n                        \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                        \"split\": \"test\",\n                    },\n                },\n                use_cache=False,\n            )\n        ```\n\n    Citations:\n        ```\n        @misc{douze2024faisslibrary,\n            title={The Faiss library},\n            author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazar\u00e9 and Maria Lomeli and Lucas Hosseini and Herv\u00e9 J\u00e9gou},\n            year={2024},\n            eprint={2401.08281},\n            archivePrefix={arXiv},\n            primaryClass={cs.LG},\n            url={https://arxiv.org/abs/2401.08281},\n        }\n        ```\n    \"\"\"\n\n    device: Optional[RuntimeParameter[Union[int, List[int]]]] = Field(\n        default=None,\n        description=\"The CUDA device ID or a list of IDs to be used. If negative integer,\"\n        \" it will use all the available GPUs.\",\n    )\n    string_factory: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The name of the factory to be used to build the `faiss` index.\"\n        \"Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\",\n    )\n    metric_type: Optional[RuntimeParameter[int]] = Field(\n        default=None,\n        description=\"The metric to be used to measure the distance between the points. It's\"\n        \" an integer and the recommend way to pass it is importing `faiss` and thenpassing\"\n        \" one of `faiss.METRIC_x` variables.\",\n    )\n    k: Optional[RuntimeParameter[int]] = Field(\n        default=1,\n        description=\"The number of nearest neighbours to search for each input row.\",\n    )\n    search_batch_size: Optional[RuntimeParameter[int]] = Field(\n        default=50,\n        description=\"The number of rows to include in a search batch. The value can be adjusted\"\n        \" to maximize the resources usage or to avoid OOM issues.\",\n    )\n    train_size: Optional[RuntimeParameter[int]] = Field(\n        default=None,\n        description=\"If the index needs a training step, specifies how many vectors will be used to train the index.\",\n    )\n\n    def load(self) -> None:\n        super().load()\n\n        if importlib.util.find_spec(\"faiss\") is None:\n            raise ImportError(\n                \"`faiss` package is not installed. Please install it using `pip install\"\n                \" faiss-cpu` or `pip install faiss-gpu`.\"\n            )\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"embedding\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"nn_indices\", \"nn_scores\"]\n\n    def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:\n        \"\"\"Builds a `faiss` index using `datasets` integration.\n\n        Args:\n            inputs: a list of dictionaries.\n\n        Returns:\n            The build `datasets.Dataset` with its `faiss` index.\n        \"\"\"\n        dataset = Dataset.from_list(inputs)\n        if self.train_size is not None and self.string_factory:\n            self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Starting Faiss index training...\")\n        dataset.add_faiss_index(\n            column=\"embedding\",\n            device=self.device,  # type: ignore\n            string_factory=self.string_factory,\n            metric_type=self.metric_type,\n            train_size=self.train_size,\n        )\n        return dataset\n\n    def _save_index(self, dataset: Dataset) -> None:\n        \"\"\"Save the generated Faiss index as an artifact of the step.\n\n        Args:\n            dataset: the dataset with the `faiss` index built.\n        \"\"\"\n        self.save_artifact(\n            name=\"faiss_index\",\n            write_function=lambda path: dataset.save_faiss_index(\n                index_name=\"embedding\", file=path / \"index.faiss\"\n            ),\n            metadata={\n                \"num_rows\": len(dataset),\n                \"embedding_dim\": len(dataset[0][\"embedding\"]),\n            },\n        )\n\n    def _search(self, dataset: Dataset) -> Dataset:\n        \"\"\"Search the top `k` nearest neighbours for each row in the dataset.\n\n        Args:\n            dataset: the dataset with the `faiss` index built.\n\n        Returns:\n            The updated dataset containing the top `k` nearest neighbours for each row,\n            as well as the score or distance.\n        \"\"\"\n\n        def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:\n            queries = np.array(examples[\"embedding\"])\n            results = dataset.search_batch(\n                index_name=\"embedding\",\n                queries=queries,\n                k=self.k + 1,  # type: ignore\n            )\n            examples[\"nn_indices\"] = [indices[1:] for indices in results.total_indices]\n            examples[\"nn_scores\"] = [scores[1:] for scores in results.total_scores]\n            return examples\n\n        return dataset.map(\n            add_search_results, batched=True, batch_size=self.search_batch_size\n        )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        dataset = self._build_index(inputs)\n        dataset_with_search_results = self._search(dataset)\n        self._save_index(dataset)\n        yield dataset_with_search_results.to_list()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._build_index","title":"_build_index(inputs)","text":"

Builds a faiss index using datasets integration.

Parameters:

Name Type Description Default inputs List[Dict[str, Any]]

a list of dictionaries.

required

Returns:

Type Description Dataset

The build datasets.Dataset with its faiss index.

Source code in src/distilabel/steps/embeddings/nearest_neighbour.py
def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:\n    \"\"\"Builds a `faiss` index using `datasets` integration.\n\n    Args:\n        inputs: a list of dictionaries.\n\n    Returns:\n        The build `datasets.Dataset` with its `faiss` index.\n    \"\"\"\n    dataset = Dataset.from_list(inputs)\n    if self.train_size is not None and self.string_factory:\n        self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Starting Faiss index training...\")\n    dataset.add_faiss_index(\n        column=\"embedding\",\n        device=self.device,  # type: ignore\n        string_factory=self.string_factory,\n        metric_type=self.metric_type,\n        train_size=self.train_size,\n    )\n    return dataset\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._save_index","title":"_save_index(dataset)","text":"

Save the generated Faiss index as an artifact of the step.

Parameters:

Name Type Description Default dataset Dataset

the dataset with the faiss index built.

required Source code in src/distilabel/steps/embeddings/nearest_neighbour.py
def _save_index(self, dataset: Dataset) -> None:\n    \"\"\"Save the generated Faiss index as an artifact of the step.\n\n    Args:\n        dataset: the dataset with the `faiss` index built.\n    \"\"\"\n    self.save_artifact(\n        name=\"faiss_index\",\n        write_function=lambda path: dataset.save_faiss_index(\n            index_name=\"embedding\", file=path / \"index.faiss\"\n        ),\n        metadata={\n            \"num_rows\": len(dataset),\n            \"embedding_dim\": len(dataset[0][\"embedding\"]),\n        },\n    )\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._search","title":"_search(dataset)","text":"

Search the top k nearest neighbours for each row in the dataset.

Parameters:

Name Type Description Default dataset Dataset

the dataset with the faiss index built.

required

Returns:

Type Description Dataset

The updated dataset containing the top k nearest neighbours for each row,

Dataset

as well as the score or distance.

Source code in src/distilabel/steps/embeddings/nearest_neighbour.py
def _search(self, dataset: Dataset) -> Dataset:\n    \"\"\"Search the top `k` nearest neighbours for each row in the dataset.\n\n    Args:\n        dataset: the dataset with the `faiss` index built.\n\n    Returns:\n        The updated dataset containing the top `k` nearest neighbours for each row,\n        as well as the score or distance.\n    \"\"\"\n\n    def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:\n        queries = np.array(examples[\"embedding\"])\n        results = dataset.search_batch(\n            index_name=\"embedding\",\n            queries=queries,\n            k=self.k + 1,  # type: ignore\n        )\n        examples[\"nn_indices\"] = [indices[1:] for indices in results.total_indices]\n        examples[\"nn_scores\"] = [scores[1:] for scores in results.total_scores]\n        return examples\n\n    return dataset.map(\n        add_search_results, batched=True, batch_size=self.search_batch_size\n    )\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingDedup","title":"EmbeddingDedup","text":"

Bases: GlobalStep

Deduplicates text using embeddings.

EmbeddingDedup is a Step that detects near-duplicates in datasets, using embeddings to compare the similarity between the texts. The typical workflow with this step would include having a dataset with embeddings precomputed, and then (possibly using the FaissNearestNeighbour) using the nn_indices and nn_scores, determine the texts that are duplicate.

Attributes:

Name Type Description threshold Optional[RuntimeParameter[float]]

the threshold to consider 2 examples as duplicates. It's dependent on the type of index that was used to generate the embeddings. For example, if the embeddings were generated using cosine similarity, a threshold of 0.9 would make all the texts with a cosine similarity above the value duplicates. Higher values detect less duplicates in such an index, but that should be taken into account when building it. Defaults to 0.9.

Runtime Parameters
  • threshold: the threshold to consider 2 examples as duplicates.
Input columns
  • nn_indices (List[int]): a list containing the indices of the k nearest neighbours in the inputs for the row.
  • nn_scores (List[float]): a list containing the score or distance to each k nearest neighbour in the inputs.
Output columns
  • keep_row_after_embedding_filtering (bool): boolean indicating if the piece text is not a duplicate i.e. this text should be kept.
Categories
  • filtering

Examples:

Deduplicate a list of texts using embedding information:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n    data = LoadDataFromDicts(\n        data=[\n            {\n                \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n                \"embedding\": [\n                    0.018477669046149742,\n                    -0.03748236608841726,\n                    0.001919870620352492,\n                    0.024918478063770535,\n                    0.02348063521315178,\n                    0.0038251285566308375,\n                    -0.01723884983037716,\n                    0.02881971942372201,\n                ],\n                \"nn_indices\": [0, 1],\n                \"nn_scores\": [\n                    0.9164746999740601,\n                    0.782106876373291,\n                ],\n            },\n            {\n                \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n                \"embedding\": [\n                    -0.0023464179614082125,\n                    -0.07325472251663565,\n                    -0.06058678419516501,\n                    -0.02100326928586996,\n                    -0.013462744792362657,\n                    0.027368447064244242,\n                    -0.003916070100455717,\n                    0.01243614518480423,\n                ],\n                \"nn_indices\": [0, 2],\n                \"nn_scores\": [\n                    0.7552462220191956,\n                    0.7261884808540344,\n                ],\n            },\n            {\n                \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n                \"embedding\": [\n                    -0.01630817942328242,\n                    -0.023760151552345232,\n                    -0.014249650090627883,\n                    -0.005713686451446624,\n                    -0.016033059279131567,\n                    0.0071440908501058786,\n                    -0.05691099643425161,\n                    0.01597412704817784,\n                ],\n                \"nn_indices\": [1, 2],\n                \"nn_scores\": [\n                    0.8107735514640808,\n                    0.7172299027442932,\n                ],\n            },\n        ],\n        batch_size=batch_size,\n    )\n    # In general you should do something like this before the deduplication step, to obtain the\n    # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n    # no need for it.\n    # nn = FaissNearestNeighbour(\n    #     k=30,\n    #     metric_type=faiss.METRIC_INNER_PRODUCT,\n    #     search_batch_size=50,\n    #     train_size=len(dataset),              # The number of embeddings to use for training\n    #     string_factory=\"IVF300_HNSW32,Flat\"   # To use an index (optional, maybe required for big datasets)\n    # )\n    # Read more about the `string_factory` here:\n    # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n    embedding_dedup = EmbeddingDedup(\n        threshold=0.8,\n        input_batch_size=batch_size,\n    )\n\n    data >> embedding_dedup\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    ds = distiset[\"default\"][\"train\"]\n    # Filter out the duplicates\n    ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n```\n
Source code in src/distilabel/steps/filtering/embedding.py
class EmbeddingDedup(GlobalStep):\n    \"\"\"Deduplicates text using embeddings.\n\n    `EmbeddingDedup` is a Step that detects near-duplicates in datasets, using\n    embeddings to compare the similarity between the texts. The typical workflow with this step\n    would include having a dataset with embeddings precomputed, and then (possibly using the\n    `FaissNearestNeighbour`) using the `nn_indices` and `nn_scores`, determine the texts that\n    are duplicate.\n\n    Attributes:\n        threshold: the threshold to consider 2 examples as duplicates.\n            It's dependent on the type of index that was used to generate the embeddings.\n            For example, if the embeddings were generated using cosine similarity, a threshold\n            of `0.9` would make all the texts with a cosine similarity above the value\n            duplicates. Higher values detect less duplicates in such an index, but that should\n            be taken into account when building it. Defaults to `0.9`.\n\n    Runtime Parameters:\n        - `threshold`: the threshold to consider 2 examples as duplicates.\n\n    Input columns:\n        - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours\n            in the inputs for the row.\n        - nn_scores (`List[float]`): a list containing the score or distance to each `k`\n            nearest neighbour in the inputs.\n\n    Output columns:\n        - keep_row_after_embedding_filtering (`bool`): boolean indicating if the piece `text` is\n            not a duplicate i.e. this text should be kept.\n\n    Categories:\n        - filtering\n\n    Examples:\n\n        Deduplicate a list of texts using embedding information:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import EmbeddingDedup\n        from distilabel.steps import LoadDataFromDicts\n\n        with Pipeline() as pipeline:\n            data = LoadDataFromDicts(\n                data=[\n                    {\n                        \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n                        \"embedding\": [\n                            0.018477669046149742,\n                            -0.03748236608841726,\n                            0.001919870620352492,\n                            0.024918478063770535,\n                            0.02348063521315178,\n                            0.0038251285566308375,\n                            -0.01723884983037716,\n                            0.02881971942372201,\n                        ],\n                        \"nn_indices\": [0, 1],\n                        \"nn_scores\": [\n                            0.9164746999740601,\n                            0.782106876373291,\n                        ],\n                    },\n                    {\n                        \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n                        \"embedding\": [\n                            -0.0023464179614082125,\n                            -0.07325472251663565,\n                            -0.06058678419516501,\n                            -0.02100326928586996,\n                            -0.013462744792362657,\n                            0.027368447064244242,\n                            -0.003916070100455717,\n                            0.01243614518480423,\n                        ],\n                        \"nn_indices\": [0, 2],\n                        \"nn_scores\": [\n                            0.7552462220191956,\n                            0.7261884808540344,\n                        ],\n                    },\n                    {\n                        \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n                        \"embedding\": [\n                            -0.01630817942328242,\n                            -0.023760151552345232,\n                            -0.014249650090627883,\n                            -0.005713686451446624,\n                            -0.016033059279131567,\n                            0.0071440908501058786,\n                            -0.05691099643425161,\n                            0.01597412704817784,\n                        ],\n                        \"nn_indices\": [1, 2],\n                        \"nn_scores\": [\n                            0.8107735514640808,\n                            0.7172299027442932,\n                        ],\n                    },\n                ],\n                batch_size=batch_size,\n            )\n            # In general you should do something like this before the deduplication step, to obtain the\n            # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n            # no need for it.\n            # nn = FaissNearestNeighbour(\n            #     k=30,\n            #     metric_type=faiss.METRIC_INNER_PRODUCT,\n            #     search_batch_size=50,\n            #     train_size=len(dataset),              # The number of embeddings to use for training\n            #     string_factory=\"IVF300_HNSW32,Flat\"   # To use an index (optional, maybe required for big datasets)\n            # )\n            # Read more about the `string_factory` here:\n            # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n            embedding_dedup = EmbeddingDedup(\n                threshold=0.8,\n                input_batch_size=batch_size,\n            )\n\n            data >> embedding_dedup\n\n        if __name__ == \"__main__\":\n            distiset = pipeline.run(use_cache=False)\n            ds = distiset[\"default\"][\"train\"]\n            # Filter out the duplicates\n            ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n        ```\n    \"\"\"\n\n    threshold: Optional[RuntimeParameter[float]] = Field(\n        default=0.9,\n        description=\"The threshold to consider 2 examples as duplicates. It's dependent \"\n        \"on the type of index that was used to generate the embeddings. For example, if \"\n        \"the embeddings were generated using cosine similarity, a threshold of `0.9` \"\n        \"would make all the texts with a cosine similarity above the value duplicates. \"\n        \"Higher values detect less duplicates in such an index, but that should be \"\n        \"taken into account when building it.\",\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"nn_scores\", \"nn_indices\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"keep_row_after_embedding_filtering\"]\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        rows_to_remove = set()\n\n        for input in track(inputs, description=\"Running Embedding deduplication...\"):\n            input[\"keep_row_after_embedding_filtering\"] = True\n            indices_scores = np.array(input[\"nn_scores\"]) > self.threshold\n            indices = np.array(input[\"nn_indices\"])[indices_scores]\n            if len(indices) > 0:  # If there are any rows found over the threshold\n                rows_to_remove.update(list(indices))\n\n        # Remove duplicates and get the list of rows to remove\n        for idx in rows_to_remove:\n            inputs[idx][\"keep_row_after_embedding_filtering\"] = False\n\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.MinHashDedup","title":"MinHashDedup","text":"

Bases: Step

Deduplicates text using MinHash and MinHashLSH.

MinHashDedup is a Step that detects near-duplicates in datasets. The idea roughly translates to the following steps: 1. Tokenize the text into words or ngrams. 2. Create a MinHash for each text. 3. Store the MinHashes in a MinHashLSH. 4. Check if the MinHash is already in the LSH, if so, it is a duplicate.

Attributes:

Name Type Description num_perm int

the number of permutations to use. Defaults to 128.

seed int

the seed to use for the MinHash. This seed must be the same used for MinHash, keep in mind when both steps are created. Defaults to 1.

tokenizer Literal['words', 'ngrams']

the tokenizer to use. Available ones are words or ngrams. If words is selected, it tokenize the text into words using nltk's word tokenizer. ngram estimates the ngrams (together with the size n) using. Defaults to words.

n Optional[int]

the size of the ngrams to use. Only relevant if tokenizer=\"ngrams\". Defaults to 5.

threshold float

the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to 0.9.

storage Literal['dict', 'disk']

the storage to use for the LSH. Can be dict to store the index in memory, or disk. Keep in mind, disk is an experimental feature not defined in datasketch, that is based on DiskCache's Index class. It should work as a dict, but backed by disk, but depending on the system it can be slower. Defaults to dict. which uses a custom shelve backend. Note the disk is an experimetal feature that may cause issues. Defaults to dict.

Input columns
  • text (str): the texts to be filtered.
Output columns
  • keep_row_after_minhash_filtering (bool): boolean indicating if the piece text is not a duplicate i.e. this text should be kept.
Categories
  • filtering
References
  • datasketch documentation
  • Identifying and Filtering Near-Duplicate Documents
  • Diskcache's Index

Examples:

Deduplicate a list of texts using MinHash and MinHashLSH:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import MinHashDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n    ds_size = 1000\n    batch_size = 500  # Bigger batch sizes work better for this step\n    data = LoadDataFromDicts(\n        data=[\n            {\"text\": \"This is a test document.\"},\n            {\"text\": \"This document is a test.\"},\n            {\"text\": \"Test document for duplication.\"},\n            {\"text\": \"Document for duplication test.\"},\n            {\"text\": \"This is another unique document.\"},\n        ]\n        * (ds_size // 5),\n        batch_size=batch_size,\n    )\n    minhash_dedup = MinHashDedup(\n        tokenizer=\"words\",\n        threshold=0.9,      # lower values will increase the number of duplicates\n        storage=\"dict\",     # or \"disk\" for bigger datasets\n    )\n\n    data >> minhash_dedup\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    ds = distiset[\"default\"][\"train\"]\n    # Filter out the duplicates\n    ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n```\n
Source code in src/distilabel/steps/filtering/minhash.py
class MinHashDedup(Step):\n    \"\"\"Deduplicates text using `MinHash` and `MinHashLSH`.\n\n    `MinHashDedup` is a Step that detects near-duplicates in datasets. The idea roughly translates\n    to the following steps:\n    1. Tokenize the text into words or ngrams.\n    2. Create a `MinHash` for each text.\n    3. Store the `MinHashes` in a `MinHashLSH`.\n    4. Check if the `MinHash` is already in the `LSH`, if so, it is a duplicate.\n\n    Attributes:\n        num_perm: the number of permutations to use. Defaults to `128`.\n        seed: the seed to use for the MinHash. This seed must be the same\n            used for `MinHash`, keep in mind when both steps are created. Defaults to `1`.\n        tokenizer: the tokenizer to use. Available ones are `words` or `ngrams`.\n            If `words` is selected, it tokenize the text into words using nltk's\n            word tokenizer. `ngram` estimates the ngrams (together with the size\n            `n`) using. Defaults to `words`.\n        n: the size of the ngrams to use. Only relevant if `tokenizer=\"ngrams\"`. Defaults to `5`.\n        threshold: the threshold to consider two MinHashes as duplicates.\n            Values closer to 0 detect more duplicates. Defaults to `0.9`.\n        storage: the storage to use for the LSH. Can be `dict` to store the index\n            in memory, or `disk`. Keep in mind, `disk` is an experimental feature\n            not defined in `datasketch`, that is based on DiskCache's `Index` class.\n            It should work as a `dict`, but backed by disk, but depending on the system\n            it can be slower. Defaults to `dict`.\n            which uses a custom `shelve` backend. Note the `disk`\n            is an experimetal feature that may cause issues. Defaults to `dict`.\n\n    Input columns:\n        - text (`str`): the texts to be filtered.\n\n    Output columns:\n        - keep_row_after_minhash_filtering (`bool`): boolean indicating if the piece `text` is\n            not a duplicate i.e. this text should be kept.\n\n    Categories:\n        - filtering\n\n    References:\n        - [`datasketch documentation`](https://ekzhu.github.io/datasketch/lsh.html)\n        - [Identifying and Filtering Near-Duplicate Documents](https://cs.brown.edu/courses/cs253/papers/nearduplicate.pdf)\n        - [Diskcache's Index](https://grantjenks.com/docs/diskcache/api.html#diskcache.Index)\n\n    Examples:\n\n        Deduplicate a list of texts using MinHash and MinHashLSH:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import MinHashDedup\n        from distilabel.steps import LoadDataFromDicts\n\n        with Pipeline() as pipeline:\n            ds_size = 1000\n            batch_size = 500  # Bigger batch sizes work better for this step\n            data = LoadDataFromDicts(\n                data=[\n                    {\"text\": \"This is a test document.\"},\n                    {\"text\": \"This document is a test.\"},\n                    {\"text\": \"Test document for duplication.\"},\n                    {\"text\": \"Document for duplication test.\"},\n                    {\"text\": \"This is another unique document.\"},\n                ]\n                * (ds_size // 5),\n                batch_size=batch_size,\n            )\n            minhash_dedup = MinHashDedup(\n                tokenizer=\"words\",\n                threshold=0.9,      # lower values will increase the number of duplicates\n                storage=\"dict\",     # or \"disk\" for bigger datasets\n            )\n\n            data >> minhash_dedup\n\n        if __name__ == \"__main__\":\n            distiset = pipeline.run(use_cache=False)\n            ds = distiset[\"default\"][\"train\"]\n            # Filter out the duplicates\n            ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n        ```\n    \"\"\"\n\n    num_perm: int = 128\n    seed: int = 1\n    tokenizer: Literal[\"words\", \"ngrams\"] = \"words\"\n    n: Optional[int] = 5\n    threshold: float = 0.9\n    storage: Literal[\"dict\", \"disk\"] = \"dict\"\n\n    _hasher: Union[\"MinHash\", None] = PrivateAttr(None)\n    _tokenizer: Union[Callable, None] = PrivateAttr(None)\n    _lhs: Union[\"MinHashLSH\", None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        super().load()\n        if not importlib.import_module(\"datasketch\"):\n            raise ImportError(\n                \"`datasketch` is needed to deduplicate with MinHash, but is not installed. \"\n                \"Please install it using `pip install datasketch`.\"\n            )\n        from datasketch import MinHash\n\n        from distilabel.steps.filtering._datasketch import MinHashLSH\n\n        self._hasher = MinHash.bulk\n        self._lsh = MinHashLSH(\n            num_perm=self.num_perm,\n            threshold=self.threshold,\n            storage_config={\"type\": self.storage},\n        )\n\n        if self.tokenizer == \"words\":\n            if not importlib.import_module(\"nltk\"):\n                raise ImportError(\n                    \"`nltk` is needed to tokenize based on words, but is not installed. \"\n                    \"Please install it using `pip install nltk`. Then run `nltk.download('punkt_tab')`.\"\n                )\n            self._tokenizer = tokenized_on_words\n        else:\n            self._tokenizer = partial(tokenize_on_ngrams, n=self.n)\n\n    def unload(self) -> None:\n        super().unload()\n        # In case of LSH being stored in disk, we need to close the file.\n        if self.storage == \"disk\":\n            self._lsh.close()\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"keep_row_after_minhash_filtering\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        tokenized_texts = []\n        for input in inputs:\n            tokenized_texts.append(self._tokenizer([input[self.inputs[0]]])[0])\n\n        minhashes = self._hasher(\n            tokenized_texts, num_perm=self.num_perm, seed=self.seed\n        )\n\n        for input, minhash in zip(inputs, minhashes):\n            # Check if the text is already in the LSH index\n            if self._lsh.query(minhash):\n                input[\"keep_row_after_minhash_filtering\"] = False\n            else:\n                self._lsh.insert(str(uuid.uuid4()), minhash)\n                input[\"keep_row_after_minhash_filtering\"] = True\n\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate","title":"ConversationTemplate","text":"

Bases: Step

Generate a conversation template from an instruction and a response.

Input columns
  • instruction (str): The instruction to be used in the conversation.
  • response (str): The response to be used in the conversation.
Output columns
  • conversation (ChatType): The conversation template.
Categories
  • format
  • chat
  • template

Examples:

Create a conversation from an instruction and a response:

from distilabel.steps import ConversationTemplate\n\nconv_template = ConversationTemplate()\nconv_template.load()\n\nresult = next(\n    conv_template.process(\n        [\n            {\n                \"instruction\": \"Hello\",\n                \"response\": \"Hi\",\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n
Source code in src/distilabel/steps/formatting/conversation.py
class ConversationTemplate(Step):\n    \"\"\"Generate a conversation template from an instruction and a response.\n\n    Input columns:\n        - instruction (`str`): The instruction to be used in the conversation.\n        - response (`str`): The response to be used in the conversation.\n\n    Output columns:\n        - conversation (`ChatType`): The conversation template.\n\n    Categories:\n        - format\n        - chat\n        - template\n\n    Examples:\n        Create a conversation from an instruction and a response:\n\n        ```python\n        from distilabel.steps import ConversationTemplate\n\n        conv_template = ConversationTemplate()\n        conv_template.load()\n\n        result = next(\n            conv_template.process(\n                [\n                    {\n                        \"instruction\": \"Hello\",\n                        \"response\": \"Hi\",\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The instruction and response.\"\"\"\n        return [\"instruction\", \"response\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The conversation template.\"\"\"\n        return [\"conversation\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Generate a conversation template from an instruction and a response.\n\n        Args:\n            inputs: The input data.\n\n        Yields:\n            The input data with the conversation template.\n        \"\"\"\n        for input in inputs:\n            input[\"conversation\"] = [\n                {\"role\": \"user\", \"content\": input[\"instruction\"]},\n                {\"role\": \"assistant\", \"content\": input[\"response\"]},\n            ]\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.inputs","title":"inputs: StepColumns property","text":"

The instruction and response.

"},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.outputs","title":"outputs: StepColumns property","text":"

The conversation template.

"},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.process","title":"process(inputs)","text":"

Generate a conversation template from an instruction and a response.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Yields:

Type Description StepOutput

The input data with the conversation template.

Source code in src/distilabel/steps/formatting/conversation.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Generate a conversation template from an instruction and a response.\n\n    Args:\n        inputs: The input data.\n\n    Yields:\n        The input data with the conversation template.\n    \"\"\"\n    for input in inputs:\n        input[\"conversation\"] = [\n            {\"role\": \"user\", \"content\": input[\"instruction\"]},\n            {\"role\": \"assistant\", \"content\": input[\"response\"]},\n        ]\n    yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO","title":"FormatChatGenerationDPO","text":"

Bases: Step

Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO).

FormatChatGenerationDPO is a Step that formats the output of the combination of a ChatGeneration task with a preference Task i.e. a task generating ratings such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook., so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings.

Note

The messages column should contain at least one message from the user, the generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations.

Input columns
  • messages (List[Dict[str, str]]): The conversation messages.
  • generations (List[str]): The generations produced by the LLM.
  • generation_models (List[str], optional): The model names used to generate the generations, only available if the model_name from the ChatGeneration task/s is combined into a single column named this way, otherwise, it will be ignored.
  • ratings (List[float]): The ratings for each of the generations, produced by a preference task such as UltraFeedback.
Output columns
  • prompt (str): The user message used to generate the generations with the LLM.
  • prompt_id (str): The SHA256 hash of the prompt.
  • chosen (List[Dict[str, str]]): The chosen generation based on the ratings.
  • chosen_model (str, optional): The model name used to generate the chosen generation, if the generation_models are available.
  • chosen_rating (float): The rating of the chosen generation.
  • rejected (List[Dict[str, str]]): The rejected generation based on the ratings.
  • rejected_model (str, optional): The model name used to generate the rejected generation, if the generation_models are available.
  • rejected_rating (float): The rating of the rejected generation.
Categories
  • format
  • chat-generation
  • preference
  • messages
  • generations

Examples:

Format your dataset for DPO fine tuning:

from distilabel.steps import FormatChatGenerationDPO\n\nformat_dpo = FormatChatGenerationDPO()\nformat_dpo.load()\n\n# NOTE: \"generation_models\" can be added optionally.\nresult = next(\n    format_dpo.process(\n        [\n            {\n                \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                \"generations\": [\"4\", \"5\", \"6\"],\n                \"ratings\": [1, 0, -1],\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n#         'generations': ['4', '5', '6'],\n#         'ratings': [1, 0, -1],\n#         'prompt': \"What's 2+2?\",\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#         'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#         'chosen_rating': 1,\n#         'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n#         'rejected_rating': -1\n#     }\n# ]\n
Source code in src/distilabel/steps/formatting/dpo.py
class FormatChatGenerationDPO(Step):\n    \"\"\"Format the output of a combination of a `ChatGeneration` + a preference task for Direct Preference Optimization (DPO).\n\n    `FormatChatGenerationDPO` is a `Step` that formats the output of the combination of a `ChatGeneration`\n    task with a preference `Task` i.e. a task generating `ratings` such as `UltraFeedback` following the standard\n    formatting from frameworks such as `axolotl` or `alignment-handbook`., so that those are used to rank the\n    existing generations and provide the `chosen` and `rejected` generations based on the `ratings`.\n\n    Note:\n        The `messages` column should contain at least one message from the user, the `generations`\n        column should contain at least two generations, the `ratings` column should contain the same\n        number of ratings as generations.\n\n    Input columns:\n        - messages (`List[Dict[str, str]]`): The conversation messages.\n        - generations (`List[str]`): The generations produced by the `LLM`.\n        - generation_models (`List[str]`, optional): The model names used to generate the `generations`,\n            only available if the `model_name` from the `ChatGeneration` task/s is combined into a single\n            column named this way, otherwise, it will be ignored.\n        - ratings (`List[float]`): The ratings for each of the `generations`, produced by a preference\n            task such as `UltraFeedback`.\n\n    Output columns:\n        - prompt (`str`): The user message used to generate the `generations` with the `LLM`.\n        - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n        - chosen (`List[Dict[str, str]]`): The `chosen` generation based on the `ratings`.\n        - chosen_model (`str`, optional): The model name used to generate the `chosen` generation,\n            if the `generation_models` are available.\n        - chosen_rating (`float`): The rating of the `chosen` generation.\n        - rejected (`List[Dict[str, str]]`): The `rejected` generation based on the `ratings`.\n        - rejected_model (`str`, optional): The model name used to generate the `rejected` generation,\n            if the `generation_models` are available.\n        - rejected_rating (`float`): The rating of the `rejected` generation.\n\n    Categories:\n        - format\n        - chat-generation\n        - preference\n        - messages\n        - generations\n\n    Examples:\n        Format your dataset for DPO fine tuning:\n\n        ```python\n        from distilabel.steps import FormatChatGenerationDPO\n\n        format_dpo = FormatChatGenerationDPO()\n        format_dpo.load()\n\n        # NOTE: \"generation_models\" can be added optionally.\n        result = next(\n            format_dpo.process(\n                [\n                    {\n                        \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                        \"generations\": [\"4\", \"5\", \"6\"],\n                        \"ratings\": [1, 0, -1],\n                    }\n                ]\n            )\n        )\n        # >>> result\n        # [\n        #     {\n        #         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n        #         'generations': ['4', '5', '6'],\n        #         'ratings': [1, 0, -1],\n        #         'prompt': \"What's 2+2?\",\n        #         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n        #         'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n        #         'chosen_rating': 1,\n        #         'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n        #         'rejected_rating': -1\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of inputs required by the `Step`, which in this case are: `messages`, `generations`,\n        and `ratings`.\"\"\"\n        return [\"messages\", \"generations\", \"ratings\"]\n\n    @property\n    def optional_inputs(self) -> List[str]:\n        \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n        which in this case is: `generation_models`.\"\"\"\n        return [\"generation_models\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `chosen`,\n        `chosen_model`, `chosen_rating`, `rejected`, `rejected_model`, `rejected_rating`. Both\n        the `chosen_model` and `rejected_model` being optional and only used if `generation_models`\n        is available.\n\n        Reference:\n            - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n        \"\"\"\n        return [\n            \"prompt\",\n            \"prompt_id\",\n            \"chosen\",\n            \"chosen_model\",\n            \"chosen_rating\",\n            \"rejected\",\n            \"rejected_model\",\n            \"rejected_rating\",\n        ]\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n        according to the DPO formatting standard.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n        \"\"\"\n        for input in inputs:\n            for item in input:\n                item[\"prompt\"] = next(\n                    (\n                        turn[\"content\"]\n                        for turn in item[\"messages\"]\n                        if turn[\"role\"] == \"user\"\n                    ),\n                    None,\n                )\n                item[\"prompt_id\"] = hashlib.sha256(\n                    item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n                ).hexdigest()\n\n                chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n                item[\"chosen\"] = item[\"messages\"] + [\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": item[\"generations\"][chosen_idx],\n                    }\n                ]\n                if \"generation_models\" in item:\n                    item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n                item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n                rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n                item[\"rejected\"] = item[\"messages\"] + [\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": item[\"generations\"][rejected_idx],\n                    }\n                ]\n                if \"generation_models\" in item:\n                    item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n                item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n            yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.inputs","title":"inputs: StepColumns property","text":"

List of inputs required by the Step, which in this case are: messages, generations, and ratings.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.optional_inputs","title":"optional_inputs: List[str] property","text":"

List of optional inputs, which are not required by the Step but used if available, which in this case is: generation_models.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.outputs","title":"outputs: StepColumns property","text":"

List of outputs generated by the Step, which are: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating. Both the chosen_model and rejected_model being optional and only used if generation_models is available.

Reference
  • Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.process","title":"process(*inputs)","text":"

The process method formats the received StepInput or list of StepInput according to the DPO formatting standard.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with batches of formatted StepInput following the DPO standard.

Source code in src/distilabel/steps/formatting/dpo.py
def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n    according to the DPO formatting standard.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n    \"\"\"\n    for input in inputs:\n        for item in input:\n            item[\"prompt\"] = next(\n                (\n                    turn[\"content\"]\n                    for turn in item[\"messages\"]\n                    if turn[\"role\"] == \"user\"\n                ),\n                None,\n            )\n            item[\"prompt_id\"] = hashlib.sha256(\n                item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n            item[\"chosen\"] = item[\"messages\"] + [\n                {\n                    \"role\": \"assistant\",\n                    \"content\": item[\"generations\"][chosen_idx],\n                }\n            ]\n            if \"generation_models\" in item:\n                item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n            item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n            rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n            item[\"rejected\"] = item[\"messages\"] + [\n                {\n                    \"role\": \"assistant\",\n                    \"content\": item[\"generations\"][rejected_idx],\n                }\n            ]\n            if \"generation_models\" in item:\n                item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n            item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n        yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO","title":"FormatTextGenerationDPO","text":"

Bases: Step

Format the output of your LLMs for Direct Preference Optimization (DPO).

FormatTextGenerationDPO is a Step that formats the output of the combination of a TextGeneration task with a preference Task i.e. a task generating ratings, so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings. Use this step to transform the output of a combination of a TextGeneration + a preference task such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook.

Note

The generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations.

Input columns
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generations, if available.
  • instruction (str): The instruction used to generate the generations with the LLM.
  • generations (List[str]): The generations produced by the LLM.
  • generation_models (List[str], optional): The model names used to generate the generations, only available if the model_name from the TextGeneration task/s is combined into a single column named this way, otherwise, it will be ignored.
  • ratings (List[float]): The ratings for each of the generations, produced by a preference task such as UltraFeedback.
Output columns
  • prompt (str): The instruction used to generate the generations with the LLM.
  • prompt_id (str): The SHA256 hash of the prompt.
  • chosen (List[Dict[str, str]]): The chosen generation based on the ratings.
  • chosen_model (str, optional): The model name used to generate the chosen generation, if the generation_models are available.
  • chosen_rating (float): The rating of the chosen generation.
  • rejected (List[Dict[str, str]]): The rejected generation based on the ratings.
  • rejected_model (str, optional): The model name used to generate the rejected generation, if the generation_models are available.
  • rejected_rating (float): The rating of the rejected generation.
Categories
  • format
  • text-generation
  • preference
  • instruction
  • generations

Examples:

Format your dataset for DPO fine tuning:

from distilabel.steps import FormatTextGenerationDPO\n\nformat_dpo = FormatTextGenerationDPO()\nformat_dpo.load()\n\n# NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\nresult = next(\n    format_dpo.process(\n        [\n            {\n                \"instruction\": \"What's 2+2?\",\n                \"generations\": [\"4\", \"5\", \"6\"],\n                \"ratings\": [1, 0, -1],\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#    {   'instruction': \"What's 2+2?\",\n#        'generations': ['4', '5', '6'],\n#        'ratings': [1, 0, -1],\n#        'prompt': \"What's 2+2?\",\n#        'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#        'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#        'chosen_rating': 1,\n#        'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n#        'rejected_rating': -1\n#    }\n# ]\n
Source code in src/distilabel/steps/formatting/dpo.py
class FormatTextGenerationDPO(Step):\n    \"\"\"Format the output of your LLMs for Direct Preference Optimization (DPO).\n\n    `FormatTextGenerationDPO` is a `Step` that formats the output of the combination of a `TextGeneration`\n    task with a preference `Task` i.e. a task generating `ratings`, so that those are used to rank the\n    existing generations and provide the `chosen` and `rejected` generations based on the `ratings`.\n    Use this step to transform the output of a combination of a `TextGeneration` + a preference task such as\n    `UltraFeedback` following the standard formatting from frameworks such as `axolotl` or `alignment-handbook`.\n\n    Note:\n        The `generations` column should contain at least two generations, the `ratings` column should\n        contain the same number of ratings as generations.\n\n    Input columns:\n        - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n            `generations`, if available.\n        - instruction (`str`): The instruction used to generate the `generations` with the `LLM`.\n        - generations (`List[str]`): The generations produced by the `LLM`.\n        - generation_models (`List[str]`, optional): The model names used to generate the `generations`,\n            only available if the `model_name` from the `TextGeneration` task/s is combined into a single\n            column named this way, otherwise, it will be ignored.\n        - ratings (`List[float]`): The ratings for each of the `generations`, produced by a preference\n            task such as `UltraFeedback`.\n\n    Output columns:\n        - prompt (`str`): The instruction used to generate the `generations` with the `LLM`.\n        - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n        - chosen (`List[Dict[str, str]]`): The `chosen` generation based on the `ratings`.\n        - chosen_model (`str`, optional): The model name used to generate the `chosen` generation,\n            if the `generation_models` are available.\n        - chosen_rating (`float`): The rating of the `chosen` generation.\n        - rejected (`List[Dict[str, str]]`): The `rejected` generation based on the `ratings`.\n        - rejected_model (`str`, optional): The model name used to generate the `rejected` generation,\n            if the `generation_models` are available.\n        - rejected_rating (`float`): The rating of the `rejected` generation.\n\n    Categories:\n        - format\n        - text-generation\n        - preference\n        - instruction\n        - generations\n\n    Examples:\n        Format your dataset for DPO fine tuning:\n\n        ```python\n        from distilabel.steps import FormatTextGenerationDPO\n\n        format_dpo = FormatTextGenerationDPO()\n        format_dpo.load()\n\n        # NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\n        result = next(\n            format_dpo.process(\n                [\n                    {\n                        \"instruction\": \"What's 2+2?\",\n                        \"generations\": [\"4\", \"5\", \"6\"],\n                        \"ratings\": [1, 0, -1],\n                    }\n                ]\n            )\n        )\n        # >>> result\n        # [\n        #    {   'instruction': \"What's 2+2?\",\n        #        'generations': ['4', '5', '6'],\n        #        'ratings': [1, 0, -1],\n        #        'prompt': \"What's 2+2?\",\n        #        'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n        #        'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n        #        'chosen_rating': 1,\n        #        'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n        #        'rejected_rating': -1\n        #    }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, `generations`,\n        and `ratings`.\"\"\"\n        return {\n            \"system_prompt\": False,\n            \"instruction\": True,\n            \"generations\": True,\n            \"generation_models\": False,\n            \"ratings\": True,\n        }\n\n    @property\n    def optional_inputs(self) -> List[str]:\n        \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n        which in this case are: `system_prompt`, and `generation_models`.\"\"\"\n        return [\"system_prompt\", \"generation_models\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `chosen`,\n        `chosen_model`, `chosen_rating`, `rejected`, `rejected_model`, `rejected_rating`. Both\n        the `chosen_model` and `rejected_model` being optional and only used if `generation_models`\n        is available.\n\n        Reference:\n            - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n        \"\"\"\n        return [\n            \"prompt\",\n            \"prompt_id\",\n            \"chosen\",\n            \"chosen_model\",\n            \"chosen_rating\",\n            \"rejected\",\n            \"rejected_model\",\n            \"rejected_rating\",\n        ]\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n        according to the DPO formatting standard.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n        \"\"\"\n        for input in inputs:\n            for item in input:\n                messages = [\n                    {\"role\": \"user\", \"content\": item[\"instruction\"]},  # type: ignore\n                ]\n                if (\n                    \"system_prompt\" in item\n                    and isinstance(item[\"system_prompt\"], str)  # type: ignore\n                    and len(item[\"system_prompt\"]) > 0  # type: ignore\n                ):\n                    messages.insert(\n                        0,\n                        {\"role\": \"system\", \"content\": item[\"system_prompt\"]},  # type: ignore\n                    )\n\n                item[\"prompt\"] = item[\"instruction\"]\n                item[\"prompt_id\"] = hashlib.sha256(\n                    item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n                ).hexdigest()\n\n                chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n                item[\"chosen\"] = messages + [\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": item[\"generations\"][chosen_idx],\n                    }\n                ]\n                if \"generation_models\" in item:\n                    item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n                item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n                rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n                item[\"rejected\"] = messages + [\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": item[\"generations\"][rejected_idx],\n                    }\n                ]\n                if \"generation_models\" in item:\n                    item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n                item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n            yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.inputs","title":"inputs: StepColumns property","text":"

List of inputs required by the Step, which in this case are: instruction, generations, and ratings.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.optional_inputs","title":"optional_inputs: List[str] property","text":"

List of optional inputs, which are not required by the Step but used if available, which in this case are: system_prompt, and generation_models.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.outputs","title":"outputs: StepColumns property","text":"

List of outputs generated by the Step, which are: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating. Both the chosen_model and rejected_model being optional and only used if generation_models is available.

Reference
  • Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.process","title":"process(*inputs)","text":"

The process method formats the received StepInput or list of StepInput according to the DPO formatting standard.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with batches of formatted StepInput following the DPO standard.

Source code in src/distilabel/steps/formatting/dpo.py
def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n    according to the DPO formatting standard.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n    \"\"\"\n    for input in inputs:\n        for item in input:\n            messages = [\n                {\"role\": \"user\", \"content\": item[\"instruction\"]},  # type: ignore\n            ]\n            if (\n                \"system_prompt\" in item\n                and isinstance(item[\"system_prompt\"], str)  # type: ignore\n                and len(item[\"system_prompt\"]) > 0  # type: ignore\n            ):\n                messages.insert(\n                    0,\n                    {\"role\": \"system\", \"content\": item[\"system_prompt\"]},  # type: ignore\n                )\n\n            item[\"prompt\"] = item[\"instruction\"]\n            item[\"prompt_id\"] = hashlib.sha256(\n                item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n            item[\"chosen\"] = messages + [\n                {\n                    \"role\": \"assistant\",\n                    \"content\": item[\"generations\"][chosen_idx],\n                }\n            ]\n            if \"generation_models\" in item:\n                item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n            item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n            rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n            item[\"rejected\"] = messages + [\n                {\n                    \"role\": \"assistant\",\n                    \"content\": item[\"generations\"][rejected_idx],\n                }\n            ]\n            if \"generation_models\" in item:\n                item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n            item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n        yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT","title":"FormatChatGenerationSFT","text":"

Bases: Step

Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT).

FormatChatGenerationSFT is a Step that formats the output of a ChatGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook. The output of the ChatGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation.

Input columns
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generation, if available.
  • instruction (str): The instruction used to generate the generation with the LLM.
  • generation (str): The generation produced by the LLM.
Output columns
  • prompt (str): The instruction used to generate the generation with the LLM.
  • prompt_id (str): The SHA256 hash of the prompt.
  • messages (List[Dict[str, str]]): The chat-like conversation with the instruction as the user message and the generation as the assistant message.
Categories
  • format
  • chat-generation
  • instruction
  • generation

Examples:

Format your dataset for SFT:

from distilabel.steps import FormatChatGenerationSFT\n\nformat_sft = FormatChatGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n    format_sft.process(\n        [\n            {\n                \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                \"generation\": \"4\"\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#         'generation': '4',\n#         'prompt': 'What's 2+2?',\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#     }\n# ]\n
Source code in src/distilabel/steps/formatting/sft.py
class FormatChatGenerationSFT(Step):\n    \"\"\"Format the output of a `ChatGeneration` task for Supervised Fine-Tuning (SFT).\n\n    `FormatChatGenerationSFT` is a `Step` that formats the output of a `ChatGeneration` task for\n    Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl`\n    or `alignment-handbook`. The output of the `ChatGeneration` task is formatted into a chat-like\n    conversation with the `instruction` as the user message and the `generation` as the assistant\n    message. Optionally, if the `system_prompt` is available, it is included as the first message\n    in the conversation.\n\n    Input columns:\n        - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n            `generation`, if available.\n        - instruction (`str`): The instruction used to generate the `generation` with the `LLM`.\n        - generation (`str`): The generation produced by the `LLM`.\n\n    Output columns:\n        - prompt (`str`): The instruction used to generate the `generation` with the `LLM`.\n        - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n        - messages (`List[Dict[str, str]]`): The chat-like conversation with the `instruction` as\n            the user message and the `generation` as the assistant message.\n\n    Categories:\n        - format\n        - chat-generation\n        - instruction\n        - generation\n\n    Examples:\n        Format your dataset for SFT:\n\n        ```python\n        from distilabel.steps import FormatChatGenerationSFT\n\n        format_sft = FormatChatGenerationSFT()\n        format_sft.load()\n\n        # NOTE: \"system_prompt\" can be added optionally.\n        result = next(\n            format_sft.process(\n                [\n                    {\n                        \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                        \"generation\": \"4\"\n                    }\n                ]\n            )\n        )\n        # >>> result\n        # [\n        #     {\n        #         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n        #         'generation': '4',\n        #         'prompt': 'What's 2+2?',\n        #         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, and `generation`.\"\"\"\n        return [\"messages\", \"generation\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `messages`.\n\n        Reference:\n            - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n        \"\"\"\n        return [\"prompt\", \"prompt_id\", \"messages\"]\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n        according to the SFT formatting standard.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n        \"\"\"\n        for input in inputs:\n            for item in input:\n                item[\"prompt\"] = next(\n                    (\n                        turn[\"content\"]\n                        for turn in item[\"messages\"]\n                        if turn[\"role\"] == \"user\"\n                    ),\n                    None,\n                )\n\n                item[\"prompt_id\"] = hashlib.sha256(\n                    item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n                ).hexdigest()\n\n                item[\"messages\"] = item[\"messages\"] + [\n                    {\"role\": \"assistant\", \"content\": item[\"generation\"]},  # type: ignore\n                ]\n            yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.inputs","title":"inputs: StepColumns property","text":"

List of inputs required by the Step, which in this case are: instruction, and generation.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.outputs","title":"outputs: StepColumns property","text":"

List of outputs generated by the Step, which are: prompt, prompt_id, messages.

Reference
  • Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.process","title":"process(*inputs)","text":"

The process method formats the received StepInput or list of StepInput according to the SFT formatting standard.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with batches of formatted StepInput following the SFT standard.

Source code in src/distilabel/steps/formatting/sft.py
def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n    according to the SFT formatting standard.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n    \"\"\"\n    for input in inputs:\n        for item in input:\n            item[\"prompt\"] = next(\n                (\n                    turn[\"content\"]\n                    for turn in item[\"messages\"]\n                    if turn[\"role\"] == \"user\"\n                ),\n                None,\n            )\n\n            item[\"prompt_id\"] = hashlib.sha256(\n                item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            item[\"messages\"] = item[\"messages\"] + [\n                {\"role\": \"assistant\", \"content\": item[\"generation\"]},  # type: ignore\n            ]\n        yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT","title":"FormatTextGenerationSFT","text":"

Bases: Step

Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT).

FormatTextGenerationSFT is a Step that formats the output of a TextGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook. The output of the TextGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation.

Input columns
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generation, if available.
  • instruction (str): The instruction used to generate the generation with the LLM.
  • generation (str): The generation produced by the LLM.
Output columns
  • prompt (str): The instruction used to generate the generation with the LLM.
  • prompt_id (str): The SHA256 hash of the prompt.
  • messages (List[Dict[str, str]]): The chat-like conversation with the instruction as the user message and the generation as the assistant message.
Categories
  • format
  • text-generation
  • instruction
  • generation

Examples:

Format your dataset for SFT fine tuning:

from distilabel.steps import FormatTextGenerationSFT\n\nformat_sft = FormatTextGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n    format_sft.process(\n        [\n            {\n                \"instruction\": \"What's 2+2?\",\n                \"generation\": \"4\"\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'instruction': 'What's 2+2?',\n#         'generation': '4',\n#         'prompt': 'What's 2+2?',\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n#     }\n# ]\n
Source code in src/distilabel/steps/formatting/sft.py
class FormatTextGenerationSFT(Step):\n    \"\"\"Format the output of a `TextGeneration` task for Supervised Fine-Tuning (SFT).\n\n    `FormatTextGenerationSFT` is a `Step` that formats the output of a `TextGeneration` task for\n    Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl`\n    or `alignment-handbook`. The output of the `TextGeneration` task is formatted into a chat-like\n    conversation with the `instruction` as the user message and the `generation` as the assistant\n    message. Optionally, if the `system_prompt` is available, it is included as the first message\n    in the conversation.\n\n    Input columns:\n        - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n            `generation`, if available.\n        - instruction (`str`): The instruction used to generate the `generation` with the `LLM`.\n        - generation (`str`): The generation produced by the `LLM`.\n\n    Output columns:\n        - prompt (`str`): The instruction used to generate the `generation` with the `LLM`.\n        - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n        - messages (`List[Dict[str, str]]`): The chat-like conversation with the `instruction` as\n            the user message and the `generation` as the assistant message.\n\n    Categories:\n        - format\n        - text-generation\n        - instruction\n        - generation\n\n    Examples:\n        Format your dataset for SFT fine tuning:\n\n        ```python\n        from distilabel.steps import FormatTextGenerationSFT\n\n        format_sft = FormatTextGenerationSFT()\n        format_sft.load()\n\n        # NOTE: \"system_prompt\" can be added optionally.\n        result = next(\n            format_sft.process(\n                [\n                    {\n                        \"instruction\": \"What's 2+2?\",\n                        \"generation\": \"4\"\n                    }\n                ]\n            )\n        )\n        # >>> result\n        # [\n        #     {\n        #         'instruction': 'What's 2+2?',\n        #         'generation': '4',\n        #         'prompt': 'What's 2+2?',\n        #         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n        #         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, and `generation`.\"\"\"\n        return {\n            \"system_prompt\": False,\n            \"instruction\": True,\n            \"generation\": True,\n        }\n\n    @property\n    def optional_inputs(self) -> List[str]:\n        \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n        which in this case is: `system_prompt`.\"\"\"\n        return [\"system_prompt\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `messages`.\n\n        Reference:\n            - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n        \"\"\"\n        return [\"prompt\", \"prompt_id\", \"messages\"]\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n        according to the SFT formatting standard.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n        \"\"\"\n        for input in inputs:\n            for item in input:\n                item[\"prompt\"] = item[\"instruction\"]\n\n                item[\"prompt_id\"] = hashlib.sha256(\n                    item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n                ).hexdigest()\n\n                item[\"messages\"] = [\n                    {\"role\": \"user\", \"content\": item[\"instruction\"]},  # type: ignore\n                    {\"role\": \"assistant\", \"content\": item[\"generation\"]},  # type: ignore\n                ]\n                if (\n                    \"system_prompt\" in item\n                    and isinstance(item[\"system_prompt\"], str)  # type: ignore\n                    and len(item[\"system_prompt\"]) > 0  # type: ignore\n                ):\n                    item[\"messages\"].insert(\n                        0,\n                        {\"role\": \"system\", \"content\": item[\"system_prompt\"]},  # type: ignore\n                    )\n\n            yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.inputs","title":"inputs: StepColumns property","text":"

List of inputs required by the Step, which in this case are: instruction, and generation.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.optional_inputs","title":"optional_inputs: List[str] property","text":"

List of optional inputs, which are not required by the Step but used if available, which in this case is: system_prompt.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.outputs","title":"outputs: StepColumns property","text":"

List of outputs generated by the Step, which are: prompt, prompt_id, messages.

Reference
  • Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.process","title":"process(*inputs)","text":"

The process method formats the received StepInput or list of StepInput according to the SFT formatting standard.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with batches of formatted StepInput following the SFT standard.

Source code in src/distilabel/steps/formatting/sft.py
def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n    according to the SFT formatting standard.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n    \"\"\"\n    for input in inputs:\n        for item in input:\n            item[\"prompt\"] = item[\"instruction\"]\n\n            item[\"prompt_id\"] = hashlib.sha256(\n                item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            item[\"messages\"] = [\n                {\"role\": \"user\", \"content\": item[\"instruction\"]},  # type: ignore\n                {\"role\": \"assistant\", \"content\": item[\"generation\"]},  # type: ignore\n            ]\n            if (\n                \"system_prompt\" in item\n                and isinstance(item[\"system_prompt\"], str)  # type: ignore\n                and len(item[\"system_prompt\"]) > 0  # type: ignore\n            ):\n                item[\"messages\"].insert(\n                    0,\n                    {\"role\": \"system\", \"content\": item[\"system_prompt\"]},  # type: ignore\n                )\n\n        yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts","title":"LoadDataFromDicts","text":"

Bases: GeneratorStep

Loads a dataset from a list of dictionaries.

GeneratorStep that loads a dataset from a list of dictionaries and yields it in batches.

Attributes:

Name Type Description data List[Dict[str, Any]]

The list of dictionaries to load the data from.

Runtime parameters
  • batch_size: The batch size to use when processing the data.
Output columns
  • dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
Categories
  • load

Examples:

Load data from a list of dictionaries:

from distilabel.steps import LoadDataFromDicts\n\nloader = LoadDataFromDicts(\n    data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n    batch_size=2\n)\nloader.load()\n\nresult = next(loader.process())\n# >>> result\n# ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n
Source code in src/distilabel/steps/generators/data.py
class LoadDataFromDicts(GeneratorStep):\n    \"\"\"Loads a dataset from a list of dictionaries.\n\n    `GeneratorStep` that loads a dataset from a list of dictionaries and yields it in\n    batches.\n\n    Attributes:\n        data: The list of dictionaries to load the data from.\n\n    Runtime parameters:\n        - `batch_size`: The batch size to use when processing the data.\n\n    Output columns:\n        - dynamic (based on the keys found on the first dictionary of the list): The columns\n            of the dataset.\n\n    Categories:\n        - load\n\n    Examples:\n        Load data from a list of dictionaries:\n\n        ```python\n        from distilabel.steps import LoadDataFromDicts\n\n        loader = LoadDataFromDicts(\n            data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n            batch_size=2\n        )\n        loader.load()\n\n        result = next(loader.process())\n        # >>> result\n        # ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n        ```\n    \"\"\"\n\n    data: List[Dict[str, Any]] = Field(default_factory=list, exclude=True)\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n        \"\"\"Yields batches from a list of dictionaries.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to `0`.\n\n        Yields:\n            A list of Python dictionaries as read from the inputs (propagated in batches)\n            and a flag indicating whether the yield batch is the last one.\n        \"\"\"\n        if offset:\n            self.data = self.data[offset:]\n\n        while self.data:\n            batch = self.data[: self.batch_size]\n            self.data = self.data[self.batch_size :]\n            yield (\n                batch,\n                True if len(self.data) == 0 else False,\n            )\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"Returns a list of strings with the names of the columns that the step will generate.\"\"\"\n        return list(self.data[0].keys())\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts.outputs","title":"outputs: List[str] property","text":"

Returns a list of strings with the names of the columns that the step will generate.

"},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts.process","title":"process(offset=0)","text":"

Yields batches from a list of dictionaries.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

A list of Python dictionaries as read from the inputs (propagated in batches)

GeneratorStepOutput

and a flag indicating whether the yield batch is the last one.

Source code in src/distilabel/steps/generators/data.py
@override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n    \"\"\"Yields batches from a list of dictionaries.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to `0`.\n\n    Yields:\n        A list of Python dictionaries as read from the inputs (propagated in batches)\n        and a flag indicating whether the yield batch is the last one.\n    \"\"\"\n    if offset:\n        self.data = self.data[offset:]\n\n    while self.data:\n        batch = self.data[: self.batch_size]\n        self.data = self.data[self.batch_size :]\n        yield (\n            batch,\n            True if len(self.data) == 0 else False,\n        )\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DataSampler","title":"DataSampler","text":"

Bases: GeneratorStep

Step to sample from a dataset.

GeneratorStep that samples from a dataset and yields it in batches. This step is useful when you have a pipeline that can benefit from using examples in the prompts for example as few-shot learning, that can be changing on each row. For example, you can pass a list of dictionaries with N examples and generate M samples from it (assuming you have another step loading data, this M should have the same size as the data being loaded in that step). The size S argument is the number of samples per row generated, so each example would contain S examples to be used as examples.

Attributes:

Name Type Description data List[Dict[str, Any]]

The list of dictionaries to sample from.

size int

Number of samples per example. For example in a few-shot learning scenario, the number of few-shot examples that will be generated per example. Defaults to 2.

samples int

Number of examples that will be generated by the step in total. If used with another loader step, this should be the same as the number of samples in the loader step. Defaults to 100.

Output columns
  • dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
Categories
  • load

Examples:

Sample data from a list of dictionaries:

from distilabel.steps import DataSampler\n\nsampler = DataSampler(\n    data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n    samples=10,\n    size=2,\n    batch_size=4\n)\nsampler.load()\n\nresult = next(sampler.process())\n# >>> result\n# ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n

Pipeline with a loader and a sampler combined in a single stream:

from datasets import load_dataset\n\nfrom distilabel.steps import LoadDataFromDicts, DataSampler\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples\nfrom distilabel.pipeline import Pipeline\n\nds = (\n    load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n    .shuffle(seed=42)\n    .select(range(500))\n    .to_list()\n)\ndata = [\n    {\n        \"func_name\": \"final_velocity\",\n        \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n    },\n    {\n        \"func_name\": \"permutation_count\",\n        \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n    },\n    {\n        \"func_name\": \"getdivision\",\n        \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n    },\n]\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n    loader_seeds = LoadDataFromDicts(data=data)\n    sampler = DataSampler(\n        data=ds,\n        size=2,\n        samples=len(data),\n        batch_size=8,\n    )\n    prep_examples = PrepareExamples()\n\n    sampler >> prep_examples\n    (\n        [loader_seeds, prep_examples]\n        >> combine_steps\n    )\n# Now we have a single stream of data with the loader and the sampler data\n
Source code in src/distilabel/steps/generators/data_sampler.py
class DataSampler(GeneratorStep):\n    \"\"\"Step to sample from a dataset.\n\n    `GeneratorStep` that samples from a dataset and yields it in batches.\n    This step is useful when you have a pipeline that can benefit from using examples\n    in the prompts for example as few-shot learning, that can be changing on each row.\n    For example, you can pass a list of dictionaries with N examples and generate M samples\n    from it (assuming you have another step loading data, this M should have the same size\n    as the data being loaded in that step). The size S argument is the number of samples per\n    row generated, so each example would contain S examples to be used as examples.\n\n    Attributes:\n        data: The list of dictionaries to sample from.\n        size: Number of samples per example. For example in a few-shot learning scenario,\n            the number of few-shot examples that will be generated per example. Defaults to 2.\n        samples: Number of examples that will be generated by the step in total.\n            If used with another loader step, this should be the same as the number\n            of samples in the loader step. Defaults to 100.\n\n    Output columns:\n        - dynamic (based on the keys found on the first dictionary of the list): The columns\n            of the dataset.\n\n    Categories:\n        - load\n\n    Examples:\n        Sample data from a list of dictionaries:\n\n        ```python\n        from distilabel.steps import DataSampler\n\n        sampler = DataSampler(\n            data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n            samples=10,\n            size=2,\n            batch_size=4\n        )\n        sampler.load()\n\n        result = next(sampler.process())\n        # >>> result\n        # ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n        ```\n\n        Pipeline with a loader and a sampler combined in a single stream:\n\n        ```python\n        from datasets import load_dataset\n\n        from distilabel.steps import LoadDataFromDicts, DataSampler\n        from distilabel.steps.tasks.apigen.utils import PrepareExamples\n        from distilabel.pipeline import Pipeline\n\n        ds = (\n            load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n            .shuffle(seed=42)\n            .select(range(500))\n            .to_list()\n        )\n        data = [\n            {\n                \"func_name\": \"final_velocity\",\n                \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n            },\n            {\n                \"func_name\": \"permutation_count\",\n                \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n            },\n            {\n                \"func_name\": \"getdivision\",\n                \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n            },\n        ]\n        with Pipeline(name=\"APIGenPipeline\") as pipeline:\n            loader_seeds = LoadDataFromDicts(data=data)\n            sampler = DataSampler(\n                data=ds,\n                size=2,\n                samples=len(data),\n                batch_size=8,\n            )\n            prep_examples = PrepareExamples()\n\n            sampler >> prep_examples\n            (\n                [loader_seeds, prep_examples]\n                >> combine_steps\n            )\n        # Now we have a single stream of data with the loader and the sampler data\n        ```\n    \"\"\"\n\n    data: List[Dict[str, Any]] = Field(default_factory=list, exclude=True)\n    size: int = Field(\n        default=2,\n        description=(\n            \"Number of samples per example. For example in a few-shot learning scenario, the number \"\n            \"of few-shot examples that will be generated per example.\"\n        ),\n    )\n    samples: int = Field(\n        default=100,\n        description=(\n            \"Number of examples that will be generated by the step in total. \"\n            \"If used with another loader step, this should be the same as the number of \"\n            \"samples in the loader step.\"\n        ),\n    )\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n        \"\"\"Yields batches from a list of dictionaries.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to `0`.\n\n        Yields:\n            A list of Python dictionaries as read from the inputs (propagated in batches)\n            and a flag indicating whether the yield batch is the last one.\n        \"\"\"\n\n        total_samples = 0\n\n        while total_samples < self.samples:\n            batch = []\n            bs = min(self.batch_size, self.samples - total_samples)\n            for _ in range(self.batch_size):\n                choices = random.choices(self.data, k=self.size)\n                choices = self._transform_data(choices)\n                batch.extend(choices)\n            total_samples += bs\n            batch = list(islice(batch, bs))\n            yield (batch, True if total_samples >= self.samples else False)\n            batch = []\n\n    @staticmethod\n    def _transform_data(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n        if not data:\n            return []\n\n        result = {key: [] for key in data[0].keys()}\n\n        for item in data:\n            for key, value in item.items():\n                result[key].append(value)\n\n        return [result]\n\n    @property\n    def outputs(self) -> List[str]:\n        return list(self.data[0].keys())\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DataSampler.process","title":"process(offset=0)","text":"

Yields batches from a list of dictionaries.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

A list of Python dictionaries as read from the inputs (propagated in batches)

GeneratorStepOutput

and a flag indicating whether the yield batch is the last one.

Source code in src/distilabel/steps/generators/data_sampler.py
@override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n    \"\"\"Yields batches from a list of dictionaries.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to `0`.\n\n    Yields:\n        A list of Python dictionaries as read from the inputs (propagated in batches)\n        and a flag indicating whether the yield batch is the last one.\n    \"\"\"\n\n    total_samples = 0\n\n    while total_samples < self.samples:\n        batch = []\n        bs = min(self.batch_size, self.samples - total_samples)\n        for _ in range(self.batch_size):\n            choices = random.choices(self.data, k=self.size)\n            choices = self._transform_data(choices)\n            batch.extend(choices)\n        total_samples += bs\n        batch = list(islice(batch, bs))\n        yield (batch, True if total_samples >= self.samples else False)\n        batch = []\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore","title":"RewardModelScore","text":"

Bases: Step, CudaDevicePlacementMixin

Assign a score to a response using a Reward Model.

RewardModelScore is a Step that using a Reward Model (RM) loaded using transformers, assigns an score to a response generated for an instruction, or a score to a multi-turn conversation.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

revision str

if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

torch_dtype str

the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\".

trust_remote_code bool

whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

device_map Union[str, Dict[str, Any], None]

a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\". Defaults to None.

token Union[SecretStr, None]

the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

truncation bool

whether to truncate sequences at the maximum length. Defaults to False.

max_length Union[int, None]

maximun length to use for padding or truncation. Defaults to None.

Input columns
  • instruction (str, optional): the instruction used to generate a response. If provided, then response must be provided too.
  • response (str, optional): the response generated for instruction. If provided, then instruction must be provide too.
  • conversation (ChatType, optional): a multi-turn conversation. If not provided, then instruction and response columns must be provided.
Output columns
  • score (float): the score given by the reward model for the instruction-response pair or the conversation.
Categories
  • scorer

Examples:

Assigning an score for an instruction-response pair:

from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n    model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n    step.process(\n        inputs=[\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"response\": \"The output of 2+2 is 4\",\n            },\n            {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n        ]\n    )\n)\n# [\n#   {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n#   {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n# ]\n

Assigning an score for a multi-turn conversation:

from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n    model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n    step.process(\n        inputs=[\n            {\n                \"conversation\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                    {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n                ],\n            },\n            {\n                \"conversation\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                    {\"role\": \"assistant\", \"content\": \"4\"},\n                ],\n            },\n        ]\n    )\n)\n# [\n#   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n#   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n# ]\n
Source code in src/distilabel/steps/reward_model.py
class RewardModelScore(Step, CudaDevicePlacementMixin):\n    \"\"\"Assign a score to a response using a Reward Model.\n\n    `RewardModelScore` is a `Step` that using a Reward Model (RM) loaded using `transformers`,\n    assigns an score to a response generated for an instruction, or a score to a multi-turn\n    conversation.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        revision: if `model` refers to a Hugging Face Hub repository, then the revision\n            (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n        torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc.\n            Defaults to `\"auto\"`.\n        trust_remote_code: whether to allow fetching and executing remote code fetched\n            from the repository in the Hub. Defaults to `False`.\n        device_map: a dictionary mapping each layer of the model to a device, or a mode like `\"sequential\"` or `\"auto\"`. Defaults to `None`.\n        token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n            Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n            local configuration will be used. Defaults to `None`.\n        truncation: whether to truncate sequences at the maximum length. Defaults to `False`.\n        max_length: maximun length to use for padding or truncation. Defaults to `None`.\n\n    Input columns:\n        - instruction (`str`, optional): the instruction used to generate a `response`.\n            If provided, then `response` must be provided too.\n        - response (`str`, optional): the response generated for `instruction`. If provided,\n            then `instruction` must be provide too.\n        - conversation (`ChatType`, optional): a multi-turn conversation. If not provided,\n            then `instruction` and `response` columns must be provided.\n\n    Output columns:\n        - score (`float`): the score given by the reward model for the instruction-response\n            pair or the conversation.\n\n    Categories:\n        - scorer\n\n    Examples:\n        Assigning an score for an instruction-response pair:\n\n        ```python\n        from distilabel.steps import RewardModelScore\n\n        step = RewardModelScore(\n            model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n        )\n\n        step.load()\n\n        result = next(\n            step.process(\n                inputs=[\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"response\": \"The output of 2+2 is 4\",\n                    },\n                    {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n                ]\n            )\n        )\n        # [\n        #   {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n        #   {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n        # ]\n        ```\n\n        Assigning an score for a multi-turn conversation:\n\n        ```python\n        from distilabel.steps import RewardModelScore\n\n        step = RewardModelScore(\n            model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n        )\n\n        step.load()\n\n        result = next(\n            step.process(\n                inputs=[\n                    {\n                        \"conversation\": [\n                            {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                            {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n                        ],\n                    },\n                    {\n                        \"conversation\": [\n                            {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                            {\"role\": \"assistant\", \"content\": \"4\"},\n                        ],\n                    },\n                ]\n            )\n        )\n        # [\n        #   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n        #   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n        # ]\n        ```\n    \"\"\"\n\n    model: str\n    revision: str = \"main\"\n    torch_dtype: str = \"auto\"\n    trust_remote_code: bool = False\n    device_map: Union[str, Dict[str, Any], None] = None\n    token: Union[SecretStr, None] = Field(\n        default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR), description=\"\"\n    )\n    truncation: bool = False\n    max_length: Union[int, None] = None\n\n    _model: Union[\"PreTrainedModel\", None] = PrivateAttr(None)\n    _tokenizer: Union[\"PreTrainedTokenizer\", None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        super().load()\n\n        if self.device_map in [\"cuda\", \"auto\"]:\n            CudaDevicePlacementMixin.load(self)\n\n        try:\n            from transformers import AutoModelForSequenceClassification, AutoTokenizer\n        except ImportError as e:\n            raise ImportError(\n                \"`transformers` is not installed. Please install it using `pip install transformers`.\"\n            ) from e\n\n        token = self.token.get_secret_value() if self.token is not None else self.token\n\n        self._model = AutoModelForSequenceClassification.from_pretrained(\n            self.model,\n            revision=self.revision,\n            torch_dtype=self.torch_dtype,\n            trust_remote_code=self.trust_remote_code,\n            device_map=self.device_map,\n            token=token,\n        )\n        self._tokenizer = AutoTokenizer.from_pretrained(\n            self.model,\n            revision=self.revision,\n            torch_dtype=self.torch_dtype,\n            trust_remote_code=self.trust_remote_code,\n            token=token,\n        )\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"Either `response` and `instruction`, or a `conversation` columns.\"\"\"\n        return {\n            \"response\": False,\n            \"instruction\": False,\n            \"conversation\": False,\n        }\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The `score` given by the reward model.\"\"\"\n        return [\"score\"]\n\n    def _prepare_conversation(self, input: Dict[str, Any]) -> \"ChatType\":\n        if \"instruction\" in input and \"response\" in input:\n            return [\n                {\"role\": \"user\", \"content\": input[\"instruction\"]},\n                {\"role\": \"assistant\", \"content\": input[\"response\"]},\n            ]\n\n        return input[\"conversation\"]\n\n    def _prepare_inputs(self, inputs: List[Dict[str, Any]]) -> \"torch.Tensor\":\n        return self._tokenizer.apply_chat_template(  # type: ignore\n            [self._prepare_conversation(input) for input in inputs],  # type: ignore\n            return_tensors=\"pt\",\n            padding=True,\n            truncation=self.truncation,\n            max_length=self.max_length,\n        ).to(self._model.device)  # type: ignore\n\n    def _inference(self, inputs: List[Dict[str, Any]]) -> List[float]:\n        import torch\n\n        input_ids = self._prepare_inputs(inputs)\n        with torch.no_grad():\n            output = self._model(input_ids)  # type: ignore\n            logits = output.logits\n            if logits.shape == (2, 1):\n                logits = logits.squeeze(-1)\n            return logits.tolist()\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        scores = self._inference(inputs)\n        for input, score in zip(inputs, scores):\n            input[\"score\"] = score\n        yield inputs\n\n    def unload(self) -> None:\n        if self.device_map in [\"cuda\", \"auto\"]:\n            CudaDevicePlacementMixin.unload(self)\n        super().unload()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore.inputs","title":"inputs: StepColumns property","text":"

Either response and instruction, or a conversation columns.

"},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore.outputs","title":"outputs: StepColumns property","text":"

The score given by the reward model.

"},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn","title":"TruncateTextColumn","text":"

Bases: Step

Truncate a row using a tokenizer or the number of characters.

TruncateTextColumn is a Step that truncates a row according to the max length. If the tokenizer is provided, then the row will be truncated using the tokenizer, and the max_length will be used as the maximum number of tokens, otherwise it will be used as the maximum number of characters. The TruncateTextColumn step is useful when one wants to truncate a row to a certain length, to avoid posterior errors in the model due to the length.

Attributes:

Name Type Description column str

the column to truncate. Defaults to \"text\".

max_length int

the maximum length to use for truncation. If a tokenizer is given, corresponds to the number of tokens, otherwise corresponds to the number of characters. Defaults to 8192.

tokenizer Optional[str]

the name of the tokenizer to use. If provided, the row will be truncated using the tokenizer. Defaults to None.

Input columns
  • dynamic (determined by column attribute): The columns to be truncated, defaults to \"text\".
Output columns
  • dynamic (determined by column attribute): The truncated column.
Categories
  • text-manipulation

Examples:

Truncating a row to a given number of tokens:

from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(\n    tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    max_length=4,\n    column=\"text\"\n)\n\ntrunc.load()\n\nresult = next(\n    trunc.process(\n        [\n            {\"text\": \"This is a sample text that is longer than 10 characters\"}\n        ]\n    )\n)\n# result\n# [{'text': 'This is a sample'}]\n

Truncating a row to a given number of characters:

from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(max_length=10)\n\ntrunc.load()\n\nresult = next(\n    trunc.process(\n        [\n            {\"text\": \"This is a sample text that is longer than 10 characters\"}\n        ]\n    )\n)\n# result\n# [{'text': 'This is a '}]\n
Source code in src/distilabel/steps/truncate.py
class TruncateTextColumn(Step):\n    \"\"\"Truncate a row using a tokenizer or the number of characters.\n\n    `TruncateTextColumn` is a `Step` that truncates a row according to the max length. If\n    the `tokenizer` is provided, then the row will be truncated using the tokenizer,\n    and the `max_length` will be used as the maximum number of tokens, otherwise it will\n    be used as the maximum number of characters. The `TruncateTextColumn` step is useful when one\n    wants to truncate a row to a certain length, to avoid posterior errors in the model due\n    to the length.\n\n    Attributes:\n        column: the column to truncate. Defaults to `\"text\"`.\n        max_length: the maximum length to use for truncation.\n            If a `tokenizer` is given, corresponds to the number of tokens,\n            otherwise corresponds to the number of characters. Defaults to `8192`.\n        tokenizer: the name of the tokenizer to use. If provided, the row will be\n            truncated using the tokenizer. Defaults to `None`.\n\n    Input columns:\n        - dynamic (determined by `column` attribute): The columns to be truncated, defaults to \"text\".\n\n    Output columns:\n        - dynamic (determined by `column` attribute): The truncated column.\n\n    Categories:\n        - text-manipulation\n\n    Examples:\n        Truncating a row to a given number of tokens:\n\n        ```python\n        from distilabel.steps import TruncateTextColumn\n\n        trunc = TruncateTextColumn(\n            tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            max_length=4,\n            column=\"text\"\n        )\n\n        trunc.load()\n\n        result = next(\n            trunc.process(\n                [\n                    {\"text\": \"This is a sample text that is longer than 10 characters\"}\n                ]\n            )\n        )\n        # result\n        # [{'text': 'This is a sample'}]\n        ```\n\n        Truncating a row to a given number of characters:\n\n        ```python\n        from distilabel.steps import TruncateTextColumn\n\n        trunc = TruncateTextColumn(max_length=10)\n\n        trunc.load()\n\n        result = next(\n            trunc.process(\n                [\n                    {\"text\": \"This is a sample text that is longer than 10 characters\"}\n                ]\n            )\n        )\n        # result\n        # [{'text': 'This is a '}]\n        ```\n    \"\"\"\n\n    column: str = \"text\"\n    max_length: int = 8192\n    tokenizer: Optional[str] = None\n    _truncator: Optional[Callable[[str], str]] = None\n    _tokenizer: Optional[Any] = None\n\n    def load(self):\n        super().load()\n        if self.tokenizer:\n            if not importlib.util.find_spec(\"transformers\"):\n                raise ImportError(\n                    \"`transformers` is needed to tokenize, but is not installed. \"\n                    \"Please install it using `pip install transformers`.\"\n                )\n\n            from transformers import AutoTokenizer\n\n            self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)\n            self._truncator = self._truncate_with_tokenizer\n        else:\n            self._truncator = self._truncate_with_length\n\n    @property\n    def inputs(self) -> List[str]:\n        return [self.column]\n\n    @property\n    def outputs(self) -> List[str]:\n        return self.inputs\n\n    def _truncate_with_length(self, text: str) -> str:\n        \"\"\"Truncates the text according to the number of characters.\"\"\"\n        return text[: self.max_length]\n\n    def _truncate_with_tokenizer(self, text: str) -> str:\n        \"\"\"Truncates the text according to the number of characters using the tokenizer.\"\"\"\n        return self._tokenizer.decode(\n            self._tokenizer.encode(\n                text,\n                add_special_tokens=False,\n                max_length=self.max_length,\n                truncation=True,\n            )\n        )\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        for input in inputs:\n            input[self.column] = self._truncator(input[self.column])\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn._truncate_with_length","title":"_truncate_with_length(text)","text":"

Truncates the text according to the number of characters.

Source code in src/distilabel/steps/truncate.py
def _truncate_with_length(self, text: str) -> str:\n    \"\"\"Truncates the text according to the number of characters.\"\"\"\n    return text[: self.max_length]\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn._truncate_with_tokenizer","title":"_truncate_with_tokenizer(text)","text":"

Truncates the text according to the number of characters using the tokenizer.

Source code in src/distilabel/steps/truncate.py
def _truncate_with_tokenizer(self, text: str) -> str:\n    \"\"\"Truncates the text according to the number of characters using the tokenizer.\"\"\"\n    return self._tokenizer.decode(\n        self._tokenizer.encode(\n            text,\n            add_special_tokens=False,\n            max_length=self.max_length,\n            truncation=True,\n        )\n    )\n
"},{"location":"api/step_gallery/hugging_face/","title":"Hugging Face","text":"

This section contains the existing steps integrated with Hugging Face so as to easily push the generated datasets to Hugging Face.

"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk","title":"LoadDataFromDisk","text":"

Bases: LoadDataFromHub

Load a dataset that was previously saved to disk.

If you previously saved your dataset using the save_to_disk method, or Distiset.save_to_disk you can load it again to build a new pipeline using this class.

Attributes:

Name Type Description dataset_path RuntimeParameter[Union[str, Path]]

The path to the dataset or distiset.

split Optional[RuntimeParameter[str]]

The split of the dataset to load (typically will be train, test or validation).

config Optional[RuntimeParameter[str]]

The configuration of the dataset to load. Defaults to default, if there are multiple configurations in the dataset this must be suplied or an error is raised.

Runtime parameters
  • batch_size: The batch size to use when processing the data.
  • dataset_path: The path to the dataset or distiset.
  • is_distiset: Whether the dataset to load is a Distiset or not. Defaults to False.
  • split: The split of the dataset to load. Defaults to 'train'.
  • config: The configuration of the dataset to load. Defaults to default, if there are multiple configurations in the dataset this must be suplied or an error is raised.
  • num_examples: The number of examples to load from the dataset. By default will load all examples.
  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.
Output columns
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
Categories
  • load

Examples:

Load data from a Hugging Face Dataset:

from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n

Load data from a distilabel Distiset:

from distilabel.steps import LoadDataFromDisk\n\n# Specify the configuration to load.\nloader = LoadDataFromDisk(\n    dataset_path=\"path/to/dataset\",\n    is_distiset=True,\n    config=\"leaf_step_1\"\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n

Load data from a Hugging Face Dataset or Distiset in your cloud provider:

from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(\n    dataset_path=\"gcs://path/to/dataset\",\n    storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
Source code in src/distilabel/steps/generators/huggingface.py
class LoadDataFromDisk(LoadDataFromHub):\n    \"\"\"Load a dataset that was previously saved to disk.\n\n    If you previously saved your dataset using the `save_to_disk` method, or\n    `Distiset.save_to_disk` you can load it again to build a new pipeline using this class.\n\n    Attributes:\n        dataset_path: The path to the dataset or distiset.\n        split: The split of the dataset to load (typically will be `train`, `test` or `validation`).\n        config: The configuration of the dataset to load. Defaults to `default`, if there are\n            multiple configurations in the dataset this must be suplied or an error is raised.\n\n    Runtime parameters:\n        - `batch_size`: The batch size to use when processing the data.\n        - `dataset_path`: The path to the dataset or distiset.\n        - `is_distiset`: Whether the dataset to load is a `Distiset` or not. Defaults to False.\n        - `split`: The split of the dataset to load. Defaults to 'train'.\n        - `config`: The configuration of the dataset to load. Defaults to `default`, if there are\n            multiple configurations in the dataset this must be suplied or an error is raised.\n        - `num_examples`: The number of examples to load from the dataset.\n            By default will load all examples.\n        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n\n    Output columns:\n        - dynamic (`all`): The columns that will be generated by this step, based on the\n            datasets loaded from the Hugging Face Hub.\n\n    Categories:\n        - load\n\n    Examples:\n        Load data from a Hugging Face Dataset:\n\n        ```python\n        from distilabel.steps import LoadDataFromDisk\n\n        loader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n\n        Load data from a distilabel Distiset:\n\n        ```python\n        from distilabel.steps import LoadDataFromDisk\n\n        # Specify the configuration to load.\n        loader = LoadDataFromDisk(\n            dataset_path=\"path/to/dataset\",\n            is_distiset=True,\n            config=\"leaf_step_1\"\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n        ```\n\n        Load data from a Hugging Face Dataset or Distiset in your cloud provider:\n\n        ```python\n        from distilabel.steps import LoadDataFromDisk\n\n        loader = LoadDataFromDisk(\n            dataset_path=\"gcs://path/to/dataset\",\n            storage_options={\"project\": \"experiments-0001\"}\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n    \"\"\"\n\n    dataset_path: RuntimeParameter[Union[str, Path]] = Field(\n        default=None,\n        description=\"Path to the dataset or distiset.\",\n    )\n    config: Optional[RuntimeParameter[str]] = Field(\n        default=\"default\",\n        description=(\n            \"The configuration of the dataset to load. Will default to 'default'\",\n            \" which corresponds to a distiset with a single configuration.\",\n        ),\n    )\n    is_distiset: Optional[RuntimeParameter[bool]] = Field(\n        default=False,\n        description=\"Whether the dataset to load is a `Distiset` or not. Defaults to False.\",\n    )\n    keep_in_memory: Optional[RuntimeParameter[bool]] = Field(\n        default=None,\n        description=\"Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk` \"\n        \" for more information. Defaults to `None`.\",\n    )\n    split: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The split of the dataset to load. By default will load the whole Dataset/Distiset.\",\n    )\n    repo_id: ExcludedField[Union[str, None]] = None\n\n    def load(self) -> None:\n        \"\"\"Load the dataset from the file/s in disk.\"\"\"\n        super(GeneratorStep, self).load()\n        if self.is_distiset:\n            ds = Distiset.load_from_disk(\n                self.dataset_path,\n                keep_in_memory=self.keep_in_memory,\n                storage_options=self.storage_options,\n            )\n            if self.config not in ds.keys():\n                raise DistilabelUserError(\n                    f\"Configuration '{self.config}' not found in the Distiset, available ones\"\n                    f\" are: {list(ds.keys())}. Please try changing the `config` parameter to one \"\n                    \"of the available configurations.\\n\\n\",\n                    page=\"sections/how_to_guides/advanced/distiset/#using-the-distiset-dataset-object\",\n                )\n            ds = ds[self.config]\n\n        else:\n            ds = load_from_disk(\n                self.dataset_path,\n                keep_in_memory=self.keep_in_memory,\n                storage_options=self.storage_options,\n            )\n\n        if self.split:\n            ds = ds[self.split]\n\n        self._dataset = ds\n\n        if self.num_examples:\n            self._dataset = self._dataset.select(range(self.num_examples))\n        else:\n            self.num_examples = len(self._dataset)\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The columns that will be generated by this step, based on the datasets from a file\n        in disk.\n\n        Returns:\n            The columns that will be generated by this step.\n        \"\"\"\n        # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts\n        if self._dataset is None:\n            self.load()\n\n        return self._dataset.column_names\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk.outputs","title":"outputs: List[str] property","text":"

The columns that will be generated by this step, based on the datasets from a file in disk.

Returns:

Type Description List[str]

The columns that will be generated by this step.

"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk.load","title":"load()","text":"

Load the dataset from the file/s in disk.

Source code in src/distilabel/steps/generators/huggingface.py
def load(self) -> None:\n    \"\"\"Load the dataset from the file/s in disk.\"\"\"\n    super(GeneratorStep, self).load()\n    if self.is_distiset:\n        ds = Distiset.load_from_disk(\n            self.dataset_path,\n            keep_in_memory=self.keep_in_memory,\n            storage_options=self.storage_options,\n        )\n        if self.config not in ds.keys():\n            raise DistilabelUserError(\n                f\"Configuration '{self.config}' not found in the Distiset, available ones\"\n                f\" are: {list(ds.keys())}. Please try changing the `config` parameter to one \"\n                \"of the available configurations.\\n\\n\",\n                page=\"sections/how_to_guides/advanced/distiset/#using-the-distiset-dataset-object\",\n            )\n        ds = ds[self.config]\n\n    else:\n        ds = load_from_disk(\n            self.dataset_path,\n            keep_in_memory=self.keep_in_memory,\n            storage_options=self.storage_options,\n        )\n\n    if self.split:\n        ds = ds[self.split]\n\n    self._dataset = ds\n\n    if self.num_examples:\n        self._dataset = self._dataset.select(range(self.num_examples))\n    else:\n        self.num_examples = len(self._dataset)\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem","title":"LoadDataFromFileSystem","text":"

Bases: LoadDataFromHub

Loads a dataset from a file in your filesystem.

GeneratorStep that creates a dataset from a file in the filesystem, uses Hugging Face datasets library. Take a look at Hugging Face Datasets for more information of the supported file types.

Attributes:

Name Type Description data_files RuntimeParameter[Union[str, Path]]

The path to the file, or directory containing the files that conform the dataset.

split RuntimeParameter[Union[str, Path]]

The split of the dataset to load (typically will be train, test or validation).

Runtime parameters
  • batch_size: The batch size to use when processing the data.
  • data_files: The path to the file, or directory containing the files that conform the dataset.
  • split: The split of the dataset to load. Defaults to 'train'.
  • streaming: Whether to load the dataset in streaming mode or not. Defaults to False.
  • num_examples: The number of examples to load from the dataset. By default will load all examples.
  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.
  • filetype: The expected filetype. If not provided, it will be inferred from the file extension. For more than one file, it will be inferred from the first file.
Output columns
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
Categories
  • load

Examples:

Load data from a Hugging Face dataset in your file system:

from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n

Specify a filetype if the file extension is not expected:

from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n

Load data from a file in your cloud provider:

from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n    data_files=\"gcs://path/to/dataset\",\n    storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n

Load data passing a glob pattern:

from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n    data_files=\"path/to/dataset/*.jsonl\",\n    streaming=True\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
Source code in src/distilabel/steps/generators/huggingface.py
class LoadDataFromFileSystem(LoadDataFromHub):\n    \"\"\"Loads a dataset from a file in your filesystem.\n\n    `GeneratorStep` that creates a dataset from a file in the filesystem, uses Hugging Face `datasets`\n    library. Take a look at [Hugging Face Datasets](https://huggingface.co/docs/datasets/loading)\n    for more information of the supported file types.\n\n    Attributes:\n        data_files: The path to the file, or directory containing the files that conform\n            the dataset.\n        split: The split of the dataset to load (typically will be `train`, `test` or `validation`).\n\n    Runtime parameters:\n        - `batch_size`: The batch size to use when processing the data.\n        - `data_files`: The path to the file, or directory containing the files that conform\n            the dataset.\n        - `split`: The split of the dataset to load. Defaults to 'train'.\n        - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to\n            `False`.\n        - `num_examples`: The number of examples to load from the dataset.\n            By default will load all examples.\n        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n        - `filetype`: The expected filetype. If not provided, it will be inferred from the file extension.\n            For more than one file, it will be inferred from the first file.\n\n    Output columns:\n        - dynamic (`all`): The columns that will be generated by this step, based on the\n            datasets loaded from the Hugging Face Hub.\n\n    Categories:\n        - load\n\n    Examples:\n        Load data from a Hugging Face dataset in your file system:\n\n        ```python\n        from distilabel.steps import LoadDataFromFileSystem\n\n        loader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n\n        Specify a filetype if the file extension is not expected:\n\n        ```python\n        from distilabel.steps import LoadDataFromFileSystem\n\n        loader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n\n        Load data from a file in your cloud provider:\n\n        ```python\n        from distilabel.steps import LoadDataFromFileSystem\n\n        loader = LoadDataFromFileSystem(\n            data_files=\"gcs://path/to/dataset\",\n            storage_options={\"project\": \"experiments-0001\"}\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n\n        Load data passing a glob pattern:\n\n        ```python\n        from distilabel.steps import LoadDataFromFileSystem\n\n        loader = LoadDataFromFileSystem(\n            data_files=\"path/to/dataset/*.jsonl\",\n            streaming=True\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n    \"\"\"\n\n    data_files: RuntimeParameter[Union[str, Path]] = Field(\n        default=None,\n        description=\"The data files, or directory containing the data files, to generate the dataset from.\",\n    )\n    filetype: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The expected filetype. If not provided, it will be inferred from the file extension.\",\n    )\n    repo_id: ExcludedField[Union[str, None]] = None\n\n    def load(self) -> None:\n        \"\"\"Load the dataset from the file/s in disk.\"\"\"\n        GeneratorStep.load(self)\n\n        data_path = UPath(self.data_files, storage_options=self.storage_options)\n\n        (data_files, self.filetype) = self._prepare_data_files(data_path)\n\n        self._dataset = load_dataset(\n            self.filetype,\n            data_files=data_files,\n            split=self.split,\n            streaming=self.streaming,\n            storage_options=self.storage_options,\n        )\n\n        if not self.streaming and self.num_examples:\n            self._dataset = self._dataset.select(range(self.num_examples))\n        if not self.num_examples:\n            if self.streaming:\n                # There's no better way to get the number of examples in a streaming dataset,\n                # load it again for the moment.\n                self.num_examples = len(\n                    load_dataset(\n                        self.filetype, data_files=self.data_files, split=self.split\n                    )\n                )\n            else:\n                self.num_examples = len(self._dataset)\n\n    @staticmethod\n    def _prepare_data_files(  # noqa: C901\n        data_path: UPath,\n    ) -> Tuple[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]], str]:\n        \"\"\"Prepare the loading process by setting the `data_files` attribute.\n\n        Args:\n            data_path: The path to the data files, or directory containing the data files.\n\n        Returns:\n            Tuple with the data files and the filetype.\n        \"\"\"\n\n        def get_filetype(data_path: UPath) -> str:\n            filetype = data_path.suffix.lstrip(\".\")\n            if filetype == \"jsonl\":\n                filetype = \"json\"\n            return filetype\n\n        if data_path.is_file() or (\n            len(str(data_path.parent.glob(data_path.name))) >= 1\n        ):\n            filetype = get_filetype(data_path)\n            data_files = str(data_path)\n\n        elif data_path.is_dir():\n            file_sequence = []\n            file_map = defaultdict(list)\n            for file_or_folder in data_path.iterdir():\n                if file_or_folder.is_file():\n                    file_sequence.append(str(file_or_folder))\n                elif file_or_folder.is_dir():\n                    for file in file_or_folder.iterdir():\n                        file_sequence.append(str(file))\n                        file_map[str(file_or_folder)].append(str(file))\n\n            data_files = file_sequence or file_map\n            # Try to obtain the filetype from any of the files, assuming all files have the same type.\n            if file_sequence:\n                filetype = get_filetype(UPath(file_sequence[0]))\n            else:\n                filetype = get_filetype(UPath(file_map[list(file_map.keys())[0]][0]))\n        return data_files, filetype\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The columns that will be generated by this step, based on the datasets from a file\n        in disk.\n\n        Returns:\n            The columns that will be generated by this step.\n        \"\"\"\n        # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts\n        if self._dataset is None:\n            self.load()\n\n        return self._dataset.column_names\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem.outputs","title":"outputs: List[str] property","text":"

The columns that will be generated by this step, based on the datasets from a file in disk.

Returns:

Type Description List[str]

The columns that will be generated by this step.

"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem.load","title":"load()","text":"

Load the dataset from the file/s in disk.

Source code in src/distilabel/steps/generators/huggingface.py
def load(self) -> None:\n    \"\"\"Load the dataset from the file/s in disk.\"\"\"\n    GeneratorStep.load(self)\n\n    data_path = UPath(self.data_files, storage_options=self.storage_options)\n\n    (data_files, self.filetype) = self._prepare_data_files(data_path)\n\n    self._dataset = load_dataset(\n        self.filetype,\n        data_files=data_files,\n        split=self.split,\n        streaming=self.streaming,\n        storage_options=self.storage_options,\n    )\n\n    if not self.streaming and self.num_examples:\n        self._dataset = self._dataset.select(range(self.num_examples))\n    if not self.num_examples:\n        if self.streaming:\n            # There's no better way to get the number of examples in a streaming dataset,\n            # load it again for the moment.\n            self.num_examples = len(\n                load_dataset(\n                    self.filetype, data_files=self.data_files, split=self.split\n                )\n            )\n        else:\n            self.num_examples = len(self._dataset)\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub","title":"LoadDataFromHub","text":"

Bases: GeneratorStep

Loads a dataset from the Hugging Face Hub.

GeneratorStep that loads a dataset from the Hugging Face Hub using the datasets library.

Attributes:

Name Type Description repo_id RuntimeParameter[str]

The Hugging Face Hub repository ID of the dataset to load.

split RuntimeParameter[str]

The split of the dataset to load.

config Optional[RuntimeParameter[str]]

The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations.

Runtime parameters
  • batch_size: The batch size to use when processing the data.
  • repo_id: The Hugging Face Hub repository ID of the dataset to load.
  • split: The split of the dataset to load. Defaults to 'train'.
  • config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations.
  • revision: The revision of the dataset to load. Defaults to the latest revision.
  • streaming: Whether to load the dataset in streaming mode or not. Defaults to False.
  • num_examples: The number of examples to load from the dataset. By default will load all examples.
  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.
Output columns
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
Categories
  • load

Examples:

Load data from a dataset in Hugging Face Hub:

from distilabel.steps import LoadDataFromHub\n\nloader = LoadDataFromHub(\n    repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n    split=\"test\",\n    batch_size=2\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'prompt': 'Arianna has 12...', False)\n
Source code in src/distilabel/steps/generators/huggingface.py
class LoadDataFromHub(GeneratorStep):\n    \"\"\"Loads a dataset from the Hugging Face Hub.\n\n    `GeneratorStep` that loads a dataset from the Hugging Face Hub using the `datasets`\n    library.\n\n    Attributes:\n        repo_id: The Hugging Face Hub repository ID of the dataset to load.\n        split: The split of the dataset to load.\n        config: The configuration of the dataset to load. This is optional and only needed\n            if the dataset has multiple configurations.\n\n    Runtime parameters:\n        - `batch_size`: The batch size to use when processing the data.\n        - `repo_id`: The Hugging Face Hub repository ID of the dataset to load.\n        - `split`: The split of the dataset to load. Defaults to 'train'.\n        - `config`: The configuration of the dataset to load. This is optional and only\n            needed if the dataset has multiple configurations.\n        - `revision`: The revision of the dataset to load. Defaults to the latest revision.\n        - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to\n            `False`.\n        - `num_examples`: The number of examples to load from the dataset.\n            By default will load all examples.\n        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n\n    Output columns:\n        - dynamic (`all`): The columns that will be generated by this step, based on the\n            datasets loaded from the Hugging Face Hub.\n\n    Categories:\n        - load\n\n    Examples:\n        Load data from a dataset in Hugging Face Hub:\n\n        ```python\n        from distilabel.steps import LoadDataFromHub\n\n        loader = LoadDataFromHub(\n            repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n            split=\"test\",\n            batch_size=2\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'prompt': 'Arianna has 12...', False)\n        ```\n    \"\"\"\n\n    repo_id: RuntimeParameter[str] = Field(\n        default=None,\n        description=\"The Hugging Face Hub repository ID of the dataset to load.\",\n    )\n    split: RuntimeParameter[str] = Field(\n        default=\"train\",\n        description=\"The split of the dataset to load. Defaults to 'train'.\",\n    )\n    config: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The configuration of the dataset to load. This is optional and only\"\n        \" needed if the dataset has multiple configurations.\",\n    )\n    revision: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The revision of the dataset to load. Defaults to the latest revision.\",\n    )\n    streaming: RuntimeParameter[bool] = Field(\n        default=False,\n        description=\"Whether to load the dataset in streaming mode or not. Defaults to False.\",\n    )\n    num_examples: Optional[RuntimeParameter[int]] = Field(\n        default=None,\n        description=\"The number of examples to load from the dataset. By default will load all examples.\",\n    )\n    storage_options: Optional[Dict[str, Any]] = Field(\n        default=None,\n        description=\"The storage options to use when loading the dataset.\",\n    )\n\n    _dataset: Union[IterableDataset, Dataset, None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Load the dataset from the Hugging Face Hub\"\"\"\n        super().load()\n\n        if self._dataset is not None:\n            # Here to simplify the functionality of\u00a0distilabel.steps.generators.util.make_generator_step\n            return\n\n        self._dataset = load_dataset(\n            self.repo_id,  # type: ignore\n            self.config,\n            split=self.split,\n            revision=self.revision,\n            streaming=self.streaming,\n        )\n        num_examples = self._get_dataset_num_examples()\n        self.num_examples = (\n            min(self.num_examples, num_examples) if self.num_examples else num_examples\n        )\n\n        if not self.streaming:\n            self._dataset = self._dataset.select(range(self.num_examples))\n\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        \"\"\"Yields batches from the loaded dataset from the Hugging Face Hub.\n\n        Args:\n            offset: The offset to start yielding the data from. Will be used during the caching\n                process to help skipping already processed data.\n\n        Yields:\n            A tuple containing a batch of rows and a boolean indicating if the batch is\n            the last one.\n        \"\"\"\n        num_returned_rows = 0\n        for batch_num, batch in enumerate(\n            self._dataset.iter(batch_size=self.batch_size)  # type: ignore\n        ):\n            if batch_num * self.batch_size < offset:\n                continue\n            transformed_batch = self._transform_batch(batch)\n            batch_size = len(transformed_batch)\n            num_returned_rows += batch_size\n            yield transformed_batch, num_returned_rows >= self.num_examples\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The columns that will be generated by this step, based on the datasets loaded\n        from the Hugging Face Hub.\n\n        Returns:\n            The columns that will be generated by this step.\n        \"\"\"\n        return self._get_dataset_columns()\n\n    def _transform_batch(self, batch: Dict[str, Any]) -> List[Dict[str, Any]]:\n        \"\"\"Transform a batch of data from the Hugging Face Hub into a list of rows.\n\n        Args:\n            batch: The batch of data from the Hugging Face Hub.\n\n        Returns:\n            A list of rows, where each row is a dictionary of column names and values.\n        \"\"\"\n        length = len(next(iter(batch.values())))\n        rows = []\n        for i in range(length):\n            rows.append({col: values[i] for col, values in batch.items()})\n        return rows\n\n    def _get_dataset_num_examples(self) -> int:\n        \"\"\"Get the number of examples in the dataset, based on the `split` and `config`\n        runtime parameters provided.\n\n        Returns:\n            The number of examples in the dataset.\n        \"\"\"\n        default_config = self.config\n        if not default_config:\n            default_config = list(self._dataset_info.keys())[0]\n\n        return self._dataset_info[default_config].splits[self.split].num_examples\n\n    def _get_dataset_columns(self) -> List[str]:\n        \"\"\"Get the columns of the dataset, based on the `config` runtime parameter provided.\n\n        Returns:\n            The columns of the dataset.\n        \"\"\"\n        return list(\n            self._dataset_info[\n                self.config if self.config else \"default\"\n            ].features.keys()\n        )\n\n    @cached_property\n    def _dataset_info(self) -> Dict[str, DatasetInfo]:\n        \"\"\"Calls the Datasets Server API from Hugging Face to obtain the dataset information.\n\n        Returns:\n            The dataset information.\n        \"\"\"\n\n        try:\n            return get_dataset_infos(self.repo_id)\n        except Exception as e:\n            warnings.warn(\n                f\"Failed to get dataset info from Hugging Face Hub, trying to get it loading the dataset. Error: {e}\",\n                UserWarning,\n                stacklevel=2,\n            )\n            ds = load_dataset(self.repo_id, config=self.config, split=self.split)\n            if self.config:\n                return ds[self.config].info\n            return ds.info\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.outputs","title":"outputs: List[str] property","text":"

The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.

Returns:

Type Description List[str]

The columns that will be generated by this step.

"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.load","title":"load()","text":"

Load the dataset from the Hugging Face Hub

Source code in src/distilabel/steps/generators/huggingface.py
def load(self) -> None:\n    \"\"\"Load the dataset from the Hugging Face Hub\"\"\"\n    super().load()\n\n    if self._dataset is not None:\n        # Here to simplify the functionality of\u00a0distilabel.steps.generators.util.make_generator_step\n        return\n\n    self._dataset = load_dataset(\n        self.repo_id,  # type: ignore\n        self.config,\n        split=self.split,\n        revision=self.revision,\n        streaming=self.streaming,\n    )\n    num_examples = self._get_dataset_num_examples()\n    self.num_examples = (\n        min(self.num_examples, num_examples) if self.num_examples else num_examples\n    )\n\n    if not self.streaming:\n        self._dataset = self._dataset.select(range(self.num_examples))\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.process","title":"process(offset=0)","text":"

Yields batches from the loaded dataset from the Hugging Face Hub.

Parameters:

Name Type Description Default offset int

The offset to start yielding the data from. Will be used during the caching process to help skipping already processed data.

0

Yields:

Type Description GeneratorStepOutput

A tuple containing a batch of rows and a boolean indicating if the batch is

GeneratorStepOutput

the last one.

Source code in src/distilabel/steps/generators/huggingface.py
def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n    \"\"\"Yields batches from the loaded dataset from the Hugging Face Hub.\n\n    Args:\n        offset: The offset to start yielding the data from. Will be used during the caching\n            process to help skipping already processed data.\n\n    Yields:\n        A tuple containing a batch of rows and a boolean indicating if the batch is\n        the last one.\n    \"\"\"\n    num_returned_rows = 0\n    for batch_num, batch in enumerate(\n        self._dataset.iter(batch_size=self.batch_size)  # type: ignore\n    ):\n        if batch_num * self.batch_size < offset:\n            continue\n        transformed_batch = self._transform_batch(batch)\n        batch_size = len(transformed_batch)\n        num_returned_rows += batch_size\n        yield transformed_batch, num_returned_rows >= self.num_examples\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.PushToHub","title":"PushToHub","text":"

Bases: GlobalStep

Push data to a Hugging Face Hub dataset.

A GlobalStep which creates a datasets.Dataset with the input data and pushes it to the Hugging Face Hub.

Attributes:

Name Type Description repo_id RuntimeParameter[str]

The Hugging Face Hub repository ID where the dataset will be uploaded.

split RuntimeParameter[str]

The split of the dataset that will be pushed. Defaults to \"train\".

private RuntimeParameter[bool]

Whether the dataset to be pushed should be private or not. Defaults to False.

token Optional[RuntimeParameter[str]]

The token that will be used to authenticate in the Hub. If not provided, the token will be tried to be obtained from the environment variable HF_TOKEN. If not provided using one of the previous methods, then huggingface_hub library will try to use the token from the local Hugging Face CLI configuration. Defaults to None.

Runtime parameters
  • repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.
  • split: The split of the dataset that will be pushed.
  • private: Whether the dataset to be pushed should be private or not.
  • token: The token that will be used to authenticate in the Hub.
Input columns
  • dynamic (all): all columns from the input will be used to create the dataset.
Categories
  • save
  • dataset
  • huggingface

Examples:

Push batches of your dataset to the Hugging Face Hub repository:

from distilabel.steps import PushToHub\n\npush = PushToHub(repo_id=\"path_to/repo\")\npush.load()\n\nresult = next(\n    push.process(\n        [\n            {\n                \"instruction\": \"instruction \",\n                \"generation\": \"generation\"\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction ', 'generation': 'generation'}]\n
Source code in src/distilabel/steps/globals/huggingface.py
class PushToHub(GlobalStep):\n    \"\"\"Push data to a Hugging Face Hub dataset.\n\n    A `GlobalStep` which creates a `datasets.Dataset` with the input data and pushes\n    it to the Hugging Face Hub.\n\n    Attributes:\n        repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.\n        split: The split of the dataset that will be pushed. Defaults to `\"train\"`.\n        private: Whether the dataset to be pushed should be private or not. Defaults to\n            `False`.\n        token: The token that will be used to authenticate in the Hub. If not provided, the\n            token will be tried to be obtained from the environment variable `HF_TOKEN`.\n            If not provided using one of the previous methods, then `huggingface_hub` library\n            will try to use the token from the local Hugging Face CLI configuration. Defaults\n            to `None`.\n\n    Runtime parameters:\n        - `repo_id`: The Hugging Face Hub repository ID where the dataset will be uploaded.\n        - `split`: The split of the dataset that will be pushed.\n        - `private`: Whether the dataset to be pushed should be private or not.\n        - `token`: The token that will be used to authenticate in the Hub.\n\n    Input columns:\n        - dynamic (`all`): all columns from the input will be used to create the dataset.\n\n    Categories:\n        - save\n        - dataset\n        - huggingface\n\n    Examples:\n        Push batches of your dataset to the Hugging Face Hub repository:\n\n        ```python\n        from distilabel.steps import PushToHub\n\n        push = PushToHub(repo_id=\"path_to/repo\")\n        push.load()\n\n        result = next(\n            push.process(\n                [\n                    {\n                        \"instruction\": \"instruction \",\n                        \"generation\": \"generation\"\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'instruction ', 'generation': 'generation'}]\n        ```\n    \"\"\"\n\n    repo_id: RuntimeParameter[str] = Field(\n        default=None,\n        description=\"The Hugging Face Hub repository ID where the dataset will be uploaded.\",\n    )\n    split: RuntimeParameter[str] = Field(\n        default=\"train\",\n        description=\"The split of the dataset that will be pushed. Defaults to 'train'.\",\n    )\n    private: RuntimeParameter[bool] = Field(\n        default=False,\n        description=\"Whether the dataset to be pushed should be private or not. Defaults\"\n        \" to `False`.\",\n    )\n    token: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The token that will be used to authenticate in the Hub. If not provided,\"\n        \" the token will be tried to be obtained from the environment variable `HF_TOKEN`.\"\n        \" If not provided using one of the previous methods, then `huggingface_hub` library\"\n        \" will try to use the token from the local Hugging Face CLI configuration. Defaults\"\n        \" to `None`\",\n    )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Method that processes the input data, respecting the `datasets.Dataset` formatting,\n        and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.\n\n        Args:\n            inputs: that input data within a single object (as it's a GlobalStep) that\n                will be transformed into a `datasets.Dataset`.\n\n        Yields:\n            Propagates the received inputs so that the `Distiset` can be generated if this is\n            the last step of the `Pipeline`, or if this is not a leaf step and has follow up\n            steps.\n        \"\"\"\n        dataset_dict = defaultdict(list)\n        for input in inputs:\n            for key, value in input.items():\n                dataset_dict[key].append(value)\n        dataset_dict = dict(dataset_dict)\n        dataset = Dataset.from_dict(dataset_dict)\n        dataset.push_to_hub(\n            self.repo_id,  # type: ignore\n            split=self.split,\n            private=self.private,\n            token=self.token or os.getenv(\"HF_TOKEN\"),\n        )\n        yield inputs\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.PushToHub.process","title":"process(inputs)","text":"

Method that processes the input data, respecting the datasets.Dataset formatting, and pushes it to the Hugging Face Hub based on the RuntimeParameters attributes.

Parameters:

Name Type Description Default inputs StepInput

that input data within a single object (as it's a GlobalStep) that will be transformed into a datasets.Dataset.

required

Yields:

Type Description StepOutput

Propagates the received inputs so that the Distiset can be generated if this is

StepOutput

the last step of the Pipeline, or if this is not a leaf step and has follow up

StepOutput

steps.

Source code in src/distilabel/steps/globals/huggingface.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Method that processes the input data, respecting the `datasets.Dataset` formatting,\n    and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.\n\n    Args:\n        inputs: that input data within a single object (as it's a GlobalStep) that\n            will be transformed into a `datasets.Dataset`.\n\n    Yields:\n        Propagates the received inputs so that the `Distiset` can be generated if this is\n        the last step of the `Pipeline`, or if this is not a leaf step and has follow up\n        steps.\n    \"\"\"\n    dataset_dict = defaultdict(list)\n    for input in inputs:\n        for key, value in input.items():\n            dataset_dict[key].append(value)\n    dataset_dict = dict(dataset_dict)\n    dataset = Dataset.from_dict(dataset_dict)\n    dataset.push_to_hub(\n        self.repo_id,  # type: ignore\n        split=self.split,\n        private=self.private,\n        token=self.token or os.getenv(\"HF_TOKEN\"),\n    )\n    yield inputs\n
"},{"location":"api/task/","title":"Task","text":"

This section contains the API reference for the distilabel tasks.

For more information on how the Task works and see some examples, check the Tutorial - Task page.

"},{"location":"api/task/#distilabel.steps.tasks.base","title":"base","text":""},{"location":"api/task/#distilabel.steps.tasks.base._Task","title":"_Task","text":"

Bases: _Step, ABC

_Task is an abstract class that implements the _Step interface and adds the format_input and format_output methods to format the inputs and outputs of the task. It also adds a llm attribute to be used as the LLM to generate the outputs.

Attributes:

Name Type Description llm LLM

the LLM to be used to generate the outputs of the task.

group_generations bool

whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False.

add_raw_output RuntimeParameter[bool]

whether to include a field with the raw output of the LLM in the distilabel_metadata field of the output. Can be helpful to not loose data with Tasks that need to format the output of the LLM. Defaults to False.

num_generations RuntimeParameter[int]

The number of generations to be produced per input.

Source code in src/distilabel/steps/tasks/base.py
class _Task(_Step, ABC):\n    \"\"\"_Task is an abstract class that implements the `_Step` interface and adds the\n    `format_input` and `format_output` methods to format the inputs and outputs of the\n    task. It also adds a `llm` attribute to be used as the LLM to generate the outputs.\n\n    Attributes:\n        llm: the `LLM` to be used to generate the outputs of the task.\n        group_generations: whether to group the `num_generations` generated per input in\n            a list or create a row per generation. Defaults to `False`.\n        add_raw_output: whether to include a field with the raw output of the LLM in the\n            `distilabel_metadata` field of the output. Can be helpful to not loose data\n            with `Tasks` that need to format the output of the `LLM`. Defaults to `False`.\n        num_generations: The number of generations to be produced per input.\n    \"\"\"\n\n    llm: LLM\n\n    group_generations: bool = False\n    add_raw_output: RuntimeParameter[bool] = Field(\n        default=True,\n        description=(\n            \"Whether to include the raw output of the LLM in the key `raw_output_<TASK_NAME>`\"\n            \" of the `distilabel_metadata` dictionary output column\"\n        ),\n    )\n    add_raw_input: RuntimeParameter[bool] = Field(\n        default=True,\n        description=(\n            \"Whether to include the raw input of the LLM in the key `raw_input_<TASK_NAME>`\"\n            \" of the `distilabel_metadata` dictionary column\"\n        ),\n    )\n    num_generations: RuntimeParameter[int] = Field(\n        default=1, description=\"The number of generations to be produced per input.\"\n    )\n    use_default_structured_output: bool = False\n\n    _can_be_used_with_offline_batch_generation: bool = PrivateAttr(False)\n\n    def model_post_init(self, __context: Any) -> None:\n        if (\n            self.llm.use_offline_batch_generation\n            and not self._can_be_used_with_offline_batch_generation\n        ):\n            raise DistilabelUserError(\n                f\"`{self.__class__.__name__}` task cannot be used with offline batch generation\"\n                \" feature.\",\n                page=\"sections/how_to_guides/advanced/offline-batch-generation\",\n            )\n\n        super().model_post_init(__context)\n\n    @property\n    def is_global(self) -> bool:\n        \"\"\"Extends the `is_global` property to return `True` if the task is using the\n        offline batch generation feature, otherwise it returns the value of the parent\n        class property. `offline_batch_generation` requires to receive all the inputs\n        at once, so for the `_BatchManager` this is a global step.\n\n        Returns:\n            Whether the task is a global step or not.\n        \"\"\"\n        if self.llm.use_offline_batch_generation:\n            return True\n\n        return super().is_global\n\n    def load(self) -> None:\n        \"\"\"Loads the LLM via the `LLM.load()` method.\"\"\"\n        super().load()\n        self._set_default_structured_output()\n        self.llm.load()\n\n    @override\n    def unload(self) -> None:\n        \"\"\"Unloads the LLM.\"\"\"\n        self._logger.debug(\"Executing task unload logic.\")\n        self.llm.unload()\n\n    @override\n    def impute_step_outputs(\n        self, step_output: List[Dict[str, Any]]\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Imputes the outputs of the task in case the LLM failed to generate a response.\n        \"\"\"\n        result = []\n        for row in step_output:\n            data = row.copy()\n            for output in self.get_outputs().keys():\n                data[output] = None\n            data = self._maybe_add_raw_input_output(\n                data,\n                None,\n                None,\n                add_raw_output=self.add_raw_output,\n                add_raw_input=self.add_raw_input,\n            )\n            result.append(data)\n        return result\n\n    @abstractmethod\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Union[Dict[str, Any], None] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"Abstract method to format the outputs of the task. It needs to receive an output\n        as a string, and generates a Python dictionary with the outputs of the task. In\n        addition the `input` used to generate the output is also received just in case it's\n        needed to be able to parse the output correctly.\n        \"\"\"\n        pass\n\n    def _format_outputs(\n        self,\n        outputs: \"GenerateOutput\",\n        input: Union[Dict[str, Any], None] = None,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"Formats the outputs of the task using the `format_output` method. If the output\n        is `None` (i.e. the LLM failed to generate a response), then the outputs will be\n        set to `None` as well.\n\n        Args:\n            outputs: The outputs (`n` generations) for the provided `input`.\n            input: The input used to generate the output.\n\n        Returns:\n            A list containing a dictionary with the outputs of the task for each input.\n        \"\"\"\n        inputs = [None] if input is None else [input]\n\n        formatted_outputs = []\n        for output, input in zip(outputs, inputs * len(outputs)):  # type: ignore\n            try:\n                formatted_output = self.format_output(output, input)\n                formatted_output = self._maybe_add_raw_input_output(\n                    formatted_output,\n                    output,\n                    input,\n                    add_raw_output=self.add_raw_output,  # type: ignore\n                    add_raw_input=self.add_raw_input,  # type: ignore\n                )\n                formatted_outputs.append(formatted_output)\n            except Exception as e:\n                self._logger.warning(  # type: ignore\n                    f\"Task '{self.name}' failed to format output: {e}. Saving raw response.\"  # type: ignore\n                )\n                formatted_outputs.append(self._output_on_failure(output, input))\n        return formatted_outputs\n\n    def _output_on_failure(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"In case of failure to format the output, this method will return a dictionary including\n        a new field `distilabel_meta` with the raw output of the LLM.\n        \"\"\"\n        # Create a dictionary with the outputs of the task (every output set to None)\n        outputs = {output: None for output in self.outputs}\n        outputs[\"model_name\"] = self.llm.model_name  # type: ignore\n        outputs = self._maybe_add_raw_input_output(\n            outputs,\n            output,\n            input,\n            add_raw_output=self.add_raw_output,  # type: ignore\n            add_raw_input=self.add_raw_input,  # type: ignore\n        )\n        return outputs\n\n    def _maybe_add_raw_input_output(\n        self,\n        output: Dict[str, Any],\n        raw_output: Union[str, None],\n        input: Union[str, None],\n        add_raw_output: bool = True,\n        add_raw_input: bool = True,\n    ):\n        \"\"\"Adds the raw output and or the formatted input of the LLM to the output dictionary\n        if `add_raw_output` is True or `add_raw_input` is True.\n        \"\"\"\n        meta = output.get(DISTILABEL_METADATA_KEY, {})\n\n        if add_raw_output:\n            meta[f\"raw_output_{self.name}\"] = raw_output\n        if add_raw_input:\n            meta[f\"raw_input_{self.name}\"] = self.format_input(input) if input else None\n        if meta:\n            output[DISTILABEL_METADATA_KEY] = meta\n\n        return output\n\n    def _set_default_structured_output(self) -> None:\n        \"\"\"Prepares the structured output to be set in the selected `LLM`.\n\n        If the method `get_structured_output` returns None (the default), there's no need\n        to set anything, as it doesn't apply.\n        If the `use_default_structured_output` and there's no previous structured output\n        set by hand, then decide the type of structured output to select depending on the\n        `LLM` provider.\n        \"\"\"\n        schema = self.get_structured_output()\n        if not schema:\n            return\n\n        if self.use_default_structured_output and not self.llm.structured_output:\n            # In case the default structured output is required, we have to set it before\n            # the LLM is loaded\n            from distilabel.models.llms import InferenceEndpointsLLM\n            from distilabel.models.llms.base import AsyncLLM\n\n            def check_dependency(module_name: str) -> None:\n                if not importlib.util.find_spec(module_name):\n                    raise ImportError(\n                        f\"`{module_name}` is not installed and is needed for the structured generation with this LLM.\"\n                        f\" Please install it using `pip install {module_name}`.\"\n                    )\n\n            dependency = \"outlines\"\n            structured_output = {\"schema\": schema}\n            if isinstance(self.llm, InferenceEndpointsLLM):\n                structured_output.update({\"format\": \"json\"})\n            # To determine instructor or outlines format\n            elif isinstance(self.llm, AsyncLLM) and not isinstance(\n                self.llm, InferenceEndpointsLLM\n            ):\n                dependency = \"instructor\"\n                structured_output.update({\"format\": \"json\"})\n\n            check_dependency(dependency)\n            self.llm.structured_output = structured_output\n\n    def get_structured_output(self) -> Union[Dict[str, Any], None]:\n        \"\"\"Returns the structured output for a task that implements one by default,\n        must be overriden by subclasses of `Task`. When implemented, should be a json\n        schema that enforces the response from the LLM so that it's easier to parse.\n        \"\"\"\n        return None\n\n    def _sample_input(self) -> \"ChatType\":\n        \"\"\"Returns a sample input to be used in the `print` method.\n        Tasks that don't adhere to a format input that returns a map of the type\n        str -> str should override this method to return a sample input.\n        \"\"\"\n        return self.format_input(\n            {input: f\"<PLACEHOLDER_{input.upper()}>\" for input in self.inputs}\n        )\n\n    def print(self, sample_input: Optional[\"ChatType\"] = None) -> None:\n        \"\"\"Prints a sample input to the console using the `rich` library.\n        Helper method to visualize the prompt of the task.\n\n        Args:\n            sample_input: A sample input to be printed. If not provided, a default will be\n                generated using the `_sample_input` method, which can be overriden by\n                subclasses. This should correspond to the same example you could pass to\n                the `format_input` method.\n                The variables be named <PLACEHOLDER_VARIABLE_NAME> by default.\n\n        Examples:\n            Print the URIAL prompt:\n\n            ```python\n            from distilabel.steps.tasks import URIAL\n            from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n            # Consider this as a placeholder for your actual LLM.\n            urial = URIAL(\n                llm=InferenceEndpointsLLM(\n                    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                ),\n            )\n            urial.load()\n            urial.print()\n            \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n            \u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n            \u2502 \u2502 # Instruction                                                                             \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you).               \u2502 \u2502\n            \u2502 \u2502 Users place their queries under \"# User:\", and your responses are under  \"# Assistant:\".  \u2502 \u2502\n            \u2502 \u2502 You are a helpful, respectful, and honest assistant.                                      \u2502 \u2502\n            \u2502 \u2502 You should always answer as helpfully as possible while ensuring safety.                  \u2502 \u2502\n            \u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n            \u2502 \u2502 have an engaging tone.                                                                    \u2502 \u2502\n            \u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic,      \u2502 \u2502\n            \u2502 \u2502 dangerous, or illegal content, even if it may be helpful.                                 \u2502 \u2502\n            \u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some        \u2502 \u2502\n            \u2502 \u2502 controversial topics.                                                                     \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502 # User:                                                                                   \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502 <PLACEHOLDER_INSTRUCTION>                                                                 \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502 # Assistant:                                                                              \u2502 \u2502\n            \u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n            \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n            ```\n        \"\"\"\n        from rich.console import Console, Group\n        from rich.panel import Panel\n        from rich.text import Text\n\n        console = Console()\n        sample_input = sample_input or self._sample_input()\n\n        panels = []\n        for item in sample_input:\n            content = Text.assemble((item.get(\"content\", \"\"),))\n            panel = Panel(\n                content,\n                title=f\"[bold][magenta]{item.get('role', '').capitalize()} Message[/magenta][/bold]\",\n                border_style=\"light_cyan3\",\n            )\n            panels.append(panel)\n\n        # Create a group of panels\n        # Wrap the group in an outer panel\n        outer_panel = Panel(\n            Group(*panels),\n            title=f\"[bold][magenta]Prompt: {type(self).__name__} [/magenta][/bold]\",\n            border_style=\"light_cyan3\",\n            expand=False,\n        )\n        console.print(outer_panel)\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.is_global","title":"is_global: bool property","text":"

Extends the is_global property to return True if the task is using the offline batch generation feature, otherwise it returns the value of the parent class property. offline_batch_generation requires to receive all the inputs at once, so for the _BatchManager this is a global step.

Returns:

Type Description bool

Whether the task is a global step or not.

"},{"location":"api/task/#distilabel.steps.tasks.base._Task.load","title":"load()","text":"

Loads the LLM via the LLM.load() method.

Source code in src/distilabel/steps/tasks/base.py
def load(self) -> None:\n    \"\"\"Loads the LLM via the `LLM.load()` method.\"\"\"\n    super().load()\n    self._set_default_structured_output()\n    self.llm.load()\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.unload","title":"unload()","text":"

Unloads the LLM.

Source code in src/distilabel/steps/tasks/base.py
@override\ndef unload(self) -> None:\n    \"\"\"Unloads the LLM.\"\"\"\n    self._logger.debug(\"Executing task unload logic.\")\n    self.llm.unload()\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.impute_step_outputs","title":"impute_step_outputs(step_output)","text":"

Imputes the outputs of the task in case the LLM failed to generate a response.

Source code in src/distilabel/steps/tasks/base.py
@override\ndef impute_step_outputs(\n    self, step_output: List[Dict[str, Any]]\n) -> List[Dict[str, Any]]:\n    \"\"\"\n    Imputes the outputs of the task in case the LLM failed to generate a response.\n    \"\"\"\n    result = []\n    for row in step_output:\n        data = row.copy()\n        for output in self.get_outputs().keys():\n            data[output] = None\n        data = self._maybe_add_raw_input_output(\n            data,\n            None,\n            None,\n            add_raw_output=self.add_raw_output,\n            add_raw_input=self.add_raw_input,\n        )\n        result.append(data)\n    return result\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.format_output","title":"format_output(output, input=None) abstractmethod","text":"

Abstract method to format the outputs of the task. It needs to receive an output as a string, and generates a Python dictionary with the outputs of the task. In addition the input used to generate the output is also received just in case it's needed to be able to parse the output correctly.

Source code in src/distilabel/steps/tasks/base.py
@abstractmethod\ndef format_output(\n    self,\n    output: Union[str, None],\n    input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n    \"\"\"Abstract method to format the outputs of the task. It needs to receive an output\n    as a string, and generates a Python dictionary with the outputs of the task. In\n    addition the `input` used to generate the output is also received just in case it's\n    needed to be able to parse the output correctly.\n    \"\"\"\n    pass\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.get_structured_output","title":"get_structured_output()","text":"

Returns the structured output for a task that implements one by default, must be overriden by subclasses of Task. When implemented, should be a json schema that enforces the response from the LLM so that it's easier to parse.

Source code in src/distilabel/steps/tasks/base.py
def get_structured_output(self) -> Union[Dict[str, Any], None]:\n    \"\"\"Returns the structured output for a task that implements one by default,\n    must be overriden by subclasses of `Task`. When implemented, should be a json\n    schema that enforces the response from the LLM so that it's easier to parse.\n    \"\"\"\n    return None\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.print","title":"print(sample_input=None)","text":"

Prints a sample input to the console using the rich library. Helper method to visualize the prompt of the task.

Parameters:

Name Type Description Default sample_input Optional[ChatType]

A sample input to be printed. If not provided, a default will be generated using the _sample_input method, which can be overriden by subclasses. This should correspond to the same example you could pass to the format_input method. The variables be named by default. None

Examples:

Print the URIAL prompt:

from distilabel.steps.tasks import URIAL\nfrom distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nurial = URIAL(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n)\nurial.load()\nurial.print()\n\u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n\u2502 \u2502 # Instruction                                                                             \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you).               \u2502 \u2502\n\u2502 \u2502 Users place their queries under \"# User:\", and your responses are under  \"# Assistant:\".  \u2502 \u2502\n\u2502 \u2502 You are a helpful, respectful, and honest assistant.                                      \u2502 \u2502\n\u2502 \u2502 You should always answer as helpfully as possible while ensuring safety.                  \u2502 \u2502\n\u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n\u2502 \u2502 have an engaging tone.                                                                    \u2502 \u2502\n\u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic,      \u2502 \u2502\n\u2502 \u2502 dangerous, or illegal content, even if it may be helpful.                                 \u2502 \u2502\n\u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some        \u2502 \u2502\n\u2502 \u2502 controversial topics.                                                                     \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502 # User:                                                                                   \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502 <PLACEHOLDER_INSTRUCTION>                                                                 \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502 # Assistant:                                                                              \u2502 \u2502\n\u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n
Source code in src/distilabel/steps/tasks/base.py
def print(self, sample_input: Optional[\"ChatType\"] = None) -> None:\n    \"\"\"Prints a sample input to the console using the `rich` library.\n    Helper method to visualize the prompt of the task.\n\n    Args:\n        sample_input: A sample input to be printed. If not provided, a default will be\n            generated using the `_sample_input` method, which can be overriden by\n            subclasses. This should correspond to the same example you could pass to\n            the `format_input` method.\n            The variables be named <PLACEHOLDER_VARIABLE_NAME> by default.\n\n    Examples:\n        Print the URIAL prompt:\n\n        ```python\n        from distilabel.steps.tasks import URIAL\n        from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        urial = URIAL(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n        )\n        urial.load()\n        urial.print()\n        \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n        \u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n        \u2502 \u2502 # Instruction                                                                             \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you).               \u2502 \u2502\n        \u2502 \u2502 Users place their queries under \"# User:\", and your responses are under  \"# Assistant:\".  \u2502 \u2502\n        \u2502 \u2502 You are a helpful, respectful, and honest assistant.                                      \u2502 \u2502\n        \u2502 \u2502 You should always answer as helpfully as possible while ensuring safety.                  \u2502 \u2502\n        \u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n        \u2502 \u2502 have an engaging tone.                                                                    \u2502 \u2502\n        \u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic,      \u2502 \u2502\n        \u2502 \u2502 dangerous, or illegal content, even if it may be helpful.                                 \u2502 \u2502\n        \u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some        \u2502 \u2502\n        \u2502 \u2502 controversial topics.                                                                     \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502 # User:                                                                                   \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502 <PLACEHOLDER_INSTRUCTION>                                                                 \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502 # Assistant:                                                                              \u2502 \u2502\n        \u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n        \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n        ```\n    \"\"\"\n    from rich.console import Console, Group\n    from rich.panel import Panel\n    from rich.text import Text\n\n    console = Console()\n    sample_input = sample_input or self._sample_input()\n\n    panels = []\n    for item in sample_input:\n        content = Text.assemble((item.get(\"content\", \"\"),))\n        panel = Panel(\n            content,\n            title=f\"[bold][magenta]{item.get('role', '').capitalize()} Message[/magenta][/bold]\",\n            border_style=\"light_cyan3\",\n        )\n        panels.append(panel)\n\n    # Create a group of panels\n    # Wrap the group in an outer panel\n    outer_panel = Panel(\n        Group(*panels),\n        title=f\"[bold][magenta]Prompt: {type(self).__name__} [/magenta][/bold]\",\n        border_style=\"light_cyan3\",\n        expand=False,\n    )\n    console.print(outer_panel)\n
"},{"location":"api/task/#distilabel.steps.tasks.base.Task","title":"Task","text":"

Bases: _Task, Step

Task is a class that implements the _Task abstract class and adds the Step interface to be used as a step in the pipeline.

Attributes:

Name Type Description llm

the LLM to be used to generate the outputs of the task.

group_generations

whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False.

num_generations

The number of generations to be produced per input.

Source code in src/distilabel/steps/tasks/base.py
class Task(_Task, Step):\n    \"\"\"Task is a class that implements the `_Task` abstract class and adds the `Step`\n    interface to be used as a step in the pipeline.\n\n    Attributes:\n        llm: the `LLM` to be used to generate the outputs of the task.\n        group_generations: whether to group the `num_generations` generated per input in\n            a list or create a row per generation. Defaults to `False`.\n        num_generations: The number of generations to be produced per input.\n    \"\"\"\n\n    @abstractmethod\n    def format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n        \"\"\"Abstract method to format the inputs of the task. It needs to receive an input\n        as a Python dictionary, and generates an OpenAI chat-like list of dicts.\"\"\"\n        pass\n\n    def _format_inputs(self, inputs: List[Dict[str, Any]]) -> List[\"FormattedInput\"]:\n        \"\"\"Formats the inputs of the task using the `format_input` method.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list containing the formatted inputs, which are `ChatType`-like following\n            the OpenAI formatting.\n        \"\"\"\n        return [self.format_input(input) for input in inputs]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n\n        formatted_inputs = self._format_inputs(inputs)\n\n        # `outputs` is a list containing a list of generations per input\n        outputs = self.llm.generate_outputs(\n            inputs=formatted_inputs,\n            num_generations=self.num_generations,  # type: ignore\n            **self.llm.get_generation_kwargs(),  # type: ignore\n        )\n\n        task_outputs = []\n        for input, input_outputs in zip(inputs, outputs):\n            formatted_outputs = self._format_outputs(input_outputs, input)\n\n            if self.group_generations:\n                combined = group_dicts(*formatted_outputs)\n                task_outputs.append(\n                    {**input, **combined, \"model_name\": self.llm.model_name}\n                )\n                continue\n\n            # Create a row per generation\n            for formatted_output in formatted_outputs:\n                task_outputs.append(\n                    {**input, **formatted_output, \"model_name\": self.llm.model_name}\n                )\n\n        yield task_outputs\n
"},{"location":"api/task/#distilabel.steps.tasks.base.Task.format_input","title":"format_input(input) abstractmethod","text":"

Abstract method to format the inputs of the task. It needs to receive an input as a Python dictionary, and generates an OpenAI chat-like list of dicts.

Source code in src/distilabel/steps/tasks/base.py
@abstractmethod\ndef format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n    \"\"\"Abstract method to format the inputs of the task. It needs to receive an input\n    as a Python dictionary, and generates an OpenAI chat-like list of dicts.\"\"\"\n    pass\n
"},{"location":"api/task/#distilabel.steps.tasks.base.Task.process","title":"process(inputs)","text":"

Processes the inputs of the task and generates the outputs using the LLM.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Yields:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/tasks/base.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Yields:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n\n    formatted_inputs = self._format_inputs(inputs)\n\n    # `outputs` is a list containing a list of generations per input\n    outputs = self.llm.generate_outputs(\n        inputs=formatted_inputs,\n        num_generations=self.num_generations,  # type: ignore\n        **self.llm.get_generation_kwargs(),  # type: ignore\n    )\n\n    task_outputs = []\n    for input, input_outputs in zip(inputs, outputs):\n        formatted_outputs = self._format_outputs(input_outputs, input)\n\n        if self.group_generations:\n            combined = group_dicts(*formatted_outputs)\n            task_outputs.append(\n                {**input, **combined, \"model_name\": self.llm.model_name}\n            )\n            continue\n\n        # Create a row per generation\n        for formatted_output in formatted_outputs:\n            task_outputs.append(\n                {**input, **formatted_output, \"model_name\": self.llm.model_name}\n            )\n\n    yield task_outputs\n
"},{"location":"api/task/generator_task/","title":"GeneratorTask","text":"

This section contains the API reference for the distilabel generator tasks.

For more information on how the GeneratorTask works and see some examples, check the Tutorial - Task - GeneratorTask page.

"},{"location":"api/task/generator_task/#distilabel.steps.tasks.base.GeneratorTask","title":"GeneratorTask","text":"

Bases: _Task, GeneratorStep

GeneratorTask is a class that implements the _Task abstract class and adds the GeneratorStep interface to be used as a step in the pipeline.

Attributes:

Name Type Description llm

the LLM to be used to generate the outputs of the task.

group_generations

whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False.

num_generations

The number of generations to be produced per input.

Source code in src/distilabel/steps/tasks/base.py
class GeneratorTask(_Task, GeneratorStep):\n    \"\"\"`GeneratorTask` is a class that implements the `_Task` abstract class and adds the\n    `GeneratorStep` interface to be used as a step in the pipeline.\n\n    Attributes:\n        llm: the `LLM` to be used to generate the outputs of the task.\n        group_generations: whether to group the `num_generations` generated per input in\n            a list or create a row per generation. Defaults to `False`.\n        num_generations: The number of generations to be produced per input.\n    \"\"\"\n\n    pass\n
"},{"location":"api/task/task_gallery/","title":"Task Gallery","text":"

This section contains the existing Task subclasses implemented in distilabel.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks","title":"tasks","text":""},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker","title":"APIGenExecutionChecker","text":"

Bases: Step

Executes the generated function calls.

This step checks if a given answer from a model as generated by APIGenGenerator can be executed against the given library (given by libpath, which is a string pointing to a python .py file with functions).

Attributes:

Name Type Description libpath str

The path to the library where we will retrieve the functions. It can also point to a folder with the functions. In this case, the folder layout should be a folder with .py files, each containing a single function, the name of the function being the same as the filename.

check_is_dangerous bool

Bool to exclude some potentially dangerous functions, it contains some heuristics found while testing. This functions can run subprocesses, deal with the OS, or have other potentially dangerous operations. Defaults to True.

Input columns
  • answers (str): List with arguments to be passed to the function, dumped as a string from a list of dictionaries. Should be loaded using json.loads.
Output columns
  • keep_row_after_execution_check (bool): Whether the function should be kept or not.
  • execution_result (str): The result from executing the function.
Categories
  • filtering
  • execution
References
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets
  • Salesforce/xlam-function-calling-60k

Examples:

Execute a function from a given library with the answer from an LLM:

from distilabel.steps.tasks import APIGenExecutionChecker\n\n# For the libpath you can use as an example the file at the tests folder:\n# ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\ntask = APIGenExecutionChecker(\n    libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n)\ntask.load()\n\nres = next(\n    task.process(\n        [\n            {\n                \"answers\": [\n                    {\n                        \"arguments\": {\n                            \"initial_velocity\": 0.2,\n                            \"acceleration\": 0.1,\n                            \"time\": 0.5,\n                        },\n                        \"name\": \"final_velocity\",\n                    }\n                ],\n            }\n        ]\n    )\n)\nres\n#[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n
Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
class APIGenExecutionChecker(Step):\n    \"\"\"Executes the generated function calls.\n\n    This step checks if a given answer from a model as generated by `APIGenGenerator`\n    can be executed against the given library (given by `libpath`, which is a string\n    pointing to a python .py file with functions).\n\n    Attributes:\n        libpath: The path to the library where we will retrieve the functions.\n            It can also point to a folder with the functions. In this case, the folder\n            layout should be a folder with .py files, each containing a single function,\n            the name of the function being the same as the filename.\n        check_is_dangerous: Bool to exclude some potentially dangerous functions, it contains\n            some heuristics found while testing. This functions can run subprocesses, deal with\n            the OS, or have other potentially dangerous operations. Defaults to True.\n\n    Input columns:\n        - answers (`str`): List with arguments to be passed to the function,\n            dumped as a string from a list of dictionaries. Should be loaded using\n            `json.loads`.\n\n    Output columns:\n        - keep_row_after_execution_check (`bool`): Whether the function should be kept or not.\n        - execution_result (`str`): The result from executing the function.\n\n    Categories:\n        - filtering\n        - execution\n\n    References:\n        - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n        - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n    Examples:\n        Execute a function from a given library with the answer from an LLM:\n\n        ```python\n        from distilabel.steps.tasks import APIGenExecutionChecker\n\n        # For the libpath you can use as an example the file at the tests folder:\n        # ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\n        task = APIGenExecutionChecker(\n            libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n        )\n        task.load()\n\n        res = next(\n            task.process(\n                [\n                    {\n                        \"answers\": [\n                            {\n                                \"arguments\": {\n                                    \"initial_velocity\": 0.2,\n                                    \"acceleration\": 0.1,\n                                    \"time\": 0.5,\n                                },\n                                \"name\": \"final_velocity\",\n                            }\n                        ],\n                    }\n                ]\n            )\n        )\n        res\n        #[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n        ```\n    \"\"\"\n\n    libpath: str = Field(\n        default=...,\n        description=(\n            \"The path to the library where we will retrieve the functions, \"\n            \"or a folder with python files named the same as the functions they contain.\",\n        ),\n    )\n    check_is_dangerous: bool = Field(\n        default=True,\n        description=(\n            \"Bool to exclude some potentially dangerous functions, it contains \"\n            \"some heuristics found while testing. This functions can run subprocesses, \"\n            \"deal with the OS, or have other potentially dangerous operations.\",\n        ),\n    )\n\n    _toolbox: Union[\"ModuleType\", None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the library where the functions will be extracted from.\"\"\"\n        super().load()\n        if Path(self.libpath).suffix == \".py\":\n            self._toolbox = load_module_from_path(self.libpath)\n\n    def unload(self) -> None:\n        self._toolbox = None\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task are those found in the original dataset.\"\"\"\n        return [\"answers\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs are the columns required by `APIGenGenerator` task.\"\"\"\n        return [\"keep_row_after_execution_check\", \"execution_result\"]\n\n    def _get_function(self, function_name: str) -> Callable:\n        \"\"\"Retrieves the function from the toolbox.\n\n        Args:\n            function_name: The name of the function to retrieve.\n\n        Returns:\n            Callable: The function to be executed.\n        \"\"\"\n        if self._toolbox:\n            return getattr(self._toolbox, function_name, None)\n        try:\n            toolbox = load_module_from_path(\n                str(Path(self.libpath) / f\"{function_name}.py\")\n            )\n            return getattr(toolbox, function_name, None)\n        except FileNotFoundError:\n            return None\n        except Exception as e:\n            self._logger.warning(f\"Error loading function '{function_name}': {e}\")\n            return None\n\n    def _is_dangerous(self, function: Callable) -> bool:\n        \"\"\"Checks if a function is dangerous to remove it.\n        Contains a list of heuristics to avoid executing possibly dangerous functions.\n        \"\"\"\n        source_code = inspect.getsource(function)\n        # We don't want to execute functions that use subprocess\n        if (\n            (\"subprocess.\" in source_code)\n            or (\"os.system(\" in source_code)\n            or (\"input(\" in source_code)\n            # Avoiding threading\n            or (\"threading.Thread(\" in source_code)\n            or (\"exec(\" in source_code)\n            # Avoiding argparse (not sure why)\n            or (\"argparse.ArgumentParser(\" in source_code)\n            # Avoiding logging changing the levels to not mess with the logs\n            or (\".setLevel(\" in source_code)\n            # Don't run a test battery\n            or (\"unittest.main(\" in source_code)\n            # Avoid exiting the program\n            or (\"sys.exit(\" in source_code)\n            or (\"exit(\" in source_code)\n            or (\"raise SystemExit(\" in source_code)\n            or (\"multiprocessing.Pool(\" in source_code)\n        ):\n            return True\n        return False\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        \"\"\"Checks the answer to see if it can be executed.\n        Captures the possible errors and returns them.\n\n        If a single example is provided, it is copied to avoid raising an error.\n\n        Args:\n            inputs: A list of dictionaries with the input data.\n\n        Yields:\n            A list of dictionaries with the output data.\n        \"\"\"\n        for input in inputs:\n            output = []\n            if input[\"answers\"]:\n                answers = json.loads(input[\"answers\"])\n            else:\n                input.update(\n                    **{\n                        \"keep_row_after_execution_check\": False,\n                        \"execution_result\": [\"No answers were provided.\"],\n                    }\n                )\n                continue\n            for answer in answers:\n                if answer is None:\n                    output.append(\n                        {\n                            \"keep\": False,\n                            \"execution_result\": \"Nothing was generated for this answer.\",\n                        }\n                    )\n                    continue\n\n                function_name = answer.get(\"name\", None)\n                arguments = answer.get(\"arguments\", None)\n\n                self._logger.debug(\n                    f\"Executing function '{function_name}' with arguments: {arguments}\"\n                )\n                function = self._get_function(function_name)\n\n                if self.check_is_dangerous:\n                    if function and self._is_dangerous(function):\n                        function = None\n\n                if function is None:\n                    output.append(\n                        {\n                            \"keep\": False,\n                            \"execution_result\": f\"Function '{function_name}' not found.\",\n                        }\n                    )\n                else:\n                    execution = execute_from_response(function, arguments)\n                    output.append(\n                        {\n                            \"keep\": execution[\"keep\"],\n                            \"execution_result\": execution[\"execution_result\"],\n                        }\n                    )\n            # We only consider a good response if all the answers were executed successfully,\n            # but keep the reasons for further review if needed.\n            input.update(\n                **{\n                    \"keep_row_after_execution_check\": all(\n                        o[\"keep\"] is True for o in output\n                    ),\n                    \"execution_result\": [o[\"execution_result\"] for o in output],\n                }\n            )\n\n        yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task are those found in the original dataset.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.outputs","title":"outputs: StepColumns property","text":"

The outputs are the columns required by APIGenGenerator task.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.load","title":"load()","text":"

Loads the library where the functions will be extracted from.

Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
def load(self) -> None:\n    \"\"\"Loads the library where the functions will be extracted from.\"\"\"\n    super().load()\n    if Path(self.libpath).suffix == \".py\":\n        self._toolbox = load_module_from_path(self.libpath)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker._get_function","title":"_get_function(function_name)","text":"

Retrieves the function from the toolbox.

Parameters:

Name Type Description Default function_name str

The name of the function to retrieve.

required

Returns:

Name Type Description Callable Callable

The function to be executed.

Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
def _get_function(self, function_name: str) -> Callable:\n    \"\"\"Retrieves the function from the toolbox.\n\n    Args:\n        function_name: The name of the function to retrieve.\n\n    Returns:\n        Callable: The function to be executed.\n    \"\"\"\n    if self._toolbox:\n        return getattr(self._toolbox, function_name, None)\n    try:\n        toolbox = load_module_from_path(\n            str(Path(self.libpath) / f\"{function_name}.py\")\n        )\n        return getattr(toolbox, function_name, None)\n    except FileNotFoundError:\n        return None\n    except Exception as e:\n        self._logger.warning(f\"Error loading function '{function_name}': {e}\")\n        return None\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker._is_dangerous","title":"_is_dangerous(function)","text":"

Checks if a function is dangerous to remove it. Contains a list of heuristics to avoid executing possibly dangerous functions.

Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
def _is_dangerous(self, function: Callable) -> bool:\n    \"\"\"Checks if a function is dangerous to remove it.\n    Contains a list of heuristics to avoid executing possibly dangerous functions.\n    \"\"\"\n    source_code = inspect.getsource(function)\n    # We don't want to execute functions that use subprocess\n    if (\n        (\"subprocess.\" in source_code)\n        or (\"os.system(\" in source_code)\n        or (\"input(\" in source_code)\n        # Avoiding threading\n        or (\"threading.Thread(\" in source_code)\n        or (\"exec(\" in source_code)\n        # Avoiding argparse (not sure why)\n        or (\"argparse.ArgumentParser(\" in source_code)\n        # Avoiding logging changing the levels to not mess with the logs\n        or (\".setLevel(\" in source_code)\n        # Don't run a test battery\n        or (\"unittest.main(\" in source_code)\n        # Avoid exiting the program\n        or (\"sys.exit(\" in source_code)\n        or (\"exit(\" in source_code)\n        or (\"raise SystemExit(\" in source_code)\n        or (\"multiprocessing.Pool(\" in source_code)\n    ):\n        return True\n    return False\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.process","title":"process(inputs)","text":"

Checks the answer to see if it can be executed. Captures the possible errors and returns them.

If a single example is provided, it is copied to avoid raising an error.

Parameters:

Name Type Description Default inputs StepInput

A list of dictionaries with the input data.

required

Yields:

Type Description StepOutput

A list of dictionaries with the output data.

Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":\n    \"\"\"Checks the answer to see if it can be executed.\n    Captures the possible errors and returns them.\n\n    If a single example is provided, it is copied to avoid raising an error.\n\n    Args:\n        inputs: A list of dictionaries with the input data.\n\n    Yields:\n        A list of dictionaries with the output data.\n    \"\"\"\n    for input in inputs:\n        output = []\n        if input[\"answers\"]:\n            answers = json.loads(input[\"answers\"])\n        else:\n            input.update(\n                **{\n                    \"keep_row_after_execution_check\": False,\n                    \"execution_result\": [\"No answers were provided.\"],\n                }\n            )\n            continue\n        for answer in answers:\n            if answer is None:\n                output.append(\n                    {\n                        \"keep\": False,\n                        \"execution_result\": \"Nothing was generated for this answer.\",\n                    }\n                )\n                continue\n\n            function_name = answer.get(\"name\", None)\n            arguments = answer.get(\"arguments\", None)\n\n            self._logger.debug(\n                f\"Executing function '{function_name}' with arguments: {arguments}\"\n            )\n            function = self._get_function(function_name)\n\n            if self.check_is_dangerous:\n                if function and self._is_dangerous(function):\n                    function = None\n\n            if function is None:\n                output.append(\n                    {\n                        \"keep\": False,\n                        \"execution_result\": f\"Function '{function_name}' not found.\",\n                    }\n                )\n            else:\n                execution = execute_from_response(function, arguments)\n                output.append(\n                    {\n                        \"keep\": execution[\"keep\"],\n                        \"execution_result\": execution[\"execution_result\"],\n                    }\n                )\n        # We only consider a good response if all the answers were executed successfully,\n        # but keep the reasons for further review if needed.\n        input.update(\n            **{\n                \"keep_row_after_execution_check\": all(\n                    o[\"keep\"] is True for o in output\n                ),\n                \"execution_result\": [o[\"execution_result\"] for o in output],\n            }\n        )\n\n    yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator","title":"APIGenGenerator","text":"

Bases: Task

Generate queries and answers for the given functions in JSON format.

The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\nverifiable and diverse function-calling datasets. The task generates a set of diverse queries\nand corresponding answers for the given functions in JSON format.\n\nAttributes:\n    system_prompt: The system prompt to guide the user in the generation of queries and answers.\n    use_tools: Whether to use the tools available in the prompt to generate the queries and answers.\n        In case the tools are given in the input, they will be added to the prompt.\n    number: The number of queries to generate. It can be a list, where each number will be\n        chosen randomly, or a dictionary with the number of queries and the probability of each.\n        I.e: `number=1`, `number=[1, 2, 3]`, `number={1: 0.5, 2: 0.3, 3: 0.2}` are all valid inputs.\n        It corresponds to the number of parallel queries to generate.\n    use_default_structured_output: Whether to use the default structured output or not.\n\nInput columns:\n    - examples (`str`): Examples used as few shots to guide the model.\n    - func_name (`str`): Name for the function to generate.\n    - func_desc (`str`): Description of what the function should do.\n    - tools (`str`): JSON formatted string containing the tool representation of the function.\n\nOutput columns:\n    - query (`str`): The list of queries.\n    - answers (`str`): JSON formatted string with the list of answers, containing the info as\n        a dictionary to be passed to the functions.\n\nCategories:\n    - text-generation\n\nReferences:\n    - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n    - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\nExamples:\n    Generate without structured output (original implementation):\n\n    ```python\n    from distilabel.steps.tasks import ApiGenGenerator\n    from distilabel.models import InferenceEndpointsLLM\n\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 1024,\n        },\n    )\n    apigen = ApiGenGenerator(\n        use_default_structured_output=False,\n        llm=llm\n    )\n    apigen.load()\n\n    res = next(\n        apigen.process(\n            [\n                {\n                    \"examples\": 'QUERY:\n

What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', \"func_name\": \"getrandommovie\", \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\" } ] ) ) res # [{'examples': 'QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', # 'number': 1, # 'func_name': 'getrandommovie', # 'func_desc': 'Returns a list of random movies from a database by calling an external API.', # 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?', # 'Give me 5 random movie suggestions from your database to plan my weekend.'], # 'answers': [[{'name': 'getrandommovie', 'arguments': {}}], # [{'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}]], # 'raw_input_api_gen_generator_0': [{'role': 'system', # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.

Construct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.

Ensure the query: - Is clear and concise - Demonstrates typical use cases - Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words - Across a variety level of difficulties, ranging from beginner and advanced use cases - The corresponding result's parameter types and ranges match with the function's descriptions

Ensure the answer: - Is a list of function calls in JSON format - The length of the answer list should be equal to the number of requests in the query - Can solve all the requests in the query effectively\"}, # {'role': 'user', # 'content': 'Here are examples of queries and the corresponding answers for similar functions: QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]

Note that the query could be interpreted as a combination of several independent requests. Based on these examples, generate 2 diverse query and answer pairs for the function getrandommovie The detailed function description is the following: Returns a list of random movies from a database by calling an external API.

The output MUST strictly adhere to the following JSON format, and NO other text MUST be included:

[\n   {\n       \"query\": \"The generated query.\",\n       \"answers\": [\n           {\n               \"name\": \"api_name\",\n               \"arguments\": {\n                   \"arg_name\": \"value\"\n                   ... (more arguments as required)\n               }\n           },\n           ... (more API calls as required)\n       ]\n   }\n]\n

Now please generate 2 diverse query and answer pairs following the above format.'}]}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ```

    Generate with structured output:\n\n    ```python\n    from distilabel.steps.tasks import ApiGenGenerator\n    from distilabel.models import InferenceEndpointsLLM\n\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 1024,\n        },\n    )\n    apigen = ApiGenGenerator(\n        use_default_structured_output=True,\n        llm=llm\n    )\n    apigen.load()\n\n    res_struct = next(\n        apigen.process(\n            [\n                {\n                    \"examples\": 'QUERY:\n

What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', \"func_name\": \"getrandommovie\", \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\" } ] ) ) res_struct # [{'examples': 'QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', # 'number': 1, # 'func_name': 'getrandommovie', # 'func_desc': 'Returns a list of random movies from a database by calling an external API.', # 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\", # \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"], # 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}], # [{'arguments': {}, 'name': 'getrandommovie'}]], # 'raw_input_api_gen_generator_0': [{'role': 'system', # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.

Construct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.

Ensure the query: - Is clear and concise - Demonstrates typical use cases - Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words - Across a variety level of difficulties, ranging from beginner and advanced use cases - The corresponding result's parameter types and ranges match with the function's descriptions

Ensure the answer: - Is a list of function calls in JSON format - The length of the answer list should be equal to the number of requests in the query - Can solve all the requests in the query effectively\"}, # {'role': 'user', # 'content': 'Here are examples of queries and the corresponding answers for similar functions: QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]

Note that the query could be interpreted as a combination of several independent requests. Based on these examples, generate 2 diverse query and answer pairs for the function getrandommovie The detailed function description is the following: Returns a list of random movies from a database by calling an external API.

Now please generate 2 diverse query and answer pairs following the above format.'}]}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ```

Source code in src/distilabel/steps/tasks/apigen/generator.py
class APIGenGenerator(Task):\n    \"\"\"Generate queries and answers for the given functions in JSON format.\n\n    The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\n    verifiable and diverse function-calling datasets. The task generates a set of diverse queries\n    and corresponding answers for the given functions in JSON format.\n\n    Attributes:\n        system_prompt: The system prompt to guide the user in the generation of queries and answers.\n        use_tools: Whether to use the tools available in the prompt to generate the queries and answers.\n            In case the tools are given in the input, they will be added to the prompt.\n        number: The number of queries to generate. It can be a list, where each number will be\n            chosen randomly, or a dictionary with the number of queries and the probability of each.\n            I.e: `number=1`, `number=[1, 2, 3]`, `number={1: 0.5, 2: 0.3, 3: 0.2}` are all valid inputs.\n            It corresponds to the number of parallel queries to generate.\n        use_default_structured_output: Whether to use the default structured output or not.\n\n    Input columns:\n        - examples (`str`): Examples used as few shots to guide the model.\n        - func_name (`str`): Name for the function to generate.\n        - func_desc (`str`): Description of what the function should do.\n        - tools (`str`): JSON formatted string containing the tool representation of the function.\n\n    Output columns:\n        - query (`str`): The list of queries.\n        - answers (`str`): JSON formatted string with the list of answers, containing the info as\n            a dictionary to be passed to the functions.\n\n    Categories:\n        - text-generation\n\n    References:\n        - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n        - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n    Examples:\n        Generate without structured output (original implementation):\n\n        ```python\n        from distilabel.steps.tasks import ApiGenGenerator\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 1024,\n            },\n        )\n        apigen = ApiGenGenerator(\n            use_default_structured_output=False,\n            llm=llm\n        )\n        apigen.load()\n\n        res = next(\n            apigen.process(\n                [\n                    {\n                        \"examples\": 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n                        \"func_name\": \"getrandommovie\",\n                        \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n                    }\n                ]\n            )\n        )\n        res\n        # [{'examples': 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n        # 'number': 1,\n        # 'func_name': 'getrandommovie',\n        # 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n        # 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?',\n        # 'Give me 5 random movie suggestions from your database to plan my weekend.'],\n        # 'answers': [[{'name': 'getrandommovie', 'arguments': {}}],\n        # [{'name': 'getrandommovie', 'arguments': {}},\n        #     {'name': 'getrandommovie', 'arguments': {}},\n        #     {'name': 'getrandommovie', 'arguments': {}},\n        #     {'name': 'getrandommovie', 'arguments': {}},\n        #     {'name': 'getrandommovie', 'arguments': {}}]],\n        # 'raw_input_api_gen_generator_0': [{'role': 'system',\n        #     'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\"},\n        #     {'role': 'user',\n        #     'content': 'Here are examples of queries and the corresponding answers for similar functions:\\nQUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\\nThe detailed function description is the following:\\nReturns a list of random movies from a database by calling an external API.\\n\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n```json\\n[\\n   {\\n       \"query\": \"The generated query.\",\\n       \"answers\": [\\n           {\\n               \"name\": \"api_name\",\\n               \"arguments\": {\\n                   \"arg_name\": \"value\"\\n                   ... (more arguments as required)\\n               }\\n           },\\n           ... (more API calls as required)\\n       ]\\n   }\\n]\\n```\\n\\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Generate with structured output:\n\n        ```python\n        from distilabel.steps.tasks import ApiGenGenerator\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 1024,\n            },\n        )\n        apigen = ApiGenGenerator(\n            use_default_structured_output=True,\n            llm=llm\n        )\n        apigen.load()\n\n        res_struct = next(\n            apigen.process(\n                [\n                    {\n                        \"examples\": 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n                        \"func_name\": \"getrandommovie\",\n                        \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n                    }\n                ]\n            )\n        )\n        res_struct\n        # [{'examples': 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n        # 'number': 1,\n        # 'func_name': 'getrandommovie',\n        # 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n        # 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\",\n        # \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"],\n        # 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}],\n        # [{'arguments': {}, 'name': 'getrandommovie'}]],\n        # 'raw_input_api_gen_generator_0': [{'role': 'system',\n        #     'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\"},\n        #     {'role': 'user',\n        #     'content': 'Here are examples of queries and the corresponding answers for similar functions:\\nQUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\\nThe detailed function description is the following:\\nReturns a list of random movies from a database by calling an external API.\\n\\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n    \"\"\"\n\n    system_prompt: str = SYSTEM_PROMPT_API_GEN\n    use_default_structured_output: bool = False\n    number: Union[int, List[int], Dict[int, float]] = 1\n    use_tools: bool = True\n\n    _number: Union[int, None] = PrivateAttr(None)\n    _fn_parallel_queries: Union[Callable[[], str], None] = PrivateAttr(None)\n    _format_inst: Union[str, None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the template for the generator prompt.\"\"\"\n        super().load()\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"apigen\"\n            / \"generator.jinja2\"\n        )\n        self._template = Template(open(_path).read())\n        self._format_inst = self._set_format_inst()\n\n    def _parallel_queries(self, number: int) -> Callable[[int], str]:\n        \"\"\"Prepares the function to update the parallel queries guide in the prompt.\n\n        Raises:\n            ValueError: if `is_parallel` is not a boolean or a list of floats.\n\n        Returns:\n            The function to generate the parallel queries guide.\n        \"\"\"\n        if number > 1:\n            return (\n                \"It can contain multiple parallel queries in natural language for the given functions. \"\n                \"They could use either the same function with different arguments or different functions.\\n\"\n            )\n        return \"\"\n\n    def _get_number(self) -> int:\n        \"\"\"Generates the number of queries to generate in a single call.\n        The number must be set to `_number` to avoid changing the original value\n        when calling `_default_error`.\n        \"\"\"\n        if isinstance(self.number, list):\n            self._number = random.choice(self.number)\n        elif isinstance(self.number, dict):\n            self._number = random.choices(\n                list(self.number.keys()), list(self.number.values())\n            )[0]\n        else:\n            self._number = self.number\n        return self._number\n\n    def _set_format_inst(self) -> str:\n        \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n        If the default structured output is used, returns an empty string because nothing\n        else is needed, otherwise, returns the original addition to the prompt to guide the model\n        to generate a formatted JSON.\n        \"\"\"\n        return (\n            \"\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n\"\n            \"```\\n\"\n            \"[\\n\"\n            \"   {\\n\"\n            '       \"query\": \"The generated query.\",\\n'\n            '       \"answers\": [\\n'\n            \"           {\\n\"\n            '               \"name\": \"api_name\",\\n'\n            '               \"arguments\": {\\n'\n            '                   \"arg_name\": \"value\"\\n'\n            \"                   ... (more arguments as required)\\n\"\n            \"               }\\n\"\n            \"           },\\n\"\n            \"           ... (more API calls as required)\\n\"\n            \"       ]\\n\"\n            \"   }\\n\"\n            \"]\\n\"\n            \"```\\n\"\n        )\n\n    def _get_func_desc(self, input: Dict[str, Any]) -> str:\n        \"\"\"If available and required, will use the info from the tools in the\n        prompt for extra information. Otherwise will use jut the function description.\n        \"\"\"\n        if not self.use_tools:\n            return input[\"func_desc\"]\n        extra = \"\"  # Extra information from the tools (if available will be added)\n        if \"tools\" in input:\n            extra = f\"\\n\\nThis is the available tool to guide you (respect the order of the parameters):\\n{input['tools']}\"\n        return input[\"func_desc\"] + extra\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task.\"\"\"\n        return {\n            \"examples\": True,\n            \"func_name\": True,\n            \"func_desc\": True,\n            \"tools\": False,\n        }\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType`.\"\"\"\n        number = self._get_number()\n        parallel_queries = self._parallel_queries(number)\n        return [\n            {\"role\": \"system\", \"content\": self.system_prompt},\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    examples=input[\"examples\"],\n                    parallel_queries=parallel_queries,\n                    number=number,\n                    func_name=input[\"func_name\"],\n                    func_desc=self._get_func_desc(input),\n                    format_inst=self._format_inst,\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The output for the task are the queries and corresponding answers.\"\"\"\n        return [\"query\", \"answers\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the queries and answers pairs.\n            The answers are an array of answers corresponding to the query.\n            Each answer is represented as an object with the following properties:\n                - name (string): The name of the tool used to generate the answer.\n                - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n            Each argument is represented as a key-value pair, where the key is the parameter name and the\n            value is the corresponding value.\n        \"\"\"\n        if output is None:\n            return self._default_error(input)\n\n        if not self.use_default_structured_output:\n            output = remove_fences(output)\n\n        try:\n            pairs = orjson.loads(output)\n        except orjson.JSONDecodeError:\n            return self._default_error(input)\n\n        pairs = pairs[\"pairs\"] if self.use_default_structured_output else pairs\n\n        return self._format_output(pairs, input)\n\n    def _format_output(\n        self, pairs: Dict[str, Any], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Parses the response, returning a dictionary with queries and answers.\n\n        Args:\n            pairs: The parsed dictionary from the LLM's output.\n            input: The input from the `LLM`.\n\n        Returns:\n            Formatted output, where the `queries` are a list of strings, and the `answers`\n            are a list of objects.\n        \"\"\"\n        try:\n            input.update(\n                **{\n                    \"query\": pairs[0][\"query\"],\n                    \"answers\": json.dumps(pairs[0][\"answers\"]),\n                }\n            )\n            return input\n        except Exception as e:\n            self._logger.error(f\"Error formatting output: {e}, pairs: '{pairs}'\")\n            return self._default_error(input)\n\n    def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n        \"\"\"Returns a default error output, to fill the responses in case of failure.\"\"\"\n        input.update(\n            **{\n                \"query\": None,\n                \"answers\": json.dumps([None] * self._number),\n            }\n        )\n        return input\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from typing import Dict, List\n        from pydantic import BaseModel\n\n\n        class Answer(BaseModel):\n            name: str\n            arguments: Dict[str, str]\n\n        class QueryAnswer(BaseModel):\n            query: str\n            answers: List[Answer]\n\n        class QueryAnswerPairs(BaseModel):\n            pairs: List[QueryAnswer]\n\n        json.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        return {\n            \"$defs\": {\n                \"Answer\": {\n                    \"properties\": {\n                        \"name\": {\"title\": \"Name\", \"type\": \"string\"},\n                        \"arguments\": {\n                            \"additionalProperties\": {\"type\": \"string\"},\n                            \"title\": \"Arguments\",\n                            \"type\": \"object\",\n                        },\n                    },\n                    \"required\": [\"name\", \"arguments\"],\n                    \"title\": \"Answer\",\n                    \"type\": \"object\",\n                },\n                \"QueryAnswer\": {\n                    \"properties\": {\n                        \"query\": {\"title\": \"Query\", \"type\": \"string\"},\n                        \"answers\": {\n                            \"items\": {\"$ref\": \"#/$defs/Answer\"},\n                            \"title\": \"Answers\",\n                            \"type\": \"array\",\n                        },\n                    },\n                    \"required\": [\"query\", \"answers\"],\n                    \"title\": \"QueryAnswer\",\n                    \"type\": \"object\",\n                },\n            },\n            \"properties\": {\n                \"pairs\": {\n                    \"items\": {\"$ref\": \"#/$defs/QueryAnswer\"},\n                    \"title\": \"Pairs\",\n                    \"type\": \"array\",\n                }\n            },\n            \"required\": [\"pairs\"],\n            \"title\": \"QueryAnswerPairs\",\n            \"type\": \"object\",\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.outputs","title":"outputs: StepColumns property","text":"

The output for the task are the queries and corresponding answers.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.load","title":"load()","text":"

Loads the template for the generator prompt.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def load(self) -> None:\n    \"\"\"Loads the template for the generator prompt.\"\"\"\n    super().load()\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"apigen\"\n        / \"generator.jinja2\"\n    )\n    self._template = Template(open(_path).read())\n    self._format_inst = self._set_format_inst()\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._parallel_queries","title":"_parallel_queries(number)","text":"

Prepares the function to update the parallel queries guide in the prompt.

Raises:

Type Description ValueError

if is_parallel is not a boolean or a list of floats.

Returns:

Type Description Callable[[int], str]

The function to generate the parallel queries guide.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _parallel_queries(self, number: int) -> Callable[[int], str]:\n    \"\"\"Prepares the function to update the parallel queries guide in the prompt.\n\n    Raises:\n        ValueError: if `is_parallel` is not a boolean or a list of floats.\n\n    Returns:\n        The function to generate the parallel queries guide.\n    \"\"\"\n    if number > 1:\n        return (\n            \"It can contain multiple parallel queries in natural language for the given functions. \"\n            \"They could use either the same function with different arguments or different functions.\\n\"\n        )\n    return \"\"\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._get_number","title":"_get_number()","text":"

Generates the number of queries to generate in a single call. The number must be set to _number to avoid changing the original value when calling _default_error.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _get_number(self) -> int:\n    \"\"\"Generates the number of queries to generate in a single call.\n    The number must be set to `_number` to avoid changing the original value\n    when calling `_default_error`.\n    \"\"\"\n    if isinstance(self.number, list):\n        self._number = random.choice(self.number)\n    elif isinstance(self.number, dict):\n        self._number = random.choices(\n            list(self.number.keys()), list(self.number.values())\n        )[0]\n    else:\n        self._number = self.number\n    return self._number\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._set_format_inst","title":"_set_format_inst()","text":"

Prepares the function to generate the formatted instructions for the prompt.

If the default structured output is used, returns an empty string because nothing else is needed, otherwise, returns the original addition to the prompt to guide the model to generate a formatted JSON.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _set_format_inst(self) -> str:\n    \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n    If the default structured output is used, returns an empty string because nothing\n    else is needed, otherwise, returns the original addition to the prompt to guide the model\n    to generate a formatted JSON.\n    \"\"\"\n    return (\n        \"\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n\"\n        \"```\\n\"\n        \"[\\n\"\n        \"   {\\n\"\n        '       \"query\": \"The generated query.\",\\n'\n        '       \"answers\": [\\n'\n        \"           {\\n\"\n        '               \"name\": \"api_name\",\\n'\n        '               \"arguments\": {\\n'\n        '                   \"arg_name\": \"value\"\\n'\n        \"                   ... (more arguments as required)\\n\"\n        \"               }\\n\"\n        \"           },\\n\"\n        \"           ... (more API calls as required)\\n\"\n        \"       ]\\n\"\n        \"   }\\n\"\n        \"]\\n\"\n        \"```\\n\"\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._get_func_desc","title":"_get_func_desc(input)","text":"

If available and required, will use the info from the tools in the prompt for extra information. Otherwise will use jut the function description.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _get_func_desc(self, input: Dict[str, Any]) -> str:\n    \"\"\"If available and required, will use the info from the tools in the\n    prompt for extra information. Otherwise will use jut the function description.\n    \"\"\"\n    if not self.use_tools:\n        return input[\"func_desc\"]\n    extra = \"\"  # Extra information from the tools (if available will be added)\n    if \"tools\" in input:\n        extra = f\"\\n\\nThis is the available tool to guide you (respect the order of the parameters):\\n{input['tools']}\"\n    return input[\"func_desc\"] + extra\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType`.\"\"\"\n    number = self._get_number()\n    parallel_queries = self._parallel_queries(number)\n    return [\n        {\"role\": \"system\", \"content\": self.system_prompt},\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                examples=input[\"examples\"],\n                parallel_queries=parallel_queries,\n                number=number,\n                func_name=input[\"func_name\"],\n                func_desc=self._get_func_desc(input),\n                format_inst=self._format_inst,\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the queries and answers pairs.

Dict[str, Any]

The answers are an array of answers corresponding to the query.

Dict[str, Any]

Each answer is represented as an object with the following properties: - name (string): The name of the tool used to generate the answer. - arguments (object): An object representing the arguments passed to the tool to generate the answer.

Dict[str, Any]

Each argument is represented as a key-value pair, where the key is the parameter name and the

Dict[str, Any]

value is the corresponding value.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the queries and answers pairs.\n        The answers are an array of answers corresponding to the query.\n        Each answer is represented as an object with the following properties:\n            - name (string): The name of the tool used to generate the answer.\n            - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n        Each argument is represented as a key-value pair, where the key is the parameter name and the\n        value is the corresponding value.\n    \"\"\"\n    if output is None:\n        return self._default_error(input)\n\n    if not self.use_default_structured_output:\n        output = remove_fences(output)\n\n    try:\n        pairs = orjson.loads(output)\n    except orjson.JSONDecodeError:\n        return self._default_error(input)\n\n    pairs = pairs[\"pairs\"] if self.use_default_structured_output else pairs\n\n    return self._format_output(pairs, input)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._format_output","title":"_format_output(pairs, input)","text":"

Parses the response, returning a dictionary with queries and answers.

Parameters:

Name Type Description Default pairs Dict[str, Any]

The parsed dictionary from the LLM's output.

required input Dict[str, Any]

The input from the LLM.

required

Returns:

Type Description Dict[str, Any]

Formatted output, where the queries are a list of strings, and the answers

Dict[str, Any]

are a list of objects.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _format_output(\n    self, pairs: Dict[str, Any], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"Parses the response, returning a dictionary with queries and answers.\n\n    Args:\n        pairs: The parsed dictionary from the LLM's output.\n        input: The input from the `LLM`.\n\n    Returns:\n        Formatted output, where the `queries` are a list of strings, and the `answers`\n        are a list of objects.\n    \"\"\"\n    try:\n        input.update(\n            **{\n                \"query\": pairs[0][\"query\"],\n                \"answers\": json.dumps(pairs[0][\"answers\"]),\n            }\n        )\n        return input\n    except Exception as e:\n        self._logger.error(f\"Error formatting output: {e}, pairs: '{pairs}'\")\n        return self._default_error(input)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._default_error","title":"_default_error(input)","text":"

Returns a default error output, to fill the responses in case of failure.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n    \"\"\"Returns a default error output, to fill the responses in case of failure.\"\"\"\n    input.update(\n        **{\n            \"query\": None,\n            \"answers\": json.dumps([None] * self._number),\n        }\n    )\n    return input\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from typing import Dict, List\nfrom pydantic import BaseModel\n\n\nclass Answer(BaseModel):\n    name: str\n    arguments: Dict[str, str]\n\nclass QueryAnswer(BaseModel):\n    query: str\n    answers: List[Answer]\n\nclass QueryAnswerPairs(BaseModel):\n    pairs: List[QueryAnswer]\n\njson.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/apigen/generator.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from typing import Dict, List\n    from pydantic import BaseModel\n\n\n    class Answer(BaseModel):\n        name: str\n        arguments: Dict[str, str]\n\n    class QueryAnswer(BaseModel):\n        query: str\n        answers: List[Answer]\n\n    class QueryAnswerPairs(BaseModel):\n        pairs: List[QueryAnswer]\n\n    json.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    return {\n        \"$defs\": {\n            \"Answer\": {\n                \"properties\": {\n                    \"name\": {\"title\": \"Name\", \"type\": \"string\"},\n                    \"arguments\": {\n                        \"additionalProperties\": {\"type\": \"string\"},\n                        \"title\": \"Arguments\",\n                        \"type\": \"object\",\n                    },\n                },\n                \"required\": [\"name\", \"arguments\"],\n                \"title\": \"Answer\",\n                \"type\": \"object\",\n            },\n            \"QueryAnswer\": {\n                \"properties\": {\n                    \"query\": {\"title\": \"Query\", \"type\": \"string\"},\n                    \"answers\": {\n                        \"items\": {\"$ref\": \"#/$defs/Answer\"},\n                        \"title\": \"Answers\",\n                        \"type\": \"array\",\n                    },\n                },\n                \"required\": [\"query\", \"answers\"],\n                \"title\": \"QueryAnswer\",\n                \"type\": \"object\",\n            },\n        },\n        \"properties\": {\n            \"pairs\": {\n                \"items\": {\"$ref\": \"#/$defs/QueryAnswer\"},\n                \"title\": \"Pairs\",\n                \"type\": \"array\",\n            }\n        },\n        \"required\": [\"pairs\"],\n        \"title\": \"QueryAnswerPairs\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker","title":"APIGenSemanticChecker","text":"

Bases: Task

Generate queries and answers for the given functions in JSON format.

The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format.

Attributes:

Name Type Description system_prompt str

System prompt for the task. Has a default one.

exclude_failed_execution str

Whether to exclude failed executions (won't run on those rows that have a False in keep_row_after_execution_check column, which comes from running APIGenExecutionChecker). Defaults to True.

Input columns
  • func_desc (str): Description of what the function should do.
  • query (str): Instruction from the user.
  • answers (str): JSON encoded list with arguments to be passed to the function/API. Should be loaded using json.loads.
  • execution_result (str): Result of the function/API executed.
Output columns
  • thought (str): Reasoning for the output on whether to keep this output or not.
  • keep_row_after_semantic_check (bool): True or False, can be used to filter afterwards.
Categories
  • filtering
  • text-generation
References
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets
  • Salesforce/xlam-function-calling-60k

Examples:

Semantic checker for generated function calls (original implementation):\n\n```python\nfrom distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\nsemantic_checker = APIGenSemanticChecker(\n    use_default_structured_output=False,\n    llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n    semantic_checker.process(\n        [\n            {\n                \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n            }\n        ]\n    )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'thought': '',\n# 'keep_row_after_semantic_check': True,\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n#     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n#     {'role': 'user',\n#     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n   \"thought\": \"Concisely describe your reasoning here\",\\n   \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n```\n\nSemantic checker for generated function calls (structured output):\n\n```python\nfrom distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\nsemantic_checker = APIGenSemanticChecker(\n    use_default_structured_output=True,\n    llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n    semantic_checker.process(\n        [\n            {\n                \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n            }\n        ]\n    )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'keep_row_after_semantic_check': True,\n# 'thought': '',\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n#     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n#     {'role': 'user',\n#     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n```\n
Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
class APIGenSemanticChecker(Task):\n    r\"\"\"Generate queries and answers for the given functions in JSON format.\n\n    The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\n    verifiable and diverse function-calling datasets. The task generates a set of diverse queries\n    and corresponding answers for the given functions in JSON format.\n\n    Attributes:\n        system_prompt: System prompt for the task. Has a default one.\n        exclude_failed_execution: Whether to exclude failed executions (won't run on those\n            rows that have a False in `keep_row_after_execution_check` column, which\n            comes from running `APIGenExecutionChecker`). Defaults to True.\n\n    Input columns:\n        - func_desc (`str`): Description of what the function should do.\n        - query (`str`): Instruction from the user.\n        - answers (`str`): JSON encoded list with arguments to be passed to the function/API.\n            Should be loaded using `json.loads`.\n        - execution_result (`str`): Result of the function/API executed.\n\n    Output columns:\n        - thought (`str`): Reasoning for the output on whether to keep this output or not.\n        - keep_row_after_semantic_check (`bool`): True or False, can be used to filter\n            afterwards.\n\n    Categories:\n        - filtering\n        - text-generation\n\n    References:\n        - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n        - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n    Examples:\n\n        Semantic checker for generated function calls (original implementation):\n\n        ```python\n        from distilabel.steps.tasks import APIGenSemanticChecker\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 1024,\n            },\n        )\n        semantic_checker = APIGenSemanticChecker(\n            use_default_structured_output=False,\n            llm=llm\n        )\n        semantic_checker.load()\n\n        res = next(\n            semantic_checker.process(\n                [\n                    {\n                        \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                        \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                        \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                        \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n                    }\n                ]\n            )\n        )\n        res\n        # [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n        # 'query': 'What information can be obtained about the Maine Coon cat breed?',\n        # 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n        # 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n        # 'thought': '',\n        # 'keep_row_after_semantic_check': True,\n        # 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n        #     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n        #     {'role': 'user',\n        #     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n   \"thought\": \"Concisely describe your reasoning here\",\\n   \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Semantic checker for generated function calls (structured output):\n\n        ```python\n        from distilabel.steps.tasks import APIGenSemanticChecker\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 1024,\n            },\n        )\n        semantic_checker = APIGenSemanticChecker(\n            use_default_structured_output=True,\n            llm=llm\n        )\n        semantic_checker.load()\n\n        res = next(\n            semantic_checker.process(\n                [\n                    {\n                        \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                        \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                        \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                        \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n                    }\n                ]\n            )\n        )\n        res\n        # [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n        # 'query': 'What information can be obtained about the Maine Coon cat breed?',\n        # 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n        # 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n        # 'keep_row_after_semantic_check': True,\n        # 'thought': '',\n        # 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n        #     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n        #     {'role': 'user',\n        #     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n    \"\"\"\n\n    system_prompt: str = SYSTEM_PROMPT_SEMANTIC_CHECKER\n    use_default_structured_output: bool = False\n\n    _format_inst: Union[str, None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the template for the generator prompt.\"\"\"\n        super().load()\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"apigen\"\n            / \"semantic_checker.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n        self._format_inst = self._set_format_inst()\n\n    def _set_format_inst(self) -> str:\n        \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n        If the default structured output is used, returns an empty string because nothing\n        else is needed, otherwise, returns the original addition to the prompt to guide the model\n        to generate a formatted JSON.\n        \"\"\"\n        return (\n            \"\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n\"\n            \"```\\n\"\n            \"{\\n\"\n            '   \"thought\": \"Concisely describe your reasoning here\",\\n'\n            '   \"passes\": \"yes\" or \"no\"\\n'\n            \"}\\n\"\n            \"```\\n\"\n        )\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task.\"\"\"\n        return {\n            \"func_desc\": True,\n            \"query\": True,\n            \"answers\": True,\n            \"execution_result\": True,\n            \"keep_row_after_execution_check\": True,\n        }\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType`.\"\"\"\n        return [\n            {\"role\": \"system\", \"content\": self.system_prompt},\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    func_desc=input[\"func_desc\"],\n                    query=input[\"query\"] or \"\",\n                    func_call=input[\"answers\"] or \"\",\n                    execution_result=input[\"execution_result\"],\n                    format_inst=self._format_inst,\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The output for the task are the queries and corresponding answers.\"\"\"\n        return [\"keep_row_after_semantic_check\", \"thought\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the queries and answers pairs.\n            The answers are an array of answers corresponding to the query.\n            Each answer is represented as an object with the following properties:\n                - name (string): The name of the tool used to generate the answer.\n                - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n            Each argument is represented as a key-value pair, where the key is the parameter name and the\n            value is the corresponding value.\n        \"\"\"\n        if output is None:\n            return self._default_error(input)\n\n        output = remove_fences(output)\n\n        try:\n            result = orjson.loads(output)\n            # Update the column name and change to bool\n            result[\"keep_row_after_semantic_check\"] = (\n                result.pop(\"passes\").lower() == \"yes\"\n            )\n            input.update(**result)\n            return input\n        except orjson.JSONDecodeError:\n            return self._default_error(input)\n\n    def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n        \"\"\"Default error message for the task.\"\"\"\n        input.update({\"thought\": None, \"keep_row_after_semantic_check\": None})\n        return input\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from typing import Literal\n        from pydantic import BaseModel\n        import json\n\n        class Checker(BaseModel):\n            thought: str\n            passes: Literal[\"yes\", \"no\"]\n\n        json.dumps(Checker.model_json_schema(), indent=4)\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        return {\n            \"properties\": {\n                \"thought\": {\"title\": \"Thought\", \"type\": \"string\"},\n                \"passes\": {\"enum\": [\"yes\", \"no\"], \"title\": \"Passes\", \"type\": \"string\"},\n            },\n            \"required\": [\"thought\", \"passes\"],\n            \"title\": \"Checker\",\n            \"type\": \"object\",\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.outputs","title":"outputs: StepColumns property","text":"

The output for the task are the queries and corresponding answers.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.load","title":"load()","text":"

Loads the template for the generator prompt.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def load(self) -> None:\n    \"\"\"Loads the template for the generator prompt.\"\"\"\n    super().load()\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"apigen\"\n        / \"semantic_checker.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n    self._format_inst = self._set_format_inst()\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker._set_format_inst","title":"_set_format_inst()","text":"

Prepares the function to generate the formatted instructions for the prompt.

If the default structured output is used, returns an empty string because nothing else is needed, otherwise, returns the original addition to the prompt to guide the model to generate a formatted JSON.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def _set_format_inst(self) -> str:\n    \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n    If the default structured output is used, returns an empty string because nothing\n    else is needed, otherwise, returns the original addition to the prompt to guide the model\n    to generate a formatted JSON.\n    \"\"\"\n    return (\n        \"\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n\"\n        \"```\\n\"\n        \"{\\n\"\n        '   \"thought\": \"Concisely describe your reasoning here\",\\n'\n        '   \"passes\": \"yes\" or \"no\"\\n'\n        \"}\\n\"\n        \"```\\n\"\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType`.\"\"\"\n    return [\n        {\"role\": \"system\", \"content\": self.system_prompt},\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                func_desc=input[\"func_desc\"],\n                query=input[\"query\"] or \"\",\n                func_call=input[\"answers\"] or \"\",\n                execution_result=input[\"execution_result\"],\n                format_inst=self._format_inst,\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the queries and answers pairs.

Dict[str, Any]

The answers are an array of answers corresponding to the query.

Dict[str, Any]

Each answer is represented as an object with the following properties: - name (string): The name of the tool used to generate the answer. - arguments (object): An object representing the arguments passed to the tool to generate the answer.

Dict[str, Any]

Each argument is represented as a key-value pair, where the key is the parameter name and the

Dict[str, Any]

value is the corresponding value.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the queries and answers pairs.\n        The answers are an array of answers corresponding to the query.\n        Each answer is represented as an object with the following properties:\n            - name (string): The name of the tool used to generate the answer.\n            - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n        Each argument is represented as a key-value pair, where the key is the parameter name and the\n        value is the corresponding value.\n    \"\"\"\n    if output is None:\n        return self._default_error(input)\n\n    output = remove_fences(output)\n\n    try:\n        result = orjson.loads(output)\n        # Update the column name and change to bool\n        result[\"keep_row_after_semantic_check\"] = (\n            result.pop(\"passes\").lower() == \"yes\"\n        )\n        input.update(**result)\n        return input\n    except orjson.JSONDecodeError:\n        return self._default_error(input)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker._default_error","title":"_default_error(input)","text":"

Default error message for the task.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n    \"\"\"Default error message for the task.\"\"\"\n    input.update({\"thought\": None, \"keep_row_after_semantic_check\": None})\n    return input\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from typing import Literal\nfrom pydantic import BaseModel\nimport json\n\nclass Checker(BaseModel):\n    thought: str\n    passes: Literal[\"yes\", \"no\"]\n\njson.dumps(Checker.model_json_schema(), indent=4)\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from typing import Literal\n    from pydantic import BaseModel\n    import json\n\n    class Checker(BaseModel):\n        thought: str\n        passes: Literal[\"yes\", \"no\"]\n\n    json.dumps(Checker.model_json_schema(), indent=4)\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    return {\n        \"properties\": {\n            \"thought\": {\"title\": \"Thought\", \"type\": \"string\"},\n            \"passes\": {\"enum\": [\"yes\", \"no\"], \"title\": \"Passes\", \"type\": \"string\"},\n        },\n        \"required\": [\"thought\", \"passes\"],\n        \"title\": \"Checker\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller","title":"ArgillaLabeller","text":"

Bases: Task

Annotate Argilla records based on input fields, example records and question settings.

This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM. It uses a system prompt that guides the LLM to understand the input fields, the question type, and the question settings. The task then formats the input data and generates a response based on the question. The response is validated against the question's value model, and the final suggestion is prepared for annotation.

Attributes:

Name Type Description _template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • record (argilla.Record): The record to be annotated.
  • fields (Optional[List[Dict[str, Any]]]): The list of field settings for the input fields.
  • question (Optional[Dict[str, Any]]): The question settings for the question to be answered.
  • example_records (Optional[List[Dict[str, Any]]]): The few shot example records with responses to be used to answer the question.
  • guidelines (Optional[str]): The guidelines for the annotation task.
Output columns
  • suggestion (Dict[str, Any]): The final suggestion for annotation.
Categories
  • text-classification
  • scorer
  • text-generation
References
  • Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets

Examples:

Annotate a record with the same dataset and question:

import argilla as rg\nfrom argilla import Suggestion\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\npending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\ncompleted_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\npending_records = list(\n    dataset.records(\n        query=rg.Query(filter=pending_records_filter),\n        limit=5,\n    )\n)\nexample_records = list(\n    dataset.records(\n        query=rg.Query(filter=completed_records_filter),\n        limit=5,\n    )\n)\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    fields=[field],\n    question=question,\n    example_records=example_records,\n    guidelines=dataset.guidelines\n)\nlabeller.load()\n\n# Process the pending records\nresult = next(\n    labeller.process(\n        [\n            {\n                \"record\": record\n            } for record in pending_records\n        ]\n    )\n)\n\n# Add the suggestions to the records\nfor record, suggestion in zip(pending_records, result):\n    record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated records\ndataset.records.log(pending_records)\n

Annotate a record with alternating datasets and questions:

import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\nquestion2 = dataset.settings.questions[\"label2\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\nlabeller.load()\n\n# Process the record\nrecord = next(dataset.records())\nresult = next(\n    labeller.process(\n        [\n            {\n                \"record\": record,\n                \"fields\": [field],\n                \"question\": question,\n            },\n            {\n                \"record\": record,\n                \"fields\": [field],\n                \"question\": question2,\n            }\n        ]\n    )\n)\n\n# Add the suggestions to the record\nfor suggestion in result:\n    record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated record\ndataset.records.log([record])\n

Overwrite default prompts and instructions:

import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Overwrite default prompts and instructions\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n    question_to_label_instruction={\n        \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n        \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n        \"text\": \"Provide a text response to the question.\",\n        \"rating\": \"Provide a rating for the question.\",\n    },\n)\nlabeller.load()\n
Source code in src/distilabel/steps/tasks/argilla_labeller.py
class ArgillaLabeller(Task):\n    \"\"\"\n    Annotate Argilla records based on input fields, example records and question settings.\n\n    This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM.\n    It uses a system prompt that guides the LLM to understand the input fields, the question type,\n    and the question settings. The task then formats the input data and generates a response based on the question.\n    The response is validated against the question's value model, and the final suggestion is prepared for annotation.\n\n    Attributes:\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - record (`argilla.Record`): The record to be annotated.\n        - fields (`Optional[List[Dict[str, Any]]]`): The list of field settings for the input fields.\n        - question (`Optional[Dict[str, Any]]`): The question settings for the question to be answered.\n        - example_records (`Optional[List[Dict[str, Any]]]`): The few shot example records with responses to be used to answer the question.\n        - guidelines (`Optional[str]`): The guidelines for the annotation task.\n\n    Output columns:\n        - suggestion (`Dict[str, Any]`): The final suggestion for annotation.\n\n    Categories:\n        - text-classification\n        - scorer\n        - text-generation\n\n    References:\n        - [`Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets`](https://github.com/argilla-io/argilla/)\n\n    Examples:\n        Annotate a record with the same dataset and question:\n\n        ```python\n        import argilla as rg\n        from argilla import Suggestion\n        from distilabel.steps.tasks import ArgillaLabeller\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Get information from Argilla dataset definition\n        dataset = rg.Dataset(\"my_dataset\")\n        pending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\n        completed_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\n        pending_records = list(\n            dataset.records(\n                query=rg.Query(filter=pending_records_filter),\n                limit=5,\n            )\n        )\n        example_records = list(\n            dataset.records(\n                query=rg.Query(filter=completed_records_filter),\n                limit=5,\n            )\n        )\n        field = dataset.settings.fields[\"text\"]\n        question = dataset.settings.questions[\"label\"]\n\n        # Initialize the labeller with the model and fields\n        labeller = ArgillaLabeller(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            fields=[field],\n            question=question,\n            example_records=example_records,\n            guidelines=dataset.guidelines\n        )\n        labeller.load()\n\n        # Process the pending records\n        result = next(\n            labeller.process(\n                [\n                    {\n                        \"record\": record\n                    } for record in pending_records\n                ]\n            )\n        )\n\n        # Add the suggestions to the records\n        for record, suggestion in zip(pending_records, result):\n            record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n        # Log the updated records\n        dataset.records.log(pending_records)\n        ```\n\n        Annotate a record with alternating datasets and questions:\n\n        ```python\n        import argilla as rg\n        from distilabel.steps.tasks import ArgillaLabeller\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Get information from Argilla dataset definition\n        dataset = rg.Dataset(\"my_dataset\")\n        field = dataset.settings.fields[\"text\"]\n        question = dataset.settings.questions[\"label\"]\n        question2 = dataset.settings.questions[\"label2\"]\n\n        # Initialize the labeller with the model and fields\n        labeller = ArgillaLabeller(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            )\n        )\n        labeller.load()\n\n        # Process the record\n        record = next(dataset.records())\n        result = next(\n            labeller.process(\n                [\n                    {\n                        \"record\": record,\n                        \"fields\": [field],\n                        \"question\": question,\n                    },\n                    {\n                        \"record\": record,\n                        \"fields\": [field],\n                        \"question\": question2,\n                    }\n                ]\n            )\n        )\n\n        # Add the suggestions to the record\n        for suggestion in result:\n            record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n        # Log the updated record\n        dataset.records.log([record])\n        ```\n\n        Overwrite default prompts and instructions:\n\n        ```python\n        import argilla as rg\n        from distilabel.steps.tasks import ArgillaLabeller\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Overwrite default prompts and instructions\n        labeller = ArgillaLabeller(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n            question_to_label_instruction={\n                \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n                \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n                \"text\": \"Provide a text response to the question.\",\n                \"rating\": \"Provide a rating for the question.\",\n            },\n        )\n        labeller.load()\n        ```\n    \"\"\"\n\n    system_prompt: str = (\n        \"You are an expert annotator and labelling assistant that understands complex domains and natural language processing. \"\n        \"You are given input fields and a question. \"\n        \"You should create a valid JSON object as an response to the question based on the input fields. \"\n    )\n    question_to_label_instruction: Dict[str, str] = {\n        \"label_selection\": \"Select the appropriate label for the fields from the list of optional labels.\",\n        \"multi_label_selection\": \"Select none, one or multiple labels for the fields from the list of optional labels.\",\n        \"text\": \"Provide a response to the question based on the fields.\",\n        \"rating\": \"Provide a rating for the question based on the fields.\",\n    }\n    example_records: Optional[\n        RuntimeParameter[Union[List[Union[Dict[str, Any], BaseModel]], None]]\n    ] = Field(\n        default=None,\n        description=\"The few shot serialized example records or `BaseModel`s with responses to be used to answer the question.\",\n    )\n    fields: Optional[\n        RuntimeParameter[Union[List[Union[BaseModel, Dict[str, Any]]], None]]\n    ] = Field(\n        default=None,\n        description=\"The field serialized field settings or `BaseModel` for the fields to be used to answer the question.\",\n    )\n    question: Optional[\n        RuntimeParameter[\n            Union[\n                Dict[str, Any],\n                BaseModel,\n                None,\n            ]\n        ]\n    ] = Field(\n        default=None,\n        description=\"The question serialized question settings or `BaseModel` for the question to be answered.\",\n    )\n    guidelines: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The guidelines for the annotation task.\",\n    )\n\n    _template: Union[Template, None] = PrivateAttr(...)\n    _client: Optional[Any] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"argillalabeller.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> Dict[str, bool]:\n        return {\n            \"record\": True,\n            \"fields\": False,\n            \"question\": False,\n            \"example_records\": False,\n            \"guidelines\": False,\n        }\n\n    def _format_record(\n        self, record: Dict[str, Any], fields: List[Dict[str, Any]]\n    ) -> str:\n        \"\"\"Format the record fields into a string.\n\n        Args:\n            record (Dict[str, Any]): The record to format.\n            fields (List[Dict[str, Any]]): The fields to format.\n\n        Returns:\n            str: The formatted record fields.\n        \"\"\"\n        output = []\n        for field in fields:\n            output.append(record.get(\"fields\", {}).get(field.get(\"name\", \"\")))\n        return \"fields: \" + \"\\n\".join(output)\n\n    def _get_label_instruction(self, question: Dict[str, Any]) -> str:\n        \"\"\"Get the label instruction for the question.\n\n        Args:\n            question (Dict[str, Any]): The question to get the label instruction for.\n\n        Returns:\n            str: The label instruction for the question.\n        \"\"\"\n        question_type = question[\"settings\"][\"type\"]\n        return self.question_to_label_instruction[question_type]\n\n    def _format_question(self, question: Dict[str, Any]) -> str:\n        \"\"\"Format the question settings into a string.\n\n        Args:\n            question (Dict[str, Any]): The question to format.\n\n        Returns:\n            str: The formatted question.\n        \"\"\"\n        output = []\n        output.append(f\"question: {self._get_label_instruction(question)}\")\n        if \"options\" in question.get(\"settings\", {}):\n            output.append(\n                f\"optional labels: {[option['value'] for option in question.get('settings', {}).get('options', [])]}\"\n            )\n        return \"\\n\".join(output)\n\n    def _format_example_records(\n        self,\n        records: List[Dict[str, Any]],\n        fields: List[Dict[str, Any]],\n        question: Dict[str, Any],\n    ) -> str:\n        \"\"\"Format the example records into a string.\n\n        Args:\n            records (List[Dict[str, Any]]): The records to format.\n            fields (List[Dict[str, Any]]): The fields to format.\n            question (Dict[str, Any]): The question to format.\n\n        Returns:\n            str: The formatted example records.\n        \"\"\"\n        base = []\n        for record in records:\n            responses = record.get(\"responses\", {})\n            if responses.get(question[\"name\"]):\n                base.append(self._format_record(record, fields))\n                value = responses[question[\"name\"]][0][\"value\"]\n                formatted_value = self._assign_value_to_question_value_model(\n                    value, question\n                )\n                base.append(f\"response: {formatted_value}\")\n                base.append(\"\")\n            else:\n                warnings.warn(\n                    f\"Record {record} has no response for question {question['name']}. Skipping example record.\",\n                    stacklevel=2,\n                )\n        return \"\\n\".join(base)\n\n    def format_input(\n        self,\n        input: Dict[\n            str,\n            Union[\n                Dict[str, Any],\n                \"Record\",\n                \"TextField\",\n                \"MultiLabelQuestion\",\n                \"LabelQuestion\",\n                \"RatingQuestion\",\n                \"TextQuestion\",\n            ],\n        ],\n    ) -> \"ChatType\":\n        \"\"\"Format the input into a chat message.\n\n        Args:\n            input: The input to format.\n\n        Returns:\n            The formatted chat message.\n\n        Raises:\n            ValueError: If question or fields are not provided.\n        \"\"\"\n        input_keys = list(self.inputs.keys())\n        record = input[input_keys[0]]\n        fields = input.get(input_keys[1], self.fields)\n        question = input.get(input_keys[2], self.question)\n        examples = input.get(input_keys[3], self.example_records)\n        guidelines = input.get(input_keys[4], self.guidelines)\n\n        if question is None:\n            raise ValueError(\"Question must be provided.\")\n        if fields is None or any(field is None for field in fields):\n            raise ValueError(\"Fields must be provided.\")\n\n        record = record.to_dict() if not isinstance(record, dict) else record\n        question = question.serialize() if not isinstance(question, dict) else question\n        fields = [\n            field.serialize() if not isinstance(field, dict) else field\n            for field in fields\n        ]\n        examples = (\n            [\n                example.to_dict() if not isinstance(example, dict) else example\n                for example in examples\n            ]\n            if examples\n            else None\n        )\n\n        formatted_fields = self._format_record(record, fields)\n        formatted_question = self._format_question(question)\n        formatted_examples = (\n            self._format_example_records(examples, fields, question)\n            if examples\n            else False\n        )\n\n        prompt = self._template.render(\n            fields=formatted_fields,\n            question=formatted_question,\n            examples=formatted_examples,\n            guidelines=guidelines,\n        )\n\n        messages = []\n        if self.system_prompt:\n            messages.append({\"role\": \"system\", \"content\": self.system_prompt})\n        messages.append({\"role\": \"user\", \"content\": prompt})\n        return messages\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"suggestion\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Format the output into a dictionary.\n\n        Args:\n            output (Union[str, None]): The output to format.\n            input (Dict[str, Any]): The input to format.\n\n        Returns:\n            Dict[str, Any]: The formatted output.\n        \"\"\"\n        from argilla import Suggestion\n\n        question: Union[\n            Any,\n            Dict[str, Any],\n            LabelQuestion,\n            MultiLabelQuestion,\n            RatingQuestion,\n            TextQuestion,\n            None,\n        ] = input.get(list(self.inputs.keys())[2], self.question) or self.question\n        question = question.serialize() if not isinstance(question, dict) else question\n        model = self._get_pydantic_model_of_structured_output(question)\n        validated_output = model(**json.loads(output))\n        value = self._get_value_from_question_value_model(validated_output)\n        suggestion = Suggestion(\n            value=value,\n            question_name=question[\"name\"],\n            type=\"model\",\n            agent=self.llm.model_name,\n        ).serialize()\n        return {\n            self.outputs[0]: {\n                k: v\n                for k, v in suggestion.items()\n                if k in [\"value\", \"question_name\", \"type\", \"agent\"]\n            }\n        }\n\n    def _set_llm_structured_output_for_question(self, question: Dict[str, Any]) -> None:\n        runtime_parameters = self.llm._runtime_parameters\n        runtime_parameters.update(\n            {\n                \"structured_output\": {\n                    \"format\": \"json\",\n                    \"schema\": self._get_pydantic_model_of_structured_output(question),\n                },\n            }\n        )\n        self.llm.set_runtime_parameters(runtime_parameters)\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        \"\"\"Process the input through the task.\n\n        Args:\n            inputs (StepInput): The input to process.\n\n        Returns:\n            StepOutput: The output of the task.\n        \"\"\"\n\n        question_list = [input.get(\"question\", self.question) for input in inputs]\n        fields_list = [input.get(\"fields\", self.fields) for input in inputs]\n        # check if any field for the field in fields is None\n        for fields in fields_list:\n            if any(field is None for field in fields):\n                raise ValueError(\n                    \"Fields must be provided during init or through `process` method.\"\n                )\n        # check if any question is None\n        if any(question is None for question in question_list):\n            raise ValueError(\n                \"Question must be provided during init or through `process` method.\"\n            )\n        question_list = [\n            question.serialize() if not isinstance(question, dict) else question\n            for question in question_list\n        ]\n        if not all(question == question_list[0] for question in question_list):\n            warnings.warn(\n                \"Not all questions are the same. Processing each question separately by setting the structured output for each question. This may impact performance.\",\n                stacklevel=2,\n            )\n            for input, question in zip(inputs, question_list):\n                self._set_llm_structured_output_for_question(question)\n                yield from super().process([input])\n        else:\n            question = question_list[0]\n            self._set_llm_structured_output_for_question(question)\n            yield from super().process(inputs)\n\n    def _get_value_from_question_value_model(\n        self, question_value_model: BaseModel\n    ) -> Any:\n        \"\"\"Get the value from the question value model.\n\n        Args:\n            question_value_model (BaseModel): The question value model to get the value from.\n\n        Returns:\n            Any: The value from the question value model.\n        \"\"\"\n        for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n            if hasattr(question_value_model, attr):\n                return getattr(question_value_model, attr)\n        raise ValueError(f\"Unsupported question type: {question_value_model}\")\n\n    def _assign_value_to_question_value_model(\n        self, value: Any, question: Dict[str, Any]\n    ) -> BaseModel:\n        \"\"\"Assign the value to the question value model.\n\n        Args:\n            value (Any): The value to assign.\n            question (Dict[str, Any]): The question to assign the value to.\n\n        Returns:\n            BaseModel: The question value model with the assigned value.\n        \"\"\"\n        question_value_model = self._get_pydantic_model_of_structured_output(question)\n        for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n            try:\n                model_dict = {attr: value}\n                question_value_model = question_value_model(**model_dict)\n                return question_value_model.model_dump_json()\n            except AttributeError:\n                pass\n        return value\n\n    def _get_pydantic_model_of_structured_output(\n        self,\n        question: Dict[str, Any],\n    ) -> BaseModel:\n        \"\"\"Get the Pydantic model of the structured output.\n\n        Args:\n            question (Dict[str, Any]): The question to get the Pydantic model of the structured output for.\n\n        Returns:\n            BaseModel: The Pydantic model of the structured output.\n        \"\"\"\n\n        question_type = question[\"settings\"][\"type\"]\n\n        if question_type == \"multi_label_selection\":\n\n            class QuestionValueModel(BaseModel):\n                labels: Optional[List[str]] = Field(default_factory=list)\n\n        elif question_type == \"label_selection\":\n\n            class QuestionValueModel(BaseModel):\n                label: str\n\n        elif question_type == \"text\":\n\n            class QuestionValueModel(BaseModel):\n                text: str\n\n        elif question_type == \"rating\":\n\n            class QuestionValueModel(BaseModel):\n                rating: int\n        else:\n            raise ValueError(f\"Unsupported question type: {question}\")\n\n        return QuestionValueModel\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"argillalabeller.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_record","title":"_format_record(record, fields)","text":"

Format the record fields into a string.

Parameters:

Name Type Description Default record Dict[str, Any]

The record to format.

required fields List[Dict[str, Any]]

The fields to format.

required

Returns:

Name Type Description str str

The formatted record fields.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _format_record(\n    self, record: Dict[str, Any], fields: List[Dict[str, Any]]\n) -> str:\n    \"\"\"Format the record fields into a string.\n\n    Args:\n        record (Dict[str, Any]): The record to format.\n        fields (List[Dict[str, Any]]): The fields to format.\n\n    Returns:\n        str: The formatted record fields.\n    \"\"\"\n    output = []\n    for field in fields:\n        output.append(record.get(\"fields\", {}).get(field.get(\"name\", \"\")))\n    return \"fields: \" + \"\\n\".join(output)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_label_instruction","title":"_get_label_instruction(question)","text":"

Get the label instruction for the question.

Parameters:

Name Type Description Default question Dict[str, Any]

The question to get the label instruction for.

required

Returns:

Name Type Description str str

The label instruction for the question.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _get_label_instruction(self, question: Dict[str, Any]) -> str:\n    \"\"\"Get the label instruction for the question.\n\n    Args:\n        question (Dict[str, Any]): The question to get the label instruction for.\n\n    Returns:\n        str: The label instruction for the question.\n    \"\"\"\n    question_type = question[\"settings\"][\"type\"]\n    return self.question_to_label_instruction[question_type]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_question","title":"_format_question(question)","text":"

Format the question settings into a string.

Parameters:

Name Type Description Default question Dict[str, Any]

The question to format.

required

Returns:

Name Type Description str str

The formatted question.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _format_question(self, question: Dict[str, Any]) -> str:\n    \"\"\"Format the question settings into a string.\n\n    Args:\n        question (Dict[str, Any]): The question to format.\n\n    Returns:\n        str: The formatted question.\n    \"\"\"\n    output = []\n    output.append(f\"question: {self._get_label_instruction(question)}\")\n    if \"options\" in question.get(\"settings\", {}):\n        output.append(\n            f\"optional labels: {[option['value'] for option in question.get('settings', {}).get('options', [])]}\"\n        )\n    return \"\\n\".join(output)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_example_records","title":"_format_example_records(records, fields, question)","text":"

Format the example records into a string.

Parameters:

Name Type Description Default records List[Dict[str, Any]]

The records to format.

required fields List[Dict[str, Any]]

The fields to format.

required question Dict[str, Any]

The question to format.

required

Returns:

Name Type Description str str

The formatted example records.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _format_example_records(\n    self,\n    records: List[Dict[str, Any]],\n    fields: List[Dict[str, Any]],\n    question: Dict[str, Any],\n) -> str:\n    \"\"\"Format the example records into a string.\n\n    Args:\n        records (List[Dict[str, Any]]): The records to format.\n        fields (List[Dict[str, Any]]): The fields to format.\n        question (Dict[str, Any]): The question to format.\n\n    Returns:\n        str: The formatted example records.\n    \"\"\"\n    base = []\n    for record in records:\n        responses = record.get(\"responses\", {})\n        if responses.get(question[\"name\"]):\n            base.append(self._format_record(record, fields))\n            value = responses[question[\"name\"]][0][\"value\"]\n            formatted_value = self._assign_value_to_question_value_model(\n                value, question\n            )\n            base.append(f\"response: {formatted_value}\")\n            base.append(\"\")\n        else:\n            warnings.warn(\n                f\"Record {record} has no response for question {question['name']}. Skipping example record.\",\n                stacklevel=2,\n            )\n    return \"\\n\".join(base)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.format_input","title":"format_input(input)","text":"

Format the input into a chat message.

Parameters:

Name Type Description Default input Dict[str, Union[Dict[str, Any], Record, TextField, MultiLabelQuestion, LabelQuestion, RatingQuestion, TextQuestion]]

The input to format.

required

Returns:

Type Description ChatType

The formatted chat message.

Raises:

Type Description ValueError

If question or fields are not provided.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def format_input(\n    self,\n    input: Dict[\n        str,\n        Union[\n            Dict[str, Any],\n            \"Record\",\n            \"TextField\",\n            \"MultiLabelQuestion\",\n            \"LabelQuestion\",\n            \"RatingQuestion\",\n            \"TextQuestion\",\n        ],\n    ],\n) -> \"ChatType\":\n    \"\"\"Format the input into a chat message.\n\n    Args:\n        input: The input to format.\n\n    Returns:\n        The formatted chat message.\n\n    Raises:\n        ValueError: If question or fields are not provided.\n    \"\"\"\n    input_keys = list(self.inputs.keys())\n    record = input[input_keys[0]]\n    fields = input.get(input_keys[1], self.fields)\n    question = input.get(input_keys[2], self.question)\n    examples = input.get(input_keys[3], self.example_records)\n    guidelines = input.get(input_keys[4], self.guidelines)\n\n    if question is None:\n        raise ValueError(\"Question must be provided.\")\n    if fields is None or any(field is None for field in fields):\n        raise ValueError(\"Fields must be provided.\")\n\n    record = record.to_dict() if not isinstance(record, dict) else record\n    question = question.serialize() if not isinstance(question, dict) else question\n    fields = [\n        field.serialize() if not isinstance(field, dict) else field\n        for field in fields\n    ]\n    examples = (\n        [\n            example.to_dict() if not isinstance(example, dict) else example\n            for example in examples\n        ]\n        if examples\n        else None\n    )\n\n    formatted_fields = self._format_record(record, fields)\n    formatted_question = self._format_question(question)\n    formatted_examples = (\n        self._format_example_records(examples, fields, question)\n        if examples\n        else False\n    )\n\n    prompt = self._template.render(\n        fields=formatted_fields,\n        question=formatted_question,\n        examples=formatted_examples,\n        guidelines=guidelines,\n    )\n\n    messages = []\n    if self.system_prompt:\n        messages.append({\"role\": \"system\", \"content\": self.system_prompt})\n    messages.append({\"role\": \"user\", \"content\": prompt})\n    return messages\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.format_output","title":"format_output(output, input)","text":"

Format the output into a dictionary.

Parameters:

Name Type Description Default output Union[str, None]

The output to format.

required input Dict[str, Any]

The input to format.

required

Returns:

Type Description Dict[str, Any]

Dict[str, Any]: The formatted output.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"Format the output into a dictionary.\n\n    Args:\n        output (Union[str, None]): The output to format.\n        input (Dict[str, Any]): The input to format.\n\n    Returns:\n        Dict[str, Any]: The formatted output.\n    \"\"\"\n    from argilla import Suggestion\n\n    question: Union[\n        Any,\n        Dict[str, Any],\n        LabelQuestion,\n        MultiLabelQuestion,\n        RatingQuestion,\n        TextQuestion,\n        None,\n    ] = input.get(list(self.inputs.keys())[2], self.question) or self.question\n    question = question.serialize() if not isinstance(question, dict) else question\n    model = self._get_pydantic_model_of_structured_output(question)\n    validated_output = model(**json.loads(output))\n    value = self._get_value_from_question_value_model(validated_output)\n    suggestion = Suggestion(\n        value=value,\n        question_name=question[\"name\"],\n        type=\"model\",\n        agent=self.llm.model_name,\n    ).serialize()\n    return {\n        self.outputs[0]: {\n            k: v\n            for k, v in suggestion.items()\n            if k in [\"value\", \"question_name\", \"type\", \"agent\"]\n        }\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.process","title":"process(inputs)","text":"

Process the input through the task.

Parameters:

Name Type Description Default inputs StepInput

The input to process.

required

Returns:

Name Type Description StepOutput StepOutput

The output of the task.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":\n    \"\"\"Process the input through the task.\n\n    Args:\n        inputs (StepInput): The input to process.\n\n    Returns:\n        StepOutput: The output of the task.\n    \"\"\"\n\n    question_list = [input.get(\"question\", self.question) for input in inputs]\n    fields_list = [input.get(\"fields\", self.fields) for input in inputs]\n    # check if any field for the field in fields is None\n    for fields in fields_list:\n        if any(field is None for field in fields):\n            raise ValueError(\n                \"Fields must be provided during init or through `process` method.\"\n            )\n    # check if any question is None\n    if any(question is None for question in question_list):\n        raise ValueError(\n            \"Question must be provided during init or through `process` method.\"\n        )\n    question_list = [\n        question.serialize() if not isinstance(question, dict) else question\n        for question in question_list\n    ]\n    if not all(question == question_list[0] for question in question_list):\n        warnings.warn(\n            \"Not all questions are the same. Processing each question separately by setting the structured output for each question. This may impact performance.\",\n            stacklevel=2,\n        )\n        for input, question in zip(inputs, question_list):\n            self._set_llm_structured_output_for_question(question)\n            yield from super().process([input])\n    else:\n        question = question_list[0]\n        self._set_llm_structured_output_for_question(question)\n        yield from super().process(inputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_value_from_question_value_model","title":"_get_value_from_question_value_model(question_value_model)","text":"

Get the value from the question value model.

Parameters:

Name Type Description Default question_value_model BaseModel

The question value model to get the value from.

required

Returns:

Name Type Description Any Any

The value from the question value model.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _get_value_from_question_value_model(\n    self, question_value_model: BaseModel\n) -> Any:\n    \"\"\"Get the value from the question value model.\n\n    Args:\n        question_value_model (BaseModel): The question value model to get the value from.\n\n    Returns:\n        Any: The value from the question value model.\n    \"\"\"\n    for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n        if hasattr(question_value_model, attr):\n            return getattr(question_value_model, attr)\n    raise ValueError(f\"Unsupported question type: {question_value_model}\")\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._assign_value_to_question_value_model","title":"_assign_value_to_question_value_model(value, question)","text":"

Assign the value to the question value model.

Parameters:

Name Type Description Default value Any

The value to assign.

required question Dict[str, Any]

The question to assign the value to.

required

Returns:

Name Type Description BaseModel BaseModel

The question value model with the assigned value.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _assign_value_to_question_value_model(\n    self, value: Any, question: Dict[str, Any]\n) -> BaseModel:\n    \"\"\"Assign the value to the question value model.\n\n    Args:\n        value (Any): The value to assign.\n        question (Dict[str, Any]): The question to assign the value to.\n\n    Returns:\n        BaseModel: The question value model with the assigned value.\n    \"\"\"\n    question_value_model = self._get_pydantic_model_of_structured_output(question)\n    for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n        try:\n            model_dict = {attr: value}\n            question_value_model = question_value_model(**model_dict)\n            return question_value_model.model_dump_json()\n        except AttributeError:\n            pass\n    return value\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_pydantic_model_of_structured_output","title":"_get_pydantic_model_of_structured_output(question)","text":"

Get the Pydantic model of the structured output.

Parameters:

Name Type Description Default question Dict[str, Any]

The question to get the Pydantic model of the structured output for.

required

Returns:

Name Type Description BaseModel BaseModel

The Pydantic model of the structured output.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _get_pydantic_model_of_structured_output(\n    self,\n    question: Dict[str, Any],\n) -> BaseModel:\n    \"\"\"Get the Pydantic model of the structured output.\n\n    Args:\n        question (Dict[str, Any]): The question to get the Pydantic model of the structured output for.\n\n    Returns:\n        BaseModel: The Pydantic model of the structured output.\n    \"\"\"\n\n    question_type = question[\"settings\"][\"type\"]\n\n    if question_type == \"multi_label_selection\":\n\n        class QuestionValueModel(BaseModel):\n            labels: Optional[List[str]] = Field(default_factory=list)\n\n    elif question_type == \"label_selection\":\n\n        class QuestionValueModel(BaseModel):\n            label: str\n\n    elif question_type == \"text\":\n\n        class QuestionValueModel(BaseModel):\n            text: str\n\n    elif question_type == \"rating\":\n\n        class QuestionValueModel(BaseModel):\n            rating: int\n    else:\n        raise ValueError(f\"Unsupported question type: {question}\")\n\n    return QuestionValueModel\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR","title":"CLAIR","text":"

Bases: Task

Contrastive Learning from AI Revisions (CLAIR).

CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting preference A preferred A\u2019 is much more contrastive and precise.

Input columns
  • task (str): The task or instruction.
  • student_solution (str): An answer to the task that is to be revised.
Output columns
  • revision (str): The revised text.
  • rational (str): The rational for the provided revision.
  • model_name (str): The name of the model used to generate the revision and rational.
Categories
  • preference
  • text-generation
References
  • Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment
  • APO and CLAIR - GitHub Repository

Examples:

Create contrastive preference pairs:

from distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 4096,\n    },\n)\nclair_task = CLAIR(llm=llm)\n\nclair_task.load()\n\nresult = next(\n    clair_task.process(\n        [\n            {\n                \"task\": \"How many gaps are there between the earth and the moon?\",\n                \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n            }\n        ]\n    )\n)\n# result\n# [{'task': 'How many gaps are there between the earth and the moon?',\n# 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n# 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n# 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n#     'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n#     {'role': 'user',\n#     'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n

Citations:

```\n@misc{doosterlinck2024anchoredpreferenceoptimizationcontrastive,\n    title={Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment},\n    author={Karel D'Oosterlinck and Winnie Xu and Chris Develder and Thomas Demeester and Amanpreet Singh and Christopher Potts and Douwe Kiela and Shikib Mehri},\n    year={2024},\n    eprint={2408.06266},\n    archivePrefix={arXiv},\n    primaryClass={cs.LG},\n    url={https://arxiv.org/abs/2408.06266},\n}\n```\n
Source code in src/distilabel/steps/tasks/clair.py
class CLAIR(Task):\n    r\"\"\"Contrastive Learning from AI Revisions (CLAIR).\n\n    CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting\n    preference A `preferred` A\u2019 is much more contrastive and precise.\n\n    Input columns:\n        - task (`str`): The task or instruction.\n        - student_solution (`str`): An answer to the task that is to be revised.\n\n    Output columns:\n        - revision (`str`): The revised text.\n        - rational (`str`): The rational for the provided revision.\n        - model_name (`str`): The name of the model used to generate the revision and rational.\n\n    Categories:\n        - preference\n        - text-generation\n\n    References:\n        - [`Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment`](https://arxiv.org/abs/2408.06266v1)\n        - [`APO and CLAIR - GitHub Repository`](https://github.com/ContextualAI/CLAIR_and_APO)\n\n    Examples:\n        Create contrastive preference pairs:\n\n        ```python\n        from distilabel.steps.tasks import CLAIR\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 4096,\n            },\n        )\n        clair_task = CLAIR(llm=llm)\n\n        clair_task.load()\n\n        result = next(\n            clair_task.process(\n                [\n                    {\n                        \"task\": \"How many gaps are there between the earth and the moon?\",\n                        \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n                    }\n                ]\n            )\n        )\n        # result\n        # [{'task': 'How many gaps are there between the earth and the moon?',\n        # 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n        # 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n        # 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n        # 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n        # 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n        #     'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n        #     {'role': 'user',\n        #     'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n    Citations:\n\n        ```\n        @misc{doosterlinck2024anchoredpreferenceoptimizationcontrastive,\n            title={Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment},\n            author={Karel D'Oosterlinck and Winnie Xu and Chris Develder and Thomas Demeester and Amanpreet Singh and Christopher Potts and Douwe Kiela and Shikib Mehri},\n            year={2024},\n            eprint={2408.06266},\n            archivePrefix={arXiv},\n            primaryClass={cs.LG},\n            url={https://arxiv.org/abs/2408.06266},\n        }\n        ```\n    \"\"\"\n\n    system_prompt: str = SYSTEM_PROMPT\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    def load(self) -> None:\n        super().load()\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"clair.jinja2\"\n        )\n        with open(_path, \"r\") as f:\n            self._template = Template(f.read())\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return [\"task\", \"student_solution\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"revision\", \"rational\", \"model_name\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\"role\": \"system\", \"content\": self.system_prompt},\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    task=input[\"task\"], student_solution=input[\"student_solution\"]\n                ),\n            },\n        ]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the key `scores` containing the scores for each instruction-response pair.\n        \"\"\"\n        if output is None:\n            return self._default_error()\n\n        return self._format_output(output)\n\n    def _format_output(self, output: Union[str, None]) -> Dict[str, Any]:\n        if \"**Corrected Student Solution:**\" in output:\n            splits = output.split(\"**Corrected Student Solution:**\")\n        elif \"{corrected_student_solution}:\" in output:\n            splits = output.split(\"{corrected_student_solution}:\")\n        elif \"{corrected_student_solution}\" in output:\n            splits = output.split(\"{corrected_student_solution}\")\n        elif \"**Worsened Student Solution:**\" in output:\n            splits = output.split(\"**Worsened Student Solution:**\")\n        elif \"{worsened_student_solution}:\" in output:\n            splits = output.split(\"{worsened_student_solution}:\")\n        elif \"{worsened_student_solution}\" in output:\n            splits = output.split(\"{worsened_student_solution}\")\n        else:\n            splits = None\n\n        # Safety check when the output doesn't follow the expected format\n        if not splits:\n            return self._default_error()\n\n        if len(splits) >= 2:\n            revision = splits[1]\n            revision = revision.strip(\"\\n\\n\").strip()  # noqa: B005\n\n            rational = splits[0]\n            if \"{teacher_reasoning}\" in rational:\n                rational = rational.split(\"{teacher_reasoning}\")[1].strip(\":\").strip()\n            rational = rational.strip(\"\\n\\n\").strip()  # noqa: B005\n        else:\n            return self._default_error()\n        return {\"revision\": revision, \"rational\": rational}\n\n    def _default_error(self) -> Dict[str, None]:\n        return {\"revision\": None, \"rational\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/clair.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\"role\": \"system\", \"content\": self.system_prompt},\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                task=input[\"task\"], student_solution=input[\"student_solution\"]\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction-response pair.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the key scores containing the scores for each instruction-response pair.

Source code in src/distilabel/steps/tasks/clair.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the key `scores` containing the scores for each instruction-response pair.\n    \"\"\"\n    if output is None:\n        return self._default_error()\n\n    return self._format_output(output)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer","title":"ComplexityScorer","text":"

Bases: Task

Score instructions based on their complexity using an LLM.

ComplexityScorer is a pre-defined task used to rank a list of instructions based in their complexity. It's an implementation of the complexity score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

Attributes:

Name Type Description _template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • instructions (List[str]): The list of instructions to be scored.
Output columns
  • scores (List[float]): The score for each instruction.
  • model_name (str): The model name used to generate the scores.
Categories
  • scorer
  • complexity
  • instruction
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Evaluate the complexity of your instructions:

from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n    )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n

Generate structured output with default schema:

from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    use_default_structured_output=use_default_structured_output\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n    )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n  \"scores\": [\\n    1, \\n    2\\n  ]\\n}'}}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/tasks/complexity_scorer.py
class ComplexityScorer(Task):\n    \"\"\"Score instructions based on their complexity using an `LLM`.\n\n    `ComplexityScorer` is a pre-defined task used to rank a list of instructions based in\n    their complexity. It's an implementation of the complexity score task from the paper\n    'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection\n    in Instruction Tuning'.\n\n    Attributes:\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - instructions (`List[str]`): The list of instructions to be scored.\n\n    Output columns:\n        - scores (`List[float]`): The score for each instruction.\n        - model_name (`str`): The model name used to generate the scores.\n\n    Categories:\n        - scorer\n        - complexity\n        - instruction\n\n    References:\n        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Evaluate the complexity of your instructions:\n\n        ```python\n        from distilabel.steps.tasks import ComplexityScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        scorer = ComplexityScorer(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            )\n        )\n\n        scorer.load()\n\n        result = next(\n            scorer.process(\n                [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n            )\n        )\n        # result\n        # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n        ```\n\n        Generate structured output with default schema:\n\n        ```python\n        from distilabel.steps.tasks import ComplexityScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        scorer = ComplexityScorer(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            use_default_structured_output=use_default_structured_output\n        )\n\n        scorer.load()\n\n        result = next(\n            scorer.process(\n                [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n            )\n        )\n        # result\n        # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\\\n  \"scores\": [\\\\n    1, \\\\n    2\\\\n  ]\\\\n}'}}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"complexity-scorer.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the task are the `instructions`.\"\"\"\n        return [\"instructions\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(instructions=input[\"instructions\"]),  # type: ignore\n            }\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are: a list of `scores` containing the complexity score for each\n        instruction in `instructions`, and the `model_name`.\"\"\"\n        return [\"scores\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the key `scores` containing the scores for each instruction.\n        \"\"\"\n        if output is None:\n            return {\"scores\": [None] * len(input[\"instructions\"])}\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output, input)\n\n        scores = []\n        score_lines = output.split(\"\\n\")\n        for i, line in enumerate(score_lines):\n            match = _PARSE_SCORE_LINE_REGEX.match(line)\n            score = float(match.group(1)) if match else None\n            scores.append(score)\n            if i == len(input[\"instructions\"]) - 1:\n                break\n        return {\"scores\": scores}\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from pydantic import BaseModel\n        from typing import List\n\n        class SchemaComplexityScorer(BaseModel):\n            scores: List[int]\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        return {\n            \"properties\": {\n                \"scores\": {\n                    \"items\": {\"type\": \"integer\"},\n                    \"title\": \"Scores\",\n                    \"type\": \"array\",\n                }\n            },\n            \"required\": [\"scores\"],\n            \"title\": \"SchemaComplexityScorer\",\n            \"type\": \"object\",\n        }\n\n    def _format_structured_output(\n        self, output: str, input: Dict[str, Any]\n    ) -> Dict[str, str]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with either `positive`, or `positive` and `negative` keys.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            return {\"scores\": [None] * len(input[\"instructions\"])}\n\n    @override\n    def _sample_input(self) -> \"ChatType\":\n        \"\"\"Returns a sample input to be used in the `print` method.\n        Tasks that don't adhere to a format input that returns a map of the type\n        str -> str should override this method to return a sample input.\n        \"\"\"\n        return self.format_input(\n            {\n                \"instructions\": [\n                    f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n                ],\n            }\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.inputs","title":"inputs: List[str] property","text":"

The inputs for the task are the instructions.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.outputs","title":"outputs: List[str] property","text":"

The output for the task are: a list of scores containing the complexity score for each instruction in instructions, and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"complexity-scorer.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(instructions=input[\"instructions\"]),  # type: ignore\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the key scores containing the scores for each instruction.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the key `scores` containing the scores for each instruction.\n    \"\"\"\n    if output is None:\n        return {\"scores\": [None] * len(input[\"instructions\"])}\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output, input)\n\n    scores = []\n    score_lines = output.split(\"\\n\")\n    for i, line in enumerate(score_lines):\n        match = _PARSE_SCORE_LINE_REGEX.match(line)\n        score = float(match.group(1)) if match else None\n        scores.append(score)\n        if i == len(input[\"instructions\"]) - 1:\n            break\n    return {\"scores\": scores}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaComplexityScorer(BaseModel):\n    scores: List[int]\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from pydantic import BaseModel\n    from typing import List\n\n    class SchemaComplexityScorer(BaseModel):\n        scores: List[int]\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    return {\n        \"properties\": {\n            \"scores\": {\n                \"items\": {\"type\": \"integer\"},\n                \"title\": \"Scores\",\n                \"type\": \"array\",\n            }\n        },\n        \"required\": [\"scores\"],\n        \"title\": \"SchemaComplexityScorer\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer._format_structured_output","title":"_format_structured_output(output, input)","text":"

Parses the structured response, which should correspond to a dictionary with either positive, or positive and negative keys.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, str]

Formatted output.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
def _format_structured_output(\n    self, output: str, input: Dict[str, Any]\n) -> Dict[str, str]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with either `positive`, or `positive` and `negative` keys.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        return {\"scores\": [None] * len(input[\"instructions\"])}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer._sample_input","title":"_sample_input()","text":"

Returns a sample input to be used in the print method. Tasks that don't adhere to a format input that returns a map of the type str -> str should override this method to return a sample input.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
@override\ndef _sample_input(self) -> \"ChatType\":\n    \"\"\"Returns a sample input to be used in the `print` method.\n    Tasks that don't adhere to a format input that returns a map of the type\n    str -> str should override this method to return a sample input.\n    \"\"\"\n    return self.format_input(\n        {\n            \"instructions\": [\n                f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n            ],\n        }\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct","title":"EvolInstruct","text":"

Bases: Task

Evolve instructions using an LLM.

WizardLM: Empowering Large Language Models to Follow Complex Instructions

Attributes:

Name Type Description num_evolutions int

The number of evolutions to be performed.

store_evolutions bool

Whether to store all the evolutions or just the last one. Defaults to False.

generate_answers bool

Whether to generate answers for the evolved instructions. Defaults to False.

include_original_instruction bool

Whether to include the original instruction in the evolved_instructions output column. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used for evolving the instructions. Defaults to the ones provided in the utils.py file.

seed RuntimeParameter[int]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
Input columns
  • instruction (str): The instruction to evolve.
Output columns
  • evolved_instruction (str): The evolved instruction if store_evolutions=False.
  • evolved_instructions (List[str]): The evolved instructions if store_evolutions=True.
  • model_name (str): The name of the LLM used to evolve the instructions.
  • answer (str): The answer to the evolved instruction if generate_answers=True and store_evolutions=False.
  • answers (List[str]): The answers to the evolved instructions if generate_answers=True and store_evolutions=True.
Categories
  • evol
  • instruction
References
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions
  • GitHub: h2oai/h2o-wizardlm

Examples:

Evolve an instruction using an LLM:

from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n

Keep the iterations of the evolutions:

from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n    store_evolutions=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'evolved_instructions': ['initial evolution', 'final evolution'],\n#         'model_name': 'model_name'\n#     }\n# ]\n

Generate answers for the instructions in a single step:

from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n    generate_answers=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'evolved_instruction': 'evolved instruction',\n#         'answer': 'answer to the instruction',\n#         'model_name': 'model_name'\n#     }\n# ]\n
Citations
@misc{xu2023wizardlmempoweringlargelanguage,\n    title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n    author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n    year={2023},\n    eprint={2304.12244},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2304.12244},\n}\n
Source code in src/distilabel/steps/tasks/evol_instruct/base.py
class EvolInstruct(Task):\n    \"\"\"Evolve instructions using an `LLM`.\n\n    WizardLM: Empowering Large Language Models to Follow Complex Instructions\n\n    Attributes:\n        num_evolutions: The number of evolutions to be performed.\n        store_evolutions: Whether to store all the evolutions or just the last one. Defaults\n            to `False`.\n        generate_answers: Whether to generate answers for the evolved instructions. Defaults\n            to `False`.\n        include_original_instruction: Whether to include the original instruction in the\n            `evolved_instructions` output column. Defaults to `False`.\n        mutation_templates: The mutation templates to be used for evolving the instructions.\n            Defaults to the ones provided in the `utils.py` file.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n    Input columns:\n        - instruction (`str`): The instruction to evolve.\n\n    Output columns:\n        - evolved_instruction (`str`): The evolved instruction if `store_evolutions=False`.\n        - evolved_instructions (`List[str]`): The evolved instructions if `store_evolutions=True`.\n        - model_name (`str`): The name of the LLM used to evolve the instructions.\n        - answer (`str`): The answer to the evolved instruction if `generate_answers=True`\n            and `store_evolutions=False`.\n        - answers (`List[str]`): The answers to the evolved instructions if `generate_answers=True`\n            and `store_evolutions=True`.\n\n    Categories:\n        - evol\n        - instruction\n\n    References:\n        - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n        - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm)\n\n    Examples:\n        Evolve an instruction using an LLM:\n\n        ```python\n        from distilabel.steps.tasks import EvolInstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_instruct = EvolInstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n        )\n\n        evol_instruct.load()\n\n        result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n        # result\n        # [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n        ```\n\n        Keep the iterations of the evolutions:\n\n        ```python\n        from distilabel.steps.tasks import EvolInstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_instruct = EvolInstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n            store_evolutions=True,\n        )\n\n        evol_instruct.load()\n\n        result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n        # result\n        # [\n        #     {\n        #         'instruction': 'common instruction',\n        #         'evolved_instructions': ['initial evolution', 'final evolution'],\n        #         'model_name': 'model_name'\n        #     }\n        # ]\n        ```\n\n        Generate answers for the instructions in a single step:\n\n        ```python\n        from distilabel.steps.tasks import EvolInstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_instruct = EvolInstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n            generate_answers=True,\n        )\n\n        evol_instruct.load()\n\n        result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n        # result\n        # [\n        #     {\n        #         'instruction': 'common instruction',\n        #         'evolved_instruction': 'evolved instruction',\n        #         'answer': 'answer to the instruction',\n        #         'model_name': 'model_name'\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{xu2023wizardlmempoweringlargelanguage,\n            title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n            author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n            year={2023},\n            eprint={2304.12244},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2304.12244},\n        }\n        ```\n    \"\"\"\n\n    num_evolutions: int\n    store_evolutions: bool = False\n    generate_answers: bool = False\n    include_original_instruction: bool = False\n    mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n\n    seed: RuntimeParameter[int] = Field(\n        default=42,\n        description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to seed a random seed.\",\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`.\"\"\"\n        return [\"instruction\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation. And the\n        `system_prompt` is added as the first message if it exists.\"\"\"\n        return [{\"role\": \"user\", \"content\": input}]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `evolved_instruction/s`, the `answer` if `generate_answers=True`\n        and the `model_name`.\"\"\"\n        # TODO: having to define a `model_name` column every time as the `Task.outputs` is not ideal,\n        # this could be handled always and the value could be included within the DAG validation when\n        # a `Task` is used, since all the `Task` subclasses will have an `llm` with a `model_name` attr.\n        _outputs = [\n            (\n                \"evolved_instruction\"\n                if not self.store_evolutions\n                else \"evolved_instructions\"\n            ),\n            \"model_name\",\n        ]\n        if self.generate_answers:\n            _outputs.append(\"answer\" if not self.store_evolutions else \"answers\")\n        return _outputs\n\n    @override\n    def format_output(  # type: ignore\n        self, instructions: Union[str, List[str]], answers: Optional[List[str]] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        \"\"\"The output for the task is a dict with: `evolved_instruction` or `evolved_instructions`,\n        depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n        `answer` if `generate_answers=True`; and, finally, the `model_name`.\n\n        Args:\n            instructions: The instructions to be included within the output.\n            answers: The answers to be included within the output if `generate_answers=True`.\n\n        Returns:\n            If `store_evolutions=False` and `generate_answers=True` return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};\n            if `store_evolutions=True` and `generate_answers=True` return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};\n            if `store_evolutions=False` and `generate_answers=False` return {\"evolved_instruction\": ..., \"model_name\": ...};\n            if `store_evolutions=True` and `generate_answers=False` return {\"evolved_instructions\": ..., \"model_name\": ...}.\n        \"\"\"\n        _output = {}\n        if not self.store_evolutions:\n            _output[\"evolved_instruction\"] = instructions[-1]\n        else:\n            _output[\"evolved_instructions\"] = instructions\n\n        if self.generate_answers and answers:\n            if not self.store_evolutions:\n                _output[\"answer\"] = answers[-1]\n            else:\n                _output[\"answers\"] = answers\n\n        _output[\"model_name\"] = self.llm.model_name\n        return _output\n\n    @property\n    def mutation_templates_names(self) -> List[str]:\n        \"\"\"Returns the names i.e. keys of the provided `mutation_templates`.\"\"\"\n        return list(self.mutation_templates.keys())\n\n    def _apply_random_mutation(self, instruction: str) -> str:\n        \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n        enum, and returns the provided instruction within the mutation prompt.\n\n        Args:\n            instruction: The instruction to be included within the mutation prompt.\n\n        Returns:\n            A random mutation prompt with the provided instruction.\n        \"\"\"\n        mutation = np.random.choice(self.mutation_templates_names)\n        return self.mutation_templates[mutation].replace(\"<PROMPT>\", instruction)  # type: ignore\n\n    def _evolve_instructions(self, inputs: \"StepInput\") -> List[List[str]]:\n        \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list where each item is a list with either the last evolved instruction if\n            `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n        \"\"\"\n\n        instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n\n        for iter_no in range(self.num_evolutions):\n            formatted_prompts = []\n            for instruction in instructions:\n                formatted_prompts.append(self._apply_random_mutation(instruction[-1]))\n\n            formatted_prompts = [\n                self.format_input(prompt) for prompt in formatted_prompts\n            ]\n            generated_prompts = flatten_responses(\n                self.llm.generate(\n                    formatted_prompts,\n                    **self.llm.generation_kwargs,  # type: ignore\n                )\n            )\n\n            evolved_instructions = []\n            for generated_prompt in generated_prompts:\n                generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n                evolved_instructions.append(generated_prompt)\n\n            if self.store_evolutions:\n                instructions = [\n                    instruction + [evolved_instruction]\n                    for instruction, evolved_instruction in zip(\n                        instructions, evolved_instructions\n                    )\n                ]\n            else:\n                instructions = [\n                    [evolved_instruction]\n                    for evolved_instruction in evolved_instructions\n                ]\n\n            self._logger.info(\n                f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(instructions)} instructions!\"\n            )\n\n        return instructions\n\n    def _generate_answers(\n        self, evolved_instructions: List[List[str]]\n    ) -> List[List[str]]:\n        \"\"\"Generates the answer for the instructions in `instructions`.\n\n        Args:\n            evolved_instructions: A list of lists where each item is a list with either the last\n                evolved instruction if `store_evolutions=False` or all the evolved instructions\n                if `store_evolutions=True`.\n\n        Returns:\n            A list of answers for each instruction.\n        \"\"\"\n        formatted_instructions = [\n            self.format_input(instruction)\n            for instructions in evolved_instructions\n            for instruction in instructions\n        ]\n\n        responses = self.llm.generate(\n            formatted_instructions,\n            num_generations=1,\n            **self.llm.generation_kwargs,  # type: ignore\n        )\n\n        step = (\n            self.num_evolutions\n            if not self.include_original_instruction\n            else self.num_evolutions + 1\n        )\n        return [\n            flatten_responses(responses[i : i + step])\n            for i in range(0, len(responses), step)\n        ]\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n\n        evolved_instructions = self._evolve_instructions(inputs)\n\n        if self.store_evolutions:\n            # Remove the input instruction from the `evolved_instructions` list\n            from_ = 1 if not self.include_original_instruction else 0\n            evolved_instructions = [\n                instruction[from_:] for instruction in evolved_instructions\n            ]\n\n        if not self.generate_answers:\n            for input, instruction in zip(inputs, evolved_instructions):\n                input.update(self.format_output(instruction))\n            yield inputs\n\n        self._logger.info(\n            f\"\ud83c\udf89 Finished evolving {len(evolved_instructions)} instructions!\"\n        )\n\n        if self.generate_answers:\n            self._logger.info(\n                f\"\ud83e\udde0 Generating answers for the {len(evolved_instructions)} evolved instructions!\"\n            )\n\n            answers = self._generate_answers(evolved_instructions)\n\n            self._logger.info(\n                f\"\ud83c\udf89 Finished generating answers for the {len(evolved_instructions)} evolved\"\n                \" instructions!\"\n            )\n\n            for idx, (input, instruction) in enumerate(\n                zip(inputs, evolved_instructions)\n            ):\n                input.update(self.format_output(instruction, answers[idx]))\n            yield inputs\n\n    @override\n    def _sample_input(self) -> ChatType:\n        return self.format_input(\n            self._apply_random_mutation(\"<PLACEHOLDER_INSTRUCTION>\")\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.inputs","title":"inputs: List[str] property","text":"

The input for the task is the instruction.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.outputs","title":"outputs: List[str] property","text":"

The output for the task are the evolved_instruction/s, the answer if generate_answers=True and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.mutation_templates_names","title":"mutation_templates_names: List[str] property","text":"

Returns the names i.e. keys of the provided mutation_templates.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. And the system_prompt is added as the first message if it exists.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
def format_input(self, input: str) -> ChatType:  # type: ignore\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation. And the\n    `system_prompt` is added as the first message if it exists.\"\"\"\n    return [{\"role\": \"user\", \"content\": input}]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.format_output","title":"format_output(instructions, answers=None)","text":"

The output for the task is a dict with: evolved_instruction or evolved_instructions, depending whether the value is either False or True for store_evolutions, respectively; answer if generate_answers=True; and, finally, the model_name.

Parameters:

Name Type Description Default instructions Union[str, List[str]]

The instructions to be included within the output.

required answers Optional[List[str]]

The answers to be included within the output if generate_answers=True.

None

Returns:

Type Description Dict[str, Any]

If store_evolutions=False and generate_answers=True return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};

Dict[str, Any]

if store_evolutions=True and generate_answers=True return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};

Dict[str, Any]

if store_evolutions=False and generate_answers=False return {\"evolved_instruction\": ..., \"model_name\": ...};

Dict[str, Any]

if store_evolutions=True and generate_answers=False return {\"evolved_instructions\": ..., \"model_name\": ...}.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
@override\ndef format_output(  # type: ignore\n    self, instructions: Union[str, List[str]], answers: Optional[List[str]] = None\n) -> Dict[str, Any]:  # type: ignore\n    \"\"\"The output for the task is a dict with: `evolved_instruction` or `evolved_instructions`,\n    depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n    `answer` if `generate_answers=True`; and, finally, the `model_name`.\n\n    Args:\n        instructions: The instructions to be included within the output.\n        answers: The answers to be included within the output if `generate_answers=True`.\n\n    Returns:\n        If `store_evolutions=False` and `generate_answers=True` return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};\n        if `store_evolutions=True` and `generate_answers=True` return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};\n        if `store_evolutions=False` and `generate_answers=False` return {\"evolved_instruction\": ..., \"model_name\": ...};\n        if `store_evolutions=True` and `generate_answers=False` return {\"evolved_instructions\": ..., \"model_name\": ...}.\n    \"\"\"\n    _output = {}\n    if not self.store_evolutions:\n        _output[\"evolved_instruction\"] = instructions[-1]\n    else:\n        _output[\"evolved_instructions\"] = instructions\n\n    if self.generate_answers and answers:\n        if not self.store_evolutions:\n            _output[\"answer\"] = answers[-1]\n        else:\n            _output[\"answers\"] = answers\n\n    _output[\"model_name\"] = self.llm.model_name\n    return _output\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._apply_random_mutation","title":"_apply_random_mutation(instruction)","text":"

Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt.

Parameters:

Name Type Description Default instruction str

The instruction to be included within the mutation prompt.

required

Returns:

Type Description str

A random mutation prompt with the provided instruction.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
def _apply_random_mutation(self, instruction: str) -> str:\n    \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n    enum, and returns the provided instruction within the mutation prompt.\n\n    Args:\n        instruction: The instruction to be included within the mutation prompt.\n\n    Returns:\n        A random mutation prompt with the provided instruction.\n    \"\"\"\n    mutation = np.random.choice(self.mutation_templates_names)\n    return self.mutation_templates[mutation].replace(\"<PROMPT>\", instruction)  # type: ignore\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._evolve_instructions","title":"_evolve_instructions(inputs)","text":"

Evolves the instructions provided as part of the inputs of the task.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description List[List[str]]

A list where each item is a list with either the last evolved instruction if

List[List[str]]

store_evolutions=False or all the evolved instructions if store_evolutions=True.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
def _evolve_instructions(self, inputs: \"StepInput\") -> List[List[str]]:\n    \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list where each item is a list with either the last evolved instruction if\n        `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n    \"\"\"\n\n    instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n\n    for iter_no in range(self.num_evolutions):\n        formatted_prompts = []\n        for instruction in instructions:\n            formatted_prompts.append(self._apply_random_mutation(instruction[-1]))\n\n        formatted_prompts = [\n            self.format_input(prompt) for prompt in formatted_prompts\n        ]\n        generated_prompts = flatten_responses(\n            self.llm.generate(\n                formatted_prompts,\n                **self.llm.generation_kwargs,  # type: ignore\n            )\n        )\n\n        evolved_instructions = []\n        for generated_prompt in generated_prompts:\n            generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n            evolved_instructions.append(generated_prompt)\n\n        if self.store_evolutions:\n            instructions = [\n                instruction + [evolved_instruction]\n                for instruction, evolved_instruction in zip(\n                    instructions, evolved_instructions\n                )\n            ]\n        else:\n            instructions = [\n                [evolved_instruction]\n                for evolved_instruction in evolved_instructions\n            ]\n\n        self._logger.info(\n            f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(instructions)} instructions!\"\n        )\n\n    return instructions\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._generate_answers","title":"_generate_answers(evolved_instructions)","text":"

Generates the answer for the instructions in instructions.

Parameters:

Name Type Description Default evolved_instructions List[List[str]]

A list of lists where each item is a list with either the last evolved instruction if store_evolutions=False or all the evolved instructions if store_evolutions=True.

required

Returns:

Type Description List[List[str]]

A list of answers for each instruction.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
def _generate_answers(\n    self, evolved_instructions: List[List[str]]\n) -> List[List[str]]:\n    \"\"\"Generates the answer for the instructions in `instructions`.\n\n    Args:\n        evolved_instructions: A list of lists where each item is a list with either the last\n            evolved instruction if `store_evolutions=False` or all the evolved instructions\n            if `store_evolutions=True`.\n\n    Returns:\n        A list of answers for each instruction.\n    \"\"\"\n    formatted_instructions = [\n        self.format_input(instruction)\n        for instructions in evolved_instructions\n        for instruction in instructions\n    ]\n\n    responses = self.llm.generate(\n        formatted_instructions,\n        num_generations=1,\n        **self.llm.generation_kwargs,  # type: ignore\n    )\n\n    step = (\n        self.num_evolutions\n        if not self.include_original_instruction\n        else self.num_evolutions + 1\n    )\n    return [\n        flatten_responses(responses[i : i + step])\n        for i in range(0, len(responses), step)\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.process","title":"process(inputs)","text":"

Processes the inputs of the task and generates the outputs using the LLM.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Yields:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Yields:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n\n    evolved_instructions = self._evolve_instructions(inputs)\n\n    if self.store_evolutions:\n        # Remove the input instruction from the `evolved_instructions` list\n        from_ = 1 if not self.include_original_instruction else 0\n        evolved_instructions = [\n            instruction[from_:] for instruction in evolved_instructions\n        ]\n\n    if not self.generate_answers:\n        for input, instruction in zip(inputs, evolved_instructions):\n            input.update(self.format_output(instruction))\n        yield inputs\n\n    self._logger.info(\n        f\"\ud83c\udf89 Finished evolving {len(evolved_instructions)} instructions!\"\n    )\n\n    if self.generate_answers:\n        self._logger.info(\n            f\"\ud83e\udde0 Generating answers for the {len(evolved_instructions)} evolved instructions!\"\n        )\n\n        answers = self._generate_answers(evolved_instructions)\n\n        self._logger.info(\n            f\"\ud83c\udf89 Finished generating answers for the {len(evolved_instructions)} evolved\"\n            \" instructions!\"\n        )\n\n        for idx, (input, instruction) in enumerate(\n            zip(inputs, evolved_instructions)\n        ):\n            input.update(self.format_output(instruction, answers[idx]))\n        yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolComplexity","title":"EvolComplexity","text":"

Bases: EvolInstruct

Evolve instructions to make them more complex using an LLM.

EvolComplexity is a task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, using slight different prompts, but the exact same evolutionary approach.

Attributes:

Name Type Description num_instructions

The number of instructions to be generated.

generate_answers

Whether to generate answers for the instructions or not. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used for the generation of the instructions.

min_length Dict[str, str]

Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

max_length Dict[str, str]

Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

seed Dict[str, str]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.
  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.
  • seed: The number of evolutions to be run.
Input columns
  • instruction (str): The instruction to evolve.
Output columns
  • evolved_instruction (str): The evolved instruction.
  • answer (str, optional): The answer to the instruction if generate_answers=True.
  • model_name (str): The name of the LLM used to evolve the instructions.
Categories
  • evol
  • instruction
  • deita
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

Examples:

Evolve an instruction using an LLM:

from distilabel.steps.tasks import EvolComplexity\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity = EvolComplexity(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_complexity.load()\n\nresult = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
@misc{xu2023wizardlmempoweringlargelanguage,\n    title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n    author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n    year={2023},\n    eprint={2304.12244},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2304.12244},\n}\n
Source code in src/distilabel/steps/tasks/evol_instruct/evol_complexity/base.py
class EvolComplexity(EvolInstruct):\n    \"\"\"Evolve instructions to make them more complex using an `LLM`.\n\n    `EvolComplexity` is a task that evolves instructions to make them more complex,\n    and it is based in the EvolInstruct task, using slight different prompts, but the\n    exact same evolutionary approach.\n\n    Attributes:\n        num_instructions: The number of instructions to be generated.\n        generate_answers: Whether to generate answers for the instructions or not. Defaults\n            to `False`.\n        mutation_templates: The mutation templates to be used for the generation of the\n            instructions.\n        min_length: Defines the length (in bytes) that the generated instruction needs to\n            be higher than, to be considered valid. Defaults to `512`.\n        max_length: Defines the length (in bytes) that the generated instruction needs to\n            be lower than, to be considered valid. Defaults to `1024`.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `min_length`: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\n        - `max_length`: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\n        - `seed`: The number of evolutions to be run.\n\n    Input columns:\n        - instruction (`str`): The instruction to evolve.\n\n    Output columns:\n        - evolved_instruction (`str`): The evolved instruction.\n        - answer (`str`, optional): The answer to the instruction if `generate_answers=True`.\n        - model_name (`str`): The name of the LLM used to evolve the instructions.\n\n    Categories:\n        - evol\n        - instruction\n        - deita\n\n    References:\n        - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n        - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n\n    Examples:\n        Evolve an instruction using an LLM:\n\n        ```python\n        from distilabel.steps.tasks import EvolComplexity\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_complexity = EvolComplexity(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n        )\n\n        evol_complexity.load()\n\n        result = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n        # result\n        # [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n\n        ```\n        @misc{xu2023wizardlmempoweringlargelanguage,\n            title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n            author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n            year={2023},\n            eprint={2304.12244},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2304.12244},\n        }\n        ```\n    \"\"\"\n\n    mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolComplexityGenerator","title":"EvolComplexityGenerator","text":"

Bases: EvolInstructGenerator

Generate evolved instructions with increased complexity using an LLM.

EvolComplexityGenerator is a generation task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, but using slight different prompts, but the exact same evolutionary approach.

Attributes:

Name Type Description num_instructions

The number of instructions to be generated.

generate_answers

Whether to generate answers for the instructions or not. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used for the generation of the instructions.

min_length Dict[str, str]

Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

max_length Dict[str, str]

Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

seed Dict[str, str]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.
  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.
  • seed: The number of evolutions to be run.
Output columns
  • instruction (str): The evolved instruction.
  • answer (str, optional): The answer to the instruction if generate_answers=True.
  • model_name (str): The name of the LLM used to evolve the instructions.
Categories
  • evol
  • instruction
  • generation
  • deita
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

Examples:

Generate evolved instructions without initial instructions:

from distilabel.steps.tasks import EvolComplexityGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity_generator = EvolComplexityGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=2,\n)\n\nevol_complexity_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
@misc{xu2023wizardlmempoweringlargelanguage,\n    title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n    author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n    year={2023},\n    eprint={2304.12244},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2304.12244},\n}\n
Source code in src/distilabel/steps/tasks/evol_instruct/evol_complexity/generator.py
class EvolComplexityGenerator(EvolInstructGenerator):\n    \"\"\"Generate evolved instructions with increased complexity using an `LLM`.\n\n    `EvolComplexityGenerator` is a generation task that evolves instructions to make\n    them more complex, and it is based in the EvolInstruct task, but using slight different\n    prompts, but the exact same evolutionary approach.\n\n    Attributes:\n        num_instructions: The number of instructions to be generated.\n        generate_answers: Whether to generate answers for the instructions or not. Defaults\n            to `False`.\n        mutation_templates: The mutation templates to be used for the generation of the\n            instructions.\n        min_length: Defines the length (in bytes) that the generated instruction needs to\n            be higher than, to be considered valid. Defaults to `512`.\n        max_length: Defines the length (in bytes) that the generated instruction needs to\n            be lower than, to be considered valid. Defaults to `1024`.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `min_length`: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\n        - `max_length`: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\n        - `seed`: The number of evolutions to be run.\n\n    Output columns:\n        - instruction (`str`): The evolved instruction.\n        - answer (`str`, optional): The answer to the instruction if `generate_answers=True`.\n        - model_name (`str`): The name of the LLM used to evolve the instructions.\n\n    Categories:\n        - evol\n        - instruction\n        - generation\n        - deita\n\n    References:\n        - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n        - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n\n    Examples:\n        Generate evolved instructions without initial instructions:\n\n        ```python\n        from distilabel.steps.tasks import EvolComplexityGenerator\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_complexity_generator = EvolComplexityGenerator(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_instructions=2,\n        )\n\n        evol_complexity_generator.load()\n\n        result = next(scorer.process())\n        # result\n        # [{'instruction': 'generated instruction', 'model_name': 'test'}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n\n        ```\n        @misc{xu2023wizardlmempoweringlargelanguage,\n            title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n            author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n            year={2023},\n            eprint={2304.12244},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2304.12244},\n        }\n        ```\n    \"\"\"\n\n    mutation_templates: Dict[str, str] = GENERATION_MUTATION_TEMPLATES\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator","title":"EvolInstructGenerator","text":"

Bases: GeneratorTask

Generate evolved instructions using an LLM.

WizardLM: Empowering Large Language Models to Follow Complex Instructions

Attributes:

Name Type Description num_instructions int

The number of instructions to be generated.

generate_answers bool

Whether to generate answers for the instructions or not. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used for the generation of the instructions.

min_length RuntimeParameter[int]

Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

max_length RuntimeParameter[int]

Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

seed RuntimeParameter[int]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.
  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
Output columns
  • instruction (str): The generated instruction if generate_answers=False.
  • answer (str): The generated answer if generate_answers=True.
  • instructions (List[str]): The generated instructions if generate_answers=True.
  • model_name (str): The name of the LLM used to generate and evolve the instructions.
Categories
  • evol
  • instruction
  • generation
References
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions
  • GitHub: h2oai/h2o-wizardlm

Examples:

Generate evolved instructions without initial instructions:

from distilabel.steps.tasks import EvolInstructGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct_generator = EvolInstructGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=2,\n)\n\nevol_instruct_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n
Citations
@misc{xu2023wizardlmempoweringlargelanguage,\n    title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n    author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n    year={2023},\n    eprint={2304.12244},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2304.12244},\n}\n
Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
class EvolInstructGenerator(GeneratorTask):\n    \"\"\"Generate evolved instructions using an `LLM`.\n\n    WizardLM: Empowering Large Language Models to Follow Complex Instructions\n\n    Attributes:\n        num_instructions: The number of instructions to be generated.\n        generate_answers: Whether to generate answers for the instructions or not. Defaults\n            to `False`.\n        mutation_templates: The mutation templates to be used for the generation of the\n            instructions.\n        min_length: Defines the length (in bytes) that the generated instruction needs to\n            be higher than, to be considered valid. Defaults to `512`.\n        max_length: Defines the length (in bytes) that the generated instruction needs to\n            be lower than, to be considered valid. Defaults to `1024`.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `min_length`: Defines the length (in bytes) that the generated instruction needs\n            to be higher than, to be considered valid.\n        - `max_length`: Defines the length (in bytes) that the generated instruction needs\n            to be lower than, to be considered valid.\n        - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n    Output columns:\n        - instruction (`str`): The generated instruction if `generate_answers=False`.\n        - answer (`str`): The generated answer if `generate_answers=True`.\n        - instructions (`List[str]`): The generated instructions if `generate_answers=True`.\n        - model_name (`str`): The name of the LLM used to generate and evolve the instructions.\n\n    Categories:\n        - evol\n        - instruction\n        - generation\n\n    References:\n        - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n        - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm)\n\n    Examples:\n        Generate evolved instructions without initial instructions:\n\n        ```python\n        from distilabel.steps.tasks import EvolInstructGenerator\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_instruct_generator = EvolInstructGenerator(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_instructions=2,\n        )\n\n        evol_instruct_generator.load()\n\n        result = next(scorer.process())\n        # result\n        # [{'instruction': 'generated instruction', 'model_name': 'test'}]\n        ```\n\n    Citations:\n        ```\n        @misc{xu2023wizardlmempoweringlargelanguage,\n            title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n            author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n            year={2023},\n            eprint={2304.12244},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2304.12244},\n        }\n        ```\n    \"\"\"\n\n    num_instructions: int\n    generate_answers: bool = False\n    mutation_templates: Dict[str, str] = GENERATION_MUTATION_TEMPLATES\n\n    min_length: RuntimeParameter[int] = Field(\n        default=512,\n        description=\"Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\",\n    )\n    max_length: RuntimeParameter[int] = Field(\n        default=1024,\n        description=\"Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\",\n    )\n\n    seed: RuntimeParameter[int] = Field(\n        default=42,\n        description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to seed a random seed.\",\n    )\n    _seed_texts: Optional[List[str]] = PrivateAttr(default_factory=list)\n    _prompts: Optional[List[str]] = PrivateAttr(default_factory=list)\n\n    def _generate_seed_texts(self) -> List[str]:\n        \"\"\"Generates a list of seed texts to be used as part of the starting prompts for the task.\n\n        It will use the `FRESH_START` mutation template, as it needs to generate text from scratch; and\n        a list of English words will be used to generate the seed texts that will be provided to the\n        mutation method and included within the prompt.\n\n        Returns:\n            A list of seed texts to be used as part of the starting prompts for the task.\n        \"\"\"\n        seed_texts = []\n        for _ in range(self.num_instructions * 10):\n            num_words = np.random.choice([1, 2, 3, 4])\n            seed_texts.append(\n                self.mutation_templates[\"FRESH_START\"].replace(  # type: ignore\n                    \"<PROMPT>\",\n                    \", \".join(\n                        [\n                            np.random.choice(self._english_nouns).strip()\n                            for _ in range(num_words)\n                        ]\n                    ),\n                )\n            )\n        return seed_texts\n\n    @override\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n        This is useful if you want to do some validation that requires the entire model to be initialized.\n        \"\"\"\n        super().model_post_init(__context)\n\n        np.random.seed(self.seed)\n\n        self._seed_texts = self._generate_seed_texts()\n        self._prompts = [\n            np.random.choice(self._seed_texts) for _ in range(self.num_instructions)\n        ]\n\n    @cached_property\n    def _english_nouns(self) -> List[str]:\n        \"\"\"A list of English nouns to be used as part of the starting prompts for the task.\n\n        References:\n            - https://github.com/h2oai/h2o-wizardlm\n        \"\"\"\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps/tasks/evol_instruct/english_nouns.txt\"\n        )\n        with open(_path, mode=\"r\") as f:\n            return [line.strip() for line in f.readlines()]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `instruction`, the `answer` if `generate_answers=True`\n        and the `model_name`.\"\"\"\n        _outputs = [\"instruction\", \"model_name\"]\n        if self.generate_answers:\n            _outputs.append(\"answer\")\n        return _outputs\n\n    def format_output(  # type: ignore\n        self, instruction: str, answer: Optional[str] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output for the task is a dict with: `instruction`; `answer` if `generate_answers=True`;\n        and, finally, the `model_name`.\n\n        Args:\n            instruction: The instruction to be included within the output.\n            answer: The answer to be included within the output if `generate_answers=True`.\n\n        Returns:\n            If `generate_answers=True` return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};\n            if `generate_answers=False` return {\"instruction\": ..., \"model_name\": ...};\n        \"\"\"\n        _output = {\n            \"instruction\": instruction,\n            \"model_name\": self.llm.model_name,\n        }\n        if self.generate_answers and answer is not None:\n            _output[\"answer\"] = answer\n        return _output\n\n    @property\n    def mutation_templates_names(self) -> List[str]:\n        \"\"\"Returns the names i.e. keys of the provided `mutation_templates`.\"\"\"\n        return list(self.mutation_templates.keys())\n\n    def _apply_random_mutation(self, iter_no: int) -> List[\"ChatType\"]:\n        \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n        enum, and returns the provided instruction within the mutation prompt.\n\n        Args:\n            iter_no: The iteration number to be used to check whether the iteration is the\n                first one i.e. FRESH_START, or not.\n\n        Returns:\n            A random mutation prompt with the provided instruction formatted as an OpenAI conversation.\n        \"\"\"\n        prompts = []\n        for idx in range(self.num_instructions):\n            if (\n                iter_no == 0\n                or \"Write one question or request containing\" in self._prompts[idx]  # type: ignore\n            ):\n                mutation = \"FRESH_START\"\n            else:\n                mutation = np.random.choice(self.mutation_templates_names)\n                if mutation == \"FRESH_START\":\n                    self._prompts[idx] = np.random.choice(self._seed_texts)  # type: ignore\n\n            prompt_with_template = (\n                self.mutation_templates[mutation].replace(  # type: ignore\n                    \"<PROMPT>\",\n                    self._prompts[idx],  # type: ignore\n                )  # type: ignore\n                if iter_no != 0\n                else self._prompts[idx]  # type: ignore\n            )\n            prompts.append([{\"role\": \"user\", \"content\": prompt_with_template}])\n        return prompts\n\n    def _generate_answers(self, instructions: List[List[str]]) -> List[str]:\n        \"\"\"Generates the answer for the last instruction in `instructions`.\n\n        Args:\n            instructions: A list of lists where each item is a list with either the last\n                evolved instruction if `store_evolutions=False` or all the evolved instructions\n                if `store_evolutions=True`.\n\n        Returns:\n            A list of answers for the last instruction in `instructions`.\n        \"\"\"\n        # TODO: update to generate answers for all the instructions\n        _formatted_instructions = [\n            [{\"role\": \"user\", \"content\": instruction[-1]}]\n            for instruction in instructions\n        ]\n        responses = self.llm.generate(\n            _formatted_instructions,\n            **self.llm.generation_kwargs,  # type: ignore\n        )\n        return flatten_responses(responses)\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n        \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to 0.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task, and a boolean\n            flag indicating whether the task has finished or not i.e. is the last batch.\n        \"\"\"\n        instructions = []\n        mutation_no = 0\n\n        # TODO: update to take into account `offset`\n        iter_no = 0\n        while len(instructions) < self.num_instructions:\n            prompts = self._apply_random_mutation(iter_no=iter_no)\n\n            generated_prompts = flatten_responses(\n                self.llm.generate(prompts, **self.llm.generation_kwargs)  # type: ignore\n            )\n            for idx, generated_prompt in enumerate(generated_prompts):\n                generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n                if self.max_length >= len(generated_prompt) >= self.min_length:  # type: ignore\n                    instructions.append(generated_prompt)\n                    self._prompts[idx] = np.random.choice(self._seed_texts)  # type: ignore\n                else:\n                    self._prompts[idx] = generated_prompt  # type: ignore\n\n            self._logger.info(\n                f\"\ud83d\udd04 Ran iteration {iter_no} with {len(instructions)} instructions already evolved!\"\n            )\n            iter_no += 1\n\n            if len(instructions) > self.num_instructions:\n                instructions = instructions[: self.num_instructions]\n            if len(instructions) > mutation_no:\n                mutation_no = len(instructions) - mutation_no\n\n            if not self.generate_answers and len(instructions[-mutation_no:]) > 0:\n                yield (\n                    [\n                        self.format_output(mutated_instruction)\n                        for mutated_instruction in instructions[-mutation_no:]\n                    ],\n                    len(instructions) >= self.num_instructions,\n                )\n\n        self._logger.info(f\"\ud83c\udf89 Finished evolving {len(instructions)} instructions!\")\n\n        if self.generate_answers:\n            self._logger.info(\n                f\"\ud83e\udde0 Generating answers for the {len(instructions)} evolved instructions!\"\n            )\n\n            answers = self._generate_answers(instructions)\n\n            self._logger.info(\n                f\"\ud83c\udf89 Finished generating answers for the {len(instructions)} evolved instructions!\"\n            )\n\n            yield (\n                [\n                    self.format_output(instruction, answer)\n                    for instruction, answer in zip(instructions, answers)\n                ],\n                True,\n            )\n\n    @override\n    def _sample_input(self) -> \"ChatType\":\n        return self._apply_random_mutation(iter_no=0)[0]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._english_nouns","title":"_english_nouns: List[str] cached property","text":"

A list of English nouns to be used as part of the starting prompts for the task.

References
  • https://github.com/h2oai/h2o-wizardlm
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.outputs","title":"outputs: List[str] property","text":"

The output for the task are the instruction, the answer if generate_answers=True and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.mutation_templates_names","title":"mutation_templates_names: List[str] property","text":"

Returns the names i.e. keys of the provided mutation_templates.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._generate_seed_texts","title":"_generate_seed_texts()","text":"

Generates a list of seed texts to be used as part of the starting prompts for the task.

It will use the FRESH_START mutation template, as it needs to generate text from scratch; and a list of English words will be used to generate the seed texts that will be provided to the mutation method and included within the prompt.

Returns:

Type Description List[str]

A list of seed texts to be used as part of the starting prompts for the task.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
def _generate_seed_texts(self) -> List[str]:\n    \"\"\"Generates a list of seed texts to be used as part of the starting prompts for the task.\n\n    It will use the `FRESH_START` mutation template, as it needs to generate text from scratch; and\n    a list of English words will be used to generate the seed texts that will be provided to the\n    mutation method and included within the prompt.\n\n    Returns:\n        A list of seed texts to be used as part of the starting prompts for the task.\n    \"\"\"\n    seed_texts = []\n    for _ in range(self.num_instructions * 10):\n        num_words = np.random.choice([1, 2, 3, 4])\n        seed_texts.append(\n            self.mutation_templates[\"FRESH_START\"].replace(  # type: ignore\n                \"<PROMPT>\",\n                \", \".join(\n                    [\n                        np.random.choice(self._english_nouns).strip()\n                        for _ in range(num_words)\n                    ]\n                ),\n            )\n        )\n    return seed_texts\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.model_post_init","title":"model_post_init(__context)","text":"

Override this method to perform additional initialization after __init__ and model_construct. This is useful if you want to do some validation that requires the entire model to be initialized.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
@override\ndef model_post_init(self, __context: Any) -> None:\n    \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n    This is useful if you want to do some validation that requires the entire model to be initialized.\n    \"\"\"\n    super().model_post_init(__context)\n\n    np.random.seed(self.seed)\n\n    self._seed_texts = self._generate_seed_texts()\n    self._prompts = [\n        np.random.choice(self._seed_texts) for _ in range(self.num_instructions)\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.format_output","title":"format_output(instruction, answer=None)","text":"

The output for the task is a dict with: instruction; answer if generate_answers=True; and, finally, the model_name.

Parameters:

Name Type Description Default instruction str

The instruction to be included within the output.

required answer Optional[str]

The answer to be included within the output if generate_answers=True.

None

Returns:

Type Description Dict[str, Any]

If generate_answers=True return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};

Dict[str, Any]

if generate_answers=False return {\"instruction\": ..., \"model_name\": ...};

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
def format_output(  # type: ignore\n    self, instruction: str, answer: Optional[str] = None\n) -> Dict[str, Any]:\n    \"\"\"The output for the task is a dict with: `instruction`; `answer` if `generate_answers=True`;\n    and, finally, the `model_name`.\n\n    Args:\n        instruction: The instruction to be included within the output.\n        answer: The answer to be included within the output if `generate_answers=True`.\n\n    Returns:\n        If `generate_answers=True` return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};\n        if `generate_answers=False` return {\"instruction\": ..., \"model_name\": ...};\n    \"\"\"\n    _output = {\n        \"instruction\": instruction,\n        \"model_name\": self.llm.model_name,\n    }\n    if self.generate_answers and answer is not None:\n        _output[\"answer\"] = answer\n    return _output\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._apply_random_mutation","title":"_apply_random_mutation(iter_no)","text":"

Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt.

Parameters:

Name Type Description Default iter_no int

The iteration number to be used to check whether the iteration is the first one i.e. FRESH_START, or not.

required

Returns:

Type Description List[ChatType]

A random mutation prompt with the provided instruction formatted as an OpenAI conversation.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
def _apply_random_mutation(self, iter_no: int) -> List[\"ChatType\"]:\n    \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n    enum, and returns the provided instruction within the mutation prompt.\n\n    Args:\n        iter_no: The iteration number to be used to check whether the iteration is the\n            first one i.e. FRESH_START, or not.\n\n    Returns:\n        A random mutation prompt with the provided instruction formatted as an OpenAI conversation.\n    \"\"\"\n    prompts = []\n    for idx in range(self.num_instructions):\n        if (\n            iter_no == 0\n            or \"Write one question or request containing\" in self._prompts[idx]  # type: ignore\n        ):\n            mutation = \"FRESH_START\"\n        else:\n            mutation = np.random.choice(self.mutation_templates_names)\n            if mutation == \"FRESH_START\":\n                self._prompts[idx] = np.random.choice(self._seed_texts)  # type: ignore\n\n        prompt_with_template = (\n            self.mutation_templates[mutation].replace(  # type: ignore\n                \"<PROMPT>\",\n                self._prompts[idx],  # type: ignore\n            )  # type: ignore\n            if iter_no != 0\n            else self._prompts[idx]  # type: ignore\n        )\n        prompts.append([{\"role\": \"user\", \"content\": prompt_with_template}])\n    return prompts\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._generate_answers","title":"_generate_answers(instructions)","text":"

Generates the answer for the last instruction in instructions.

Parameters:

Name Type Description Default instructions List[List[str]]

A list of lists where each item is a list with either the last evolved instruction if store_evolutions=False or all the evolved instructions if store_evolutions=True.

required

Returns:

Type Description List[str]

A list of answers for the last instruction in instructions.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
def _generate_answers(self, instructions: List[List[str]]) -> List[str]:\n    \"\"\"Generates the answer for the last instruction in `instructions`.\n\n    Args:\n        instructions: A list of lists where each item is a list with either the last\n            evolved instruction if `store_evolutions=False` or all the evolved instructions\n            if `store_evolutions=True`.\n\n    Returns:\n        A list of answers for the last instruction in `instructions`.\n    \"\"\"\n    # TODO: update to generate answers for all the instructions\n    _formatted_instructions = [\n        [{\"role\": \"user\", \"content\": instruction[-1]}]\n        for instruction in instructions\n    ]\n    responses = self.llm.generate(\n        _formatted_instructions,\n        **self.llm.generation_kwargs,  # type: ignore\n    )\n    return flatten_responses(responses)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.process","title":"process(offset=0)","text":"

Processes the inputs of the task and generates the outputs using the LLM.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

A list of Python dictionaries with the outputs of the task, and a boolean

GeneratorStepOutput

flag indicating whether the task has finished or not i.e. is the last batch.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
@override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n    \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to 0.\n\n    Yields:\n        A list of Python dictionaries with the outputs of the task, and a boolean\n        flag indicating whether the task has finished or not i.e. is the last batch.\n    \"\"\"\n    instructions = []\n    mutation_no = 0\n\n    # TODO: update to take into account `offset`\n    iter_no = 0\n    while len(instructions) < self.num_instructions:\n        prompts = self._apply_random_mutation(iter_no=iter_no)\n\n        generated_prompts = flatten_responses(\n            self.llm.generate(prompts, **self.llm.generation_kwargs)  # type: ignore\n        )\n        for idx, generated_prompt in enumerate(generated_prompts):\n            generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n            if self.max_length >= len(generated_prompt) >= self.min_length:  # type: ignore\n                instructions.append(generated_prompt)\n                self._prompts[idx] = np.random.choice(self._seed_texts)  # type: ignore\n            else:\n                self._prompts[idx] = generated_prompt  # type: ignore\n\n        self._logger.info(\n            f\"\ud83d\udd04 Ran iteration {iter_no} with {len(instructions)} instructions already evolved!\"\n        )\n        iter_no += 1\n\n        if len(instructions) > self.num_instructions:\n            instructions = instructions[: self.num_instructions]\n        if len(instructions) > mutation_no:\n            mutation_no = len(instructions) - mutation_no\n\n        if not self.generate_answers and len(instructions[-mutation_no:]) > 0:\n            yield (\n                [\n                    self.format_output(mutated_instruction)\n                    for mutated_instruction in instructions[-mutation_no:]\n                ],\n                len(instructions) >= self.num_instructions,\n            )\n\n    self._logger.info(f\"\ud83c\udf89 Finished evolving {len(instructions)} instructions!\")\n\n    if self.generate_answers:\n        self._logger.info(\n            f\"\ud83e\udde0 Generating answers for the {len(instructions)} evolved instructions!\"\n        )\n\n        answers = self._generate_answers(instructions)\n\n        self._logger.info(\n            f\"\ud83c\udf89 Finished generating answers for the {len(instructions)} evolved instructions!\"\n        )\n\n        yield (\n            [\n                self.format_output(instruction, answer)\n                for instruction, answer in zip(instructions, answers)\n            ],\n            True,\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality","title":"EvolQuality","text":"

Bases: Task

Evolve the quality of the responses using an LLM.

EvolQuality task is used to evolve the quality of the responses given a prompt, by generating a new response with a language model. This step implements the evolution quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

Attributes:

Name Type Description num_evolutions int

The number of evolutions to be performed on the responses.

store_evolutions bool

Whether to store all the evolved responses or just the last one. Defaults to False.

include_original_response bool

Whether to include the original response within the evolved responses. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used to evolve the responses.

seed RuntimeParameter[int]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
Input columns
  • instruction (str): The instruction that was used to generate the responses.
  • response (str): The responses to be rewritten.
Output columns
  • evolved_response (str): The evolved response if store_evolutions=False.
  • evolved_responses (List[str]): The evolved responses if store_evolutions=True.
  • model_name (str): The name of the LLM used to evolve the responses.
Categories
  • evol
  • response
  • deita
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Evolve the quality of the responses given a prompt:

from distilabel.steps.tasks import EvolQuality\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_quality = EvolQuality(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_quality.load()\n\nresult = next(\n    evol_quality.process(\n        [\n            {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'response': 'a response',\n#         'evolved_response': 'evolved response',\n#         'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n#     }\n# ]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/tasks/evol_quality/base.py
class EvolQuality(Task):\n    \"\"\"Evolve the quality of the responses using an `LLM`.\n\n    `EvolQuality` task is used to evolve the quality of the responses given a prompt,\n    by generating a new response with a language model. This step implements the evolution\n    quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of\n    Automatic Data Selection in Instruction Tuning'.\n\n    Attributes:\n        num_evolutions: The number of evolutions to be performed on the responses.\n        store_evolutions: Whether to store all the evolved responses or just the last one.\n            Defaults to `False`.\n        include_original_response: Whether to include the original response within the evolved\n            responses. Defaults to `False`.\n        mutation_templates: The mutation templates to be used to evolve the responses.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n    Input columns:\n        - instruction (`str`): The instruction that was used to generate the `responses`.\n        - response (`str`): The responses to be rewritten.\n\n    Output columns:\n        - evolved_response (`str`): The evolved response if `store_evolutions=False`.\n        - evolved_responses (`List[str]`): The evolved responses if `store_evolutions=True`.\n        - model_name (`str`): The name of the LLM used to evolve the responses.\n\n    Categories:\n        - evol\n        - response\n        - deita\n\n    References:\n        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Evolve the quality of the responses given a prompt:\n\n        ```python\n        from distilabel.steps.tasks import EvolQuality\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_quality = EvolQuality(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n        )\n\n        evol_quality.load()\n\n        result = next(\n            evol_quality.process(\n                [\n                    {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'common instruction',\n        #         'response': 'a response',\n        #         'evolved_response': 'evolved response',\n        #         'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    num_evolutions: int\n    store_evolutions: bool = False\n    include_original_response: bool = False\n    mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n\n    seed: RuntimeParameter[int] = Field(\n        default=42,\n        description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to set a random seed.\",\n    )\n\n    @override\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n        This is useful if you want to do some validation that requires the entire model to be initialized.\n        \"\"\"\n        super().model_post_init(__context)\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task are the `instruction` and `response`.\"\"\"\n        return [\"instruction\", \"response\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation. And the\n        `system_prompt` is added as the first message if it exists.\"\"\"\n        return [{\"role\": \"user\", \"content\": input}]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `evolved_response/s` and the `model_name`.\"\"\"\n        # TODO: having to define a `model_name` column every time as the `Task.outputs` is not ideal,\n        # this could be handled always and the value could be included within the DAG validation when\n        # a `Task` is used, since all the `Task` subclasses will have an `llm` with a `model_name` attr.\n        _outputs = [\n            (\"evolved_response\" if not self.store_evolutions else \"evolved_responses\"),\n            \"model_name\",\n        ]\n\n        return _outputs\n\n    def format_output(self, responses: Union[str, List[str]]) -> Dict[str, Any]:  # type: ignore\n        \"\"\"The output for the task is a dict with: `evolved_response` or `evolved_responses`,\n        depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n        and, finally, the `model_name`.\n\n        Args:\n            responses: The responses to be included within the output.\n\n        Returns:\n            if `store_evolutions=False` return {\"evolved_response\": ..., \"model_name\": ...};\n            if `store_evolutions=True` return {\"evolved_responses\": ..., \"model_name\": ...}.\n        \"\"\"\n        _output = {}\n\n        if not self.store_evolutions:\n            _output[\"evolved_response\"] = responses[-1]\n        else:\n            _output[\"evolved_responses\"] = responses\n\n        _output[\"model_name\"] = self.llm.model_name\n        return _output\n\n    @property\n    def mutation_templates_names(self) -> List[str]:\n        \"\"\"Returns the names i.e. keys of the provided `mutation_templates` enum.\"\"\"\n        return list(self.mutation_templates.keys())\n\n    def _apply_random_mutation(self, instruction: str, response: str) -> str:\n        \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n        enum, and returns the provided instruction within the mutation prompt.\n\n        Args:\n            instruction: The instruction to be included within the mutation prompt.\n\n        Returns:\n            A random mutation prompt with the provided instruction.\n        \"\"\"\n        mutation = np.random.choice(self.mutation_templates_names)\n        return (\n            self.mutation_templates[mutation]\n            .replace(\"<PROMPT>\", instruction)\n            .replace(\"<RESPONSE>\", response)\n        )\n\n    def _evolve_reponses(self, inputs: \"StepInput\") -> List[List[str]]:\n        \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list where each item is a list with either the last evolved instruction if\n            `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n        \"\"\"\n        np.random.seed(self.seed)\n        instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n        responses: List[List[str]] = [[input[\"response\"]] for input in inputs]\n\n        for iter_no in range(self.num_evolutions):\n            formatted_prompts = []\n            for instruction, response in zip(instructions, responses):\n                formatted_prompts.append(\n                    self._apply_random_mutation(instruction[-1], response[-1])\n                )\n\n            formatted_prompts = [\n                self.format_input(prompt) for prompt in formatted_prompts\n            ]\n\n            generated_responses = self.llm.generate(\n                formatted_prompts,\n                **self.llm.generation_kwargs,  # type: ignore\n            )\n\n            if self.store_evolutions:\n                responses = [\n                    response + [evolved_response[0]]\n                    for response, evolved_response in zip(\n                        responses, generated_responses\n                    )\n                ]\n            else:\n                responses = [\n                    [evolved_response[0]] for evolved_response in generated_responses\n                ]\n\n            self._logger.info(\n                f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(responses)} responses!\"\n            )\n\n        return responses\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n\n        responses = self._evolve_reponses(inputs)\n\n        if self.store_evolutions:\n            # Remove the input instruction from the `evolved_responses` list\n            from_ = 1 if not self.include_original_response else 0\n            responses = [response[from_:] for response in responses]\n\n        for input, response in zip(inputs, responses):\n            input.update(self.format_output(response))\n        yield inputs\n\n        self._logger.info(f\"\ud83c\udf89 Finished evolving {len(responses)} instructions!\")\n\n    @override\n    def _sample_input(self) -> ChatType:\n        return self.format_input(\"<PLACEHOLDER_INSTRUCTION>\")\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.inputs","title":"inputs: List[str] property","text":"

The input for the task are the instruction and response.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.outputs","title":"outputs: List[str] property","text":"

The output for the task are the evolved_response/s and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.mutation_templates_names","title":"mutation_templates_names: List[str] property","text":"

Returns the names i.e. keys of the provided mutation_templates enum.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.model_post_init","title":"model_post_init(__context)","text":"

Override this method to perform additional initialization after __init__ and model_construct. This is useful if you want to do some validation that requires the entire model to be initialized.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
@override\ndef model_post_init(self, __context: Any) -> None:\n    \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n    This is useful if you want to do some validation that requires the entire model to be initialized.\n    \"\"\"\n    super().model_post_init(__context)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. And the system_prompt is added as the first message if it exists.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
def format_input(self, input: str) -> ChatType:  # type: ignore\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation. And the\n    `system_prompt` is added as the first message if it exists.\"\"\"\n    return [{\"role\": \"user\", \"content\": input}]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.format_output","title":"format_output(responses)","text":"

The output for the task is a dict with: evolved_response or evolved_responses, depending whether the value is either False or True for store_evolutions, respectively; and, finally, the model_name.

Parameters:

Name Type Description Default responses Union[str, List[str]]

The responses to be included within the output.

required

Returns:

Type Description Dict[str, Any]

if store_evolutions=False return {\"evolved_response\": ..., \"model_name\": ...};

Dict[str, Any]

if store_evolutions=True return {\"evolved_responses\": ..., \"model_name\": ...}.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
def format_output(self, responses: Union[str, List[str]]) -> Dict[str, Any]:  # type: ignore\n    \"\"\"The output for the task is a dict with: `evolved_response` or `evolved_responses`,\n    depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n    and, finally, the `model_name`.\n\n    Args:\n        responses: The responses to be included within the output.\n\n    Returns:\n        if `store_evolutions=False` return {\"evolved_response\": ..., \"model_name\": ...};\n        if `store_evolutions=True` return {\"evolved_responses\": ..., \"model_name\": ...}.\n    \"\"\"\n    _output = {}\n\n    if not self.store_evolutions:\n        _output[\"evolved_response\"] = responses[-1]\n    else:\n        _output[\"evolved_responses\"] = responses\n\n    _output[\"model_name\"] = self.llm.model_name\n    return _output\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality._apply_random_mutation","title":"_apply_random_mutation(instruction, response)","text":"

Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt.

Parameters:

Name Type Description Default instruction str

The instruction to be included within the mutation prompt.

required

Returns:

Type Description str

A random mutation prompt with the provided instruction.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
def _apply_random_mutation(self, instruction: str, response: str) -> str:\n    \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n    enum, and returns the provided instruction within the mutation prompt.\n\n    Args:\n        instruction: The instruction to be included within the mutation prompt.\n\n    Returns:\n        A random mutation prompt with the provided instruction.\n    \"\"\"\n    mutation = np.random.choice(self.mutation_templates_names)\n    return (\n        self.mutation_templates[mutation]\n        .replace(\"<PROMPT>\", instruction)\n        .replace(\"<RESPONSE>\", response)\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality._evolve_reponses","title":"_evolve_reponses(inputs)","text":"

Evolves the instructions provided as part of the inputs of the task.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description List[List[str]]

A list where each item is a list with either the last evolved instruction if

List[List[str]]

store_evolutions=False or all the evolved instructions if store_evolutions=True.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
def _evolve_reponses(self, inputs: \"StepInput\") -> List[List[str]]:\n    \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list where each item is a list with either the last evolved instruction if\n        `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n    \"\"\"\n    np.random.seed(self.seed)\n    instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n    responses: List[List[str]] = [[input[\"response\"]] for input in inputs]\n\n    for iter_no in range(self.num_evolutions):\n        formatted_prompts = []\n        for instruction, response in zip(instructions, responses):\n            formatted_prompts.append(\n                self._apply_random_mutation(instruction[-1], response[-1])\n            )\n\n        formatted_prompts = [\n            self.format_input(prompt) for prompt in formatted_prompts\n        ]\n\n        generated_responses = self.llm.generate(\n            formatted_prompts,\n            **self.llm.generation_kwargs,  # type: ignore\n        )\n\n        if self.store_evolutions:\n            responses = [\n                response + [evolved_response[0]]\n                for response, evolved_response in zip(\n                    responses, generated_responses\n                )\n            ]\n        else:\n            responses = [\n                [evolved_response[0]] for evolved_response in generated_responses\n            ]\n\n        self._logger.info(\n            f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(responses)} responses!\"\n        )\n\n    return responses\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.process","title":"process(inputs)","text":"

Processes the inputs of the task and generates the outputs using the LLM.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n\n    responses = self._evolve_reponses(inputs)\n\n    if self.store_evolutions:\n        # Remove the input instruction from the `evolved_responses` list\n        from_ = 1 if not self.include_original_response else 0\n        responses = [response[from_:] for response in responses]\n\n    for input, response in zip(inputs, responses):\n        input.update(self.format_output(response))\n    yield inputs\n\n    self._logger.info(f\"\ud83c\udf89 Finished evolving {len(responses)} instructions!\")\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings","title":"GenerateEmbeddings","text":"

Bases: Step

Generate embeddings using the last hidden state of an LLM.

Generate embeddings for a text input using the last hidden state of an LLM, as described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

Attributes:

Name Type Description llm LLM

The LLM to use to generate the embeddings.

Input columns
  • text (str, List[Dict[str, str]]): The input text or conversation to generate embeddings for.
Output columns
  • embedding (List[float]): The embedding of the input text or conversation.
  • model_name (str): The model name used to generate the embeddings.
Categories
  • embedding
  • llm
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Rank LLM candidates:

from distilabel.steps.tasks import GenerateEmbeddings\nfrom distilabel.models.llms.huggingface import TransformersLLM\n\n# Consider this as a placeholder for your actual LLM.\nembedder = GenerateEmbeddings(\n    llm=TransformersLLM(\n        model=\"TaylorAI/bge-micro-v2\",\n        model_kwargs={\"is_decoder\": True},\n        cuda_devices=[],\n    )\n)\nembedder.load()\n\nresult = next(\n    embedder.process(\n        [\n            {\"text\": \"Hello, how are you?\"},\n        ]\n    )\n)\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/tasks/generate_embeddings.py
class GenerateEmbeddings(Step):\n    \"\"\"Generate embeddings using the last hidden state of an `LLM`.\n\n    Generate embeddings for a text input using the last hidden state of an `LLM`, as\n    described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of\n    Automatic Data Selection in Instruction Tuning'.\n\n    Attributes:\n        llm: The `LLM` to use to generate the embeddings.\n\n    Input columns:\n        - text (`str`, `List[Dict[str, str]]`): The input text or conversation to generate\n            embeddings for.\n\n    Output columns:\n        - embedding (`List[float]`): The embedding of the input text or conversation.\n        - model_name (`str`): The model name used to generate the embeddings.\n\n    Categories:\n        - embedding\n        - llm\n\n    References:\n        - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Rank LLM candidates:\n\n        ```python\n        from distilabel.steps.tasks import GenerateEmbeddings\n        from distilabel.models.llms.huggingface import TransformersLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        embedder = GenerateEmbeddings(\n            llm=TransformersLLM(\n                model=\"TaylorAI/bge-micro-v2\",\n                model_kwargs={\"is_decoder\": True},\n                cuda_devices=[],\n            )\n        )\n        embedder.load()\n\n        result = next(\n            embedder.process(\n                [\n                    {\"text\": \"Hello, how are you?\"},\n                ]\n            )\n        )\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    llm: LLM\n\n    def load(self) -> None:\n        \"\"\"Loads the `LLM` used to generate the embeddings.\"\"\"\n        super().load()\n\n        self.llm.load()\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task is a `text` column containing either a string or a\n        list of dictionaries in OpenAI chat-like format.\"\"\"\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs for the task is an `embedding` column containing the embedding of\n        the `text` input.\"\"\"\n        return [\"embedding\", \"model_name\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"Formats the input to be used by the LLM to generate the embeddings. The input\n        can be in `ChatType` format or a string. If a string, it will be converted to a\n        list of dictionaries in OpenAI chat-like format.\n\n        Args:\n            input: The input to format.\n\n        Returns:\n            The OpenAI chat-like format of the input.\n        \"\"\"\n        text = input[\"text\"] = input[\"text\"]\n\n        # input is in `ChatType` format\n        if isinstance(text, str):\n            return [{\"role\": \"user\", \"content\": text}]\n\n        if is_openai_format(text):\n            return text\n\n        raise DistilabelUserError(\n            f\"Couldn't format input for step {self.name}. The `text` input column has to\"\n            \" be a string or a list of dictionaries in OpenAI chat-like format.\",\n            page=\"components-gallery/tasks/generateembeddings/\",\n        )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Generates an embedding for each input using the last hidden state of the `LLM`.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n        formatted_inputs = [self.format_input(input) for input in inputs]\n        last_hidden_states = self.llm.get_last_hidden_states(formatted_inputs)\n        for input, hidden_state in zip(inputs, last_hidden_states):\n            input[\"embedding\"] = hidden_state[-1].tolist()\n            input[\"model_name\"] = self.llm.model_name\n        yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task is a text column containing either a string or a list of dictionaries in OpenAI chat-like format.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.outputs","title":"outputs: StepColumns property","text":"

The outputs for the task is an embedding column containing the embedding of the text input.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.load","title":"load()","text":"

Loads the LLM used to generate the embeddings.

Source code in src/distilabel/steps/tasks/generate_embeddings.py
def load(self) -> None:\n    \"\"\"Loads the `LLM` used to generate the embeddings.\"\"\"\n    super().load()\n\n    self.llm.load()\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.format_input","title":"format_input(input)","text":"

Formats the input to be used by the LLM to generate the embeddings. The input can be in ChatType format or a string. If a string, it will be converted to a list of dictionaries in OpenAI chat-like format.

Parameters:

Name Type Description Default input Dict[str, Any]

The input to format.

required

Returns:

Type Description ChatType

The OpenAI chat-like format of the input.

Source code in src/distilabel/steps/tasks/generate_embeddings.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"Formats the input to be used by the LLM to generate the embeddings. The input\n    can be in `ChatType` format or a string. If a string, it will be converted to a\n    list of dictionaries in OpenAI chat-like format.\n\n    Args:\n        input: The input to format.\n\n    Returns:\n        The OpenAI chat-like format of the input.\n    \"\"\"\n    text = input[\"text\"] = input[\"text\"]\n\n    # input is in `ChatType` format\n    if isinstance(text, str):\n        return [{\"role\": \"user\", \"content\": text}]\n\n    if is_openai_format(text):\n        return text\n\n    raise DistilabelUserError(\n        f\"Couldn't format input for step {self.name}. The `text` input column has to\"\n        \" be a string or a list of dictionaries in OpenAI chat-like format.\",\n        page=\"components-gallery/tasks/generateembeddings/\",\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.process","title":"process(inputs)","text":"

Generates an embedding for each input using the last hidden state of the LLM.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Yields:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/tasks/generate_embeddings.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Generates an embedding for each input using the last hidden state of the `LLM`.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Yields:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n    formatted_inputs = [self.format_input(input) for input in inputs]\n    last_hidden_states = self.llm.get_last_hidden_states(formatted_inputs)\n    for input, hidden_state in zip(inputs, last_hidden_states):\n        input[\"embedding\"] = hidden_state[-1].tolist()\n        input[\"model_name\"] = self.llm.model_name\n    yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct","title":"Genstruct","text":"

Bases: Task

Generate a pair of instruction-response from a document using an LLM.

Genstruct is a pre-defined task designed to generate valid instructions from a given raw document, with the title and the content, enabling the creation of new, partially synthetic instruction finetuning datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is inspired in the Ada-Instruct paper.

Note

The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended option is to use NousResearch/Genstruct-7B as the LLM provided to the task, since it was trained for this specific task.

Attributes:

Name Type Description _template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • title (str): The title of the document.
  • content (str): The content of the document.
Output columns
  • user (str): The user's instruction based on the document.
  • assistant (str): The assistant's response based on the user's instruction.
  • model_name (str): The model name used to generate the feedback and result.
Categories
  • text-generation
  • instruction
  • response
References
  • Genstruct 7B by Nous Research
  • Ada-Instruct: Adapting Instruction Generators for Complex Reasoning

Examples:

Generate instructions from raw documents using the title and content:

from distilabel.steps.tasks import Genstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ngenstruct = Genstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"NousResearch/Genstruct-7B\",\n    ),\n)\n\ngenstruct.load()\n\nresult = next(\n    genstruct.process(\n        [\n            {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'title': 'An instruction',\n#         'content': 'content of the document',\n#         'model_name': 'test',\n#         'user': 'An instruction',\n#         'assistant': 'content of the document',\n#     }\n# ]\n
Citations
@misc{cui2023adainstructadaptinginstructiongenerators,\n    title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning},\n    author={Wanyun Cui and Qianle Wang},\n    year={2023},\n    eprint={2310.04484},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2310.04484},\n}\n
Source code in src/distilabel/steps/tasks/genstruct.py
class Genstruct(Task):\n    \"\"\"Generate a pair of instruction-response from a document using an `LLM`.\n\n    `Genstruct` is a pre-defined task designed to generate valid instructions from a given raw document,\n    with the title and the content, enabling the creation of new, partially synthetic instruction finetuning\n    datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is\n    inspired in the Ada-Instruct paper.\n\n    Note:\n        The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended\n        option is to use `NousResearch/Genstruct-7B` as the LLM provided to the task, since it was trained\n        for this specific task.\n\n    Attributes:\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - title (`str`): The title of the document.\n        - content (`str`): The content of the document.\n\n    Output columns:\n        - user (`str`): The user's instruction based on the document.\n        - assistant (`str`): The assistant's response based on the user's instruction.\n        - model_name (`str`): The model name used to generate the `feedback` and `result`.\n\n    Categories:\n        - text-generation\n        - instruction\n        - response\n\n    References:\n        - [Genstruct 7B by Nous Research](https://huggingface.co/NousResearch/Genstruct-7B)\n        - [Ada-Instruct: Adapting Instruction Generators for Complex Reasoning](https://arxiv.org/abs/2310.04484)\n\n    Examples:\n        Generate instructions from raw documents using the title and content:\n\n        ```python\n        from distilabel.steps.tasks import Genstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        genstruct = Genstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"NousResearch/Genstruct-7B\",\n            ),\n        )\n\n        genstruct.load()\n\n        result = next(\n            genstruct.process(\n                [\n                    {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'title': 'An instruction',\n        #         'content': 'content of the document',\n        #         'model_name': 'test',\n        #         'user': 'An instruction',\n        #         'assistant': 'content of the document',\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{cui2023adainstructadaptinginstructiongenerators,\n            title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning},\n            author={Wanyun Cui and Qianle Wang},\n            year={2023},\n            eprint={2310.04484},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2310.04484},\n        }\n        ```\n    \"\"\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"genstruct.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the task are the `title` and the `content`.\"\"\"\n        return [\"title\", \"content\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    title=input[\"title\"], content=input[\"content\"]\n                ),\n            }\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `user` instruction based on the provided document\n        and the `assistant` response based on the user's instruction.\"\"\"\n        return [\"user\", \"assistant\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted so that both the user and the assistant messages are\n        captured.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the keys `user` and `assistant` containing the content for each role.\n        \"\"\"\n        if output is None:\n            return {\"user\": None, \"assistant\": None}\n\n        matches = re.search(_PARSE_GENSTRUCT_OUTPUT_REGEX, output, re.DOTALL)\n        if not matches:\n            return {\"user\": None, \"assistant\": None}\n\n        return {\n            \"user\": matches.group(1).strip(),\n            \"assistant\": matches.group(2).strip(),\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.inputs","title":"inputs: List[str] property","text":"

The inputs for the task are the title and the content.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.outputs","title":"outputs: List[str] property","text":"

The output for the task are the user instruction based on the provided document and the assistant response based on the user's instruction.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/genstruct.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"genstruct.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/genstruct.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                title=input[\"title\"], content=input[\"content\"]\n            ),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.format_output","title":"format_output(output, input)","text":"

The output is formatted so that both the user and the assistant messages are captured.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the keys user and assistant containing the content for each role.

Source code in src/distilabel/steps/tasks/genstruct.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted so that both the user and the assistant messages are\n    captured.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the keys `user` and `assistant` containing the content for each role.\n    \"\"\"\n    if output is None:\n        return {\"user\": None, \"assistant\": None}\n\n    matches = re.search(_PARSE_GENSTRUCT_OUTPUT_REGEX, output, re.DOTALL)\n    if not matches:\n        return {\"user\": None, \"assistant\": None}\n\n    return {\n        \"user\": matches.group(1).strip(),\n        \"assistant\": matches.group(2).strip(),\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator","title":"BitextRetrievalGenerator","text":"

Bases: _EmbeddingDataGenerator

Generate bitext retrieval data with an LLM to later on train an embedding model.

BitextRetrievalGenerator is a GeneratorTask that generates bitext retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Attributes:

Name Type Description source_language str

The source language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

target_language str

The target language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

unit Optional[Literal['sentence', 'phrase', 'passage']]

The unit of the data to be generated, which can be sentence, phrase, or passage. Defaults to None, meaning that it will be randomly sampled.

difficulty Optional[Literal['elementary school', 'high school', 'college']]

The difficulty of the query to be generated, which can be elementary school, high school, or college. Defaults to None, meaning that it will be randomly sampled.

high_score Optional[Literal['4', '4.5', '5']]

The high score of the query to be generated, which can be 4, 4.5, or 5. Defaults to None, meaning that it will be randomly sampled.

low_score Optional[Literal['2.5', '3', '3.5']]

The low score of the query to be generated, which can be 2.5, 3, or 3.5. Defaults to None, meaning that it will be randomly sampled.

seed Optional[Literal['2.5', '3', '3.5']]

The random seed to be set in case there's any sampling within the format_input method.

Output columns
  • S1 (str): the first sentence generated by the LLM.
  • S2 (str): the second sentence generated by the LLM.
  • S3 (str): the third sentence generated by the LLM.
  • model_name (str): the name of the model used to generate the bitext retrieval data.

Examples:

Generate bitext retrieval data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import BitextRetrievalGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = BitextRetrievalGenerator(\n        source_language=\"English\",\n        target_language=\"Spanish\",\n        unit=\"sentence\",\n        difficulty=\"elementary school\",\n        high_score=\"4\",\n        low_score=\"2.5\",\n        llm=...,\n    )\n\n    ...\n\n    task >> ...\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class BitextRetrievalGenerator(_EmbeddingDataGenerator):\n    \"\"\"Generate bitext retrieval data with an `LLM` to later on train an embedding model.\n\n    `BitextRetrievalGenerator` is a `GeneratorTask` that generates bitext retrieval data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Attributes:\n        source_language: The source language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        target_language: The target language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        unit: The unit of the data to be generated, which can be `sentence`, `phrase`, or `passage`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        difficulty: The difficulty of the query to be generated, which can be `elementary school`, `high school`, or `college`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        high_score: The high score of the query to be generated, which can be `4`, `4.5`, or `5`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        low_score: The low score of the query to be generated, which can be `2.5`, `3`, or `3.5`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n    Output columns:\n        - S1 (`str`): the first sentence generated by the `LLM`.\n        - S2 (`str`): the second sentence generated by the `LLM`.\n        - S3 (`str`): the third sentence generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the bitext retrieval\n            data.\n\n    Examples:\n        Generate bitext retrieval data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import BitextRetrievalGenerator\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = BitextRetrievalGenerator(\n                source_language=\"English\",\n                target_language=\"Spanish\",\n                unit=\"sentence\",\n                difficulty=\"elementary school\",\n                high_score=\"4\",\n                low_score=\"2.5\",\n                llm=...,\n            )\n\n            ...\n\n            task >> ...\n        ```\n    \"\"\"\n\n    source_language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n    target_language: str = Field(\n        default=...,\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    unit: Optional[Literal[\"sentence\", \"phrase\", \"passage\"]] = None\n    difficulty: Optional[Literal[\"elementary school\", \"high school\", \"college\"]] = None\n    high_score: Optional[Literal[\"4\", \"4.5\", \"5\"]] = None\n    low_score: Optional[Literal[\"2.5\", \"3\", \"3.5\"]] = None\n\n    _template_name: str = PrivateAttr(default=\"bitext-retrieval\")\n    _can_be_used_with_offline_batch_generation = True\n\n    @property\n    def prompt(self) -> ChatType:\n        \"\"\"Contains the `prompt` to be used in the `process` method, rendering the `_template`; and\n        formatted as an OpenAI formatted chat i.e. a `ChatType`, assuming that there's only one turn,\n        being from the user with the content being the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    source_language=self.source_language,\n                    target_language=self.target_language,\n                    unit=self.unit or random.choice([\"sentence\", \"phrase\", \"passage\"]),\n                    difficulty=self.difficulty\n                    or random.choice([\"elementary school\", \"high school\", \"college\"]),\n                    high_score=self.high_score or random.choice([\"4\", \"4.5\", \"5\"]),\n                    low_score=self.low_score or random.choice([\"2.5\", \"3\", \"3.5\"]),\n                ).strip(),\n            }\n        ]  # type: ignore\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"S1\", \"S2\", \"S3\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator.prompt","title":"prompt: ChatType property","text":"

Contains the prompt to be used in the process method, rendering the _template; and formatted as an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData","title":"GenerateLongTextMatchingData","text":"

Bases: _EmbeddingDataGeneration

Generate long text matching data with an LLM to later on train an embedding model.

GenerateLongTextMatchingData is a Task that generates long text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Note

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-long\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-long category.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

seed str

The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params.

Input columns
  • task (str): The task description to be used in the generation.
Output columns
  • input (str): the input generated by the LLM.
  • positive_document (str): the positive document generated by the LLM.
  • model_name (str): the name of the model used to generate the long text matching data.
References
  • Improving Text Embeddings with Large Language Models

Examples:

Generate synthetic long text matching data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-matching-long\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateLongTextMatchingData(\n        language=\"English\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class GenerateLongTextMatchingData(_EmbeddingDataGeneration):\n    \"\"\"Generate long text matching data with an `LLM` to later on train an embedding model.\n\n    `GenerateLongTextMatchingData` is a `Task` that generates long text matching data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Note:\n        Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n        with the `category=\"text-matching-long\"`; so that the `LLM` generates a list of tasks that\n        are flattened so that each row contains a single task for the text-matching-long category.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n            Note that in this task the `seed` has no effect since there are no sampling params.\n\n    Input columns:\n        - task (`str`): The task description to be used in the generation.\n\n    Output columns:\n        - input (`str`): the input generated by the `LLM`.\n        - positive_document (`str`): the positive document generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the long text matching\n            data.\n\n    References:\n        - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n    Examples:\n        Generate synthetic long text matching data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = EmbeddingTaskGenerator(\n                category=\"text-matching-long\",\n                flatten_tasks=True,\n                llm=...,  # LLM instance\n            )\n\n            generate = GenerateLongTextMatchingData(\n                language=\"English\",\n                llm=...,  # LLM instance\n            )\n\n            task >> generate\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    _template_name: str = PrivateAttr(default=\"long-text-matching\")\n    _can_be_used_with_offline_batch_generation = True\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n        randomly sampling those if not provided. This method will render the `_template` with\n        the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n        there's only one turn, being from the user with the content being the rendered `_template`.\n\n        Args:\n            input: The input dictionary containing the `task` to be used in the `_template`.\n\n        Returns:\n            A list with a single chat containing the user's message with the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    task=input[\"task\"],\n                    language=self.language,\n                ).strip(),\n            }\n        ]\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"input\", \"positive_document\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData.format_input","title":"format_input(input)","text":"

Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

Parameters:

Name Type Description Default input Dict[str, Any]

The input dictionary containing the task to be used in the _template.

required

Returns:

Type Description ChatType

A list with a single chat containing the user's message with the rendered _template.

Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n    randomly sampling those if not provided. This method will render the `_template` with\n    the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n    there's only one turn, being from the user with the content being the rendered `_template`.\n\n    Args:\n        input: The input dictionary containing the `task` to be used in the `_template`.\n\n    Returns:\n        A list with a single chat containing the user's message with the rendered `_template`.\n    \"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                task=input[\"task\"],\n                language=self.language,\n            ).strip(),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData","title":"GenerateShortTextMatchingData","text":"

Bases: _EmbeddingDataGeneration

Generate short text matching data with an LLM to later on train an embedding model.

GenerateShortTextMatchingData is a Task that generates short text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Note

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-short\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-short category.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

seed str

The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params.

Input columns
  • task (str): The task description to be used in the generation.
Output columns
  • input (str): the input generated by the LLM.
  • positive_document (str): the positive document generated by the LLM.
  • model_name (str): the name of the model used to generate the short text matching data.
References
  • Improving Text Embeddings with Large Language Models

Examples:

Generate synthetic short text matching data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-matching-short\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateShortTextMatchingData(\n        language=\"English\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class GenerateShortTextMatchingData(_EmbeddingDataGeneration):\n    \"\"\"Generate short text matching data with an `LLM` to later on train an embedding model.\n\n    `GenerateShortTextMatchingData` is a `Task` that generates short text matching data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Note:\n        Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n        with the `category=\"text-matching-short\"`; so that the `LLM` generates a list of tasks that\n        are flattened so that each row contains a single task for the text-matching-short category.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n            Note that in this task the `seed` has no effect since there are no sampling params.\n\n    Input columns:\n        - task (`str`): The task description to be used in the generation.\n\n    Output columns:\n        - input (`str`): the input generated by the `LLM`.\n        - positive_document (`str`): the positive document generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the short text matching\n            data.\n\n    References:\n        - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n    Examples:\n        Generate synthetic short text matching data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = EmbeddingTaskGenerator(\n                category=\"text-matching-short\",\n                flatten_tasks=True,\n                llm=...,  # LLM instance\n            )\n\n            generate = GenerateShortTextMatchingData(\n                language=\"English\",\n                llm=...,  # LLM instance\n            )\n\n            task >> generate\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    _template_name: str = PrivateAttr(default=\"short-text-matching\")\n    _can_be_used_with_offline_batch_generation = True\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n        randomly sampling those if not provided. This method will render the `_template` with\n                the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n                there's only one turn, being from the user with the content being the rendered `_template`.\n\n                Args:\n                    input: The input dictionary containing the `task` to be used in the `_template`.\n\n                Returns:\n                    A list with a single chat containing the user's message with the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    task=input[\"task\"],\n                    language=self.language,\n                ).strip(),\n            }\n        ]\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"input\", \"positive_document\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData.format_input","title":"format_input(input)","text":"

Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

    Args:\n        input: The input dictionary containing the `task` to be used in the `_template`.\n\n    Returns:\n        A list with a single chat containing the user's message with the rendered `_template`.\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n    randomly sampling those if not provided. This method will render the `_template` with\n            the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n            there's only one turn, being from the user with the content being the rendered `_template`.\n\n            Args:\n                input: The input dictionary containing the `task` to be used in the `_template`.\n\n            Returns:\n                A list with a single chat containing the user's message with the rendered `_template`.\n    \"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                task=input[\"task\"],\n                language=self.language,\n            ).strip(),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData","title":"GenerateTextClassificationData","text":"

Bases: _EmbeddingDataGeneration

Generate text classification data with an LLM to later on train an embedding model.

GenerateTextClassificationData is a Task that generates text classification data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Note

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-classification\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-classification category.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

difficulty Optional[Literal['high school', 'college', 'PhD']]

The difficulty of the query to be generated, which can be high school, college, or PhD. Defaults to None, meaning that it will be randomly sampled.

clarity Optional[Literal['clear', 'understandable with some effort', 'ambiguous']]

The clarity of the query to be generated, which can be clear, understandable with some effort, or ambiguous. Defaults to None, meaning that it will be randomly sampled.

seed Optional[Literal['clear', 'understandable with some effort', 'ambiguous']]

The random seed to be set in case there's any sampling within the format_input method.

Input columns
  • task (str): The task description to be used in the generation.
Output columns
  • input_text (str): the input text generated by the LLM.
  • label (str): the label generated by the LLM.
  • misleading_label (str): the misleading label generated by the LLM.
  • model_name (str): the name of the model used to generate the text classification data.
References
  • Improving Text Embeddings with Large Language Models

Examples:

Generate synthetic text classification data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-classification\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateTextClassificationData(\n        language=\"English\",\n        difficulty=\"high school\",\n        clarity=\"clear\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class GenerateTextClassificationData(_EmbeddingDataGeneration):\n    \"\"\"Generate text classification data with an `LLM` to later on train an embedding model.\n\n    `GenerateTextClassificationData` is a `Task` that generates text classification data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Note:\n        Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n        with the `category=\"text-classification\"`; so that the `LLM` generates a list of tasks that\n        are flattened so that each row contains a single task for the text-classification category.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        difficulty: The difficulty of the query to be generated, which can be `high school`, `college`, or `PhD`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        clarity: The clarity of the query to be generated, which can be `clear`, `understandable with some effort`,\n            or `ambiguous`. Defaults to `None`, meaning that it will be randomly sampled.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n    Input columns:\n        - task (`str`): The task description to be used in the generation.\n\n    Output columns:\n        - input_text (`str`): the input text generated by the `LLM`.\n        - label (`str`): the label generated by the `LLM`.\n        - misleading_label (`str`): the misleading label generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the text classification\n            data.\n\n    References:\n        - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n    Examples:\n        Generate synthetic text classification data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = EmbeddingTaskGenerator(\n                category=\"text-classification\",\n                flatten_tasks=True,\n                llm=...,  # LLM instance\n            )\n\n            generate = GenerateTextClassificationData(\n                language=\"English\",\n                difficulty=\"high school\",\n                clarity=\"clear\",\n                llm=...,  # LLM instance\n            )\n\n            task >> generate\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    difficulty: Optional[Literal[\"high school\", \"college\", \"PhD\"]] = None\n    clarity: Optional[\n        Literal[\"clear\", \"understandable with some effort\", \"ambiguous\"]\n    ] = None\n\n    _template_name: str = PrivateAttr(default=\"text-classification\")\n    _can_be_used_with_offline_batch_generation = True\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n        randomly sampling those if not provided. This method will render the `_template` with\n        the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n        there's only one turn, being from the user with the content being the rendered `_template`.\n\n        Args:\n            input: The input dictionary containing the `task` to be used in the `_template`.\n\n        Returns:\n            A list with a single chat containing the user's message with the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    task=input[\"task\"],\n                    language=self.language,\n                    difficulty=self.difficulty\n                    or random.choice([\"high school\", \"college\", \"PhD\"]),\n                    clarity=self.clarity\n                    or random.choice(\n                        [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n                    ),\n                ).strip(),\n            }\n        ]\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"input_text\", \"label\", \"misleading_label\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData.format_input","title":"format_input(input)","text":"

Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

Parameters:

Name Type Description Default input Dict[str, Any]

The input dictionary containing the task to be used in the _template.

required

Returns:

Type Description ChatType

A list with a single chat containing the user's message with the rendered _template.

Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n    randomly sampling those if not provided. This method will render the `_template` with\n    the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n    there's only one turn, being from the user with the content being the rendered `_template`.\n\n    Args:\n        input: The input dictionary containing the `task` to be used in the `_template`.\n\n    Returns:\n        A list with a single chat containing the user's message with the rendered `_template`.\n    \"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                task=input[\"task\"],\n                language=self.language,\n                difficulty=self.difficulty\n                or random.choice([\"high school\", \"college\", \"PhD\"]),\n                clarity=self.clarity\n                or random.choice(\n                    [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n                ),\n            ).strip(),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData","title":"GenerateTextRetrievalData","text":"

Bases: _EmbeddingDataGeneration

Generate text retrieval data with an LLM to later on train an embedding model.

GenerateTextRetrievalData is a Task that generates text retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Note

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-retrieval\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-retrieval category.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

query_type Optional[Literal['extremely long-tail', 'long-tail', 'common']]

The type of query to be generated, which can be extremely long-tail, long-tail, or common. Defaults to None, meaning that it will be randomly sampled.

query_length Optional[Literal['less than 5 words', '5 to 15 words', 'at least 10 words']]

The length of the query to be generated, which can be less than 5 words, 5 to 15 words, or at least 10 words. Defaults to None, meaning that it will be randomly sampled.

difficulty Optional[Literal['high school', 'college', 'PhD']]

The difficulty of the query to be generated, which can be high school, college, or PhD. Defaults to None, meaning that it will be randomly sampled.

clarity Optional[Literal['clear', 'understandable with some effort', 'ambiguous']]

The clarity of the query to be generated, which can be clear, understandable with some effort, or ambiguous. Defaults to None, meaning that it will be randomly sampled.

num_words Optional[Literal[50, 100, 200, 300, 400, 500]]

The number of words in the query to be generated, which can be 50, 100, 200, 300, 400, or 500. Defaults to None, meaning that it will be randomly sampled.

seed Optional[Literal[50, 100, 200, 300, 400, 500]]

The random seed to be set in case there's any sampling within the format_input method.

Input columns
  • task (str): The task description to be used in the generation.
Output columns
  • user_query (str): the user query generated by the LLM.
  • positive_document (str): the positive document generated by the LLM.
  • hard_negative_document (str): the hard negative document generated by the LLM.
  • model_name (str): the name of the model used to generate the text retrieval data.
References
  • Improving Text Embeddings with Large Language Models

Examples:

Generate synthetic text retrieval data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-retrieval\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateTextRetrievalData(\n        language=\"English\",\n        query_type=\"common\",\n        query_length=\"5 to 15 words\",\n        difficulty=\"high school\",\n        clarity=\"clear\",\n        num_words=100,\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class GenerateTextRetrievalData(_EmbeddingDataGeneration):\n    \"\"\"Generate text retrieval data with an `LLM` to later on train an embedding model.\n\n    `GenerateTextRetrievalData` is a `Task` that generates text retrieval data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Note:\n        Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n        with the `category=\"text-retrieval\"`; so that the `LLM` generates a list of tasks that\n        are flattened so that each row contains a single task for the text-retrieval category.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        query_type: The type of query to be generated, which can be `extremely long-tail`, `long-tail`,\n            or `common`. Defaults to `None`, meaning that it will be randomly sampled.\n        query_length: The length of the query to be generated, which can be `less than 5 words`, `5 to 15 words`,\n            or `at least 10 words`. Defaults to `None`, meaning that it will be randomly sampled.\n        difficulty: The difficulty of the query to be generated, which can be `high school`, `college`, or `PhD`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        clarity: The clarity of the query to be generated, which can be `clear`, `understandable with some effort`,\n            or `ambiguous`. Defaults to `None`, meaning that it will be randomly sampled.\n        num_words: The number of words in the query to be generated, which can be `50`, `100`, `200`, `300`, `400`, or `500`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n    Input columns:\n        - task (`str`): The task description to be used in the generation.\n\n    Output columns:\n        - user_query (`str`): the user query generated by the `LLM`.\n        - positive_document (`str`): the positive document generated by the `LLM`.\n        - hard_negative_document (`str`): the hard negative document generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the text retrieval data.\n\n    References:\n        - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n    Examples:\n        Generate synthetic text retrieval data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = EmbeddingTaskGenerator(\n                category=\"text-retrieval\",\n                flatten_tasks=True,\n                llm=...,  # LLM instance\n            )\n\n            generate = GenerateTextRetrievalData(\n                language=\"English\",\n                query_type=\"common\",\n                query_length=\"5 to 15 words\",\n                difficulty=\"high school\",\n                clarity=\"clear\",\n                num_words=100,\n                llm=...,  # LLM instance\n            )\n\n            task >> generate\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    query_type: Optional[Literal[\"extremely long-tail\", \"long-tail\", \"common\"]] = None\n    query_length: Optional[\n        Literal[\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n    ] = None\n    difficulty: Optional[Literal[\"high school\", \"college\", \"PhD\"]] = None\n    clarity: Optional[\n        Literal[\"clear\", \"understandable with some effort\", \"ambiguous\"]\n    ] = None\n    num_words: Optional[Literal[50, 100, 200, 300, 400, 500]] = None\n\n    _template_name: str = PrivateAttr(default=\"text-retrieval\")\n    _can_be_used_with_offline_batch_generation = True\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n        randomly sampling those if not provided. This method will render the `_template` with\n        the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n        there's only one turn, being from the user with the content being the rendered `_template`.\n\n        Args:\n            input: The input dictionary containing the `task` to be used in the `_template`.\n\n        Returns:\n            A list with a single chat containing the user's message with the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    task=input[\"task\"],\n                    language=self.language,\n                    query_type=self.query_type\n                    or random.choice([\"extremely long-tail\", \"long-tail\", \"common\"]),\n                    query_length=self.query_length\n                    or random.choice(\n                        [\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n                    ),\n                    difficulty=self.difficulty\n                    or random.choice([\"high school\", \"college\", \"PhD\"]),\n                    clarity=self.clarity\n                    or random.choice(\n                        [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n                    ),\n                    num_words=self.num_words\n                    or random.choice([50, 100, 200, 300, 400, 500]),\n                ).strip(),\n            }\n        ]\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\n            \"user_query\",\n            \"positive_document\",\n            \"hard_negative_document\",\n        ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData.format_input","title":"format_input(input)","text":"

Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

Parameters:

Name Type Description Default input Dict[str, Any]

The input dictionary containing the task to be used in the _template.

required

Returns:

Type Description ChatType

A list with a single chat containing the user's message with the rendered _template.

Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n    randomly sampling those if not provided. This method will render the `_template` with\n    the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n    there's only one turn, being from the user with the content being the rendered `_template`.\n\n    Args:\n        input: The input dictionary containing the `task` to be used in the `_template`.\n\n    Returns:\n        A list with a single chat containing the user's message with the rendered `_template`.\n    \"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                task=input[\"task\"],\n                language=self.language,\n                query_type=self.query_type\n                or random.choice([\"extremely long-tail\", \"long-tail\", \"common\"]),\n                query_length=self.query_length\n                or random.choice(\n                    [\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n                ),\n                difficulty=self.difficulty\n                or random.choice([\"high school\", \"college\", \"PhD\"]),\n                clarity=self.clarity\n                or random.choice(\n                    [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n                ),\n                num_words=self.num_words\n                or random.choice([50, 100, 200, 300, 400, 500]),\n            ).strip(),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator","title":"MonolingualTripletGenerator","text":"

Bases: _EmbeddingDataGenerator

Generate monolingual triplets with an LLM to later on train an embedding model.

MonolingualTripletGenerator is a GeneratorTask that generates monolingual triplets with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

unit Optional[Literal['sentence', 'phrase', 'passage']]

The unit of the data to be generated, which can be sentence, phrase, or passage. Defaults to None, meaning that it will be randomly sampled.

difficulty Optional[Literal['elementary school', 'high school', 'college']]

The difficulty of the query to be generated, which can be elementary school, high school, or college. Defaults to None, meaning that it will be randomly sampled.

high_score Optional[Literal['4', '4.5', '5']]

The high score of the query to be generated, which can be 4, 4.5, or 5. Defaults to None, meaning that it will be randomly sampled.

low_score Optional[Literal['2.5', '3', '3.5']]

The low score of the query to be generated, which can be 2.5, 3, or 3.5. Defaults to None, meaning that it will be randomly sampled.

seed Optional[Literal['2.5', '3', '3.5']]

The random seed to be set in case there's any sampling within the format_input method.

Output columns
  • S1 (str): the first sentence generated by the LLM.
  • S2 (str): the second sentence generated by the LLM.
  • S3 (str): the third sentence generated by the LLM.
  • model_name (str): the name of the model used to generate the monolingual triplets.

Examples:

Generate monolingual triplets for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import MonolingualTripletGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = MonolingualTripletGenerator(\n        language=\"English\",\n        unit=\"sentence\",\n        difficulty=\"elementary school\",\n        high_score=\"4\",\n        low_score=\"2.5\",\n        llm=...,\n    )\n\n    ...\n\n    task >> ...\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class MonolingualTripletGenerator(_EmbeddingDataGenerator):\n    \"\"\"Generate monolingual triplets with an `LLM` to later on train an embedding model.\n\n    `MonolingualTripletGenerator` is a `GeneratorTask` that generates monolingual triplets with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        unit: The unit of the data to be generated, which can be `sentence`, `phrase`, or `passage`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        difficulty: The difficulty of the query to be generated, which can be `elementary school`, `high school`, or `college`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        high_score: The high score of the query to be generated, which can be `4`, `4.5`, or `5`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        low_score: The low score of the query to be generated, which can be `2.5`, `3`, or `3.5`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n    Output columns:\n        - S1 (`str`): the first sentence generated by the `LLM`.\n        - S2 (`str`): the second sentence generated by the `LLM`.\n        - S3 (`str`): the third sentence generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the monolingual triplets.\n\n    Examples:\n        Generate monolingual triplets for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import MonolingualTripletGenerator\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = MonolingualTripletGenerator(\n                language=\"English\",\n                unit=\"sentence\",\n                difficulty=\"elementary school\",\n                high_score=\"4\",\n                low_score=\"2.5\",\n                llm=...,\n            )\n\n            ...\n\n            task >> ...\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    unit: Optional[Literal[\"sentence\", \"phrase\", \"passage\"]] = None\n    difficulty: Optional[Literal[\"elementary school\", \"high school\", \"college\"]] = None\n    high_score: Optional[Literal[\"4\", \"4.5\", \"5\"]] = None\n    low_score: Optional[Literal[\"2.5\", \"3\", \"3.5\"]] = None\n\n    _template_name: str = PrivateAttr(default=\"monolingual-triplet\")\n    _can_be_used_with_offline_batch_generation = True\n\n    @property\n    def prompt(self) -> ChatType:\n        \"\"\"Contains the `prompt` to be used in the `process` method, rendering the `_template`; and\n        formatted as an OpenAI formatted chat i.e. a `ChatType`, assuming that there's only one turn,\n        being from the user with the content being the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    language=self.language,\n                    unit=self.unit or random.choice([\"sentence\", \"phrase\", \"passage\"]),\n                    difficulty=self.difficulty\n                    or random.choice([\"elementary school\", \"high school\", \"college\"]),\n                    high_score=self.high_score or random.choice([\"4\", \"4.5\", \"5\"]),\n                    low_score=self.low_score or random.choice([\"2.5\", \"3\", \"3.5\"]),\n                ).strip(),\n            }\n        ]  # type: ignore\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"S1\", \"S2\", \"S3\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator.prompt","title":"prompt: ChatType property","text":"

Contains the prompt to be used in the process method, rendering the _template; and formatted as an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation","title":"InstructionBacktranslation","text":"

Bases: Task

Self-Alignment with Instruction Backtranslation.

Attributes:

Name Type Description _template Optional[Template]

the Jinja2 template to use for the Instruction Backtranslation task.

Input columns
  • instruction (str): The reference instruction to evaluate the text output.
  • generation (str): The text output to evaluate for the given instruction.
Output columns
  • score (str): The score for the generation based on the given instruction.
  • reason (str): The reason for the provided score.
  • model_name (str): The model name used to score the generation.
Categories
  • critique
References
  • Self-Alignment with Instruction Backtranslation

Examples:

Generate a score and reason for a given instruction and generation:

from distilabel.steps.tasks import InstructionBacktranslation\n\ninstruction_backtranslation = InstructionBacktranslation(\n        name=\"instruction_backtranslation\",\n        llm=llm,\n        input_batch_size=10,\n        output_mappings={\"model_name\": \"scoring_model\"},\n    )\ninstruction_backtranslation.load()\n\nresult = next(\n    instruction_backtranslation.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generation\": \"4\",\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         \"instruction\": \"How much is 2+2?\",\n#         \"generation\": \"4\",\n#         \"score\": 3,\n#         \"reason\": \"Reason for the generation.\",\n#         \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n#     }\n# ]\n
Citations
@misc{li2024selfalignmentinstructionbacktranslation,\n    title={Self-Alignment with Instruction Backtranslation},\n    author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Omer Levy and Luke Zettlemoyer and Jason Weston and Mike Lewis},\n    year={2024},\n    eprint={2308.06259},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2308.06259},\n}\n
Source code in src/distilabel/steps/tasks/instruction_backtranslation.py
class InstructionBacktranslation(Task):\n    \"\"\"Self-Alignment with Instruction Backtranslation.\n\n    Attributes:\n        _template: the Jinja2 template to use for the Instruction Backtranslation task.\n\n    Input columns:\n        - instruction (`str`): The reference instruction to evaluate the text output.\n        - generation (`str`): The text output to evaluate for the given instruction.\n\n    Output columns:\n        - score (`str`): The score for the generation based on the given instruction.\n        - reason (`str`): The reason for the provided score.\n        - model_name (`str`): The model name used to score the generation.\n\n    Categories:\n        - critique\n\n    References:\n        - [`Self-Alignment with Instruction Backtranslation`](https://arxiv.org/abs/2308.06259)\n\n    Examples:\n        Generate a score and reason for a given instruction and generation:\n\n        ```python\n        from distilabel.steps.tasks import InstructionBacktranslation\n\n        instruction_backtranslation = InstructionBacktranslation(\n                name=\"instruction_backtranslation\",\n                llm=llm,\n                input_batch_size=10,\n                output_mappings={\"model_name\": \"scoring_model\"},\n            )\n        instruction_backtranslation.load()\n\n        result = next(\n            instruction_backtranslation.process(\n                [\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"generation\": \"4\",\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         \"instruction\": \"How much is 2+2?\",\n        #         \"generation\": \"4\",\n        #         \"score\": 3,\n        #         \"reason\": \"Reason for the generation.\",\n        #         \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{li2024selfalignmentinstructionbacktranslation,\n            title={Self-Alignment with Instruction Backtranslation},\n            author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Omer Levy and Luke Zettlemoyer and Jason Weston and Mike Lewis},\n            year={2024},\n            eprint={2308.06259},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2308.06259},\n        }\n        ```\n    \"\"\"\n\n    _template: Optional[\"Template\"] = PrivateAttr(default=...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"instruction-backtranslation.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`, and the `generation` for it.\"\"\"\n        return [\"instruction\", \"generation\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    instruction=input[\"instruction\"], generation=input[\"generation\"]\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `score`, `reason` and the `model_name`.\"\"\"\n        return [\"score\", \"reason\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `score` and `reason`. The\n        `model_name` will be automatically included within the `process` method of `Task`.\n\n        Args:\n            output: a string representing the output of the LLM via the `process` method.\n            input: the input to the task, as required by some tasks to format the output.\n\n        Returns:\n            A dictionary containing the `score` and the `reason` for the provided `score`.\n        \"\"\"\n        pattern = r\"(.+?)Score: (\\d)\"\n\n        matches = None\n        if output is not None:\n            matches = re.findall(pattern, output, re.DOTALL)\n        if matches is None:\n            return {\"score\": None, \"reason\": None}\n\n        return {\n            \"score\": int(matches[0][1]),\n            \"reason\": matches[0][0].strip(),\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.inputs","title":"inputs: List[str] property","text":"

The input for the task is the instruction, and the generation for it.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.outputs","title":"outputs: List[str] property","text":"

The output for the task is the score, reason and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/instruction_backtranslation.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"instruction-backtranslation.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/instruction_backtranslation.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                instruction=input[\"instruction\"], generation=input[\"generation\"]\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.format_output","title":"format_output(output, input)","text":"

The output is formatted as a dictionary with the score and reason. The model_name will be automatically included within the process method of Task.

Parameters:

Name Type Description Default output Union[str, None]

a string representing the output of the LLM via the process method.

required input Dict[str, Any]

the input to the task, as required by some tasks to format the output.

required

Returns:

Type Description Dict[str, Any]

A dictionary containing the score and the reason for the provided score.

Source code in src/distilabel/steps/tasks/instruction_backtranslation.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `score` and `reason`. The\n    `model_name` will be automatically included within the `process` method of `Task`.\n\n    Args:\n        output: a string representing the output of the LLM via the `process` method.\n        input: the input to the task, as required by some tasks to format the output.\n\n    Returns:\n        A dictionary containing the `score` and the `reason` for the provided `score`.\n    \"\"\"\n    pattern = r\"(.+?)Score: (\\d)\"\n\n    matches = None\n    if output is not None:\n        matches = re.findall(pattern, output, re.DOTALL)\n    if matches is None:\n        return {\"score\": None, \"reason\": None}\n\n    return {\n        \"score\": int(matches[0][1]),\n        \"reason\": matches[0][0].strip(),\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie","title":"Magpie","text":"

Bases: Task, MagpieBase

Generates conversations using an instruct fine-tuned LLM.

Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as if it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'.

Attributes:

Name Type Description n_turns

the number of turns that the generated conversation will have. Defaults to 1.

end_with_user

whether the conversation should end with a user message. Defaults to False.

include_system_prompt

whether to include the system prompt used in the generated conversation. Defaults to False.

only_instruction

whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

system_prompt

an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None.

Runtime parameters
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.
  • end_with_user: whether the conversation should end with a user message. Defaults to False.
  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.
  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.
  • system_prompt: an optional system prompt or list of system prompts that can be used to steer the LLM to generate content of certain topic, guide the style, etc. If it's a list of system prompts, then a random system prompt will be chosen per input/output batch. If the provided inputs contains a system_prompt column, then this runtime parameter will be ignored and the one from the column will be used. Defaults to None.
  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic.
Input columns
  • system_prompt (str, optional): an optional system prompt that can be provided to guide the generation of the instruct LLM and steer it to generate instructions of certain topic.
Output columns
  • conversation (ChatType): the generated conversation which is a list of chat items with a role and a message. Only if only_instruction=False.
  • instruction (str): the generated instructions if only_instruction=True or n_turns==1.
  • response (str): the generated response if n_turns==1.
  • system_prompt_key (str, optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary.
  • model_name (str): The model name used to generate the conversation or instruction.
Categories
  • text-generation
  • instruction
References
  • Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing

Examples:

Generating instructions with Llama 3 8B Instruct and TransformersLLM:

from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 64,\n        },\n        device=\"mps\",\n    ),\n    only_instruction=True,\n)\n\nmagpie.load()\n\nresult = next(\n    magpie.process(\n        inputs=[\n            {\n                \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n            },\n            {\n                \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n            },\n        ]\n    )\n)\n# [\n#     {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n#     {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n# ]\n

Generating conversations with Llama 3 8B Instruct and TransformersLLM:

from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 256,\n        },\n        device=\"mps\",\n    ),\n    n_turns=2,\n)\n\nmagpie.load()\n\nresult = next(\n    magpie.process(\n        inputs=[\n            {\n                \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n            },\n            {\n                \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n            },\n        ]\n    )\n)\n# [\n#     {\n#         'conversation': [\n#             {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n#             {\n#                 'role': 'user',\n#                 'content': 'I'm having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n# of x\".'\n#             },\n#             {\n#                 'role': 'assistant',\n#                 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don't worry, I'm here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n# x\". What it's asking us to do is find the'\n#             }\n#         ]\n#     },\n#     {\n#         'conversation': [\n#             {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n#             {\n#                 'role': 'user',\n#                 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n# might be pests or diseases, but I'm not sure which.\"\n#             },\n#             {\n#                 'role': 'assistant',\n#                 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n# **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n#             }\n#         ]\n#     }\n# ]\n
Source code in src/distilabel/steps/tasks/magpie/base.py
class Magpie(Task, MagpieBase):\n    \"\"\"Generates conversations using an instruct fine-tuned LLM.\n\n    Magpie is a neat method that allows generating user instructions with no seed data\n    or specific system prompt thanks to the autoregressive capabilities of the instruct\n    fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message\n    and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query\n    or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the\n    LLM without any user message, then the LLM will continue generating tokens as if it was\n    the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM.\n    After this instruct is generated, it can be sent again to the LLM to generate this time\n    an assistant response. This process can be repeated N times allowing to build a multi-turn\n    conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from\n    Scratch by Prompting Aligned LLMs with Nothing'.\n\n    Attributes:\n        n_turns: the number of turns that the generated conversation will have.\n            Defaults to `1`.\n        end_with_user: whether the conversation should end with a user message.\n            Defaults to `False`.\n        include_system_prompt: whether to include the system prompt used in the generated\n            conversation. Defaults to `False`.\n        only_instruction: whether to generate only the instruction. If this argument is\n            `True`, then `n_turns` will be ignored. Defaults to `False`.\n        system_prompt: an optional system prompt, or a list of system prompts from which\n            a random one will be chosen, or a dictionary of system prompts from which a\n            random one will be choosen, or a dictionary of system prompts with their probability\n            of being chosen. The random system prompt will be chosen per input/output batch.\n            This system prompt can be used to guide the generation of the instruct LLM and\n            steer it to generate instructions of a certain topic. Defaults to `None`.\n\n    Runtime parameters:\n        - `n_turns`: the number of turns that the generated conversation will have. Defaults\n            to `1`.\n        - `end_with_user`: whether the conversation should end with a user message.\n            Defaults to `False`.\n        - `include_system_prompt`: whether to include the system prompt used in the generated\n            conversation. Defaults to `False`.\n        - `only_instruction`: whether to generate only the instruction. If this argument is\n            `True`, then `n_turns` will be ignored. Defaults to `False`.\n        - `system_prompt`: an optional system prompt or list of system prompts that can\n            be used to steer the LLM to generate content of certain topic, guide the style,\n            etc. If it's a list of system prompts, then a random system prompt will be chosen\n            per input/output batch. If the provided inputs contains a `system_prompt` column,\n            then this runtime parameter will be ignored and the one from the column will\n            be used. Defaults to `None`.\n        - `system_prompt`: an optional system prompt, or a list of system prompts from which\n            a random one will be chosen, or a dictionary of system prompts from which a\n            random one will be choosen, or a dictionary of system prompts with their probability\n            of being chosen. The random system prompt will be chosen per input/output batch.\n            This system prompt can be used to guide the generation of the instruct LLM and\n            steer it to generate instructions of a certain topic.\n\n    Input columns:\n        - system_prompt (`str`, optional): an optional system prompt that can be provided\n            to guide the generation of the instruct LLM and steer it to generate instructions\n            of certain topic.\n\n    Output columns:\n        - conversation (`ChatType`): the generated conversation which is a list of chat\n            items with a role and a message. Only if `only_instruction=False`.\n        - instruction (`str`): the generated instructions if `only_instruction=True` or `n_turns==1`.\n        - response (`str`): the generated response if `n_turns==1`.\n        - system_prompt_key (`str`, optional): the key of the system prompt used to generate\n            the conversation or instruction. Only if `system_prompt` is a dictionary.\n        - model_name (`str`): The model name used to generate the `conversation` or `instruction`.\n\n    Categories:\n        - text-generation\n        - instruction\n\n    References:\n        - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)\n\n    Examples:\n        Generating instructions with Llama 3 8B Instruct and TransformersLLM:\n\n        ```python\n        from distilabel.models import TransformersLLM\n        from distilabel.steps.tasks import Magpie\n\n        magpie = Magpie(\n            llm=TransformersLLM(\n                model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 1.0,\n                    \"max_new_tokens\": 64,\n                },\n                device=\"mps\",\n            ),\n            only_instruction=True,\n        )\n\n        magpie.load()\n\n        result = next(\n            magpie.process(\n                inputs=[\n                    {\n                        \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n                    },\n                    {\n                        \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n                    },\n                ]\n            )\n        )\n        # [\n        #     {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n        #     {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n        # ]\n        ```\n\n        Generating conversations with Llama 3 8B Instruct and TransformersLLM:\n\n        ```python\n        from distilabel.models import TransformersLLM\n        from distilabel.steps.tasks import Magpie\n\n        magpie = Magpie(\n            llm=TransformersLLM(\n                model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 1.0,\n                    \"max_new_tokens\": 256,\n                },\n                device=\"mps\",\n            ),\n            n_turns=2,\n        )\n\n        magpie.load()\n\n        result = next(\n            magpie.process(\n                inputs=[\n                    {\n                        \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n                    },\n                    {\n                        \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n                    },\n                ]\n            )\n        )\n        # [\n        #     {\n        #         'conversation': [\n        #             {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n        #             {\n        #                 'role': 'user',\n        #                 'content': 'I\\'m having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n        # of x\".'\n        #             },\n        #             {\n        #                 'role': 'assistant',\n        #                 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don\\'t worry, I\\'m here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n        # x\". What it\\'s asking us to do is find the'\n        #             }\n        #         ]\n        #     },\n        #     {\n        #         'conversation': [\n        #             {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n        #             {\n        #                 'role': 'user',\n        #                 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n        # might be pests or diseases, but I'm not sure which.\"\n        #             },\n        #             {\n        #                 'role': 'assistant',\n        #                 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n        # **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n        #             }\n        #         ]\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n        super().model_post_init(__context)\n\n        if not isinstance(self.llm, MagpieChatTemplateMixin):\n            raise DistilabelUserError(\n                f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n                f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n                page=\"components-gallery/tasks/magpie/\",\n            )\n\n        self.llm.use_magpie_template = True\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return {\"system_prompt\": False}\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"Does nothing.\"\"\"\n        return []\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"Either a multi-turn conversation or the instruction generated.\"\"\"\n        outputs = []\n\n        if self.only_instruction:\n            outputs.append(\"instruction\")\n        elif self.n_turns == 1:\n            outputs.extend([\"instruction\", \"response\"])\n        else:\n            outputs.append(\"conversation\")\n\n        if isinstance(self.system_prompt, dict):\n            outputs.append(\"system_prompt_key\")\n\n        outputs.append(\"model_name\")\n\n        return outputs\n\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Union[Dict[str, Any], None] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"Does nothing.\"\"\"\n        return {}\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        \"\"\"Generate a list of instructions or conversations of the specified number of turns.\n\n        Args:\n            inputs: a list of dictionaries that can contain a `system_prompt` key.\n\n        Yields:\n            The list of generated conversations.\n        \"\"\"\n        yield self._generate_with_pre_query_template(inputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.outputs","title":"outputs: StepColumns property","text":"

Either a multi-turn conversation or the instruction generated.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.model_post_init","title":"model_post_init(__context)","text":"

Checks that the provided LLM uses the MagpieChatTemplateMixin.

Source code in src/distilabel/steps/tasks/magpie/base.py
def model_post_init(self, __context: Any) -> None:\n    \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n    super().model_post_init(__context)\n\n    if not isinstance(self.llm, MagpieChatTemplateMixin):\n        raise DistilabelUserError(\n            f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n            f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n            page=\"components-gallery/tasks/magpie/\",\n        )\n\n    self.llm.use_magpie_template = True\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.format_input","title":"format_input(input)","text":"

Does nothing.

Source code in src/distilabel/steps/tasks/magpie/base.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"Does nothing.\"\"\"\n    return []\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.format_output","title":"format_output(output, input=None)","text":"

Does nothing.

Source code in src/distilabel/steps/tasks/magpie/base.py
def format_output(\n    self,\n    output: Union[str, None],\n    input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n    \"\"\"Does nothing.\"\"\"\n    return {}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.process","title":"process(inputs)","text":"

Generate a list of instructions or conversations of the specified number of turns.

Parameters:

Name Type Description Default inputs StepInput

a list of dictionaries that can contain a system_prompt key.

required

Yields:

Type Description StepOutput

The list of generated conversations.

Source code in src/distilabel/steps/tasks/magpie/base.py
def process(self, inputs: StepInput) -> \"StepOutput\":\n    \"\"\"Generate a list of instructions or conversations of the specified number of turns.\n\n    Args:\n        inputs: a list of dictionaries that can contain a `system_prompt` key.\n\n    Yields:\n        The list of generated conversations.\n    \"\"\"\n    yield self._generate_with_pre_query_template(inputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator","title":"MagpieGenerator","text":"

Bases: GeneratorTask, MagpieBase

Generator task the generates instructions or conversations using Magpie.

Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'.

Attributes:

Name Type Description n_turns

the number of turns that the generated conversation will have. Defaults to 1.

end_with_user

whether the conversation should end with a user message. Defaults to False.

include_system_prompt

whether to include the system prompt used in the generated conversation. Defaults to False.

only_instruction

whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

system_prompt

an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None.

num_rows RuntimeParameter[int]

the number of rows to be generated.

Runtime parameters
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.
  • end_with_user: whether the conversation should end with a user message. Defaults to False.
  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.
  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.
  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic.
  • num_rows: the number of rows to be generated.
Output columns
  • conversation (ChatType): the generated conversation which is a list of chat items with a role and a message.
  • instruction (str): the generated instructions if only_instruction=True.
  • response (str): the generated response if n_turns==1.
  • system_prompt_key (str, optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary.
  • model_name (str): The model name used to generate the conversation or instruction.
Categories
  • text-generation
  • instruction
  • generator
References
  • Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing

Examples:

Generating instructions with Llama 3 8B Instruct and TransformersLLM:

from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 256,\n        },\n        device=\"mps\",\n    ),\n    only_instruction=True,\n    num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n#       [\n#           {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n#           {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n#       ],\n#       True\n# )\n

Generating a conversation with Llama 3 8B Instruct and TransformersLLM:

from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 64,\n        },\n        device=\"mps\",\n    ),\n    n_turns=3,\n    num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n#     [\n#         {\n#             'conversation': [\n#                 {\n#                     'role': 'system',\n#                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n#                 },\n#                 {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n#                 {\n#                     'role': 'assistant',\n#                     'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n# let's break down the basics. First, we need to identify your goals and target audience. What do\"\n#                 },\n#                 {\n#                     'role': 'user',\n#                     'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n# expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n#                 },\n#                 {\n#                     'role': 'assistant',\n#                     'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n# agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n#                 }\n#             ]\n#         },\n#         {\n#             'conversation': [\n#                 {\n#                     'role': 'system',\n#                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n#                 },\n#                 {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n#                 {\n#                     'role': 'assistant',\n#                     'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n# **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n#                 },\n#                 {\n#                     'role': 'user',\n#                     'content': 'Let me stop you there. Let's explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I're primarily using my\n# laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n#                 },\n#                 {\n#                     'role': 'assistant',\n#                     'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n# option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n#                 }\n#             ]\n#         }\n#     ],\n#     True\n# )\n

Generating with system prompts with probabilities:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\nmagpie = MagpieGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 0.8,\n            \"max_new_tokens\": 256,\n        },\n    ),\n    n_turns=2,\n    system_prompt={\n        \"math\": (\"You're an expert AI assistant.\", 0.8),\n        \"writing\": (\"You're an expert writing assistant.\", 0.2),\n    },\n)\n\nmagpie.load()\n\nresult = next(magpie.process())\n
Citations
@misc{xu2024magpiealignmentdatasynthesis,\n    title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n    author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n    year={2024},\n    eprint={2406.08464},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2406.08464},\n}\n
Source code in src/distilabel/steps/tasks/magpie/generator.py
class MagpieGenerator(GeneratorTask, MagpieBase):\n    \"\"\"Generator task the generates instructions or conversations using Magpie.\n\n    Magpie is a neat method that allows generating user instructions with no seed data\n    or specific system prompt thanks to the autoregressive capabilities of the instruct\n    fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message\n    and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query\n    or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the\n    LLM without any user message, then the LLM will continue generating tokens as it was\n    the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM.\n    After this instruct is generated, it can be sent again to the LLM to generate this time\n    an assistant response. This process can be repeated N times allowing to build a multi-turn\n    conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from\n    Scratch by Prompting Aligned LLMs with Nothing'.\n\n    Attributes:\n        n_turns: the number of turns that the generated conversation will have.\n            Defaults to `1`.\n        end_with_user: whether the conversation should end with a user message.\n            Defaults to `False`.\n        include_system_prompt: whether to include the system prompt used in the generated\n            conversation. Defaults to `False`.\n        only_instruction: whether to generate only the instruction. If this argument is\n            `True`, then `n_turns` will be ignored. Defaults to `False`.\n        system_prompt: an optional system prompt, or a list of system prompts from which\n            a random one will be chosen, or a dictionary of system prompts from which a\n            random one will be choosen, or a dictionary of system prompts with their probability\n            of being chosen. The random system prompt will be chosen per input/output batch.\n            This system prompt can be used to guide the generation of the instruct LLM and\n            steer it to generate instructions of a certain topic. Defaults to `None`.\n        num_rows: the number of rows to be generated.\n\n    Runtime parameters:\n        - `n_turns`: the number of turns that the generated conversation will have. Defaults\n            to `1`.\n        - `end_with_user`: whether the conversation should end with a user message.\n            Defaults to `False`.\n        - `include_system_prompt`: whether to include the system prompt used in the generated\n            conversation. Defaults to `False`.\n        - `only_instruction`: whether to generate only the instruction. If this argument is\n            `True`, then `n_turns` will be ignored. Defaults to `False`.\n        - `system_prompt`: an optional system prompt, or a list of system prompts from which\n            a random one will be chosen, or a dictionary of system prompts from which a\n            random one will be choosen, or a dictionary of system prompts with their probability\n            of being chosen. The random system prompt will be chosen per input/output batch.\n            This system prompt can be used to guide the generation of the instruct LLM and\n            steer it to generate instructions of a certain topic.\n        - `num_rows`: the number of rows to be generated.\n\n    Output columns:\n        - conversation (`ChatType`): the generated conversation which is a list of chat\n            items with a role and a message.\n        - instruction (`str`): the generated instructions if `only_instruction=True`.\n        - response (`str`): the generated response if `n_turns==1`.\n        - system_prompt_key (`str`, optional): the key of the system prompt used to generate\n            the conversation or instruction. Only if `system_prompt` is a dictionary.\n        - model_name (`str`): The model name used to generate the `conversation` or `instruction`.\n\n    Categories:\n        - text-generation\n        - instruction\n        - generator\n\n    References:\n        - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)\n\n    Examples:\n        Generating instructions with Llama 3 8B Instruct and TransformersLLM:\n\n        ```python\n        from distilabel.models import TransformersLLM\n        from distilabel.steps.tasks import MagpieGenerator\n\n        generator = MagpieGenerator(\n            llm=TransformersLLM(\n                model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 1.0,\n                    \"max_new_tokens\": 256,\n                },\n                device=\"mps\",\n            ),\n            only_instruction=True,\n            num_rows=5,\n        )\n\n        generator.load()\n\n        result = next(generator.process())\n        # (\n        #       [\n        #           {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n        #           {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n        #       ],\n        #       True\n        # )\n        ```\n\n        Generating a conversation with Llama 3 8B Instruct and TransformersLLM:\n\n        ```python\n        from distilabel.models import TransformersLLM\n        from distilabel.steps.tasks import MagpieGenerator\n\n        generator = MagpieGenerator(\n            llm=TransformersLLM(\n                model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 1.0,\n                    \"max_new_tokens\": 64,\n                },\n                device=\"mps\",\n            ),\n            n_turns=3,\n            num_rows=5,\n        )\n\n        generator.load()\n\n        result = next(generator.process())\n        # (\n        #     [\n        #         {\n        #             'conversation': [\n        #                 {\n        #                     'role': 'system',\n        #                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n        # insightful responses to help the user with their queries.'\n        #                 },\n        #                 {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n        #                 {\n        #                     'role': 'assistant',\n        #                     'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n        # let's break down the basics. First, we need to identify your goals and target audience. What do\"\n        #                 },\n        #                 {\n        #                     'role': 'user',\n        #                     'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n        # expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n        #                 },\n        #                 {\n        #                     'role': 'assistant',\n        #                     'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n        # agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n        #                 }\n        #             ]\n        #         },\n        #         {\n        #             'conversation': [\n        #                 {\n        #                     'role': 'system',\n        #                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n        # insightful responses to help the user with their queries.'\n        #                 },\n        #                 {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n        #                 {\n        #                     'role': 'assistant',\n        #                     'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n        # **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n        #                 },\n        #                 {\n        #                     'role': 'user',\n        #                     'content': 'Let me stop you there. Let\\'s explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I\\'re primarily using my\n        # laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n        #                 },\n        #                 {\n        #                     'role': 'assistant',\n        #                     'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n        # option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n        #                 }\n        #             ]\n        #         }\n        #     ],\n        #     True\n        # )\n        ```\n\n        Generating with system prompts with probabilities:\n\n        ```python\n        from distilabel.models import InferenceEndpointsLLM\n        from distilabel.steps.tasks import MagpieGenerator\n\n        magpie = MagpieGenerator(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 0.8,\n                    \"max_new_tokens\": 256,\n                },\n            ),\n            n_turns=2,\n            system_prompt={\n                \"math\": (\"You're an expert AI assistant.\", 0.8),\n                \"writing\": (\"You're an expert writing assistant.\", 0.2),\n            },\n        )\n\n        magpie.load()\n\n        result = next(magpie.process())\n        ```\n\n    Citations:\n        ```\n        @misc{xu2024magpiealignmentdatasynthesis,\n            title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n            author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n            year={2024},\n            eprint={2406.08464},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2406.08464},\n        }\n        ```\n    \"\"\"\n\n    # TODO: move this to `GeneratorTask`\n    num_rows: RuntimeParameter[int] = Field(\n        default=None, description=\"The number of rows to generate.\"\n    )\n\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n        super().model_post_init(__context)\n\n        if not isinstance(self.llm, MagpieChatTemplateMixin):\n            raise DistilabelUserError(\n                f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n                f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n                page=\"components-gallery/tasks/magpiegenerator/\",\n            )\n\n        self.llm.use_magpie_template = True\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"Either a multi-turn conversation or the instruction generated.\"\"\"\n        outputs = []\n\n        if self.only_instruction:\n            outputs.append(\"instruction\")\n        elif self.n_turns == 1:\n            outputs.extend([\"instruction\", \"response\"])\n        else:\n            outputs.append(\"conversation\")\n\n        if isinstance(self.system_prompt, dict):\n            outputs.append(\"system_prompt_key\")\n\n        outputs.append(\"model_name\")\n\n        return outputs\n\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Union[Dict[str, Any], None] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"Does nothing.\"\"\"\n        return {}\n\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        \"\"\"Generates the desired number of instructions or conversations using Magpie.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to `0`.\n\n        Yields:\n            The generated instructions or conversations.\n        \"\"\"\n        generated = offset\n\n        while generated <= self.num_rows:  # type: ignore\n            rows_to_generate = (\n                self.num_rows if self.num_rows < self.batch_size else self.batch_size  # type: ignore\n            )\n            conversations = self._generate_with_pre_query_template(\n                inputs=[{} for _ in range(rows_to_generate)]  # type: ignore\n            )\n            generated += rows_to_generate  # type: ignore\n            yield (conversations, generated == self.num_rows)\n\n    @override\n    def _sample_input(self) -> \"ChatType\":\n        return self._generate_with_pre_query_template(inputs=[{}])\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.outputs","title":"outputs: StepColumns property","text":"

Either a multi-turn conversation or the instruction generated.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.model_post_init","title":"model_post_init(__context)","text":"

Checks that the provided LLM uses the MagpieChatTemplateMixin.

Source code in src/distilabel/steps/tasks/magpie/generator.py
def model_post_init(self, __context: Any) -> None:\n    \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n    super().model_post_init(__context)\n\n    if not isinstance(self.llm, MagpieChatTemplateMixin):\n        raise DistilabelUserError(\n            f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n            f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n            page=\"components-gallery/tasks/magpiegenerator/\",\n        )\n\n    self.llm.use_magpie_template = True\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.format_output","title":"format_output(output, input=None)","text":"

Does nothing.

Source code in src/distilabel/steps/tasks/magpie/generator.py
def format_output(\n    self,\n    output: Union[str, None],\n    input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n    \"\"\"Does nothing.\"\"\"\n    return {}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.process","title":"process(offset=0)","text":"

Generates the desired number of instructions or conversations using Magpie.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

The generated instructions or conversations.

Source code in src/distilabel/steps/tasks/magpie/generator.py
def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n    \"\"\"Generates the desired number of instructions or conversations using Magpie.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to `0`.\n\n    Yields:\n        The generated instructions or conversations.\n    \"\"\"\n    generated = offset\n\n    while generated <= self.num_rows:  # type: ignore\n        rows_to_generate = (\n            self.num_rows if self.num_rows < self.batch_size else self.batch_size  # type: ignore\n        )\n        conversations = self._generate_with_pre_query_template(\n            inputs=[{} for _ in range(rows_to_generate)]  # type: ignore\n        )\n        generated += rows_to_generate  # type: ignore\n        yield (conversations, generated == self.num_rows)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM","title":"PairRM","text":"

Bases: Step

Rank the candidates based on the input using the LLM model.

Attributes:

Name Type Description model str

The model to use for the ranking. Defaults to \"llm-blender/PairRM\".

instructions Optional[str]

The instructions to use for the model. Defaults to None.

Input columns
  • inputs (List[Dict[str, Any]]): The input text or conversation to rank the candidates for.
  • candidates (List[Dict[str, Any]]): The candidates to rank.
Output columns
  • ranks (List[int]): The ranks of the candidates based on the input.
  • ranked_candidates (List[Dict[str, Any]]): The candidates ranked based on the input.
  • model_name (str): The model name used to rank the candidate responses. Defaults to \"llm-blender/PairRM\".
References
  • LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion.
  • Pair Ranking Model.
Categories
  • preference
Note

This step differs to other tasks as there is a single implementation of this model currently, and we will use a specific LLM.

Examples:

Rank LLM candidates:

from distilabel.steps.tasks import PairRM\n\n# Consider this as a placeholder for your actual LLM.\npair_rm = PairRM()\n\npair_rm.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'input': 'Hello, how are you?',\n#         'candidates': ['fine', 'good', 'bad'],\n#         'ranks': [2, 1, 3],\n#         'ranked_candidates': ['good', 'fine', 'bad'],\n#         'model_name': 'llm-blender/PairRM',\n#     }\n# ]\n
Citations
@misc{jiang2023llmblenderensemblinglargelanguage,\n    title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion},\n    author={Dongfu Jiang and Xiang Ren and Bill Yuchen Lin},\n    year={2023},\n    eprint={2306.02561},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2306.02561},\n}\n
Source code in src/distilabel/steps/tasks/pair_rm.py
class PairRM(Step):\n    \"\"\"Rank the candidates based on the input using the `LLM` model.\n\n    Attributes:\n        model: The model to use for the ranking. Defaults to `\"llm-blender/PairRM\"`.\n        instructions: The instructions to use for the model. Defaults to `None`.\n\n    Input columns:\n        - inputs (`List[Dict[str, Any]]`): The input text or conversation to rank the candidates for.\n        - candidates (`List[Dict[str, Any]]`): The candidates to rank.\n\n    Output columns:\n        - ranks (`List[int]`): The ranks of the candidates based on the input.\n        - ranked_candidates (`List[Dict[str, Any]]`): The candidates ranked based on the input.\n        - model_name (`str`): The model name used to rank the candidate responses. Defaults to `\"llm-blender/PairRM\"`.\n\n    References:\n        - [LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion](https://arxiv.org/abs/2306.02561).\n        - [Pair Ranking Model](https://huggingface.co/llm-blender/PairRM).\n\n    Categories:\n        - preference\n\n    Note:\n        This step differs to other tasks as there is a single implementation of this model\n        currently, and we will use a specific `LLM`.\n\n    Examples:\n        Rank LLM candidates:\n\n        ```python\n        from distilabel.steps.tasks import PairRM\n\n        # Consider this as a placeholder for your actual LLM.\n        pair_rm = PairRM()\n\n        pair_rm.load()\n\n        result = next(\n            scorer.process(\n                [\n                    {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'input': 'Hello, how are you?',\n        #         'candidates': ['fine', 'good', 'bad'],\n        #         'ranks': [2, 1, 3],\n        #         'ranked_candidates': ['good', 'fine', 'bad'],\n        #         'model_name': 'llm-blender/PairRM',\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{jiang2023llmblenderensemblinglargelanguage,\n            title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion},\n            author={Dongfu Jiang and Xiang Ren and Bill Yuchen Lin},\n            year={2023},\n            eprint={2306.02561},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2306.02561},\n        }\n        ```\n    \"\"\"\n\n    model: str = \"llm-blender/PairRM\"\n    instructions: Optional[str] = None\n\n    def load(self) -> None:\n        \"\"\"Loads the PairRM model provided via `model` with `llm_blender.Blender`, which is the\n        custom library for running the inference for the PairRM models.\"\"\"\n        try:\n            import llm_blender\n        except ImportError as e:\n            raise ImportError(\n                \"The `llm_blender` package is required to use the `PairRM` class.\"\n                \"Please install it with `pip install git+https://github.com/yuchenlin/LLM-Blender.git`.\"\n            ) from e\n\n        self._blender = llm_blender.Blender()\n        self._blender.loadranker(self.model)\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The input columns correspond to the two required arguments from `Blender.rank`:\n        `inputs` and `candidates`.\"\"\"\n        return [\"input\", \"candidates\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs will include the `ranks` and the `ranked_candidates`.\"\"\"\n        return [\"ranks\", \"ranked_candidates\", \"model_name\"]\n\n    def format_input(self, input: Dict[str, Any]) -> Dict[str, Any]:\n        \"\"\"The input is expected to be a dictionary with the keys `input` and `candidates`,\n        where the `input` corresponds to the instruction of a model and `candidates` are a\n        list of responses to be ranked.\n        \"\"\"\n        return {\"input\": input[\"input\"], \"candidates\": input[\"candidates\"]}\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Generates the ranks for the candidates based on the input.\n\n        The ranks are the positions of the candidates, where lower is better,\n        and the ranked candidates correspond to the candidates sorted according to the\n        ranks obtained.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            An iterator with the inputs containing the `ranks`, `ranked_candidates`, and `model_name`.\n        \"\"\"\n        input_texts = []\n        candidates = []\n        for input in inputs:\n            formatted_input = self.format_input(input)\n            input_texts.append(formatted_input[\"input\"])\n            candidates.append(formatted_input[\"candidates\"])\n\n        instructions = (\n            [self.instructions] * len(input_texts) if self.instructions else None\n        )\n\n        ranks = self._blender.rank(\n            input_texts,\n            candidates,\n            instructions=instructions,\n            return_scores=False,\n            batch_size=self.input_batch_size,\n        )\n        # Sort the candidates based on the ranks\n        ranked_candidates = np.take_along_axis(\n            np.array(candidates), ranks - 1, axis=1\n        ).tolist()\n        ranks = ranks.tolist()\n        for input, rank, ranked_candidate in zip(inputs, ranks, ranked_candidates):\n            input[\"ranks\"] = rank\n            input[\"ranked_candidates\"] = ranked_candidate\n            input[\"model_name\"] = self.model\n\n        yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.inputs","title":"inputs: StepColumns property","text":"

The input columns correspond to the two required arguments from Blender.rank: inputs and candidates.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.outputs","title":"outputs: StepColumns property","text":"

The outputs will include the ranks and the ranked_candidates.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.load","title":"load()","text":"

Loads the PairRM model provided via model with llm_blender.Blender, which is the custom library for running the inference for the PairRM models.

Source code in src/distilabel/steps/tasks/pair_rm.py
def load(self) -> None:\n    \"\"\"Loads the PairRM model provided via `model` with `llm_blender.Blender`, which is the\n    custom library for running the inference for the PairRM models.\"\"\"\n    try:\n        import llm_blender\n    except ImportError as e:\n        raise ImportError(\n            \"The `llm_blender` package is required to use the `PairRM` class.\"\n            \"Please install it with `pip install git+https://github.com/yuchenlin/LLM-Blender.git`.\"\n        ) from e\n\n    self._blender = llm_blender.Blender()\n    self._blender.loadranker(self.model)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.format_input","title":"format_input(input)","text":"

The input is expected to be a dictionary with the keys input and candidates, where the input corresponds to the instruction of a model and candidates are a list of responses to be ranked.

Source code in src/distilabel/steps/tasks/pair_rm.py
def format_input(self, input: Dict[str, Any]) -> Dict[str, Any]:\n    \"\"\"The input is expected to be a dictionary with the keys `input` and `candidates`,\n    where the `input` corresponds to the instruction of a model and `candidates` are a\n    list of responses to be ranked.\n    \"\"\"\n    return {\"input\": input[\"input\"], \"candidates\": input[\"candidates\"]}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.process","title":"process(inputs)","text":"

Generates the ranks for the candidates based on the input.

The ranks are the positions of the candidates, where lower is better, and the ranked candidates correspond to the candidates sorted according to the ranks obtained.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Yields:

Type Description StepOutput

An iterator with the inputs containing the ranks, ranked_candidates, and model_name.

Source code in src/distilabel/steps/tasks/pair_rm.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Generates the ranks for the candidates based on the input.\n\n    The ranks are the positions of the candidates, where lower is better,\n    and the ranked candidates correspond to the candidates sorted according to the\n    ranks obtained.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Yields:\n        An iterator with the inputs containing the `ranks`, `ranked_candidates`, and `model_name`.\n    \"\"\"\n    input_texts = []\n    candidates = []\n    for input in inputs:\n        formatted_input = self.format_input(input)\n        input_texts.append(formatted_input[\"input\"])\n        candidates.append(formatted_input[\"candidates\"])\n\n    instructions = (\n        [self.instructions] * len(input_texts) if self.instructions else None\n    )\n\n    ranks = self._blender.rank(\n        input_texts,\n        candidates,\n        instructions=instructions,\n        return_scores=False,\n        batch_size=self.input_batch_size,\n    )\n    # Sort the candidates based on the ranks\n    ranked_candidates = np.take_along_axis(\n        np.array(candidates), ranks - 1, axis=1\n    ).tolist()\n    ranks = ranks.tolist()\n    for input, rank, ranked_candidate in zip(inputs, ranks, ranked_candidates):\n        input[\"ranks\"] = rank\n        input[\"ranked_candidates\"] = ranked_candidate\n        input[\"model_name\"] = self.model\n\n    yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval","title":"PrometheusEval","text":"

Bases: Task

Critique and rank the quality of generations from an LLM using Prometheus 2.0.

PrometheusEval is a task created for Prometheus 2.0, covering both the absolute and relative evaluations. The absolute evaluation i.e. mode=\"absolute\" is used to evaluate a single generation from an LLM for a given instruction. The relative evaluation i.e. mode=\"relative\" is used to evaluate two generations from an LLM for a given instruction. Both evaluations provide the possibility of using a reference answer to compare with or withoug the reference attribute, and both are based on a score rubric that critiques the generation/s based on the following default aspects: helpfulness, harmlessness, honesty, factual-validity, and reasoning, that can be overridden via rubrics, and the selected rubric is set via the attribute rubric.

Note

The PrometheusEval task is better suited and intended to be used with any of the Prometheus 2.0 models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0, and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting and quality is not guaranteed if using another model, even though some other models may be able to correctly follow the formatting and generate insightful critiques too.

Attributes:

Name Type Description mode Literal['absolute', 'relative']

the evaluation mode to use, either absolute or relative. It defines whether the task will evaluate one or two generations.

rubric str

the score rubric to use within the prompt to run the critique based on different aspects. Can be any existing key in the rubrics attribute, which by default means that it can be: helpfulness, harmlessness, honesty, factual-validity, or reasoning. Those will only work if using the default rubrics, otherwise, the provided rubrics should be used.

rubrics Optional[Dict[str, str]]

a dictionary containing the different rubrics to use for the critique, where the keys are the rubric names and the values are the rubric descriptions. The default rubrics are the following: helpfulness, harmlessness, honesty, factual-validity, and reasoning.

reference bool

a boolean flag to indicate whether a reference answer / completion will be provided, so that the model critique is based on the comparison with it. It implies that the column reference needs to be provided within the input data in addition to the rest of the inputs.

_template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • instruction (str): The instruction to use as reference.
  • generation (str, optional): The generated text from the given instruction. This column is required if mode=absolute.
  • generations (List[str], optional): The generated texts from the given instruction. It should contain 2 generations only. This column is required if mode=relative.
  • reference (str, optional): The reference / golden answer for the instruction, to be used by the LLM for comparison against.
Output columns
  • feedback (str): The feedback explaining the result below, as critiqued by the LLM using the pre-defined score rubric, compared against reference if provided.
  • result (Union[int, Literal[\"A\", \"B\"]]): If mode=absolute, then the result contains the score for the generation in a likert-scale from 1-5, otherwise, if mode=relative, then the result contains either \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of generations if result='A' or the index 1 if result='B'.
  • model_name (str): The model name used to generate the feedback and result.
Categories
  • critique
  • preference
References
  • Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models
  • prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf

Examples:

Critique and evaluate LLM generation quality using Prometheus 2_0:

from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"factual-validity\"\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generation\": \"something done\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n

Critique for relative evaluation:

from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"relative\",\n    rubric=\"honesty\"\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generations': ['something done', 'other thing'],\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 'something done',\n#     }\n# ]\n

Critique with a custom rubric:

from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"custom\",\n    rubrics={\n        \"custom\": \"[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E\"\n    }\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generation\": \"something done\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n

Critique using a reference answer:

from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"helpfulness\",\n    reference=True,\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\n                \"instruction\": \"make something\",\n                \"generation\": \"something done\",\n                \"reference\": \"this is a reference answer\",\n            },\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'reference': 'this is a reference answer',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n
Citations
@misc{kim2024prometheus2opensource,\n    title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models},\n    author={Seungone Kim and Juyoung Suk and Shayne Longpre and Bill Yuchen Lin and Jamin Shin and Sean Welleck and Graham Neubig and Moontae Lee and Kyungjae Lee and Minjoon Seo},\n    year={2024},\n    eprint={2405.01535},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2405.01535},\n}\n
Source code in src/distilabel/steps/tasks/prometheus_eval.py
class PrometheusEval(Task):\n    \"\"\"Critique and rank the quality of generations from an `LLM` using Prometheus 2.0.\n\n    `PrometheusEval` is a task created for Prometheus 2.0, covering both the absolute and relative\n    evaluations. The absolute evaluation i.e. `mode=\"absolute\"` is used to evaluate a single generation from\n    an LLM for a given instruction. The relative evaluation i.e. `mode=\"relative\"` is used to evaluate two generations from an LLM\n    for a given instruction.\n    Both evaluations provide the possibility of using a reference answer to compare with or withoug\n    the `reference` attribute, and both are based on a score rubric that critiques the generation/s\n    based on the following default aspects: `helpfulness`, `harmlessness`, `honesty`, `factual-validity`,\n    and `reasoning`, that can be overridden via `rubrics`, and the selected rubric is set via the attribute\n    `rubric`.\n\n    Note:\n        The `PrometheusEval` task is better suited and intended to be used with any of the Prometheus 2.0\n        models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0,\n        and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting\n        and quality is not guaranteed if using another model, even though some other models may be able to\n        correctly follow the formatting and generate insightful critiques too.\n\n    Attributes:\n        mode: the evaluation mode to use, either `absolute` or `relative`. It defines whether the task\n            will evaluate one or two generations.\n        rubric: the score rubric to use within the prompt to run the critique based on different aspects.\n            Can be any existing key in the `rubrics` attribute, which by default means that it can be:\n            `helpfulness`, `harmlessness`, `honesty`, `factual-validity`, or `reasoning`. Those will only\n            work if using the default `rubrics`, otherwise, the provided `rubrics` should be used.\n        rubrics: a dictionary containing the different rubrics to use for the critique, where the keys are\n            the rubric names and the values are the rubric descriptions. The default rubrics are the following:\n            `helpfulness`, `harmlessness`, `honesty`, `factual-validity`, and `reasoning`.\n        reference: a boolean flag to indicate whether a reference answer / completion will be provided, so\n            that the model critique is based on the comparison with it. It implies that the column `reference`\n            needs to be provided within the input data in addition to the rest of the inputs.\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - instruction (`str`): The instruction to use as reference.\n        - generation (`str`, optional): The generated text from the given `instruction`. This column is required\n            if `mode=absolute`.\n        - generations (`List[str]`, optional): The generated texts from the given `instruction`. It should\n            contain 2 generations only. This column is required if `mode=relative`.\n        - reference (`str`, optional): The reference / golden answer for the `instruction`, to be used by the LLM\n            for comparison against.\n\n    Output columns:\n        - feedback (`str`): The feedback explaining the result below, as critiqued by the LLM using the\n            pre-defined score rubric, compared against `reference` if provided.\n        - result (`Union[int, Literal[\"A\", \"B\"]]`): If `mode=absolute`, then the result contains the score for the\n            `generation` in a likert-scale from 1-5, otherwise, if `mode=relative`, then the result contains either\n            \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of `generations` if `result='A'` or the\n            index 1 if `result='B'`.\n        - model_name (`str`): The model name used to generate the `feedback` and `result`.\n\n    Categories:\n        - critique\n        - preference\n\n    References:\n        - [Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models](https://arxiv.org/abs/2405.01535)\n        - [prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf](https://github.com/prometheus-eval/prometheus-eval)\n\n    Examples:\n        Critique and evaluate LLM generation quality using Prometheus 2_0:\n\n        ```python\n        from distilabel.steps.tasks import PrometheusEval\n        from distilabel.models import vLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prometheus = PrometheusEval(\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n            ),\n            mode=\"absolute\",\n            rubric=\"factual-validity\"\n        )\n\n        prometheus.load()\n\n        result = next(\n            prometheus.process(\n                [\n                    {\"instruction\": \"make something\", \"generation\": \"something done\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'make something',\n        #         'generation': 'something done',\n        #         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n        #         'feedback': 'the feedback',\n        #         'result': 6,\n        #     }\n        # ]\n        ```\n\n        Critique for relative evaluation:\n\n        ```python\n        from distilabel.steps.tasks import PrometheusEval\n        from distilabel.models import vLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prometheus = PrometheusEval(\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n            ),\n            mode=\"relative\",\n            rubric=\"honesty\"\n        )\n\n        prometheus.load()\n\n        result = next(\n            prometheus.process(\n                [\n                    {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'make something',\n        #         'generations': ['something done', 'other thing'],\n        #         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n        #         'feedback': 'the feedback',\n        #         'result': 'something done',\n        #     }\n        # ]\n        ```\n\n        Critique with a custom rubric:\n\n        ```python\n        from distilabel.steps.tasks import PrometheusEval\n        from distilabel.models import vLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prometheus = PrometheusEval(\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n            ),\n            mode=\"absolute\",\n            rubric=\"custom\",\n            rubrics={\n                \"custom\": \"[A]\\\\nScore 1: A\\\\nScore 2: B\\\\nScore 3: C\\\\nScore 4: D\\\\nScore 5: E\"\n            }\n        )\n\n        prometheus.load()\n\n        result = next(\n            prometheus.process(\n                [\n                    {\"instruction\": \"make something\", \"generation\": \"something done\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'make something',\n        #         'generation': 'something done',\n        #         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n        #         'feedback': 'the feedback',\n        #         'result': 6,\n        #     }\n        # ]\n        ```\n\n        Critique using a reference answer:\n\n        ```python\n        from distilabel.steps.tasks import PrometheusEval\n        from distilabel.models import vLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prometheus = PrometheusEval(\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n            ),\n            mode=\"absolute\",\n            rubric=\"helpfulness\",\n            reference=True,\n        )\n\n        prometheus.load()\n\n        result = next(\n            prometheus.process(\n                [\n                    {\n                        \"instruction\": \"make something\",\n                        \"generation\": \"something done\",\n                        \"reference\": \"this is a reference answer\",\n                    },\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'make something',\n        #         'generation': 'something done',\n        #         'reference': 'this is a reference answer',\n        #         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n        #         'feedback': 'the feedback',\n        #         'result': 6,\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{kim2024prometheus2opensource,\n            title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models},\n            author={Seungone Kim and Juyoung Suk and Shayne Longpre and Bill Yuchen Lin and Jamin Shin and Sean Welleck and Graham Neubig and Moontae Lee and Kyungjae Lee and Minjoon Seo},\n            year={2024},\n            eprint={2405.01535},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2405.01535},\n        }\n        ```\n    \"\"\"\n\n    mode: Literal[\"absolute\", \"relative\"]\n    rubric: str\n    rubrics: Optional[Dict[str, str]] = Field(default=_DEFAULT_RUBRICS)\n    reference: bool = False\n\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    @model_validator(mode=\"after\")\n    def validate_rubric_and_rubrics(self) -> Self:\n        if not isinstance(self.rubrics, dict) or len(self.rubrics) < 1:\n            raise DistilabelUserError(\n                \"Provided `rubrics` must be a Python dictionary with string keys and string values.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        def rubric_matches_pattern(rubric: str) -> bool:\n            \"\"\"Checks if the provided rubric matches the pattern of the default rubrics.\"\"\"\n            pattern = r\"^\\[.*?\\]\\n(?:Score [1-4]: .*?\\n){4}(?:Score 5: .*?)\"\n            return bool(re.match(pattern, rubric, re.MULTILINE))\n\n        if not all(rubric_matches_pattern(value) for value in self.rubrics.values()):\n            raise DistilabelUserError(\n                \"Provided rubrics should match the format of the default rubrics, which\"\n                \" is as follows: `[<scoring criteria>]\\nScore 1: <description>\\nScore 2: <description>\\n\"\n                \"Score 3: <description>\\nScore 4: <description>\\nScore 5: <description>`; replacing\"\n                \" `<scoring criteria>` and `<description>` with the actual criteria and description\"\n                \" for each or the scores, respectively.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        if self.rubric not in self.rubrics:\n            raise DistilabelUserError(\n                f\"Provided rubric '{self.rubric}' is not among the available rubrics: {', '.join(self.rubrics.keys())}.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        return self\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation\n        depending on the `mode` value, and either with or without reference, depending on the\n        value of `reference`.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"prometheus\"\n            / (\n                f\"{self.mode}_without_reference.jinja2\"\n                if self.reference is False\n                else f\"{self.mode}_with_reference.jinja2\"\n            )\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The default inputs for the task are the `instruction` and the `generation`\n        if `reference=False`, otherwise, the inputs are `instruction`, `generation`, and\n        `reference`.\"\"\"\n        if self.mode == \"absolute\":\n            if self.reference:\n                return [\"instruction\", \"generation\", \"reference\"]\n            return [\"instruction\", \"generation\"]\n        else:\n            if self.reference:\n                return [\"instruction\", \"generations\", \"reference\"]\n            return [\"instruction\", \"generations\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` where the prompt is formatted according\n        to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction\n        from the user, including a pre-defined system prompt.\"\"\"\n        template_kwargs = {\n            \"instruction\": input[\"instruction\"],\n            \"rubric\": self.rubrics[self.rubric],\n        }\n        if self.reference:\n            template_kwargs[\"reference\"] = input[\"reference\"]\n\n        if self.mode == \"absolute\":\n            if not isinstance(input[\"generation\"], str):\n                raise DistilabelUserError(\n                    f\"Provided `generation` is of type {type(input['generation'])} but a string\"\n                    \" should be provided instead.\",\n                    page=\"components-gallery/tasks/prometheuseval/\",\n                )\n\n            template_kwargs[\"generation\"] = input[\"generation\"]\n            system_message = (\n                \"You are a fair judge assistant tasked with providing clear, objective feedback based\"\n                \" on specific criteria, ensuring each assessment reflects the absolute standards set\"\n                \" for performance.\"\n            )\n        else:  # self.mode == \"relative\"\n            if (\n                not isinstance(input[\"generations\"], list)\n                or not all(\n                    isinstance(generation, str) for generation in input[\"generations\"]\n                )\n                or len(input[\"generations\"]) != 2\n            ):\n                raise DistilabelUserError(\n                    f\"Provided `generations` is of type {type(input['generations'])} but a list of strings with length 2 should be provided instead.\",\n                    page=\"components-gallery/tasks/prometheuseval/\",\n                )\n\n            template_kwargs[\"generations\"] = input[\"generations\"]\n            system_message = (\n                \"You are a fair judge assistant assigned to deliver insightful feedback that compares\"\n                \" individual performances, highlighting how each stands relative to others within the\"\n                \" same cohort.\"\n            )\n\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": system_message,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(**template_kwargs),  # type: ignore\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `feedback` and the `result` generated by Prometheus,\n        as well as the `model_name` which is automatically included based on the `LLM` used.\n        \"\"\"\n        return [\"feedback\", \"result\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dict with the keys `feedback` and `result` captured\n        using a regex from the Prometheus output.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Optionally provided in case it's useful to build the output.\n\n        Returns:\n            A dict with the keys `feedback` and `result` generated by the LLM.\n        \"\"\"\n        if output is None:\n            return {\"feedback\": None, \"result\": None}\n\n        parts = output.split(\"[RESULT]\")\n        if len(parts) != 2:\n            return {\"feedback\": None, \"result\": None}\n\n        feedback, result = parts[0].strip(), parts[1].strip()\n        if feedback.startswith(\"Feedback:\"):\n            feedback = feedback[len(\"Feedback:\") :].strip()\n        if self.mode == \"absolute\":\n            if not result.isdigit() or result not in [\"1\", \"2\", \"3\", \"4\", \"5\"]:\n                return {\"feedback\": None, \"result\": None}\n            return {\"feedback\": feedback, \"result\": int(result)}\n        else:  # self.mode == \"relative\"\n            if result not in [\"A\", \"B\"]:\n                return {\"feedback\": None, \"result\": None}\n            return {\"feedback\": feedback, \"result\": result}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.inputs","title":"inputs: List[str] property","text":"

The default inputs for the task are the instruction and the generation if reference=False, otherwise, the inputs are instruction, generation, and reference.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.outputs","title":"outputs: List[str] property","text":"

The output for the task are the feedback and the result generated by Prometheus, as well as the model_name which is automatically included based on the LLM used.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.load","title":"load()","text":"

Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation depending on the mode value, and either with or without reference, depending on the value of reference.

Source code in src/distilabel/steps/tasks/prometheus_eval.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation\n    depending on the `mode` value, and either with or without reference, depending on the\n    value of `reference`.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"prometheus\"\n        / (\n            f\"{self.mode}_without_reference.jinja2\"\n            if self.reference is False\n            else f\"{self.mode}_with_reference.jinja2\"\n        )\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType where the prompt is formatted according to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction from the user, including a pre-defined system prompt.

Source code in src/distilabel/steps/tasks/prometheus_eval.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` where the prompt is formatted according\n    to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction\n    from the user, including a pre-defined system prompt.\"\"\"\n    template_kwargs = {\n        \"instruction\": input[\"instruction\"],\n        \"rubric\": self.rubrics[self.rubric],\n    }\n    if self.reference:\n        template_kwargs[\"reference\"] = input[\"reference\"]\n\n    if self.mode == \"absolute\":\n        if not isinstance(input[\"generation\"], str):\n            raise DistilabelUserError(\n                f\"Provided `generation` is of type {type(input['generation'])} but a string\"\n                \" should be provided instead.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        template_kwargs[\"generation\"] = input[\"generation\"]\n        system_message = (\n            \"You are a fair judge assistant tasked with providing clear, objective feedback based\"\n            \" on specific criteria, ensuring each assessment reflects the absolute standards set\"\n            \" for performance.\"\n        )\n    else:  # self.mode == \"relative\"\n        if (\n            not isinstance(input[\"generations\"], list)\n            or not all(\n                isinstance(generation, str) for generation in input[\"generations\"]\n            )\n            or len(input[\"generations\"]) != 2\n        ):\n            raise DistilabelUserError(\n                f\"Provided `generations` is of type {type(input['generations'])} but a list of strings with length 2 should be provided instead.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        template_kwargs[\"generations\"] = input[\"generations\"]\n        system_message = (\n            \"You are a fair judge assistant assigned to deliver insightful feedback that compares\"\n            \" individual performances, highlighting how each stands relative to others within the\"\n            \" same cohort.\"\n        )\n\n    return [\n        {\n            \"role\": \"system\",\n            \"content\": system_message,\n        },\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(**template_kwargs),  # type: ignore\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.format_output","title":"format_output(output, input)","text":"

The output is formatted as a dict with the keys feedback and result captured using a regex from the Prometheus output.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Optionally provided in case it's useful to build the output.

required

Returns:

Type Description Dict[str, Any]

A dict with the keys feedback and result generated by the LLM.

Source code in src/distilabel/steps/tasks/prometheus_eval.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dict with the keys `feedback` and `result` captured\n    using a regex from the Prometheus output.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Optionally provided in case it's useful to build the output.\n\n    Returns:\n        A dict with the keys `feedback` and `result` generated by the LLM.\n    \"\"\"\n    if output is None:\n        return {\"feedback\": None, \"result\": None}\n\n    parts = output.split(\"[RESULT]\")\n    if len(parts) != 2:\n        return {\"feedback\": None, \"result\": None}\n\n    feedback, result = parts[0].strip(), parts[1].strip()\n    if feedback.startswith(\"Feedback:\"):\n        feedback = feedback[len(\"Feedback:\") :].strip()\n    if self.mode == \"absolute\":\n        if not result.isdigit() or result not in [\"1\", \"2\", \"3\", \"4\", \"5\"]:\n            return {\"feedback\": None, \"result\": None}\n        return {\"feedback\": feedback, \"result\": int(result)}\n    else:  # self.mode == \"relative\"\n        if result not in [\"A\", \"B\"]:\n            return {\"feedback\": None, \"result\": None}\n        return {\"feedback\": feedback, \"result\": result}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer","title":"QualityScorer","text":"

Bases: Task

Score responses based on their quality using an LLM.

QualityScorer is a pre-defined task that defines the instruction as the input and score as the output. This task is used to rate the quality of instructions and responses. It's an implementation of the quality score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs are scored in terms of quality, obtaining a quality score for each instruction.

Attributes:

Name Type Description _template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • instruction (str): The instruction that was used to generate the responses.
  • responses (List[str]): The responses to be scored. Each response forms a pair with the instruction.
Output columns
  • scores (List[float]): The score for each instruction.
  • model_name (str): The model name used to generate the scores.
Categories
  • scorer
  • quality
  • response
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Evaluate the quality of your instructions:

from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = QualityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n            }\n        ]\n    )\n)\n# result\n[\n    {\n        'instructions': 'instruction',\n        'model_name': 'test',\n        'scores': [5, 3, 1],\n    }\n]\n

Generate structured output with default schema:

from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\nscorer = QualityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    use_default_structured_output=True\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n            }\n        ]\n    )\n)\n\n# result\n[{'instruction': 'instruction',\n'responses': ['good response', 'weird response', 'bad response'],\n'scores': [1, 2, 3],\n'distilabel_metadata': {'raw_output_quality_scorer_0': '{  \"scores\": [1, 2, 3] }'},\n'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/tasks/quality_scorer.py
class QualityScorer(Task):\n    \"\"\"Score responses based on their quality using an `LLM`.\n\n    `QualityScorer` is a pre-defined task that defines the `instruction` as the input\n    and `score` as the output. This task is used to rate the quality of instructions and responses.\n    It's an implementation of the quality score task from the paper 'What Makes Good Data\n    for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.\n    The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs\n    are scored in terms of quality, obtaining a quality score for each instruction.\n\n    Attributes:\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - instruction (`str`): The instruction that was used to generate the `responses`.\n        - responses (`List[str]`): The responses to be scored. Each response forms a pair with the instruction.\n\n    Output columns:\n        - scores (`List[float]`): The score for each instruction.\n        - model_name (`str`): The model name used to generate the scores.\n\n    Categories:\n        - scorer\n        - quality\n        - response\n\n    References:\n        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Evaluate the quality of your instructions:\n\n        ```python\n        from distilabel.steps.tasks import QualityScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        scorer = QualityScorer(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            )\n        )\n\n        scorer.load()\n\n        result = next(\n            scorer.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n                    }\n                ]\n            )\n        )\n        # result\n        [\n            {\n                'instructions': 'instruction',\n                'model_name': 'test',\n                'scores': [5, 3, 1],\n            }\n        ]\n        ```\n\n        Generate structured output with default schema:\n\n        ```python\n        from distilabel.steps.tasks import QualityScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        scorer = QualityScorer(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            use_default_structured_output=True\n        )\n\n        scorer.load()\n\n        result = next(\n            scorer.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n                    }\n                ]\n            )\n        )\n\n        # result\n        [{'instruction': 'instruction',\n        'responses': ['good response', 'weird response', 'bad response'],\n        'scores': [1, 2, 3],\n        'distilabel_metadata': {'raw_output_quality_scorer_0': '{  \"scores\": [1, 2, 3] }'},\n        'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"quality-scorer.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the task are `instruction` and `responses`.\"\"\"\n        return [\"instruction\", \"responses\"]\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    instruction=input[\"instruction\"], responses=input[\"responses\"]\n                ),\n            }\n        ]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is a list of `scores` containing the quality score for each\n        response in `responses`.\"\"\"\n        return [\"scores\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the key `scores` containing the scores for each instruction-response pair.\n        \"\"\"\n        if output is None:\n            return {\"scores\": [None] * len(input[\"responses\"])}\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output, input)\n\n        scores = []\n        score_lines = output.split(\"\\n\")\n\n        for i, line in enumerate(score_lines):\n            match = _PARSE_SCORE_LINE_REGEX.match(line)\n            score = float(match.group(1)) if match else None\n            scores.append(score)\n            if i == len(input[\"responses\"]) - 1:\n                break\n        return {\"scores\": scores}\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from pydantic import BaseModel\n        from typing import List\n\n        class SchemaQualityScorer(BaseModel):\n            scores: List[int]\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        return {\n            \"properties\": {\n                \"scores\": {\n                    \"items\": {\"type\": \"integer\"},\n                    \"title\": \"Scores\",\n                    \"type\": \"array\",\n                }\n            },\n            \"required\": [\"scores\"],\n            \"title\": \"SchemaQualityScorer\",\n            \"type\": \"object\",\n        }\n\n    def _format_structured_output(\n        self, output: str, input: Dict[str, Any]\n    ) -> Dict[str, str]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with the scores, and a list with them.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            return {\"scores\": [None] * len(input[\"responses\"])}\n\n    @override\n    def _sample_input(self) -> ChatType:\n        return self.format_input(\n            {\n                \"instruction\": f\"<PLACEHOLDER_{'instruction'.upper()}>\",\n                \"responses\": [\n                    f\"<PLACEHOLDER_{f'RESPONSE_{i}'.upper()}>\" for i in range(2)\n                ],\n            }\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.inputs","title":"inputs: List[str] property","text":"

The inputs for the task are instruction and responses.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.outputs","title":"outputs property","text":"

The output for the task is a list of scores containing the quality score for each response in responses.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/quality_scorer.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"quality-scorer.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/quality_scorer.py
def format_input(self, input: Dict[str, Any]) -> ChatType:  # type: ignore\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                instruction=input[\"instruction\"], responses=input[\"responses\"]\n            ),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction-response pair.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the key scores containing the scores for each instruction-response pair.

Source code in src/distilabel/steps/tasks/quality_scorer.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the key `scores` containing the scores for each instruction-response pair.\n    \"\"\"\n    if output is None:\n        return {\"scores\": [None] * len(input[\"responses\"])}\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output, input)\n\n    scores = []\n    score_lines = output.split(\"\\n\")\n\n    for i, line in enumerate(score_lines):\n        match = _PARSE_SCORE_LINE_REGEX.match(line)\n        score = float(match.group(1)) if match else None\n        scores.append(score)\n        if i == len(input[\"responses\"]) - 1:\n            break\n    return {\"scores\": scores}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaQualityScorer(BaseModel):\n    scores: List[int]\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/quality_scorer.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from pydantic import BaseModel\n    from typing import List\n\n    class SchemaQualityScorer(BaseModel):\n        scores: List[int]\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    return {\n        \"properties\": {\n            \"scores\": {\n                \"items\": {\"type\": \"integer\"},\n                \"title\": \"Scores\",\n                \"type\": \"array\",\n            }\n        },\n        \"required\": [\"scores\"],\n        \"title\": \"SchemaQualityScorer\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer._format_structured_output","title":"_format_structured_output(output, input)","text":"

Parses the structured response, which should correspond to a dictionary with the scores, and a list with them.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, str]

Formatted output.

Source code in src/distilabel/steps/tasks/quality_scorer.py
def _format_structured_output(\n    self, output: str, input: Dict[str, Any]\n) -> Dict[str, str]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with the scores, and a list with them.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        return {\"scores\": [None] * len(input[\"responses\"])}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct","title":"SelfInstruct","text":"

Bases: Task

Generate instructions based on a given input using an LLM.

SelfInstruct is a pre-defined task that, given a number of instructions, a certain criteria for query generations, an application description, and an input, generates a number of instruction related to the given input and following what is stated in the criteria for query generation and the application description. It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning Language Models with Self-Generated Instructions\".

Attributes:

Name Type Description num_instructions int

The number of instructions to be generated. Defaults to 5.

criteria_for_query_generation str

The criteria for the query generation. Defaults to the criteria defined within the paper.

application_description str

The description of the AI application that one want to build with these instructions. Defaults to AI assistant.

Input columns
  • input (str): The input to generate the instructions. It's also called seed in the paper.
Output columns
  • instructions (List[str]): The generated instructions.
  • model_name (str): The model name used to generate the instructions.
Categories
  • text-generation
Reference
  • Self-Instruct: Aligning Language Models with Self-Generated Instructions

Examples:

Generate instructions based on a given input:

from distilabel.steps.tasks import SelfInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\nself_instruct = SelfInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=5,  # This is the default value\n)\n\nself_instruct.load()\n\nresult = next(self_instruct.process([{\"input\": \"instruction\"}]))\n# result\n# [\n#     {\n#         'input': 'instruction',\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#         'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n#     }\n# ]\n
Citations
@misc{wang2023selfinstructaligninglanguagemodels,\n    title={Self-Instruct: Aligning Language Models with Self-Generated Instructions},\n    author={Yizhong Wang and Yeganeh Kordi and Swaroop Mishra and Alisa Liu and Noah A. Smith and Daniel Khashabi and Hannaneh Hajishirzi},\n    year={2023},\n    eprint={2212.10560},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2212.10560},\n}\n
Source code in src/distilabel/steps/tasks/self_instruct.py
class SelfInstruct(Task):\n    \"\"\"Generate instructions based on a given input using an `LLM`.\n\n    `SelfInstruct` is a pre-defined task that, given a number of instructions, a\n    certain criteria for query generations, an application description, and an input,\n    generates a number of instruction related to the given input and following what\n    is stated in the criteria for query generation and the application description.\n    It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning\n    Language Models with Self-Generated Instructions\".\n\n    Attributes:\n        num_instructions: The number of instructions to be generated. Defaults to 5.\n        criteria_for_query_generation: The criteria for the query generation. Defaults\n            to the criteria defined within the paper.\n        application_description: The description of the AI application that one want\n            to build with these instructions. Defaults to `AI assistant`.\n\n    Input columns:\n        - input (`str`): The input to generate the instructions. It's also called seed in\n            the paper.\n\n    Output columns:\n        - instructions (`List[str]`): The generated instructions.\n        - model_name (`str`): The model name used to generate the instructions.\n\n    Categories:\n        - text-generation\n\n    Reference:\n        - [`Self-Instruct: Aligning Language Models with Self-Generated Instructions`](https://arxiv.org/abs/2212.10560)\n\n    Examples:\n        Generate instructions based on a given input:\n\n        ```python\n        from distilabel.steps.tasks import SelfInstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        self_instruct = SelfInstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_instructions=5,  # This is the default value\n        )\n\n        self_instruct.load()\n\n        result = next(self_instruct.process([{\"input\": \"instruction\"}]))\n        # result\n        # [\n        #     {\n        #         'input': 'instruction',\n        #         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n        #         'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{wang2023selfinstructaligninglanguagemodels,\n            title={Self-Instruct: Aligning Language Models with Self-Generated Instructions},\n            author={Yizhong Wang and Yeganeh Kordi and Swaroop Mishra and Alisa Liu and Noah A. Smith and Daniel Khashabi and Hannaneh Hajishirzi},\n            year={2023},\n            eprint={2212.10560},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2212.10560},\n        }\n        ```\n    \"\"\"\n\n    num_instructions: int = 5\n    criteria_for_query_generation: str = (\n        \"Incorporate a diverse range of verbs, avoiding repetition.\\n\"\n        \"Ensure queries are compatible with AI model's text generation functions and are limited to 1-2 sentences.\\n\"\n        \"Design queries to be self-contained and standalone.\\n\"\n        'Blend interrogative (e.g., \"What is the significance of x?\") and imperative (e.g., \"Detail the process of x.\") styles.'\n    )\n    application_description: str = \"AI assistant\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"self-instruct.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `input` i.e. seed text.\"\"\"\n        return [\"input\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    input=input[\"input\"],\n                    application_description=self.application_description,\n                    criteria_for_query_generation=self.criteria_for_query_generation,\n                    num_instructions=self.num_instructions,\n                ),\n            }\n        ]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n        return [\"instructions\", \"model_name\"]\n\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Optional[Dict[str, Any]] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the generated instructions.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with containing the generated instructions.\n        \"\"\"\n        if output is None:\n            return {\"instructions\": []}\n        return {\"instructions\": [line for line in output.split(\"\\n\") if line != \"\"]}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.inputs","title":"inputs: List[str] property","text":"

The input for the task is the input i.e. seed text.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.outputs","title":"outputs property","text":"

The output for the task is a list of instructions containing the generated instructions.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/self_instruct.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"self-instruct.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/self_instruct.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                input=input[\"input\"],\n                application_description=self.application_description,\n                criteria_for_query_generation=self.criteria_for_query_generation,\n                num_instructions=self.num_instructions,\n            ),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a list with the generated instructions.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Optional[Dict[str, Any]]

the input to the task. Used for obtaining the number of responses.

None

Returns:

Type Description Dict[str, Any]

A dict with containing the generated instructions.

Source code in src/distilabel/steps/tasks/self_instruct.py
def format_output(\n    self,\n    output: Union[str, None],\n    input: Optional[Dict[str, Any]] = None,\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the generated instructions.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with containing the generated instructions.\n    \"\"\"\n    if output is None:\n        return {\"instructions\": []}\n    return {\"instructions\": [line for line in output.split(\"\\n\") if line != \"\"]}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair","title":"GenerateSentencePair","text":"

Bases: Task

Generate a positive and negative (optionally) sentences given an anchor sentence.

GenerateSentencePair is a pre-defined task that given an anchor sentence generates a positive sentence related to the anchor and optionally a negative sentence unrelated to the anchor or similar to it. Optionally, you can give a context to guide the LLM towards more specific behavior. This task is useful to generate training datasets for training embeddings models.

Attributes:

Name Type Description triplet bool

a flag to indicate if the task should generate a triplet of sentences (anchor, positive, negative). Defaults to False.

action GenerationAction

the action to perform to generate the positive sentence.

context str

the context to use for the generation. Can be helpful to guide the LLM towards more specific context. Not used by default.

hard_negative bool

A flag to indicate if the negative should be a hard-negative or not. Hard negatives make it hard for the model to distinguish against the positive, with a higher degree of semantic similarity.

Input columns
  • anchor (str): The anchor sentence to generate the positive and negative sentences.
Output columns
  • positive (str): The positive sentence related to the anchor.
  • negative (str): The negative sentence unrelated to the anchor if triplet=True, or more similar to the positive to make it more challenging for a model to distinguish in case hard_negative=True.
  • model_name (str): The name of the model that was used to generate the sentences.
Categories
  • embedding

Examples:

Paraphrasing:

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"paraphrase\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n

Generating semantically similar sentences:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import GenerateSentencePair\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"semantically-similar\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n

Generating queries:

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n

Generating answers:

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"answer\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n

Generating queries with context (applies to every action):

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    context=\"Argilla is an open-source data curation platform for LLMs.\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n

Generating Hard-negatives (applies to every action):

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    context=\"Argilla is an open-source data curation platform for LLMs.\",\n    hard_negative=True,\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n

Generating structured data with default schema (applies to every action):

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    context=\"Argilla is an open-source data curation platform for LLMs.\",\n    hard_negative=True,\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n    use_default_structured_output=True\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n
Source code in src/distilabel/steps/tasks/sentence_transformers.py
class GenerateSentencePair(Task):\n    \"\"\"Generate a positive and negative (optionally) sentences given an anchor sentence.\n\n    `GenerateSentencePair` is a pre-defined task that given an anchor sentence generates\n    a positive sentence related to the anchor and optionally a negative sentence unrelated\n    to the anchor or similar to it. Optionally, you can give a context to guide the LLM\n    towards more specific behavior. This task is useful to generate training datasets for\n    training embeddings models.\n\n    Attributes:\n        triplet: a flag to indicate if the task should generate a triplet of sentences\n            (anchor, positive, negative). Defaults to `False`.\n        action: the action to perform to generate the positive sentence.\n        context: the context to use for the generation. Can be helpful to guide the LLM\n            towards more specific context. Not used by default.\n        hard_negative: A flag to indicate if the negative should be a hard-negative or not.\n            Hard negatives make it hard for the model to distinguish against the positive,\n            with a higher degree of semantic similarity.\n\n    Input columns:\n        - anchor (`str`): The anchor sentence to generate the positive and negative sentences.\n\n    Output columns:\n        - positive (`str`): The positive sentence related to the `anchor`.\n        - negative (`str`): The negative sentence unrelated to the `anchor` if `triplet=True`,\n            or more similar to the positive to make it more challenging for a model to distinguish\n            in case `hard_negative=True`.\n        - model_name (`str`): The name of the model that was used to generate the sentences.\n\n    Categories:\n        - embedding\n\n    Examples:\n        Paraphrasing:\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"paraphrase\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n        ```\n\n        Generating semantically similar sentences:\n\n        ```python\n        from distilabel.models import InferenceEndpointsLLM\n        from distilabel.steps.tasks import GenerateSentencePair\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"semantically-similar\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n        ```\n\n        Generating queries:\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"query\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n        ```\n\n        Generating answers:\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"answer\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n        ```\n\n        Generating queries with context (**applies to every action**):\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"query\",\n            context=\"Argilla is an open-source data curation platform for LLMs.\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n        ```\n\n        Generating Hard-negatives (**applies to every action**):\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"query\",\n            context=\"Argilla is an open-source data curation platform for LLMs.\",\n            hard_negative=True,\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n        ```\n\n        Generating structured data with default schema (**applies to every action**):\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"query\",\n            context=\"Argilla is an open-source data curation platform for LLMs.\",\n            hard_negative=True,\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n            use_default_structured_output=True\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n        ```\n    \"\"\"\n\n    triplet: bool = False\n    action: GenerationAction\n    hard_negative: bool = False\n    context: str = \"\"\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"generate-sentence-pair.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the task is the `anchor` sentence.\"\"\"\n        return [\"anchor\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The inputs are formatted as a `ChatType`, with a system prompt describing the\n        task of generating a positive and negative sentences for the anchor sentence. The\n        anchor is provided as the first user interaction in the conversation.\n\n        Args:\n            input: The input containing the `anchor` sentence.\n\n        Returns:\n            A list of dictionaries containing the system and user interactions.\n        \"\"\"\n        action_sentence = GENERATION_ACTION_SENTENCES[self.action]\n\n        format_system_prompt = {\n            \"action_sentence\": action_sentence,\n            \"context\": CONTEXT_INTRO if self.context else \"\",\n        }\n        if self.triplet:\n            format_system_prompt[\"negative_style\"] = NEGATIVE_STYLE[\n                \"hard-negative\" if self.hard_negative else \"negative\"\n            ]\n\n        system_prompt = (\n            POSITIVE_NEGATIVE_SYSTEM_PROMPT if self.triplet else POSITIVE_SYSTEM_PROMPT\n        ).format(**format_system_prompt)\n\n        return [\n            {\"role\": \"system\", \"content\": system_prompt},\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    anchor=input[\"anchor\"],\n                    context=self.context if self.context else None,\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The outputs for the task are the `positive` and `negative` sentences, as well\n        as the `model_name` used to generate the sentences.\"\"\"\n        columns = [\"positive\", \"negative\"] if self.triplet else [\"positive\"]\n        columns += [\"model_name\"]\n        return columns\n\n    def format_output(\n        self, output: Union[str, None], input: Optional[Dict[str, Any]] = None\n    ) -> Dict[str, Any]:\n        \"\"\"Formats the output of the LLM, to extract the `positive` and `negative` sentences\n        generated. If the output is `None` or the regex doesn't match, then the outputs\n        will be set to `None` as well.\n\n        Args:\n            output: The output of the LLM.\n            input: The input used to generate the output.\n\n        Returns:\n            The formatted output containing the `positive` and `negative` sentences.\n        \"\"\"\n        if output is None:\n            return {\"positive\": None, \"negative\": None}\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output)\n\n        match = POSITIVE_NEGATIVE_PAIR_REGEX.match(output)\n        if match is None:\n            formatted_output = {\"positive\": None}\n            if self.triplet:\n                formatted_output[\"negative\"] = None\n            return formatted_output\n\n        groups = match.groups()\n        if self.triplet:\n            return {\n                \"positive\": groups[0].strip(),\n                \"negative\": (\n                    groups[1].strip()\n                    if len(groups) > 1 and groups[1] is not None\n                    else None\n                ),\n            }\n\n        return {\"positive\": groups[0].strip()}\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        if self.triplet:\n            return {\n                \"properties\": {\n                    \"positive\": {\"title\": \"Positive\", \"type\": \"string\"},\n                    \"negative\": {\"title\": \"Negative\", \"type\": \"string\"},\n                },\n                \"required\": [\"positive\", \"negative\"],\n                \"title\": \"Schema\",\n                \"type\": \"object\",\n            }\n        return {\n            \"properties\": {\"positive\": {\"title\": \"Positive\", \"type\": \"string\"}},\n            \"required\": [\"positive\"],\n            \"title\": \"Schema\",\n            \"type\": \"object\",\n        }\n\n    def _format_structured_output(self, output: str) -> Dict[str, str]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with either `positive`, or `positive` and `negative` keys.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            if self.triplet:\n                return {\"positive\": None, \"negative\": None}\n            return {\"positive\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.inputs","title":"inputs: List[str] property","text":"

The inputs for the task is the anchor sentence.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.outputs","title":"outputs: List[str] property","text":"

The outputs for the task are the positive and negative sentences, as well as the model_name used to generate the sentences.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"generate-sentence-pair.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.format_input","title":"format_input(input)","text":"

The inputs are formatted as a ChatType, with a system prompt describing the task of generating a positive and negative sentences for the anchor sentence. The anchor is provided as the first user interaction in the conversation.

Parameters:

Name Type Description Default input Dict[str, Any]

The input containing the anchor sentence.

required

Returns:

Type Description ChatType

A list of dictionaries containing the system and user interactions.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The inputs are formatted as a `ChatType`, with a system prompt describing the\n    task of generating a positive and negative sentences for the anchor sentence. The\n    anchor is provided as the first user interaction in the conversation.\n\n    Args:\n        input: The input containing the `anchor` sentence.\n\n    Returns:\n        A list of dictionaries containing the system and user interactions.\n    \"\"\"\n    action_sentence = GENERATION_ACTION_SENTENCES[self.action]\n\n    format_system_prompt = {\n        \"action_sentence\": action_sentence,\n        \"context\": CONTEXT_INTRO if self.context else \"\",\n    }\n    if self.triplet:\n        format_system_prompt[\"negative_style\"] = NEGATIVE_STYLE[\n            \"hard-negative\" if self.hard_negative else \"negative\"\n        ]\n\n    system_prompt = (\n        POSITIVE_NEGATIVE_SYSTEM_PROMPT if self.triplet else POSITIVE_SYSTEM_PROMPT\n    ).format(**format_system_prompt)\n\n    return [\n        {\"role\": \"system\", \"content\": system_prompt},\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                anchor=input[\"anchor\"],\n                context=self.context if self.context else None,\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.format_output","title":"format_output(output, input=None)","text":"

Formats the output of the LLM, to extract the positive and negative sentences generated. If the output is None or the regex doesn't match, then the outputs will be set to None as well.

Parameters:

Name Type Description Default output Union[str, None]

The output of the LLM.

required input Optional[Dict[str, Any]]

The input used to generate the output.

None

Returns:

Type Description Dict[str, Any]

The formatted output containing the positive and negative sentences.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
def format_output(\n    self, output: Union[str, None], input: Optional[Dict[str, Any]] = None\n) -> Dict[str, Any]:\n    \"\"\"Formats the output of the LLM, to extract the `positive` and `negative` sentences\n    generated. If the output is `None` or the regex doesn't match, then the outputs\n    will be set to `None` as well.\n\n    Args:\n        output: The output of the LLM.\n        input: The input used to generate the output.\n\n    Returns:\n        The formatted output containing the `positive` and `negative` sentences.\n    \"\"\"\n    if output is None:\n        return {\"positive\": None, \"negative\": None}\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output)\n\n    match = POSITIVE_NEGATIVE_PAIR_REGEX.match(output)\n    if match is None:\n        formatted_output = {\"positive\": None}\n        if self.triplet:\n            formatted_output[\"negative\"] = None\n        return formatted_output\n\n    groups = match.groups()\n    if self.triplet:\n        return {\n            \"positive\": groups[0].strip(),\n            \"negative\": (\n                groups[1].strip()\n                if len(groups) > 1 and groups[1] is not None\n                else None\n            ),\n        }\n\n    return {\"positive\": groups[0].strip()}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    if self.triplet:\n        return {\n            \"properties\": {\n                \"positive\": {\"title\": \"Positive\", \"type\": \"string\"},\n                \"negative\": {\"title\": \"Negative\", \"type\": \"string\"},\n            },\n            \"required\": [\"positive\", \"negative\"],\n            \"title\": \"Schema\",\n            \"type\": \"object\",\n        }\n    return {\n        \"properties\": {\"positive\": {\"title\": \"Positive\", \"type\": \"string\"}},\n        \"required\": [\"positive\"],\n        \"title\": \"Schema\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair._format_structured_output","title":"_format_structured_output(output)","text":"

Parses the structured response, which should correspond to a dictionary with either positive, or positive and negative keys.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, str]

Formatted output.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
def _format_structured_output(self, output: str) -> Dict[str, str]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with either `positive`, or `positive` and `negative` keys.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        if self.triplet:\n            return {\"positive\": None, \"negative\": None}\n        return {\"positive\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration","title":"StructuredGeneration","text":"

Bases: Task

Generate structured content for a given instruction using an LLM.

StructuredGeneration is a pre-defined task that defines the instruction and the structured_output as the inputs, and generation as the output. This task is used to generate structured content based on the input instruction and following the schema provided within the structured_output column per each instruction. The model_name also returned as part of the output in order to enhance it.

Attributes:

Name Type Description use_system_prompt bool

Whether to use the system prompt in the generation. Defaults to True, which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored.

Input columns
  • instruction (str): The instruction to generate structured content from.
  • structured_output (Dict[str, Any]): The structured_output to generate structured content from. It should be a Python dictionary with the keys format and schema, where format should be one of json or regex, and the schema should be either the JSON schema or the regex pattern, respectively.
Output columns
  • generation (str): The generated text matching the provided schema, if possible.
  • model_name (str): The name of the model used to generate the text.
Categories
  • outlines
  • structured-generation

Examples:

Generate structured output from a JSON schema:

from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\n\nstructured_gen.load()\n\nresult = next(\n    structured_gen.process(\n        [\n            {\n                \"instruction\": \"Create an RPG character\",\n                \"structured_output\": {\n                    \"format\": \"json\",\n                    \"schema\": {\n                        \"properties\": {\n                            \"name\": {\n                                \"title\": \"Name\",\n                                \"type\": \"string\"\n                            },\n                            \"description\": {\n                                \"title\": \"Description\",\n                                \"type\": \"string\"\n                            },\n                            \"role\": {\n                                \"title\": \"Role\",\n                                \"type\": \"string\"\n                            },\n                            \"weapon\": {\n                                \"title\": \"Weapon\",\n                                \"type\": \"string\"\n                            }\n                        },\n                        \"required\": [\n                            \"name\",\n                            \"description\",\n                            \"role\",\n                            \"weapon\"\n                        ],\n                        \"title\": \"Character\",\n                        \"type\": \"object\"\n                    }\n                },\n            }\n        ]\n    )\n)\n

Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines):

from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\n\nstructured_gen.load()\n\nresult = next(\n    structured_gen.process(\n        [\n            {\n                \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n                \"structured_output\": {\n                    \"format\": \"regex\",\n                    \"schema\": r\"(\\d{1,2})\u00b0C\"\n                },\n\n            }\n        ]\n    )\n)\n
Source code in src/distilabel/steps/tasks/structured_generation.py
class StructuredGeneration(Task):\n    \"\"\"Generate structured content for a given `instruction` using an `LLM`.\n\n    `StructuredGeneration` is a pre-defined task that defines the `instruction` and the `structured_output`\n    as the inputs, and `generation` as the output. This task is used to generate structured content based on\n    the input instruction and following the schema provided within the `structured_output` column per each\n    `instruction`. The `model_name` also returned as part of the output in order to enhance it.\n\n    Attributes:\n        use_system_prompt: Whether to use the system prompt in the generation. Defaults to `True`,\n            which means that if the column `system_prompt` is  defined within the input batch, then\n            the `system_prompt` will be used, otherwise, it will be ignored.\n\n    Input columns:\n        - instruction (`str`): The instruction to generate structured content from.\n        - structured_output (`Dict[str, Any]`): The structured_output to generate structured content from. It should be a\n            Python dictionary with the keys `format` and `schema`, where `format` should be one of `json` or\n            `regex`, and the `schema` should be either the JSON schema or the regex pattern, respectively.\n\n    Output columns:\n        - generation (`str`): The generated text matching the provided schema, if possible.\n        - model_name (`str`): The name of the model used to generate the text.\n\n    Categories:\n        - outlines\n        - structured-generation\n\n    Examples:\n        Generate structured output from a JSON schema:\n\n        ```python\n        from distilabel.steps.tasks import StructuredGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        structured_gen = StructuredGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            ),\n        )\n\n        structured_gen.load()\n\n        result = next(\n            structured_gen.process(\n                [\n                    {\n                        \"instruction\": \"Create an RPG character\",\n                        \"structured_output\": {\n                            \"format\": \"json\",\n                            \"schema\": {\n                                \"properties\": {\n                                    \"name\": {\n                                        \"title\": \"Name\",\n                                        \"type\": \"string\"\n                                    },\n                                    \"description\": {\n                                        \"title\": \"Description\",\n                                        \"type\": \"string\"\n                                    },\n                                    \"role\": {\n                                        \"title\": \"Role\",\n                                        \"type\": \"string\"\n                                    },\n                                    \"weapon\": {\n                                        \"title\": \"Weapon\",\n                                        \"type\": \"string\"\n                                    }\n                                },\n                                \"required\": [\n                                    \"name\",\n                                    \"description\",\n                                    \"role\",\n                                    \"weapon\"\n                                ],\n                                \"title\": \"Character\",\n                                \"type\": \"object\"\n                            }\n                        },\n                    }\n                ]\n            )\n        )\n        ```\n\n        Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines):\n\n        ```python\n        from distilabel.steps.tasks import StructuredGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        structured_gen = StructuredGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            ),\n        )\n\n        structured_gen.load()\n\n        result = next(\n            structured_gen.process(\n                [\n                    {\n                        \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n                        \"structured_output\": {\n                            \"format\": \"regex\",\n                            \"schema\": r\"(\\\\d{1,2})\u00b0C\"\n                        },\n\n                    }\n                ]\n            )\n        )\n        ```\n    \"\"\"\n\n    use_system_prompt: bool = False\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task are the `instruction` and the `structured_output`.\n        Optionally, if the `use_system_prompt` flag is set to True, then the\n        `system_prompt` will be used too.\"\"\"\n        columns = [\"instruction\", \"structured_output\"]\n        if self.use_system_prompt:\n            columns = [\"system_prompt\"] + columns\n        return columns\n\n    def format_input(self, input: Dict[str, Any]) -> StructuredInput:\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        if not isinstance(input[\"instruction\"], str):\n            raise DistilabelUserError(\n                f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n                page=\"components-gallery/tasks/structuredgeneration/\",\n            )\n\n        messages = [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n        if self.use_system_prompt:\n            if \"system_prompt\" in input:\n                messages.insert(\n                    0, {\"role\": \"system\", \"content\": input[\"system_prompt\"]}\n                )\n            else:\n                warnings.warn(\n                    \"`use_system_prompt` is set to `True`, but no `system_prompt` in input batch, so it will be ignored.\",\n                    UserWarning,\n                    stacklevel=2,\n                )\n\n        return (messages, input.get(\"structured_output\", None))  # type: ignore\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        return [\"generation\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n        will be automatically included within the `process` method of `Task`. Note that even\n        if the `structured_output` is defined to produce a JSON schema, this method will return the raw\n        output i.e. a string without any parsing.\"\"\"\n        return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.inputs","title":"inputs: List[str] property","text":"

The input for the task are the instruction and the structured_output. Optionally, if the use_system_prompt flag is set to True, then the system_prompt will be used too.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/structured_generation.py
def format_input(self, input: Dict[str, Any]) -> StructuredInput:\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    if not isinstance(input[\"instruction\"], str):\n        raise DistilabelUserError(\n            f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n            page=\"components-gallery/tasks/structuredgeneration/\",\n        )\n\n    messages = [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n    if self.use_system_prompt:\n        if \"system_prompt\" in input:\n            messages.insert(\n                0, {\"role\": \"system\", \"content\": input[\"system_prompt\"]}\n            )\n        else:\n            warnings.warn(\n                \"`use_system_prompt` is set to `True`, but no `system_prompt` in input batch, so it will be ignored.\",\n                UserWarning,\n                stacklevel=2,\n            )\n\n    return (messages, input.get(\"structured_output\", None))  # type: ignore\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.format_output","title":"format_output(output, input)","text":"

The output is formatted as a dictionary with the generation. The model_name will be automatically included within the process method of Task. Note that even if the structured_output is defined to produce a JSON schema, this method will return the raw output i.e. a string without any parsing.

Source code in src/distilabel/steps/tasks/structured_generation.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n    will be automatically included within the `process` method of `Task`. Note that even\n    if the `structured_output` is defined to produce a JSON schema, this method will return the raw\n    output i.e. a string without any parsing.\"\"\"\n    return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification","title":"TextClassification","text":"

Bases: Task

Classifies text into one or more categories or labels.

This task can be used for text classification problems, where the goal is to assign one or multiple labels to a given text. It uses structured generation as per the reference paper by default, it can help to generate more concise labels. See section 4.1 in the reference.

Input columns
  • text (str): The reference text we want to obtain labels for.
Output columns
  • labels (Union[str, List[str]]): The label or list of labels for the text.
  • model_name (str): The name of the model used to generate the label/s.
Categories
  • text-classification
References
  • Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models

Attributes:

Name Type Description system_prompt Optional[str]

A prompt to display to the user before the task starts. Contains a default message to make the model behave like a classifier specialist.

n PositiveInt

Number of labels to generate If only 1 is required, corresponds to a label classification problem, if >1 it will intend return the \"n\" labels most representative for the text. Defaults to 1.

context Optional[str]

Context to use when generating the labels. By default contains a generic message, but can be used to customize the context for the task.

examples Optional[List[str]]

List of examples to help the model understand the task, few shots.

available_labels Optional[Union[List[str], Dict[str, str]]]

List of available labels to choose from when classifying the text, or a dictionary with the labels and their descriptions.

default_label Optional[Union[str, List[str]]]

Default label to use when the text is ambiguous or lacks sufficient information for classification. Can be a list in case of multiple labels (n>1).

Examples:

Assigning a sentiment to a text:

from distilabel.steps.tasks import TextClassification\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\ntext_classification = TextClassification(\n    llm=llm,\n    context=\"You are an AI system specialized in assigning sentiment to movies.\",\n    available_labels=[\"positive\", \"negative\"],\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n    )\n)\n# result\n# [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n# 'labels': 'positive',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"positive\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"positive\",  # The text shows positive sentiment\\n    \"negative\",  # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n

Assigning predefined labels with specified descriptions:

from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n    llm=llm,\n    n=1,\n    context=\"Determine the intent of the text.\",\n    available_labels={\n        \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n        \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n        \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n        \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n    },\n    query_title=\"Customer Query\",\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"Can you tell me more about your return policy?\"}]\n    )\n)\n# result\n# [{'text': 'Can you tell me more about your return policy?',\n# 'labels': 'inquiry',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"inquiry\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"complaint\",  # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n    \"inquiry\",  # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n    \"feedback\",  # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n    \"praise\",  # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n

Free multi label classification without predefined labels:

from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n    llm=llm,\n    n=3,\n    context=(\n        \"Describe the main themes, topics, or categories that could describe the \"\n        \"following type of persona.\"\n    ),\n    query_title=\"Example of Persona\",\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n    )\n)\n# result\n# [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n# 'labels': ['Historical Researcher',\n# 'Cultural Specialist',\n# 'Ethnic Studies Expert'],\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
Source code in src/distilabel/steps/tasks/text_classification.py
class TextClassification(Task):\n    r\"\"\"Classifies text into one or more categories or labels.\n\n    This task can be used for text classification problems, where the goal is to assign\n    one or multiple labels to a given text.\n    It uses structured generation as per the reference paper by default,\n    it can help to generate more concise labels. See section 4.1 in the reference.\n\n    Input columns:\n        - text (`str`): The reference text we want to obtain labels for.\n\n    Output columns:\n        - labels (`Union[str, List[str]]`): The label or list of labels for the text.\n        - model_name (`str`): The name of the model used to generate the label/s.\n\n    Categories:\n        - text-classification\n\n    References:\n        - [`Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models`](https://arxiv.org/abs/2408.02442)\n\n    Attributes:\n        system_prompt: A prompt to display to the user before the task starts. Contains a default\n            message to make the model behave like a classifier specialist.\n        n: Number of labels to generate If only 1 is required, corresponds to a label\n            classification problem, if >1 it will intend return the \"n\" labels most representative\n            for the text. Defaults to 1.\n        context: Context to use when generating the labels. By default contains a generic message,\n            but can be used to customize the context for the task.\n        examples: List of examples to help the model understand the task, few shots.\n        available_labels: List of available labels to choose from when classifying the text, or\n            a dictionary with the labels and their descriptions.\n        default_label: Default label to use when the text is ambiguous or lacks sufficient information for\n            classification. Can be a list in case of multiple labels (n>1).\n\n    Examples:\n        Assigning a sentiment to a text:\n\n        ```python\n        from distilabel.steps.tasks import TextClassification\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm = InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        )\n\n        text_classification = TextClassification(\n            llm=llm,\n            context=\"You are an AI system specialized in assigning sentiment to movies.\",\n            available_labels=[\"positive\", \"negative\"],\n        )\n\n        text_classification.load()\n\n        result = next(\n            text_classification.process(\n                [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n            )\n        )\n        # result\n        # [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n        # 'labels': 'positive',\n        # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"positive\"\\n}',\n        # 'raw_input_text_classification_0': [{'role': 'system',\n        #     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n        #     {'role': 'user',\n        #     'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"positive\",  # The text shows positive sentiment\\n    \"negative\",  # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Assigning predefined labels with specified descriptions:\n\n        ```python\n        from distilabel.steps.tasks import TextClassification\n\n        text_classification = TextClassification(\n            llm=llm,\n            n=1,\n            context=\"Determine the intent of the text.\",\n            available_labels={\n                \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n                \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n                \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n                \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n            },\n            query_title=\"Customer Query\",\n        )\n\n        text_classification.load()\n\n        result = next(\n            text_classification.process(\n                [{\"text\": \"Can you tell me more about your return policy?\"}]\n            )\n        )\n        # result\n        # [{'text': 'Can you tell me more about your return policy?',\n        # 'labels': 'inquiry',\n        # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"inquiry\"\\n}',\n        # 'raw_input_text_classification_0': [{'role': 'system',\n        #     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n        #     {'role': 'user',\n        #     'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"complaint\",  # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n    \"inquiry\",  # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n    \"feedback\",  # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n    \"praise\",  # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Free multi label classification without predefined labels:\n\n        ```python\n        from distilabel.steps.tasks import TextClassification\n\n        text_classification = TextClassification(\n            llm=llm,\n            n=3,\n            context=(\n                \"Describe the main themes, topics, or categories that could describe the \"\n                \"following type of persona.\"\n            ),\n            query_title=\"Example of Persona\",\n        )\n\n        text_classification.load()\n\n        result = next(\n            text_classification.process(\n                [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n            )\n        )\n        # result\n        # [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n        # 'labels': ['Historical Researcher',\n        # 'Cultural Specialist',\n        # 'Ethnic Studies Expert'],\n        # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n        # 'raw_input_text_classification_0': [{'role': 'system',\n        #     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n        #     {'role': 'user',\n        #     'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n    \"\"\"\n\n    system_prompt: Optional[str] = (\n        \"You are an AI system specialized in generating labels to classify pieces of text. \"\n        \"Your sole purpose is to analyze the given text and provide appropriate classification labels.\"\n    )\n    n: PositiveInt = Field(\n        default=1,\n        description=\"Number of labels to generate. Defaults to 1.\",\n    )\n    context: Optional[str] = Field(\n        default=\"Generate concise, relevant labels that accurately represent the text's main themes, topics, or categories.\",\n        description=\"Context to use when generating the labels.\",\n    )\n    examples: Optional[List[str]] = Field(\n        default=None,\n        description=\"List of examples to help the model understand the task, few shots.\",\n    )\n    available_labels: Optional[Union[List[str], Dict[str, str]]] = Field(\n        default=None,\n        description=(\n            \"List of available labels to choose from when classifying the text, or \"\n            \"a dictionary with the labels and their descriptions.\"\n        ),\n    )\n    default_label: Optional[Union[str, List[str]]] = Field(\n        default=\"Unclassified\",\n        description=(\n            \"Default label to use when the text is ambiguous or lacks sufficient information for \"\n            \"classification. Can be a list in case of multiple labels (n>1).\"\n        ),\n    )\n    query_title: str = Field(\n        default=\"User Query\",\n        description=\"Title of the query used to show the example/s to classify.\",\n    )\n    use_default_structured_output: bool = True\n\n    _template: Optional[Template] = PrivateAttr(default=None)\n\n    def load(self) -> None:\n        super().load()\n        self._template = Template(TEXT_CLASSIFICATION_TEMPLATE)\n        self._labels_format: str = (\n            '\"label\"'\n            if self.n == 1\n            else \"[\" + \", \".join([f'\"label_{i}\"' for i in range(self.n)]) + \"]\"\n        )\n        self._labels_message: str = (\n            \"Provide the label that best describes the text.\"\n            if self.n == 1\n            else f\"Provide a list of {self.n} labels that best describe the text.\"\n        )\n        self._available_labels_message: str = self._get_available_labels_message()\n        self._examples: str = self._get_examples_message()\n\n    def _get_available_labels_message(self) -> str:\n        \"\"\"Prepares the message to display depending on the available labels (if any),\n        and whether the labels have a specific context.\n        \"\"\"\n        if self.available_labels is None:\n            return (\n                \"Use clear, widely understood terms for labels.\"\n                \"Avoid overly specific or obscure labels unless the text demands it.\"\n            )\n\n        msg = (\n            \"## Labeling the user input\\n\"\n            \"Use the available labels to classify the user query{label_context}:\\n\"\n            \"available_labels = {available_labels}\"\n        )\n        if isinstance(self.available_labels, list):\n            specific_msg = (\n                \"[\\n\"\n                + indent(\n                    \"\".join([f'\"{label}\",\\n' for label in self.available_labels]),\n                    prefix=\" \" * 4,\n                )\n                + \"]\"\n            )\n            return msg.format(label_context=\"\", available_labels=specific_msg)\n\n        elif isinstance(self.available_labels, dict):\n            specific_msg = \"\"\n            for label, description in self.available_labels.items():\n                specific_msg += indent(\n                    f'\"{label}\",  # {description}' + \"\\n\", prefix=\" \" * 4\n                )\n\n            specific_msg = \"[\\n\" + specific_msg + \"]\"\n            return msg.format(\n                label_context=\". Analyze the context of each label specifically\",\n                available_labels=specific_msg,\n            )\n\n    def _get_examples_message(self) -> str:\n        \"\"\"Prepares the message to display depending on the examples provided.\"\"\"\n        if self.examples is None:\n            return \"\"\n\n        examples_msg = \"\\n\".join([f\"- {ex}\" for ex in self.examples])\n\n        return (\n            \"\\n## Examples\\n\"\n            \"Here are some examples to help you understand the task:\\n\"\n            f\"{examples_msg}\"\n        )\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`.\"\"\"\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        return [\"labels\", \"model_name\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        messages = [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    context=f\"\\n{self.context}\",\n                    labels_message=self._labels_message,\n                    available_labels=self._available_labels_message,\n                    examples=self._examples,\n                    default_label=self.default_label,\n                    labels_format=self._labels_format,\n                    query_title=self.query_title,\n                    text=input[\"text\"],\n                ),\n            },\n        ]\n        if self.system_prompt:\n            messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n        return messages\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n        will be automatically included within the `process` method of `Task`.\"\"\"\n        return self._format_structured_output(output)\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        if self.n > 1:\n\n            class MultiLabelSchema(BaseModel):\n                labels: List[str]\n\n            return MultiLabelSchema.model_json_schema()\n\n        class SingleLabelSchema(BaseModel):\n            labels: str\n\n        return SingleLabelSchema.model_json_schema()\n\n    def _format_structured_output(\n        self, output: str\n    ) -> Dict[str, Union[str, List[str]]]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with the `labels`, and either a string or a list of strings with the labels.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            if self.n > 1:\n                return {\"labels\": [None for _ in range(self.n)]}\n            return {\"labels\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.inputs","title":"inputs: List[str] property","text":"

The input for the task is the instruction.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._get_available_labels_message","title":"_get_available_labels_message()","text":"

Prepares the message to display depending on the available labels (if any), and whether the labels have a specific context.

Source code in src/distilabel/steps/tasks/text_classification.py
def _get_available_labels_message(self) -> str:\n    \"\"\"Prepares the message to display depending on the available labels (if any),\n    and whether the labels have a specific context.\n    \"\"\"\n    if self.available_labels is None:\n        return (\n            \"Use clear, widely understood terms for labels.\"\n            \"Avoid overly specific or obscure labels unless the text demands it.\"\n        )\n\n    msg = (\n        \"## Labeling the user input\\n\"\n        \"Use the available labels to classify the user query{label_context}:\\n\"\n        \"available_labels = {available_labels}\"\n    )\n    if isinstance(self.available_labels, list):\n        specific_msg = (\n            \"[\\n\"\n            + indent(\n                \"\".join([f'\"{label}\",\\n' for label in self.available_labels]),\n                prefix=\" \" * 4,\n            )\n            + \"]\"\n        )\n        return msg.format(label_context=\"\", available_labels=specific_msg)\n\n    elif isinstance(self.available_labels, dict):\n        specific_msg = \"\"\n        for label, description in self.available_labels.items():\n            specific_msg += indent(\n                f'\"{label}\",  # {description}' + \"\\n\", prefix=\" \" * 4\n            )\n\n        specific_msg = \"[\\n\" + specific_msg + \"]\"\n        return msg.format(\n            label_context=\". Analyze the context of each label specifically\",\n            available_labels=specific_msg,\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._get_examples_message","title":"_get_examples_message()","text":"

Prepares the message to display depending on the examples provided.

Source code in src/distilabel/steps/tasks/text_classification.py
def _get_examples_message(self) -> str:\n    \"\"\"Prepares the message to display depending on the examples provided.\"\"\"\n    if self.examples is None:\n        return \"\"\n\n    examples_msg = \"\\n\".join([f\"- {ex}\" for ex in self.examples])\n\n    return (\n        \"\\n## Examples\\n\"\n        \"Here are some examples to help you understand the task:\\n\"\n        f\"{examples_msg}\"\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/text_classification.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    messages = [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                context=f\"\\n{self.context}\",\n                labels_message=self._labels_message,\n                available_labels=self._available_labels_message,\n                examples=self._examples,\n                default_label=self.default_label,\n                labels_format=self._labels_format,\n                query_title=self.query_title,\n                text=input[\"text\"],\n            ),\n        },\n    ]\n    if self.system_prompt:\n        messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n    return messages\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a dictionary with the generation. The model_name will be automatically included within the process method of Task.

Source code in src/distilabel/steps/tasks/text_classification.py
def format_output(\n    self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n    will be automatically included within the `process` method of `Task`.\"\"\"\n    return self._format_structured_output(output)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/text_classification.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    if self.n > 1:\n\n        class MultiLabelSchema(BaseModel):\n            labels: List[str]\n\n        return MultiLabelSchema.model_json_schema()\n\n    class SingleLabelSchema(BaseModel):\n        labels: str\n\n    return SingleLabelSchema.model_json_schema()\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._format_structured_output","title":"_format_structured_output(output)","text":"

Parses the structured response, which should correspond to a dictionary with the labels, and either a string or a list of strings with the labels.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, Union[str, List[str]]]

Formatted output.

Source code in src/distilabel/steps/tasks/text_classification.py
def _format_structured_output(\n    self, output: str\n) -> Dict[str, Union[str, List[str]]]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with the `labels`, and either a string or a list of strings with the labels.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        if self.n > 1:\n            return {\"labels\": [None for _ in range(self.n)]}\n        return {\"labels\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration","title":"ChatGeneration","text":"

Bases: Task

Generates text based on a conversation.

ChatGeneration is a pre-defined task that defines the messages as the input and generation as the output. This task is used to generate text based on a conversation. The model_name is also returned as part of the output in order to enhance it.

Input columns
  • messages (List[Dict[Literal[\"role\", \"content\"], str]]): The messages to generate the follow up completion from.
Output columns
  • generation (str): The generated text from the assistant.
  • model_name (str): The model name used to generate the text.
Categories
  • chat-generation
Icon

:material-chat:

Examples:

Generate text from a conversation in OpenAI chat format:

from distilabel.steps.tasks import ChatGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nchat = ChatGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nchat.load()\n\nresult = next(\n    chat.process(\n        [\n            {\n                \"messages\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                ]\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#         'generation': '4',\n#     }\n# ]\n
Source code in src/distilabel/steps/tasks/text_generation.py
class ChatGeneration(Task):\n    \"\"\"Generates text based on a conversation.\n\n    `ChatGeneration` is a pre-defined task that defines the `messages` as the input\n    and `generation` as the output. This task is used to generate text based on a conversation.\n    The `model_name` is also returned as part of the output in order to enhance it.\n\n    Input columns:\n        - messages (`List[Dict[Literal[\"role\", \"content\"], str]]`): The messages to generate the\n            follow up completion from.\n\n    Output columns:\n        - generation (`str`): The generated text from the assistant.\n        - model_name (`str`): The model name used to generate the text.\n\n    Categories:\n        - chat-generation\n\n    Icon:\n        `:material-chat:`\n\n    Examples:\n        Generate text from a conversation in OpenAI chat format:\n\n        ```python\n        from distilabel.steps.tasks import ChatGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        chat = ChatGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            )\n        )\n\n        chat.load()\n\n        result = next(\n            chat.process(\n                [\n                    {\n                        \"messages\": [\n                            {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                        ]\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n        #         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n        #         'generation': '4',\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task are the `messages`.\"\"\"\n        return [\"messages\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the messages provided\n        are already formatted that way i.e. following the OpenAI chat format.\"\"\"\n\n        if not is_openai_format(input[\"messages\"]):\n            raise DistilabelUserError(\n                \"Input `messages` must be an OpenAI chat-like format conversation. \"\n                f\"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n                page=\"components-gallery/tasks/chatgeneration/\",\n            )\n\n        if input[\"messages\"][-1][\"role\"] != \"user\":\n            raise DistilabelUserError(\n                \"The last message must be from the user. Please check: \"\n                \"'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n                page=\"components-gallery/tasks/chatgeneration/\",\n            )\n\n        return input[\"messages\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        return [\"generation\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n        will be automatically included within the `process` method of `Task`.\"\"\"\n        return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.inputs","title":"inputs: List[str] property","text":"

The input for the task are the messages.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the messages provided are already formatted that way i.e. following the OpenAI chat format.

Source code in src/distilabel/steps/tasks/text_generation.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the messages provided\n    are already formatted that way i.e. following the OpenAI chat format.\"\"\"\n\n    if not is_openai_format(input[\"messages\"]):\n        raise DistilabelUserError(\n            \"Input `messages` must be an OpenAI chat-like format conversation. \"\n            f\"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n            page=\"components-gallery/tasks/chatgeneration/\",\n        )\n\n    if input[\"messages\"][-1][\"role\"] != \"user\":\n        raise DistilabelUserError(\n            \"The last message must be from the user. Please check: \"\n            \"'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n            page=\"components-gallery/tasks/chatgeneration/\",\n        )\n\n    return input[\"messages\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a dictionary with the generation. The model_name will be automatically included within the process method of Task.

Source code in src/distilabel/steps/tasks/text_generation.py
def format_output(\n    self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n    will be automatically included within the `process` method of `Task`.\"\"\"\n    return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration","title":"TextGeneration","text":"

Bases: Task

Text generation with an LLM given a prompt.

TextGeneration is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. This task should be good enough for tasks that don't need post-processing of the responses generated by the LLM.

Attributes:

Name Type Description system_prompt Union[str, None]

The system prompt to use in the generation. If not provided, then it will check if the input row has a column named system_prompt and use it. If not, then no system prompt will be used. Defaults to None.

template str

The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template.

columns Union[str, List[str]]

A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction.

use_system_prompt bool

DEPRECATED. To be removed in 1.5.0. Whether to use the system prompt in the generation. Defaults to True, which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored.

Input columns
  • dynamic (determined by columns attribute): By default will be set to instruction. The columns can point both to a str or a List[str] to be used in the template.
Output columns
  • generation (str): The generated text.
  • model_name (str): The name of the model used to generate the text.
Categories
  • text-generation
References
  • Jinja2 Template Designer Documentation

Examples:

Generate text from an instruction:

from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    )\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [{\"instruction\": \"your instruction\"}]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'your instruction',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'generation',\n#     }\n# ]\n

Use a custom template to generate text:

from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Document:\n{{ document }}\n\nQuestion: {{ question }}\n\nPlease provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n    template=CUSTOM_TEMPLATE,\n    columns=[\"document\", \"question\"],\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [\n            {\n                \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n                \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n#         'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n#     }\n# ]\n

Few shot learning with different system prompts:

from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n{% for example in examples %}\nExample {{ loop.index }}:\nInstruction: {{ example }}\n\n{% endfor %}\nNow, generate a new instruction in a similar style:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    template=CUSTOM_TEMPLATE,\n    columns=\"examples\",\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [\n            {\n                \"examples\": [\"This is an example\", \"Another relevant example\"],\n                \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'examples': ['This is an example', 'Another relevant example'],\n#         'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'Disable the firewall on the router',\n#     }\n# ]\n
Source code in src/distilabel/steps/tasks/text_generation.py
class TextGeneration(Task):\n    \"\"\"Text generation with an `LLM` given a prompt.\n\n    `TextGeneration` is a pre-defined task that allows passing a custom prompt using the\n    Jinja2 syntax. By default, a `instruction` is expected in the inputs, but the using\n    `template` and `columns` attributes one can define a custom prompt and columns expected\n    from the text. This task should be good enough for tasks that don't need post-processing\n    of the responses generated by the LLM.\n\n    Attributes:\n        system_prompt: The system prompt to use in the generation. If not provided, then\n            it will check if the input row has a column named `system_prompt` and use it.\n            If not, then no system prompt will be used. Defaults to `None`.\n        template: The template to use for the generation. It must follow the Jinja2 template\n            syntax. If not provided, it will assume the text passed is an instruction and\n            construct the appropriate template.\n        columns: A string with the column, or a list with columns expected in the template.\n            Take a look at the examples for more information. Defaults to `instruction`.\n        use_system_prompt: DEPRECATED. To be removed in 1.5.0. Whether to use the system\n            prompt in the generation. Defaults to `True`, which means that if the column\n            `system_prompt` is defined within the input batch, then the `system_prompt`\n            will be used, otherwise, it will be ignored.\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): By default will be set to `instruction`.\n            The columns can point both to a `str` or a `List[str]` to be used in the template.\n\n    Output columns:\n        - generation (`str`): The generated text.\n        - model_name (`str`): The name of the model used to generate the text.\n\n    Categories:\n        - text-generation\n\n    References:\n        - [Jinja2 Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/)\n\n    Examples:\n        Generate text from an instruction:\n\n        ```python\n        from distilabel.steps.tasks import TextGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        text_gen = TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            )\n        )\n\n        text_gen.load()\n\n        result = next(\n            text_gen.process(\n                [{\"instruction\": \"your instruction\"}]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'your instruction',\n        #         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n        #         'generation': 'generation',\n        #     }\n        # ]\n        ```\n\n        Use a custom template to generate text:\n\n        ```python\n        from distilabel.steps.tasks import TextGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        CUSTOM_TEMPLATE = '''Document:\n        {{ document }}\n\n        Question: {{ question }}\n\n        Please provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n        '''.rstrip()\n\n        text_gen = TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n            template=CUSTOM_TEMPLATE,\n            columns=[\"document\", \"question\"],\n        )\n\n        text_gen.load()\n\n        result = next(\n            text_gen.process(\n                [\n                    {\n                        \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n                        \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n        #         'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n        #         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n        #         'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n        #     }\n        # ]\n        ```\n\n        Few shot learning with different system prompts:\n\n        ```python\n        from distilabel.steps.tasks import TextGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        CUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n        {% for example in examples %}\n        Example {{ loop.index }}:\n        Instruction: {{ example }}\n\n        {% endfor %}\n        Now, generate a new instruction in a similar style:\n        '''.rstrip()\n\n        text_gen = TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            template=CUSTOM_TEMPLATE,\n            columns=\"examples\",\n        )\n\n        text_gen.load()\n\n        result = next(\n            text_gen.process(\n                [\n                    {\n                        \"examples\": [\"This is an example\", \"Another relevant example\"],\n                        \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'examples': ['This is an example', 'Another relevant example'],\n        #         'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n        #         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n        #         'generation': 'Disable the firewall on the router',\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    system_prompt: Union[str, None] = None\n    use_system_prompt: bool = Field(default=True, deprecated=True)\n    template: str = Field(\n        default=\"{{ instruction }}\",\n        description=(\n            \"This is a template or prompt to use for the generation. \"\n            \"If not provided, it is assumed a `instruction` is placed in the inputs, \"\n            \"to be used as is.\"\n        ),\n    )\n    columns: Union[str, List[str]] = Field(\n        default=\"instruction\",\n        description=(\n            \"Custom column or list of columns to include in the input. \"\n            \"If a `template` is provided which needs custom column names, \"\n            \"then they should be provided here. By default it will use `instruction`.\"\n        ),\n    )\n\n    _can_be_used_with_offline_batch_generation = True\n    _template: Optional[\"Template\"] = PrivateAttr(default=...)\n\n    def model_post_init(self, __context: Any) -> None:\n        self.columns = [self.columns] if isinstance(self.columns, str) else self.columns\n        super().model_post_init(__context)\n\n    def load(self) -> None:\n        super().load()\n\n        def check_column_in_template(column, template):\n            pattern = (\n                r\"(?:{%.*?\\b\"\n                + re.escape(column)\n                + r\"\\b.*?%}|{{\\s*\"\n                + re.escape(column)\n                + r\"\\s*}})\"\n            )\n            if not re.search(pattern, template):\n                raise DistilabelUserError(\n                    (\n                        f\"You required column name '{column}', but is not present in the template, \"\n                        \"ensure the 'columns' match with the 'template' to avoid errors.\"\n                    ),\n                    page=\"components-gallery/tasks/textgeneration/\",\n                )\n\n        for column in self.columns:\n            check_column_in_template(column, self.template)\n\n        self._template = Template(self.template)\n\n    def unload(self) -> None:\n        super().unload()\n        self._template = None\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The input for the task is the `instruction` by default, or the `columns` given as input.\"\"\"\n        columns = {column: True for column in self.columns}\n        columns[\"system_prompt\"] = False\n        return columns\n\n    def _prepare_message_content(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n        fields = {column: input[column] for column in self.columns}\n        return [{\"role\": \"user\", \"content\": self._template.render(**fields)}]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        # Handle the previous expected errors, in case of custom columns there's more freedom\n        # and we cannot check it so easily.\n        if self.columns == [\"instruction\"]:\n            if is_openai_format(input[\"instruction\"]):\n                raise DistilabelUserError(\n                    \"Providing `instruction` formatted as an OpenAI chat / conversation is\"\n                    \" deprecated, you should use `ChatGeneration` with `messages` as input instead.\",\n                    page=\"components-gallery/tasks/textgeneration/\",\n                )\n\n            if not isinstance(input[\"instruction\"], str):\n                raise DistilabelUserError(\n                    f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n                    page=\"components-gallery/tasks/textgeneration/\",\n                )\n\n        messages = self._prepare_message_content(input)\n\n        row_system_prompt = input.get(\"system_prompt\")\n        if row_system_prompt:\n            messages.insert(0, {\"role\": \"system\", \"content\": row_system_prompt})\n\n        if self.system_prompt and not row_system_prompt:\n            messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n        return messages  # type: ignore\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        return [\"generation\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n        will be automatically included within the `process` method of `Task`.\"\"\"\n        return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.inputs","title":"inputs: StepColumns property","text":"

The input for the task is the instruction by default, or the columns given as input.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration._prepare_message_content","title":"_prepare_message_content(input)","text":"

Prepares the content for the template and returns the formatted messages.

Source code in src/distilabel/steps/tasks/text_generation.py
def _prepare_message_content(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n    fields = {column: input[column] for column in self.columns}\n    return [{\"role\": \"user\", \"content\": self._template.render(**fields)}]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/text_generation.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    # Handle the previous expected errors, in case of custom columns there's more freedom\n    # and we cannot check it so easily.\n    if self.columns == [\"instruction\"]:\n        if is_openai_format(input[\"instruction\"]):\n            raise DistilabelUserError(\n                \"Providing `instruction` formatted as an OpenAI chat / conversation is\"\n                \" deprecated, you should use `ChatGeneration` with `messages` as input instead.\",\n                page=\"components-gallery/tasks/textgeneration/\",\n            )\n\n        if not isinstance(input[\"instruction\"], str):\n            raise DistilabelUserError(\n                f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n                page=\"components-gallery/tasks/textgeneration/\",\n            )\n\n    messages = self._prepare_message_content(input)\n\n    row_system_prompt = input.get(\"system_prompt\")\n    if row_system_prompt:\n        messages.insert(0, {\"role\": \"system\", \"content\": row_system_prompt})\n\n    if self.system_prompt and not row_system_prompt:\n        messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n    return messages  # type: ignore\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a dictionary with the generation. The model_name will be automatically included within the process method of Task.

Source code in src/distilabel/steps/tasks/text_generation.py
def format_output(\n    self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n    will be automatically included within the `process` method of `Task`.\"\"\"\n    return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback","title":"UltraFeedback","text":"

Bases: Task

Rank generations focusing on different aspects using an LLM.

UltraFeedback: Boosting Language Models with High-quality Feedback.

Attributes:

Name Type Description aspect Literal['helpfulness', 'honesty', 'instruction-following', 'truthfulness', 'overall-rating']

The aspect to perform with the UltraFeedback model. The available aspects are: - helpfulness: Evaluate text outputs based on helpfulness. - honesty: Evaluate text outputs based on honesty. - instruction-following: Evaluate text outputs based on given instructions. - truthfulness: Evaluate text outputs based on truthfulness. Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall assessment of the text outputs within a single prompt. The custom aspect is: - overall-rating: Evaluate text outputs based on an overall assessment. Defaults to \"overall-rating\".

Input columns
  • instruction (str): The reference instruction to evaluate the text outputs.
  • generations (List[str]): The text outputs to evaluate for the given instruction.
Output columns
  • ratings (List[float]): The ratings for each of the provided text outputs.
  • rationales (List[str]): The rationales for each of the provided text outputs.
  • model_name (str): The name of the model used to generate the ratings and rationales.
Categories
  • preference
References
  • UltraFeedback: Boosting Language Models with High-quality Feedback
  • UltraFeedback - GitHub Repository

Examples:

Rate generations from different LLMs based on the selected aspect:

from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    use_default_structured_output=False\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'How much is 2+2?',\n#         'generations': ['4', 'and a car'],\n#         'ratings': [1, 2],\n#         'rationales': ['explanation for 4', 'explanation for and a car'],\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#     }\n# ]\n

Rate generations from different LLMs based on the honesty, using the default structured output:

from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    aspect=\"honesty\"\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [5, 1],\n# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n# \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\n    5,\\n    1\\n] \\n\\n,\"rationales\": [\\n    \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\n    \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\n] }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n

Rate generations from different LLMs based on the helpfulness, using the default structured output:

from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512},\n    ),\n    aspect=\"helpfulness\"\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n#   'generations': ['4', 'and a car'],\n#   'ratings': [1, 5],\n#   'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n#    'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n#   'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n#    'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n#   'types': [1, 3, 1],\n#   'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n  \"ratings\": [\\n    1,\\n    5\\n  ]\\n ,\\n  \"rationales\": [\\n    \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\n    \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\n  ]\\n ,\\n  \"rationales_for_rating\": [\\n    \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\n    \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\n  ]\\n ,\\n  \"types\": [\\n    1, 3,\\n    1\\n  ]\\n  }'},\n#   'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
Citations
@misc{cui2024ultrafeedbackboostinglanguagemodels,\n    title={UltraFeedback: Boosting Language Models with Scaled AI Feedback},\n    author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\n    year={2024},\n    eprint={2310.01377},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2310.01377},\n}\n
Source code in src/distilabel/steps/tasks/ultrafeedback.py
class UltraFeedback(Task):\n    \"\"\"Rank generations focusing on different aspects using an `LLM`.\n\n    UltraFeedback: Boosting Language Models with High-quality Feedback.\n\n    Attributes:\n        aspect: The aspect to perform with the `UltraFeedback` model. The available aspects are:\n            - `helpfulness`: Evaluate text outputs based on helpfulness.\n            - `honesty`: Evaluate text outputs based on honesty.\n            - `instruction-following`: Evaluate text outputs based on given instructions.\n            - `truthfulness`: Evaluate text outputs based on truthfulness.\n            Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall\n            assessment of the text outputs within a single prompt. The custom aspect is:\n            - `overall-rating`: Evaluate text outputs based on an overall assessment.\n            Defaults to `\"overall-rating\"`.\n\n    Input columns:\n        - instruction (`str`): The reference instruction to evaluate the text outputs.\n        - generations (`List[str]`): The text outputs to evaluate for the given instruction.\n\n    Output columns:\n        - ratings (`List[float]`): The ratings for each of the provided text outputs.\n        - rationales (`List[str]`): The rationales for each of the provided text outputs.\n        - model_name (`str`): The name of the model used to generate the ratings and rationales.\n\n    Categories:\n        - preference\n\n    References:\n        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)\n        - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)\n\n    Examples:\n        Rate generations from different LLMs based on the selected aspect:\n\n        ```python\n        from distilabel.steps.tasks import UltraFeedback\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        ultrafeedback = UltraFeedback(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            use_default_structured_output=False\n        )\n\n        ultrafeedback.load()\n\n        result = next(\n            ultrafeedback.process(\n                [\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"generations\": [\"4\", \"and a car\"],\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'How much is 2+2?',\n        #         'generations': ['4', 'and a car'],\n        #         'ratings': [1, 2],\n        #         'rationales': ['explanation for 4', 'explanation for and a car'],\n        #         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n        #     }\n        # ]\n        ```\n\n        Rate generations from different LLMs based on the honesty, using the default structured output:\n\n        ```python\n        from distilabel.steps.tasks import UltraFeedback\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        ultrafeedback = UltraFeedback(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            aspect=\"honesty\"\n        )\n\n        ultrafeedback.load()\n\n        result = next(\n            ultrafeedback.process(\n                [\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"generations\": [\"4\", \"and a car\"],\n                    }\n                ]\n            )\n        )\n        # result\n        # [{'instruction': 'How much is 2+2?',\n        # 'generations': ['4', 'and a car'],\n        # 'ratings': [5, 1],\n        # 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n        # \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n        # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\\\n    5,\\\\n    1\\\\n] \\\\n\\\\n,\"rationales\": [\\\\n    \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\\\n    \"The response is confidently incorrect, as it provides unrelated information (\\'a car\\') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\\\n] }'},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Rate generations from different LLMs based on the helpfulness, using the default structured output:\n\n        ```python\n        from distilabel.steps.tasks import UltraFeedback\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        ultrafeedback = UltraFeedback(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                generation_kwargs={\"max_new_tokens\": 512},\n            ),\n            aspect=\"helpfulness\"\n        )\n\n        ultrafeedback.load()\n\n        result = next(\n            ultrafeedback.process(\n                [\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"generations\": [\"4\", \"and a car\"],\n                    }\n                ]\n            )\n        )\n        # result\n        # [{'instruction': 'How much is 2+2?',\n        #   'generations': ['4', 'and a car'],\n        #   'ratings': [1, 5],\n        #   'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n        #    'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n        #   'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n        #    'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n        #   'types': [1, 3, 1],\n        #   'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\\\n  \"ratings\": [\\\\n    1,\\\\n    5\\\\n  ]\\\\n ,\\\\n  \"rationales\": [\\\\n    \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\\\n    \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\\\n  ]\\\\n ,\\\\n  \"rationales_for_rating\": [\\\\n    \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\\\n    \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\\\n  ]\\\\n ,\\\\n  \"types\": [\\\\n    1, 3,\\\\n    1\\\\n  ]\\\\n  }'},\n        #   'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n    Citations:\n        ```\n        @misc{cui2024ultrafeedbackboostinglanguagemodels,\n            title={UltraFeedback: Boosting Language Models with Scaled AI Feedback},\n            author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\n            year={2024},\n            eprint={2310.01377},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2310.01377},\n        }\n        ```\n    \"\"\"\n\n    aspect: Literal[\n        \"helpfulness\",\n        \"honesty\",\n        \"instruction-following\",\n        \"truthfulness\",\n        # Custom aspects\n        \"overall-rating\",\n    ] = \"overall-rating\"\n\n    _system_prompt: str = PrivateAttr(\n        default=(\n            \"Your role is to evaluate text quality based on given criteria.\\n\"\n            'You\\'ll receive an instructional description (\"Instruction\") and {no_texts} text outputs (\"Text\").\\n'\n            \"Understand and interpret instructions to evaluate effectively.\\n\"\n            \"Provide annotations for each text with a rating and rationale.\\n\"\n            \"The {no_texts} texts given are independent, and should be evaluated separately.\\n\"\n        )\n    )\n    _template: Optional[\"Template\"] = PrivateAttr(default=...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"ultrafeedback\"\n            / f\"{self.aspect}.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`, and the `generations` for it.\"\"\"\n        return [\"instruction\", \"generations\"]\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self._system_prompt.format(\n                    no_texts=len(input[\"generations\"])\n                ),\n            },\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    instruction=input[\"instruction\"], generations=input[\"generations\"]\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        columns = []\n        if self.aspect in [\"honesty\", \"instruction-following\", \"overall-rating\"]:\n            columns = [\"ratings\", \"rationales\"]\n        elif self.aspect in [\"helpfulness\", \"truthfulness\"]:\n            columns = [\"types\", \"rationales\", \"ratings\", \"rationales-for-ratings\"]\n        return columns + [\"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `ratings` and `rationales` for\n        each of the provided `generations` for the given `instruction`. The `model_name`\n        will be automatically included within the `process` method of `Task`.\n\n        Args:\n            output: a string representing the output of the LLM via the `process` method.\n            input: the input to the task, as required by some tasks to format the output.\n\n        Returns:\n            A dictionary containing either the `ratings` and `rationales` for each of the provided\n            `generations` for the given `instruction` if the provided aspect is either `honesty`,\n            `instruction-following`, or `overall-rating`; or the `types`, `rationales`,\n            `ratings`, and `rationales-for-ratings` for each of the provided `generations` for the\n            given `instruction` if the provided aspect is either `helpfulness` or `truthfulness`.\n        \"\"\"\n        assert input is not None, \"Input is required to format the output.\"\n\n        if self.aspect in [\n            \"honesty\",\n            \"instruction-following\",\n            \"overall-rating\",\n        ]:\n            return self._format_ratings_rationales_output(output, input)\n\n        return self._format_types_ratings_rationales_output(output, input)\n\n    def _format_ratings_rationales_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, List[Any]]:\n        \"\"\"Formats the output when the aspect is either `honesty`, `instruction-following`, or `overall-rating`.\"\"\"\n        if output is None:\n            return {\n                \"ratings\": [None] * len(input[\"generations\"]),\n                \"rationales\": [None] * len(input[\"generations\"]),\n            }\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output, input)\n\n        pattern = r\"Rating: (.+?)\\nRationale: (.+)\"\n        sections = output.split(\"\\n\\n\")\n\n        formatted_outputs = []\n        for section in sections:\n            matches = None\n            if section is not None and section != \"\":\n                matches = re.search(pattern, section, re.DOTALL)\n            if not matches:\n                formatted_outputs.append({\"ratings\": None, \"rationales\": None})\n                continue\n\n            formatted_outputs.append(\n                {\n                    \"ratings\": (\n                        int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n                        if matches.group(1) not in [\"None\", \"N/A\"]\n                        else None\n                    ),\n                    \"rationales\": matches.group(2),\n                }\n            )\n        return group_dicts(*formatted_outputs)\n\n    def _format_types_ratings_rationales_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, List[Any]]:\n        \"\"\"Formats the output when the aspect is either `helpfulness` or `truthfulness`.\"\"\"\n        if output is None:\n            return {\n                \"types\": [None] * len(input[\"generations\"]),\n                \"rationales\": [None] * len(input[\"generations\"]),\n                \"ratings\": [None] * len(input[\"generations\"]),\n                \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n            }\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output, input)\n\n        pattern = r\"Type: (.+?)\\nRationale: (.+?)\\nRating: (.+?)\\nRationale: (.+)\"\n\n        sections = output.split(\"\\n\\n\")\n\n        formatted_outputs = []\n        for section in sections:\n            matches = None\n            if section is not None and section != \"\":\n                matches = re.search(pattern, section, re.DOTALL)\n            if not matches:\n                formatted_outputs.append(\n                    {\n                        \"types\": None,\n                        \"rationales\": None,\n                        \"ratings\": None,\n                        \"rationales-for-ratings\": None,\n                    }\n                )\n                continue\n\n            formatted_outputs.append(\n                {\n                    \"types\": (\n                        int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n                        if matches.group(1) not in [\"None\", \"N/A\"]\n                        else None\n                    ),\n                    \"rationales\": matches.group(2),\n                    \"ratings\": (\n                        int(re.findall(r\"\\b\\d+\\b\", matches.group(3))[0])\n                        if matches.group(3) not in [\"None\", \"N/A\"]\n                        else None\n                    ),\n                    \"rationales-for-ratings\": matches.group(4),\n                }\n            )\n        return group_dicts(*formatted_outputs)\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from pydantic import BaseModel\n        from typing import List\n\n        class SchemaUltraFeedback(BaseModel):\n            ratings: List[int]\n            rationales: List[str]\n\n        class SchemaUltraFeedbackWithType(BaseModel):\n            types: List[Optional[int]]\n            ratings: List[int]\n            rationales: List[str]\n            rationales_for_rating: List[str]\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        if self.aspect in [\n            \"honesty\",\n            \"instruction-following\",\n            \"overall-rating\",\n        ]:\n            return {\n                \"properties\": {\n                    \"ratings\": {\n                        \"items\": {\"type\": \"integer\"},\n                        \"title\": \"Ratings\",\n                        \"type\": \"array\",\n                    },\n                    \"rationales\": {\n                        \"items\": {\"type\": \"string\"},\n                        \"title\": \"Rationales\",\n                        \"type\": \"array\",\n                    },\n                },\n                \"required\": [\"ratings\", \"rationales\"],\n                \"title\": \"SchemaUltraFeedback\",\n                \"type\": \"object\",\n            }\n        return {\n            \"properties\": {\n                \"types\": {\n                    \"items\": {\"anyOf\": [{\"type\": \"integer\"}, {\"type\": \"null\"}]},\n                    \"title\": \"Types\",\n                    \"type\": \"array\",\n                },\n                \"ratings\": {\n                    \"items\": {\"type\": \"integer\"},\n                    \"title\": \"Ratings\",\n                    \"type\": \"array\",\n                },\n                \"rationales\": {\n                    \"items\": {\"type\": \"string\"},\n                    \"title\": \"Rationales\",\n                    \"type\": \"array\",\n                },\n                \"rationales_for_rating\": {\n                    \"items\": {\"type\": \"string\"},\n                    \"title\": \"Rationales For Rating\",\n                    \"type\": \"array\",\n                },\n            },\n            \"required\": [\"types\", \"ratings\", \"rationales\", \"rationales_for_rating\"],\n            \"title\": \"SchemaUltraFeedbackWithType\",\n            \"type\": \"object\",\n        }\n\n    def _format_structured_output(\n        self, output: str, input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with either `positive`, or `positive` and `negative` keys.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            if self.aspect in [\n                \"honesty\",\n                \"instruction-following\",\n                \"overall-rating\",\n            ]:\n                return {\n                    \"ratings\": [None] * len(input[\"generations\"]),\n                    \"rationales\": [None] * len(input[\"generations\"]),\n                }\n            return {\n                \"ratings\": [None] * len(input[\"generations\"]),\n                \"rationales\": [None] * len(input[\"generations\"]),\n                \"types\": [None] * len(input[\"generations\"]),\n                \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n            }\n\n    @override\n    def _sample_input(self) -> ChatType:\n        return self.format_input(\n            {\n                \"instruction\": f\"<PLACEHOLDER_{'instruction'.upper()}>\",\n                \"generations\": [\n                    f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n                ],\n            }\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.inputs","title":"inputs: List[str] property","text":"

The input for the task is the instruction, and the generations for it.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.load","title":"load()","text":"

Loads the Jinja2 template for the given aspect.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"ultrafeedback\"\n        / f\"{self.aspect}.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"system\",\n            \"content\": self._system_prompt.format(\n                no_texts=len(input[\"generations\"])\n            ),\n        },\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                instruction=input[\"instruction\"], generations=input[\"generations\"]\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a dictionary with the ratings and rationales for each of the provided generations for the given instruction. The model_name will be automatically included within the process method of Task.

Parameters:

Name Type Description Default output Union[str, None]

a string representing the output of the LLM via the process method.

required input Union[Dict[str, Any], None]

the input to the task, as required by some tasks to format the output.

None

Returns:

Type Description Dict[str, Any]

A dictionary containing either the ratings and rationales for each of the provided

Dict[str, Any]

generations for the given instruction if the provided aspect is either honesty,

Dict[str, Any]

instruction-following, or overall-rating; or the types, rationales,

Dict[str, Any]

ratings, and rationales-for-ratings for each of the provided generations for the

Dict[str, Any]

given instruction if the provided aspect is either helpfulness or truthfulness.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def format_output(\n    self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `ratings` and `rationales` for\n    each of the provided `generations` for the given `instruction`. The `model_name`\n    will be automatically included within the `process` method of `Task`.\n\n    Args:\n        output: a string representing the output of the LLM via the `process` method.\n        input: the input to the task, as required by some tasks to format the output.\n\n    Returns:\n        A dictionary containing either the `ratings` and `rationales` for each of the provided\n        `generations` for the given `instruction` if the provided aspect is either `honesty`,\n        `instruction-following`, or `overall-rating`; or the `types`, `rationales`,\n        `ratings`, and `rationales-for-ratings` for each of the provided `generations` for the\n        given `instruction` if the provided aspect is either `helpfulness` or `truthfulness`.\n    \"\"\"\n    assert input is not None, \"Input is required to format the output.\"\n\n    if self.aspect in [\n        \"honesty\",\n        \"instruction-following\",\n        \"overall-rating\",\n    ]:\n        return self._format_ratings_rationales_output(output, input)\n\n    return self._format_types_ratings_rationales_output(output, input)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_ratings_rationales_output","title":"_format_ratings_rationales_output(output, input)","text":"

Formats the output when the aspect is either honesty, instruction-following, or overall-rating.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def _format_ratings_rationales_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, List[Any]]:\n    \"\"\"Formats the output when the aspect is either `honesty`, `instruction-following`, or `overall-rating`.\"\"\"\n    if output is None:\n        return {\n            \"ratings\": [None] * len(input[\"generations\"]),\n            \"rationales\": [None] * len(input[\"generations\"]),\n        }\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output, input)\n\n    pattern = r\"Rating: (.+?)\\nRationale: (.+)\"\n    sections = output.split(\"\\n\\n\")\n\n    formatted_outputs = []\n    for section in sections:\n        matches = None\n        if section is not None and section != \"\":\n            matches = re.search(pattern, section, re.DOTALL)\n        if not matches:\n            formatted_outputs.append({\"ratings\": None, \"rationales\": None})\n            continue\n\n        formatted_outputs.append(\n            {\n                \"ratings\": (\n                    int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n                    if matches.group(1) not in [\"None\", \"N/A\"]\n                    else None\n                ),\n                \"rationales\": matches.group(2),\n            }\n        )\n    return group_dicts(*formatted_outputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_types_ratings_rationales_output","title":"_format_types_ratings_rationales_output(output, input)","text":"

Formats the output when the aspect is either helpfulness or truthfulness.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def _format_types_ratings_rationales_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, List[Any]]:\n    \"\"\"Formats the output when the aspect is either `helpfulness` or `truthfulness`.\"\"\"\n    if output is None:\n        return {\n            \"types\": [None] * len(input[\"generations\"]),\n            \"rationales\": [None] * len(input[\"generations\"]),\n            \"ratings\": [None] * len(input[\"generations\"]),\n            \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n        }\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output, input)\n\n    pattern = r\"Type: (.+?)\\nRationale: (.+?)\\nRating: (.+?)\\nRationale: (.+)\"\n\n    sections = output.split(\"\\n\\n\")\n\n    formatted_outputs = []\n    for section in sections:\n        matches = None\n        if section is not None and section != \"\":\n            matches = re.search(pattern, section, re.DOTALL)\n        if not matches:\n            formatted_outputs.append(\n                {\n                    \"types\": None,\n                    \"rationales\": None,\n                    \"ratings\": None,\n                    \"rationales-for-ratings\": None,\n                }\n            )\n            continue\n\n        formatted_outputs.append(\n            {\n                \"types\": (\n                    int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n                    if matches.group(1) not in [\"None\", \"N/A\"]\n                    else None\n                ),\n                \"rationales\": matches.group(2),\n                \"ratings\": (\n                    int(re.findall(r\"\\b\\d+\\b\", matches.group(3))[0])\n                    if matches.group(3) not in [\"None\", \"N/A\"]\n                    else None\n                ),\n                \"rationales-for-ratings\": matches.group(4),\n            }\n        )\n    return group_dicts(*formatted_outputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaUltraFeedback(BaseModel):\n    ratings: List[int]\n    rationales: List[str]\n\nclass SchemaUltraFeedbackWithType(BaseModel):\n    types: List[Optional[int]]\n    ratings: List[int]\n    rationales: List[str]\n    rationales_for_rating: List[str]\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from pydantic import BaseModel\n    from typing import List\n\n    class SchemaUltraFeedback(BaseModel):\n        ratings: List[int]\n        rationales: List[str]\n\n    class SchemaUltraFeedbackWithType(BaseModel):\n        types: List[Optional[int]]\n        ratings: List[int]\n        rationales: List[str]\n        rationales_for_rating: List[str]\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    if self.aspect in [\n        \"honesty\",\n        \"instruction-following\",\n        \"overall-rating\",\n    ]:\n        return {\n            \"properties\": {\n                \"ratings\": {\n                    \"items\": {\"type\": \"integer\"},\n                    \"title\": \"Ratings\",\n                    \"type\": \"array\",\n                },\n                \"rationales\": {\n                    \"items\": {\"type\": \"string\"},\n                    \"title\": \"Rationales\",\n                    \"type\": \"array\",\n                },\n            },\n            \"required\": [\"ratings\", \"rationales\"],\n            \"title\": \"SchemaUltraFeedback\",\n            \"type\": \"object\",\n        }\n    return {\n        \"properties\": {\n            \"types\": {\n                \"items\": {\"anyOf\": [{\"type\": \"integer\"}, {\"type\": \"null\"}]},\n                \"title\": \"Types\",\n                \"type\": \"array\",\n            },\n            \"ratings\": {\n                \"items\": {\"type\": \"integer\"},\n                \"title\": \"Ratings\",\n                \"type\": \"array\",\n            },\n            \"rationales\": {\n                \"items\": {\"type\": \"string\"},\n                \"title\": \"Rationales\",\n                \"type\": \"array\",\n            },\n            \"rationales_for_rating\": {\n                \"items\": {\"type\": \"string\"},\n                \"title\": \"Rationales For Rating\",\n                \"type\": \"array\",\n            },\n        },\n        \"required\": [\"types\", \"ratings\", \"rationales\", \"rationales_for_rating\"],\n        \"title\": \"SchemaUltraFeedbackWithType\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_structured_output","title":"_format_structured_output(output, input)","text":"

Parses the structured response, which should correspond to a dictionary with either positive, or positive and negative keys.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, Any]

Formatted output.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def _format_structured_output(\n    self, output: str, input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with either `positive`, or `positive` and `negative` keys.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        if self.aspect in [\n            \"honesty\",\n            \"instruction-following\",\n            \"overall-rating\",\n        ]:\n            return {\n                \"ratings\": [None] * len(input[\"generations\"]),\n                \"rationales\": [None] * len(input[\"generations\"]),\n            }\n        return {\n            \"ratings\": [None] * len(input[\"generations\"]),\n            \"rationales\": [None] * len(input[\"generations\"]),\n            \"types\": [None] * len(input[\"generations\"]),\n            \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.URIAL","title":"URIAL","text":"

Bases: Task

Generates a response using a non-instruct fine-tuned model.

URIAL is a pre-defined task that generates a response using a non-instruct fine-tuned model. This task is used to generate a response based on the conversation provided as input.

Input columns
  • instruction (str, optional): The instruction to generate a response from.
  • conversation (List[Dict[str, str]], optional): The conversation to generate a response from (the last message must be from the user).
Output columns
  • generation (str): The generated response.
  • model_name (str): The name of the model used to generate the response.
Categories
  • text-generation
References
  • The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning

Examples:

Generate text from an instruction:

from distilabel.models import vLLM\nfrom distilabel.steps.tasks import URIAL\n\nstep = URIAL(\n    llm=vLLM(\n        model=\"meta-llama/Meta-Llama-3.1-8B\",\n        generation_kwargs={\"temperature\": 0.7},\n    ),\n)\n\nstep.load()\n\nresults = next(\n    step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n)\n# [\n#     {\n#         'instruction': \"What's the most most common type of cloud?\",\n#         'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n#         'distilabel_metadata': {...},\n#         'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n#     }\n# ]\n
Source code in src/distilabel/steps/tasks/urial.py
class URIAL(Task):\n    \"\"\"Generates a response using a non-instruct fine-tuned model.\n\n    `URIAL` is a pre-defined task that generates a response using a non-instruct fine-tuned\n    model. This task is used to generate a response based on the conversation provided as\n    input.\n\n    Input columns:\n        - instruction (`str`, optional): The instruction to generate a response from.\n        - conversation (`List[Dict[str, str]]`, optional): The conversation to generate\n            a response from (the last message must be from the user).\n\n    Output columns:\n        - generation (`str`): The generated response.\n        - model_name (`str`): The name of the model used to generate the response.\n\n    Categories:\n        - text-generation\n\n    References:\n        - [The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning](https://arxiv.org/abs/2312.01552)\n\n    Examples:\n        Generate text from an instruction:\n\n        ```python\n        from distilabel.models import vLLM\n        from distilabel.steps.tasks import URIAL\n\n        step = URIAL(\n            llm=vLLM(\n                model=\"meta-llama/Meta-Llama-3.1-8B\",\n                generation_kwargs={\"temperature\": 0.7},\n            ),\n        )\n\n        step.load()\n\n        results = next(\n            step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n        )\n        # [\n        #     {\n        #         'instruction': \"What's the most most common type of cloud?\",\n        #         'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n        #         'distilabel_metadata': {...},\n        #         'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"urial.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return {\"instruction\": False, \"conversation\": False}\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        messages = (\n            [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n            if \"instruction\" in input\n            else input[\"conversation\"]\n        )\n\n        if messages[-1][\"role\"] != \"user\":\n            raise ValueError(\"The last message must be from the user.\")\n\n        return [{\"role\": \"user\", \"content\": self._template.render(messages=messages)}]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"generation\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        if output is None:\n            return {\"generation\": None}\n\n        response = output.split(\"\\n\\n# User\")[0]\n        if response.startswith(\"\\n\\n\"):\n            response = response[2:]\n        response = response.strip()\n\n        return {\"generation\": response}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.URIAL.load","title":"load()","text":"

Loads the Jinja2 template for the given aspect.

Source code in src/distilabel/steps/tasks/urial.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"urial.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.task","title":"task(inputs=None, outputs=None)","text":"

Creates a Task from a formatting output function.

Parameters:

Name Type Description Default inputs Union[StepColumns, None]

a list containing the name of the inputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not, that are required by the step. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None.

None outputs Union[StepColumns, None]

a list containing the name of the outputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column will be generated or not. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None.

None Source code in src/distilabel/steps/tasks/decorator.py
def task(\n    inputs: Union[\"StepColumns\", None] = None,\n    outputs: Union[\"StepColumns\", None] = None,\n) -> Callable[..., Type[\"Task\"]]:\n    \"\"\"Creates a `Task` from a formatting output function.\n\n    Args:\n        inputs: a list containing the name of the inputs columns/keys or a dictionary\n            where the keys are the columns and the values are booleans indicating whether\n            the column is required or not, that are required by the step. If not provided\n            the default will be an empty list `[]` and it will be assumed that the step\n            doesn't need any specific columns. Defaults to `None`.\n        outputs: a list containing the name of the outputs columns/keys or a dictionary\n            where the keys are the columns and the values are booleans indicating whether\n            the column will be generated or not. If not provided the default will be an\n            empty list `[]` and it will be assumed that the step doesn't need any specific\n            columns. Defaults to `None`.\n    \"\"\"\n\n    inputs = inputs or []\n    outputs = outputs or []\n\n    def decorator(func: TaskFormattingOutputFunc) -> Type[\"Task\"]:\n        doc = inspect.getdoc(func)\n        if doc is None:\n            raise DistilabelUserError(\n                \"When using the `task` decorator, including a docstring in the formatting\"\n                \" function is mandatory. The docstring must follow the format described\"\n                \" in the documentation.\",\n                page=\"\",\n            )\n\n        system_prompt, user_message_template = _parse_docstring(doc)\n        _validate_templates(inputs, system_prompt, user_message_template)\n\n        def inputs_property(self) -> \"StepColumns\":\n            return inputs\n\n        def outputs_property(self) -> \"StepColumns\":\n            return outputs\n\n        def format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n            return [\n                {\"role\": \"system\", \"content\": system_prompt.format(**input)},\n                {\"role\": \"user\", \"content\": user_message_template.format(**input)},\n            ]\n\n        def format_output(\n            self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n        ) -> Dict[str, Any]:\n            return func(output, input)\n\n        return type(\n            func.__name__,\n            (Task,),\n            {\n                \"inputs\": property(inputs_property),\n                \"outputs\": property(outputs_property),\n                \"__module__\": func.__module__,\n                \"format_input\": format_input,\n                \"format_output\": format_output,\n            },\n        )\n\n    return decorator\n
"},{"location":"api/task/typing/","title":"Task Typing","text":""},{"location":"api/task/typing/#distilabel.steps.tasks.typing","title":"typing","text":""},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ChatType","title":"ChatType = List[ChatItem] module-attribute","text":"

ChatType is a type alias for a list of dicts following the OpenAI conversational format.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StructuredOutputType","title":"StructuredOutputType = Union[OutlinesStructuredOutputType, InstructorStructuredOutputType] module-attribute","text":"

StructuredOutputType is an alias for the union of OutlinesStructuredOutputType and InstructorStructuredOutputType.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StandardInput","title":"StandardInput = ChatType module-attribute","text":"

StandardInput is an alias for ChatType that defines the default / standard input produced by format_input.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StructuredInput","title":"StructuredInput = Tuple[StandardInput, Union[StructuredOutputType, None]] module-attribute","text":"

StructuredInput defines a type produced by format_input when using either StructuredGeneration or a subclass of it.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.FormattedInput","title":"FormattedInput = Union[StandardInput, StructuredInput] module-attribute","text":"

FormattedInput is an alias for the union of StandardInput and StructuredInput as generated by format_input and expected by the LLMs.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType","title":"OutlinesStructuredOutputType","text":"

Bases: TypedDict

TypedDict to represent the structured output configuration from outlines.

Source code in src/distilabel/steps/tasks/typing.py
class OutlinesStructuredOutputType(TypedDict, total=False):\n    \"\"\"TypedDict to represent the structured output configuration from `outlines`.\"\"\"\n\n    format: Literal[\"json\", \"regex\"]\n    \"\"\"One of \"json\" or \"regex\".\"\"\"\n    schema: Union[str, Type[BaseModel], Dict[str, Any]]\n    \"\"\"The schema to use for the structured output. If \"json\", it\n    can be a pydantic.BaseModel class, or the schema as a string,\n    as obtained from `model_to_schema(BaseModel)`, if \"regex\", it\n    should be a regex pattern as a string.\n    \"\"\"\n    whitespace_pattern: Optional[Union[str, List[str]]]\n    \"\"\"If \"json\" corresponds to a string or a list of\n    strings with a pattern (doesn't impact string literals).\n    For example, to allow only a single space or newline with\n    `whitespace_pattern=r\"[\\n ]?\"`\n    \"\"\"\n
"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.format","title":"format: Literal['json', 'regex'] instance-attribute","text":"

One of \"json\" or \"regex\".

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.schema","title":"schema: Union[str, Type[BaseModel], Dict[str, Any]] instance-attribute","text":"

The schema to use for the structured output. If \"json\", it can be a pydantic.BaseModel class, or the schema as a string, as obtained from model_to_schema(BaseModel), if \"regex\", it should be a regex pattern as a string.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.whitespace_pattern","title":"whitespace_pattern: Optional[Union[str, List[str]]] instance-attribute","text":"

If \"json\" corresponds to a string or a list of strings with a pattern (doesn't impact string literals). For example, to allow only a single space or newline with whitespace_pattern=r\"[ ]?\"

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType","title":"InstructorStructuredOutputType","text":"

Bases: TypedDict

TypedDict to represent the structured output configuration from instructor.

Source code in src/distilabel/steps/tasks/typing.py
class InstructorStructuredOutputType(TypedDict, total=False):\n    \"\"\"TypedDict to represent the structured output configuration from `instructor`.\"\"\"\n\n    format: Optional[Literal[\"json\"]]\n    \"\"\"One of \"json\".\"\"\"\n    schema: Union[Type[BaseModel], Dict[str, Any]]\n    \"\"\"The schema to use for the structured output, a `pydantic.BaseModel` class. \"\"\"\n    mode: Optional[str]\n    \"\"\"Generation mode. Take a look at `instructor.Mode` for more information, if not informed it will\n    be determined automatically. \"\"\"\n    max_retries: int\n    \"\"\"Number of times to reask the model in case of error, if not set will default to the model's default. \"\"\"\n
"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.format","title":"format: Optional[Literal['json']] instance-attribute","text":"

One of \"json\".

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.schema","title":"schema: Union[Type[BaseModel], Dict[str, Any]] instance-attribute","text":"

The schema to use for the structured output, a pydantic.BaseModel class.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.mode","title":"mode: Optional[str] instance-attribute","text":"

Generation mode. Take a look at instructor.Mode for more information, if not informed it will be determined automatically.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.max_retries","title":"max_retries: int instance-attribute","text":"

Number of times to reask the model in case of error, if not set will default to the model's default.

"},{"location":"sections/community/","title":"Community","text":"

We are an open-source community-driven project not only focused on building a great product but also on building a great community, where you can get support, share your experiences, and contribute to the project! We would love to hear from you and help you get started with distilabel.

  • Discord

    In our Discord channels (#argilla-general and #argilla-help), you can get direct support from the community.

    Discord \u2197

  • Community Meetup

    We host bi-weekly community meetups where you can listen in or present your work.

    Community Meetup \u2197

  • Changelog

    The changelog is where you can find the latest updates and changes to the distilabel project.

    Changelog \u2197

  • Roadmap

    We love to discuss our plans with the community. Feel encouraged to participate in our roadmap discussions.

    Roadmap \u2197

"},{"location":"sections/community/#badges","title":"Badges","text":"

If you build something cool with distilabel consider adding one of these badges to your dataset or model card.

[<img src=\"https://raw.githubusercontent.com/argilla-io/distilabel/main/docs/assets/distilabel-badge-light.png\" alt=\"Built with Distilabel\" width=\"200\" height=\"32\"/>](https://github.com/argilla-io/distilabel)\n

[<img src=\"https://raw.githubusercontent.com/argilla-io/distilabel/main/docs/assets/distilabel-badge-dark.png\" alt=\"Built with Distilabel\" width=\"200\" height=\"32\"/>](https://github.com/argilla-io/distilabel)\n

"},{"location":"sections/community/#contribute","title":"Contribute","text":"

To directly contribute with distilabel, check our good first issues or open a new one.

"},{"location":"sections/community/contributor/","title":"How to contribute?","text":"

Thank you for investing your time in contributing to the project! Any contribution you make will be reflected in the most recent version of distilabel \ud83e\udd29.

New to contributing in general?

If you're a new contributor, read the README to get an overview of the project. In addition, here are some resources to help you get started with open-source contributions:

  • Discord: You are welcome to join the distilabel Discord community, where you can keep in touch with other users, contributors and the distilabel team. In the following section, you can find more information on how to get started in Discord.
  • Git: This is a very useful tool to keep track of the changes in your files. Using the command-line interface (CLI), you can make your contributions easily. For that, you need to have it installed and updated on your computer.
  • GitHub: It is a platform and cloud-based service that uses git and allows developers to collaborate on projects. To contribute to distilabel, you'll need to create an account. Check the Contributor Workflow with Git and Github for more info.
  • Developer Documentation: To collaborate, you'll need to set up an efficient environment. Check the Installation guide to know how to do it.
"},{"location":"sections/community/contributor/#first-contact-in-discord","title":"First Contact in Discord","text":"

Discord is a handy tool for more casual conversations and to answer day-to-day questions. As part of Hugging Face, we have set up some distilabel channels on the server. Click here to join the Hugging Face Discord community effortlessly.

When part of the Hugging Face Discord, you can select \"Channels & roles\" and select \"Argilla\" along with any of the other groups that are interesting to you. \"Argilla\" will cover anything about argilla and distilabel. You can join the following channels:

  • #argilla-distilabel-announcements: \ud83d\udce3 Stay up-to-date.
  • #argilla-distilabel-general: \ud83d\udcac For general discussions.
  • #argilla-distilabel-help: \ud83d\ude4b\u200d\u2640\ufe0f Need assistance? We're always here to help. Select the appropriate label (argilla or distilabel) for your issue and post it.

So now there is only one thing left to do: introduce yourself and talk to the community. You'll always be welcome! \ud83e\udd17\ud83d\udc4b

"},{"location":"sections/community/contributor/#contributor-workflow-with-git-and-github","title":"Contributor Workflow with Git and GitHub","text":"

If you're working with distilabel and suddenly a new idea comes to your mind or you find an issue that can be improved, it's time to actively participate and contribute to the project!

"},{"location":"sections/community/contributor/#report-an-issue","title":"Report an issue","text":"

If you spot a problem, search if an issue already exists, you can use the Label filter. If that is the case, participate in the conversation. If it does not exist, create an issue by clicking on New Issue. This will show various templates; choose the one that best suits your issue. Once you choose one, you will need to fill it in following the guidelines. Try to be as clear as possible. In addition, you can assign yourself to the issue and add or choose the right labels. Finally, click on Submit new issue.

"},{"location":"sections/community/contributor/#work-with-a-fork","title":"Work with a fork","text":""},{"location":"sections/community/contributor/#fork-the-distilabel-repository","title":"Fork the distilabel repository","text":"

After having reported the issue, you can start working on it. For that, you will need to create a fork of the project. To do that, click on the Fork button. Now, fill in the information. Remember to uncheck the Copy develop branch only if you are going to work in or from another branch (for instance, to fix documentation, the main branch is used). Then, click on Create fork.

You will be redirected to your fork. You can see that you are in your fork because the name of the repository will be your username/distilabel, and it will indicate forked from argilla-io/distilabel.

"},{"location":"sections/community/contributor/#clone-your-forked-repository","title":"Clone your forked repository","text":"

In order to make the required adjustments, clone the forked repository to your local machine. Choose the destination folder and run the following command:

git clone https://github.com/[your-github-username]/distilabel.git\ncd distilabel\n

To keep your fork\u2019s main/develop branch up to date with our repo, add it as an upstream remote branch.

git remote add upstream https://github.com/argilla-io/distilabel.git\n
"},{"location":"sections/community/contributor/#create-a-new-branch","title":"Create a new branch","text":"

For each issue you're addressing, it's advisable to create a new branch. GitHub offers a straightforward method to streamline this process.

\u26a0\ufe0f Never work directly on the main or develop branch. Always create a new branch for your changes.

Navigate to your issue, and on the right column, select Create a branch.

After the new window pops up, the branch will be named after the issue and include a prefix such as feature/, bug/, or docs/ to facilitate quick recognition of the issue type. In the Repository destination, pick your fork ( [your-github-username]/distilabel), and then select Change branch source to specify the source branch for creating the new one. Complete the process by clicking Create branch.

\ud83e\udd14 Remember that the main branch is only used to work with the documentation. For any other changes, use the develop branch.

Now, locally, change to the new branch you just created.

git fetch origin\ngit checkout [branch-name]\n
"},{"location":"sections/community/contributor/#make-changes-and-push-them","title":"Make changes and push them","text":"

Make the changes you want in your local repository, and test that everything works and you are following the guidelines.

Once you have finished, you can check the status of your repository and synchronize with the upstreaming repo with the following command:

# Check the status of your repository\ngit status\n\n# Synchronize with the upstreaming repo\ngit checkout [branch-name]\ngit rebase [default-branch]\n

If everything is right, we need to commit and push the changes to your fork. For that, run the following commands:

# Add the changes to the staging area\ngit add filename\n\n# Commit the changes by writing a proper message\ngit commit -m \"commit-message\"\n\n# Push the changes to your fork\ngit push origin [branch-name]\n

When pushing, you will be asked to enter your GitHub login credentials. Once the push is complete, all local commits will be on your GitHub repository.

"},{"location":"sections/community/contributor/#create-a-pull-request","title":"Create a pull request","text":"

Come back to GitHub, navigate to the original repository where you created your fork, and click on Compare & pull request.

First, click on compare across forks and select the right repositories and branches.

In the base repository, keep in mind that you should select either main or develop based on the modifications made. In the head repository, indicate your forked repository and the branch corresponding to the issue.

Then, fill in the pull request template. You should add a prefix to the PR name, as we did with the branch above. If you are working on a new feature, you can name your PR as feat: TITLE. If your PR consists of a solution for a bug, you can name your PR as bug: TITLE. And, if your work is for improving the documentation, you can name your PR as docs: TITLE.

In addition, on the right side, you can select a reviewer (for instance, if you discussed the issue with a member of the team) and assign the pull request to yourself. It is highly advisable to add labels to PR as well. You can do this again by the labels section right on the screen. For instance, if you are addressing a bug, add the bug label, or if the PR is related to the documentation, add the documentation label. This way, PRs can be easily filtered.

Finally, fill in the template carefully and follow the guidelines. Remember to link the original issue and enable the checkbox to allow maintainer edits so the branch can be updated for a merge. Then, click on Create pull request.

For the PR body, ensure you give a description of what the PR contains, and add examples if possible (and if they apply to the contribution) to help with the review process. You can take a look at #PR 974 or #PR 983 for examples of typical PRs.

"},{"location":"sections/community/contributor/#review-your-pull-request","title":"Review your pull request","text":"

Once you submit your PR, a team member will review your proposal. We may ask questions, request additional information, or ask for changes to be made before a PR can be merged, either using suggested changes or pull request comments.

You can apply the changes directly through the UI (check the files changed and click on the right-corner three dots; see image below) or from your fork, and then commit them to your branch. The PR will be updated automatically, and the suggestions will appear as outdated.

If you run into any merge issues, check out this git tutorial to help you resolve merge conflicts and other issues.

"},{"location":"sections/community/contributor/#your-pr-is-merged","title":"Your PR is merged!","text":"

Congratulations \ud83c\udf89\ud83c\udf8a We thank you \ud83e\udd29

Once your PR is merged, your contributions will be publicly visible on the distilabel GitHub.

Additionally, we will include your changes in the next release based on our development branch.

"},{"location":"sections/community/contributor/#additional-resources","title":"Additional resources","text":"

Here are some helpful resources for your reference.

  • Configuring Discord, a guide to learning how to get started with Discord.
  • Pro Git, a book to learn Git.
  • Git in VSCode, a guide to learning how to easily use Git in VSCode.
  • GitHub Skills, an interactive course for learning GitHub.
"},{"location":"sections/community/developer_documentation/","title":"Developer Documentation","text":"

Thank you for investing your time in contributing to the project!

If you don't have the repository locally, and need any help, go to the contributor guide and read the contributor workflow with Git and GitHub first.

"},{"location":"sections/community/developer_documentation/#set-up-the-python-environment","title":"Set up the Python environment","text":"

To work on the distilabel, you must install the package on your system.

Tip

This guide will use uv, but pip and venv can be used as well, this guide can work quite similar with both options.

From the root of the cloned Distilabel repository, you should move to the distilabel folder in your terminal.

cd distilabel\n
"},{"location":"sections/community/developer_documentation/#create-a-virtual-environment","title":"Create a virtual environment","text":"

The first step will be creating a virtual environment to keep our dependencies isolated. Here we are choosing python 3.11 (uv venv documentation), and then activate it:

uv venv .venv --python 3.11\nsource .venv/bin/activate\n
"},{"location":"sections/community/developer_documentation/#install-the-project","title":"Install the project","text":"

Installing from local (we are using uv pip):

uv pip install -e .\n

We have extra dependencies with their name, depending on the part you are working on, you may want to install some dependency (take a look at pyproject.toml in the repo to see all the extra dependencies):

uv pip install -e \".[vllm,outlines]\"\n
"},{"location":"sections/community/developer_documentation/#linting-and-formatting","title":"Linting and formatting","text":"

To maintain a consistent code format, install the pre-commit hooks to run before each commit automatically (we rely heavily on ruff):

uv pip install -e \".[dev]\"\npre-commit install\n
"},{"location":"sections/community/developer_documentation/#running-tests","title":"Running tests","text":"

All the changes you add to the codebase should come with tests, either unit or integration tests, depending on the type of change, which are placed under tests/unit and tests/integration respectively.

Start by installing the tests dependencies:

uv pip install \".[tests]\"\n

Running the whole tests suite may take some time, and you will need all the dependencies installed, so just run your tests, and the whole tests suite will be run for you in the CI:

# Run specific tests\npytest tests/unit/steps/generators/test_data.py\n
"},{"location":"sections/community/developer_documentation/#set-up-the-documentation","title":"Set up the documentation","text":"

To contribute to the documentation and generate it locally, ensure you have installed the development dependencies:

uv pip install -e \".[docs]\"\n

And run the following command to create the development server with mkdocs:

mkdocs serve\n
"},{"location":"sections/community/developer_documentation/#documentation-guidelines","title":"Documentation guidelines","text":"

As mentioned, we use mkdocs to build the documentation. You can write the documentation in markdown format, and it will automatically be converted to HTML. In addition, you can include elements such as tables, tabs, images, and others, as shown in this guide. We recommend following these guidelines:

  • Use clear and concise language: Ensure the documentation is easy to understand for all users by using straightforward language and including meaningful examples. Images are not easy to maintain, so use them only when necessary and place them in the appropriate folder within the docs/assets/images directory.

  • Verify code snippets: Double-check that all code snippets are correct and runnable.

  • Review spelling and grammar: Check the spelling and grammar of the documentation.

  • Update the table of contents: If you add a new page, include it in the relevant index.md or the mkdocs.yml file.

"},{"location":"sections/community/developer_documentation/#components-gallery","title":"Components gallery","text":"

The components gallery section of the documentation is automatically generated thanks to a custom plugin, it will be run when mkdocs serve is called. This guide to the steps helps us visualize each step, as well as examples of use.

Note

Changes done to the docstrings of Steps/Tasks/LLMs won't appear in the components gallery automatically, you will have to stop the mkdocs server and run it again to see the changes, everything else is reloaded automatically.

"},{"location":"sections/community/popular_issues/","title":"Issue dashboard","text":"Most engaging open issuesLatest issues open by the communityPlanned issues for upcoming releases Rank Issue Reactions Comments 1 368 - [FEATURE] create a pipeline playground UI \ud83d\udc4d 3 \ud83d\udcac 1 2 1041 - [FEATURE] Add Offline batch generation for open models with EXXA API \ud83d\udc4d 2 \ud83d\udcac 1 3 995 - [FEATURE] mlx-lm integration \ud83d\udc4d 2 \ud83d\udcac 1 4 737 - [FEATURE] Allow FormatTextGenerationSFT to include tools/function calls in the formatted messages. \ud83d\udc4d 2 \ud83d\udcac 0 5 829 - [FEATURE] Add Callable and GlobalCallable that takes custom callable as argument \ud83d\udc4d 1 \ud83d\udcac 3 6 797 - [FEATURE] synthetic data generation for predictive NLP tasks \ud83d\udc4d 1 \ud83d\udcac 1 7 914 - [FEATURE] Use Step.resources to set tensor_parallel_size and pipeline_parallel_size in vLLM \ud83d\udc4d 1 \ud83d\udcac 0 8 839 - [REFACTOR] unify singular/plural semantic naming of columns \ud83d\udc4d 1 \ud83d\udcac 0 9 788 - [DOCS] add embedded Datasetviewer to places where data is loaded from the hub \ud83d\udc4d 1 \ud83d\udcac 0 10 588 - [FEATURE] Single request caching \ud83d\udc4d 1 \ud83d\udcac 0 Rank Issue Author 1 \ud83d\udfe2 1049 - [BUG] vLLM Task not utilizing multiple GPUs in parallel when replicas > 1 by adamlin120 2 \ud83d\udfe2 1048 - [BUG] OepnAI JSON format by tinyrolls 3 \ud83d\udfe2 1047 - Failed to load all the steps. Could not run pipeline. by yuqie 4 \ud83d\udfe2 1046 - [FEATURE] Compute the input/output tokens of a dataset by plaguss 5 \ud83d\udfe3 1044 - Receiving error: The number of required GPUs exceeds the total number of available GPUs in the placement group by saurabhbbjain 6 \ud83d\udfe3 1042 - CUDA_VISIBLE_DEVICES does not work with distilabel code by yuqie 7 \ud83d\udfe2 1041 - [FEATURE] Add Offline batch generation for open models with EXXA API by etiennebalit 8 \ud83d\udfe2 1030 - [FEATURE] Trim inputs by arthrod 9 \ud83d\udfe2 1025 - [FEATURE] Update outlines integration to the new version by plaguss 10 \ud83d\udfe3 1020 - [BUG] Error when wrapping the step by sdiazlor Rank Issue Milestone 1 \ud83d\udfe2 880 - [FEATURE] Add exclude_from_signature attribute 1.4.0 2 \ud83d\udfe2 771 - [FEATURE] Allow passing path to YAML file containing pipeline runtime parameters in distilabel run 1.4.0 3 \ud83d\udfe2 773 - [DOCS] Include section/guide describing pipeline patterns 1.4.0 4 \ud83d\udfe2 802 - [FEATURE] Add defaults to Steps and Tasks so they can be more easily connected 1.4.0 5 \ud83d\udfe2 662 - [FEATURE] Allow passing self to steps created with step decorator 1.4.0 6 \ud83d\udfe2 889 - [FEATURE] Replace extra_sampling_params for normal arguments in vLLM 1.4.0 7 \ud83d\udfe2 942 - [BUG] make_generator_step can fail when setting the _dataset_info internally 1.4.0 8 \ud83d\udfe2 579 - [FEATURE] Sequential execution for local pipeline 1.4.0 9 \ud83d\udfe2 944 - [FEATURE] Improve the Argilla steps 1.5.0 10 \ud83d\udfe2 738 - [FEATURE] Update LLM.generate interface to allow returning arbitrary/extra stuff related to the generation 1.5.0

Last update: 2024-11-07

"},{"location":"sections/getting_started/faq/","title":"Frequent Asked Questions (FAQ)","text":"How can I rename the columns in a batch?

Every Step has both input_mappings and output_mappings attributes that can be used to rename the columns in each batch.

But input_mappings will only map, meaning that if you have a batch with the column A and you want to rename it to B, you should use input_mappings={\"A\": \"B\"}, but that will only be applied to that specific Step meaning that the next step in the pipeline will still have the column A instead of B.

While output_mappings will indeed apply the rename, meaning that if the Step produces the column A and you want to rename to B, you should use output_mappings={\"A\": \"B\"}, and that will be applied to the next Step in the pipeline.

Will the API Keys be exposed when sharing the pipeline?

No, those will be masked out using pydantic.SecretStr, meaning that those won't be exposed when sharing the pipeline.

This also means that if you want to re-run your own pipeline and the API keys have not been provided via environment variable but either via an attribute or runtime parameter, you will need to provide them again.

Does it work for Windows?

Yes, but you may need to set the multiprocessing context in advance to ensure that the spawn method is used since the default method fork is not available on Windows.

import multiprocessing as mp\n\nmp.set_start_method(\"spawn\")\n
Will the custom Steps / Tasks / LLMs be serialized too?

No, at the moment, only the references to the classes within the distilabel library will be serialized, meaning that if you define a custom class used within the pipeline, the serialization won't break, but the deserialize will fail since the class won't be available unless used from the same file.

What happens if Pipeline.run fails? Do I lose all the data?

No, indeed, we're using a cache mechanism to store all the intermediate results in the disk so, if a Step fails; the pipeline can be re-run from that point without losing the data, only if nothing is changed in the Pipeline.

All the data will be stored in .cache/distilabel, but the only data that will persist at the end of the Pipeline.run execution is the one from the leaf step/s, so bear that in mind.

For more information on the caching mechanism in distilabel, you can check the Learn - Advanced - Caching section.

Also, note that when running a Step or a Task standalone, the cache mechanism won't be used, so if you want to use that, you should use the Pipeline context manager.

How can I use the same LLM across several tasks without having to load it several times?

You can serve the LLM using a solution like TGI or vLLM, and then connect to it using an AsyncLLM client like InferenceEndpointsLLM or OpenAILLM. Please refer to Serving LLMs guide for more information.

Can distilabel be used with OpenAI Batch API?

Yes, distilabel is integrated with OpenAI Batch API via OpenAILLM. Check LLMs - Offline Batch Generation for a small example on how to use it and Advanced - Offline Batch Generation for a more detailed guide.

Prevent overloads on Free Serverless Endpoints

When running a task using the InferenceEndpointsLLM with Free Serverless Endpoints, you may be facing some errors such as Model is overloaded if you let the batch size to the default (set at 50). To fix the issue, lower the value or even better set input_batch_size=1 in your task. It may take a longer time to finish, but please remember this is a free service.

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import TextGeneration\n\nTextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=1\n)\n
"},{"location":"sections/getting_started/installation/","title":"Installation","text":"

You will need to have at least Python 3.9 or higher, up to Python 3.12, since support for the latter is still a work in progress.

To install the latest release of the package from PyPI you can use the following command:

pip install distilabel --upgrade\n

Alternatively, you may also want to install it from source i.e. the latest unreleased version, you can use the following command:

pip install \"distilabel @ git+https://github.com/argilla-io/distilabel.git@develop\" --upgrade\n

Note

We are installing from develop since that's the branch we use to collect all the features, bug fixes, and improvements that will be part of the next release. If you want to install from a specific branch, you can replace develop with the branch name.

"},{"location":"sections/getting_started/installation/#extras","title":"Extras","text":"

Additionally, as part of distilabel some extra dependencies are available, mainly to add support for some of the LLM integrations we support. Here's a list of the available extras:

"},{"location":"sections/getting_started/installation/#llms","title":"LLMs","text":"
  • anthropic: for using models available in Anthropic API via the AnthropicLLM integration.

  • argilla: for exporting the generated datasets to Argilla.

  • cohere: for using models available in Cohere via the CohereLLM integration.

  • groq: for using models available in Groq using groq Python client via the GroqLLM integration.

  • hf-inference-endpoints: for using the Hugging Face Inference Endpoints via the InferenceEndpointsLLM integration.

  • hf-transformers: for using models available in transformers package via the TransformersLLM integration.

  • litellm: for using LiteLLM to call any LLM using OpenAI format via the LiteLLM integration.

  • llama-cpp: for using llama-cpp-python Python bindings for llama.cpp via the LlamaCppLLM integration.

  • mistralai: for using models available in Mistral AI API via the MistralAILLM integration.

  • ollama: for using Ollama and their available models via OllamaLLM integration.

  • openai: for using OpenAI API models via the OpenAILLM integration, or the rest of the integrations based on OpenAI and relying on its client as AnyscaleLLM, AzureOpenAILLM, and TogetherLLM.

  • vertexai: for using Google Vertex AI proprietary models via the VertexAILLM integration.

  • vllm: for using vllm serving engine via the vLLM integration.

  • sentence-transformers: for generating sentence embeddings using sentence-transformers.

"},{"location":"sections/getting_started/installation/#data-processing","title":"Data processing","text":"
  • ray: for scaling and distributing a pipeline with Ray.

  • faiss-cpu and faiss-gpu: for generating sentence embeddings using faiss.

  • minhash: for using minhash for duplicate detection with datasketch and nltk.

  • text-clustering: for using text clustering with UMAP and Scikit-learn.

"},{"location":"sections/getting_started/installation/#structured-generation","title":"Structured generation","text":"
  • outlines: for using structured generation of LLMs with outlines.

  • instructor: for using structured generation of LLMs with Instructor.

"},{"location":"sections/getting_started/installation/#recommendations-notes","title":"Recommendations / Notes","text":"

The mistralai dependency requires Python 3.9 or higher, so if you're willing to use the distilabel.models.llms.MistralLLM implementation, you will need to have Python 3.9 or higher.

In some cases like transformers and vllm, the installation of flash-attn is recommended if you are using a GPU accelerator since it will speed up the inference process, but the installation needs to be done separately, as it's not included in the distilabel dependencies.

pip install flash-attn --no-build-isolation\n

Also, if you are willing to use the llama-cpp-python integration for running local LLMs, note that the installation process may get a bit trickier depending on which OS are you using, so we recommend you to read through their Installation section in their docs.

"},{"location":"sections/getting_started/quickstart/","title":"Quickstart","text":""},{"location":"sections/getting_started/quickstart/#quickstart","title":"Quickstart","text":"

Distilabel provides all the tools you need to your scalable and reliable pipelines for synthetic data generation and AI-feedback. Pipelines are used to generate data, evaluate models, manipulate data, or any other general task. They are made up of different components: Steps, Tasks and LLMs, which are chained together in a directed acyclic graph (DAG).

  • Steps: These are the building blocks of your pipeline. Normal steps are used for basic executions like loading data, applying some transformations, or any other general task.
  • Tasks: These are steps that rely on LLMs and prompts to perform generative tasks. For example, they can be used to generate data, evaluate models or manipulate data.
  • LLMs: These are the models that will perform the task. They can be local or remote models, and open-source or commercial models.

Pipelines are designed to be scalable and reliable. They can be executed in a distributed manner, and they can be cached and recovered. This is useful when dealing with large datasets or when you want to ensure that your pipeline is reproducible.

Besides that, pipelines are designed to be modular and flexible. You can easily add new steps, tasks, or LLMs to your pipeline, and you can also easily modify or remove them. An example architecture of a pipeline to generate a dataset of preferences is the following:

"},{"location":"sections/getting_started/quickstart/#installation","title":"Installation","text":"

To install the latest release with hf-inference-endpoints extra of the package from PyPI you can use the following command:

pip install distilabel[hf-inference-endpoints] --upgrade\n
"},{"location":"sections/getting_started/quickstart/#define-a-pipeline","title":"Define a pipeline","text":"

In this guide we will walk you through the process of creating a simple pipeline that uses the InferenceEndpointsLLM class to generate text. The Pipeline will load a dataset that contains a column named prompt from the Hugging Face Hub via the step LoadDataFromHub and then use the InferenceEndpointsLLM class to generate text based on the dataset using the TextGeneration task.

You can check the available models in the Hugging Face Model Hub and filter by Inference status.

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(  # (1)\n    name=\"simple-text-generation-pipeline\",\n    description=\"A simple text generation pipeline\",\n) as pipeline:  # (2)\n    load_dataset = LoadDataFromHub(  # (3)\n        output_mappings={\"prompt\": \"instruction\"},\n    )\n\n    text_generation = TextGeneration(  # (4)\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n        ),  # (5)\n        system_prompt=\"You are a creative AI Assistant writer.\",\n        template=\"Follow the following instruction: {{ instruction }}\"  # (6)\n    )\n\n    load_dataset >> text_generation  # (7)\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(  # (8)\n        parameters={\n            load_dataset.name: {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n            text_generation.name: {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n        },\n    )\n    distiset.push_to_hub(repo_id=\"distilabel-example\")  # (9)\n
  1. We define a Pipeline with the name simple-text-generation-pipeline and a description A simple text generation pipeline. Note that the name is mandatory and will be used to calculate the cache signature path, so changing the name will change the cache path and will be identified as a different pipeline.

  2. We are using the Pipeline context manager, meaning that every Step subclass that is defined within the context manager will be added to the pipeline automatically.

  3. We define a LoadDataFromHub step named load_dataset that will load a dataset from the Hugging Face Hub, as provided via runtime parameters in the pipeline.run method below, but it can also be defined within the class instance via the arg repo_id=.... This step will produce output batches with the rows from the dataset, and the column prompt will be mapped to the instruction field.

  4. We define a TextGeneration task named text_generation that will generate text based on the instruction field from the dataset. This task will use the InferenceEndpointsLLM class with the model Meta-Llama-3.1-8B-Instruct.

  5. We define the InferenceEndpointsLLM class with the model Meta-Llama-3.1-8B-Instruct that will be used by the TextGeneration task. In this case, since the InferenceEndpointsLLM is used, we assume that the HF_TOKEN environment variable is set.

  6. Both system_prompt and template are optional fields. The template must be informed as a string following the Jinja2 template format, and the fields that appear there (\"instruction\" in this case, which corresponds to the default) must be informed in the columns attribute. The component gallery for TextGeneration has examples to get you started.

  7. We connect the load_dataset step to the text_generation task using the rshift operator, meaning that the output from the load_dataset step will be used as input for the text_generation task.

  8. We run the pipeline with the parameters for the load_dataset and text_generation steps. The load_dataset step will use the repository distilabel-internal-testing/instruction-dataset-mini and the test split, and the text_generation task will use the generation_kwargs with the temperature set to 0.7 and the max_new_tokens set to 512.

  9. Optionally, we can push the generated Distiset to the Hugging Face Hub repository distilabel-example. This will allow you to share the generated dataset with others and use it in other pipelines.

"},{"location":"sections/how_to_guides/","title":"How-to guides","text":"

Welcome to the how-to guides section! Here you will find a collection of guides that will help you get started with Distilabel. We have divided the guides into two categories: basic and advanced. The basic guides will help you get started with the core concepts of Distilabel, while the advanced guides will help you explore more advanced features.

"},{"location":"sections/how_to_guides/#basic","title":"Basic","text":"
  • Define Steps for your Pipeline

    Steps are the building blocks of your pipeline. They can be used to generate data, evaluate models, manipulate data, or any other general task.

    Define Steps

  • Define Tasks that rely on LLMs

    Tasks are a specific type of step that rely on Language Models (LLMs) to generate data.

    Define Tasks

  • Define LLMs as local or remote models

    LLMs are the core of your tasks. They are used to integrate with local models or remote APIs.

    Define LLMs

  • Execute Steps and Tasks in a Pipeline

    Pipeline is where you put all your steps and tasks together to create a workflow.

    Execute Pipeline

"},{"location":"sections/how_to_guides/#advanced","title":"Advanced","text":"
  • Using the Distiset dataset object

    Distiset is a dataset object based on the datasets library that can be used to store and manipulate data.

    Distiset

  • Export data to Argilla

    Argilla is a platform that can be used to store, search, and apply feedback to datasets. Argilla

  • Using a file system to pass data of batches between steps

    File system can be used to pass data between steps in a pipeline.

    File System

  • Using CLI to explore and re-run existing Pipelines

    CLI can be used to explore and re-run existing pipelines through the command line.

    CLI

  • Cache and recover pipeline executions

    Caching can be used to recover pipeline executions to avoid loosing data and precious LLM calls.

    Caching

  • Structured data generation

    Structured data generation can be used to generate data with a specific structure like JSON, function calls, etc.

    Structured Generation

  • Serving an LLM for sharing it between several tasks

    Serve an LLM via TGI or vLLM to make requests and connect using a client like InferenceEndpointsLLM or OpenAILLM to avoid wasting resources.

    Sharing an LLM across tasks

  • Impose requirements to your pipelines and steps

    Add requirements to steps in a pipeline to ensure they are installed and avoid errors.

    Pipeline requirements

"},{"location":"sections/how_to_guides/advanced/argilla/","title":"Export data to Argilla","text":"

Being able to export the generated synthetic datasets to Argilla, is a core feature within distilabel. We believe in the potential of synthetic data, but without removing the impact a human annotator or group of annotators can bring. So on, the Argilla integration makes it straightforward to push a dataset to Argilla while the Pipeline is running, to be able to follow along the generation process in Argilla's UI, as well as annotating the records on the fly. One can include a Step within the Pipeline to easily export the datasets to Argilla with a pre-defined configuration, suiting the annotation purposes.

Before using any of the steps about to be described below, you should first have an Argilla instance up and running, so that you can successfully upload the data to Argilla. In order to deploy Argilla, the easiest and most straightforward way is to deploy it via the Argilla Template in Hugging Face Spaces as simply as following the steps there, or just via the following button:

"},{"location":"sections/how_to_guides/advanced/argilla/#text-generation","title":"Text Generation","text":"

For text generation scenarios, i.e. when the Pipeline contains a single TextGeneration step, we have designed the task TextGenerationToArgilla, which will seamlessly push the generated data to Argilla, and allow the annotator to review the records.

The dataset will be pushed with the following configuration:

  • Fields: instruction and generation, both being fields of type argilla.TextField, plus the automatically generated id for the given instruction to be able to search for other records with the same instruction in the dataset. The field instruction must always be a string, while the field generation can either be a single string or a list of strings (useful when there are multiple parent nodes of type TextGeneration); even though each record will always contain at most one instruction-generation pair.

  • Questions: quality will be the only question for the annotators to answer, i.e., to annotate, and it will be an argilla.LabelQuestion referring to the quality of the provided generation for the given instruction. It can be annotated as either \ud83d\udc4e (bad) or \ud83d\udc4d (good).

Note

The TextGenerationToArgilla step will only work as is if the Pipeline contains one or multiple TextGeneration steps, or if the columns instruction and generation are available within the batch data. Otherwise, the variable input_mappings will need to be set so that either both or one of instruction and generation are mapped to one of the existing columns in the batch data.

from distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts, TextGenerationToArgilla\nfrom distilabel.steps.tasks import TextGeneration\n\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n    load_dataset = LoadDataFromDicts(\n        name=\"load_dataset\",\n        data=[\n            {\n                \"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\",\n            },\n        ],\n    )\n\n    text_generation = TextGeneration(\n        name=\"text_generation\",\n        llm=OpenAILLM(model=\"gpt-4\"),\n    )\n\n    to_argilla = TextGenerationToArgilla(\n        dataset_name=\"my-dataset\",\n        dataset_workspace=\"admin\",\n        api_url=\"<ARGILLA_API_URL>\",\n        api_key=\"<ARGILLA_API_KEY>\",\n    )\n\n    load_dataset >> text_generation >> to_argilla\n\npipeline.run()\n

"},{"location":"sections/how_to_guides/advanced/argilla/#preference","title":"Preference","text":"

For preference scenarios, i.e. when the Pipeline contains multiple TextGeneration steps, we have designed the task PreferenceToArgilla, which will seamlessly push the generated data to Argilla, and allow the annotator to review the records.

The dataset will be pushed with the following configuration:

  • Fields: instruction and generations, both being fields of type argilla.TextField, plus the automatically generated id for the given instruction to be able to search for other records with the same instruction in the dataset. The field instruction must always be a string, while the field generations must be a list of strings, containing the generated texts for the given instruction so that at least there are two generations to compare. Other than that, the number of generation fields within each record in Argilla will be defined by the value of the variable num_generations to be provided in the PreferenceToArgilla step.

  • Questions: rating and rationale will be the pairs of questions to be defined per each generation i.e. per each value within the range from 0 to num_generations, and those will be of types argilla.RatingQuestion and argilla.TextQuestion, respectively. Note that only the first pair of questions will be mandatory, since only one generation is ensured to be within the batch data. Additionally, note that the provided ratings will range from 1 to 5, and to mention that Argilla only supports values above 0.

Note

The PreferenceToArgilla step will only work if the Pipeline contains multiple TextGeneration steps, or if the columns instruction and generations are available within the batch data. Otherwise, the variable input_mappings will need to be set so that either both or one of instruction and generations are mapped to one of the existing columns in the batch data.

Note

Additionally, if the Pipeline contains an UltraFeedback step, the ratings and rationales will also be available and be automatically injected as suggestions to the existing dataset.

from distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts, PreferenceToArgilla\nfrom distilabel.steps.tasks import TextGeneration\n\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n    load_dataset = LoadDataFromDicts(\n        name=\"load_dataset\",\n        data=[\n            {\n                \"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\",\n            },\n        ],\n    )\n\n    text_generation = TextGeneration(\n        name=\"text_generation\",\n        llm=OpenAILLM(model=\"gpt-4\"),\n        num_generations=4,\n        group_generations=True,\n    )\n\n    to_argilla = PreferenceToArgilla(\n        dataset_name=\"my-dataset\",\n        dataset_workspace=\"admin\",\n        api_url=\"<ARGILLA_API_URL>\",\n        api_key=\"<ARGILLA_API_KEY>\",\n        num_generations=4,\n    )\n\n    load_dataset >> text_generation >> to_argilla\n\nif __name__ == \"__main__\":\n    pipeline.run()\n

"},{"location":"sections/how_to_guides/advanced/assigning_resources_to_step/","title":"Assigning resources to a Step","text":"

When dealing with complex pipelines that get executed in a distributed environment with abundant resources (CPUs and GPUs), sometimes it's necessary to allocate these resources judiciously among the Steps. This is why distilabel allows to specify the number of replicas, cpus and gpus for each Step. Let's see that with an example:

from distilabel.pipeline import Pipeline\nfrom distilabel.models import vLLM\nfrom distilabel.steps import StepResources\nfrom distilabel.steps.tasks import PrometheusEval\n\n\nwith Pipeline(name=\"resources\") as pipeline:\n    ...\n\n    prometheus = PrometheusEval(\n        llm=vLLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\",\n            chat_template=\"[INST] {{ messages[0]['content'] }}\\\\n{{ messages[1]['content'] }}[/INST]\",\n        ),\n        resources=StepResources(replicas=2, cpus=1, gpus=1)\n        mode=\"absolute\",\n        rubric=\"factual-validity\",\n        reference=False,\n        num_generations=1,\n        group_generations=False,\n    )\n

In the example above, we're creating a PrometheusEval task (remember that Tasks are Steps) that will use vLLM to serve prometheus-eval/prometheus-7b-v2.0 model. This task is resource intensive as it requires an LLM, which in turn requires a GPU to run fast. With that in mind, we have specified the resources required for the task using the StepResources class, and we have defined that we need 1 GPU and 1 CPU per replica of the task. In addition, we have defined that we need 2 replicas i.e. we will run two instances of the task so the computation for the whole dataset runs faster. In addition, StepResources uses the RuntimeParametersMixin, so we can also specify the resources for each step when running the pipeline:

...\n\nif __name__ == \"__main__\":\n    pipeline.run(\n        parameters={\n            prometheus.name: {\"resources\": {\"replicas\": 2, \"cpus\": 1, \"gpus\": 1}}\n        }\n    )\n

And that's it! When running the pipeline, distilabel will create the tasks in nodes that have available the specified resources.

"},{"location":"sections/how_to_guides/advanced/caching/","title":"Pipeline cache","text":"

distilabel will automatically save all the intermediate outputs generated by each Step of a Pipeline, so these outputs can be reused to recover the state of a pipeline execution that was stopped before finishing or to not have to re-execute steps from a pipeline after adding a new downstream step.

"},{"location":"sections/how_to_guides/advanced/caching/#how-to-enabledisable-the-cache","title":"How to enable/disable the cache","text":"

The use of the cache can be toggled using the use_cache parameter of the Pipeline.use_cache method. If True, then distilabel will use the reuse the outputs of previous executions for the new execution. If False, then distilabel will re-execute all the steps of the pipeline to generate new outputs for all the steps.

with Pipeline(name=\"my-pipeline\") as pipeline:\n    ...\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)  # (1)\n
  1. Pipeline cache is disabled

In addition, the cache can be enabled/disabled at Step level using its use_cache attribute. If True, then the outputs of the step will be reused in the new pipeline execution. If False, then the step will be re-executed to generate new outputs. If the cache of one step is disabled and the outputs have to be regenerated, then the outputs of the steps that depend on this step will also be regenerated.

with Pipeline(name=\"writting-assistant\") as pipeline:\n    load_data = LoadDataFromDicts(\n        data=[\n            {\n                \"instruction\": \"How much is 2+2?\"\n            }\n        ]\n    )\n\n    generation = TextGeneration(\n        llm=InferenceEndpointsLLM(\n            model_id=\"Qwen/Qwen2.5-72B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.8,\n                \"max_new_tokens\": 512,\n            },\n        ),\n        use_cache=False  # (1)\n    )\n\n    load_data >> generation\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run()\n
  1. Step cache is disabled and every time the pipeline is executed, this step will be re-executed
"},{"location":"sections/how_to_guides/advanced/caching/#how-a-cache-hit-is-triggered","title":"How a cache hit is triggered","text":"

distilabel groups information and data generated by a Pipeline using the name of the pipeline, so the first factor that triggers a cache hit is the name of the pipeline. The second factor, is the Pipeline.signature property. This property returns a hash that is generated using the names of the steps used in the pipeline and their connections. The third factor, is the Pipeline.aggregated_steps_signature property which is used to determine if the new pipeline execution is exactly the same as one of the previous i.e. the pipeline contains exactly the same steps, with exactly the same connections and the steps are using exactly the same parameters. If these three factors are met, then the cache hit is triggered and the pipeline won't get re-executed and instead the function create_distiset will be used to create the resulting Distiset using the outputs of the previous execution, as it can be seen in the following image:

If the new pipeline execution have a different Pipeline.aggregated_steps_signature i.e. at least one step has changed its parameters, distilabel will reuse the outputs of the steps that have not changed and re-execute the steps that have changed, as it can be seen in the following image:

The same pipeline from above gets executed a third time, but this time the last step text_generation_1 changed, so it's needed to re-execute it. The other steps, as they have not been changed, doesn't need to be re-executed and their outputs are reused.

"},{"location":"sections/how_to_guides/advanced/distiset/","title":"Using the Distiset dataset object","text":"

A Pipeline in distilabel returns a special type of Hugging Face datasets.DatasetDict which is called Distiset.

The Distiset is a dictionary-like object that contains the different configurations generated by the Pipeline, where each configuration corresponds to each leaf step in the DAG built by the Pipeline. Each configuration corresponds to a different subset of the dataset. This is a concept taken from \ud83e\udd17 datasets that lets you upload different configurations of the same dataset within the same repository and can contain different columns i.e. different configurations, which can be seamlessly pushed to the Hugging Face Hub.

Below you can find an example of how to create a Distiset object that resembles a datasets.DatasetDict:

from datasets import Dataset\nfrom distilabel.distiset import Distiset\n\ndistiset = Distiset(\n    {\n        \"leaf_step_1\": Dataset.from_dict({\"instruction\": [1, 2, 3]}),\n        \"leaf_step_2\": Dataset.from_dict(\n            {\"instruction\": [1, 2, 3, 4], \"generation\": [5, 6, 7, 8]}\n        ),\n    }\n)\n

Note

If there's only one leaf node, i.e., only one step at the end of the Pipeline, then the configuration name won't be the name of the last step, but it will be set to \"default\" instead, as that's more aligned with standard datasets within the Hugging Face Hub.

"},{"location":"sections/how_to_guides/advanced/distiset/#distiset-methods","title":"Distiset methods","text":"

We can interact with the different pieces generated by the Pipeline and treat them as different configurations. The Distiset contains just two methods:

"},{"location":"sections/how_to_guides/advanced/distiset/#traintest-split","title":"Train/Test split","text":"

Create a train/test split partition of the dataset for the different configurations or subsets.

>>> distiset.train_test_split(train_size=0.9)\nDistiset({\n    leaf_step_1: DatasetDict({\n        train: Dataset({\n            features: ['instruction'],\n            num_rows: 2\n        })\n        test: Dataset({\n            features: ['instruction'],\n            num_rows: 1\n        })\n    })\n    leaf_step_2: DatasetDict({\n        train: Dataset({\n            features: ['instruction', 'generation'],\n            num_rows: 3\n        })\n        test: Dataset({\n            features: ['instruction', 'generation'],\n            num_rows: 1\n        })\n    })\n})\n
"},{"location":"sections/how_to_guides/advanced/distiset/#push-to-hugging-face-hub","title":"Push to Hugging Face Hub","text":"

Push the Distiset to a Hugging Face repository, where each one of the subsets will correspond to a different configuration:

distiset.push_to_hub(\n    \"my-org/my-dataset\",\n    commit_message=\"Initial commit\",\n    private=False,\n    token=os.getenv(\"HF_TOKEN\"),\n    generate_card=True,\n    include_script=False\n)\n

New since version 1.3.0

Since version 1.3.0 you can automatically push the script that created your pipeline to the same repository. For example, assuming you have a file like the following:

sample_pipe.py
with Pipeline() as pipe:\n    ...\ndistiset = pipe.run()\ndistiset.push_to_hub(\n    \"my-org/my-dataset,\n    include_script=True\n)\n

After running the command, you could visit the repository and the file sample_pipe.py will be stored to simplify sharing your pipeline with the community.

"},{"location":"sections/how_to_guides/advanced/distiset/#custom-docstrings","title":"Custom Docstrings","text":"

distilabel contains a custom plugin to automatically generates a gallery for the different components. The information is extracted by parsing the Step's docstrings. You can take a look at the docstrings in the source code of the UltraFeedback, and take a look at the corresponding entry in the components gallery to see an example of how the docstrings are rendered.

If you create your own components and want the Citations automatically rendered in the README card (in case you are sharing your final distiset in the Hugging Face Hub), you may want to add the citation section. This is an example for the MagpieGenerator Task:

class MagpieGenerator(GeneratorTask, MagpieBase):\n    r\"\"\"Generator task the generates instructions or conversations using Magpie.\n    ...\n\n    Citations:\n\n        ```\n        @misc{xu2024magpiealignmentdatasynthesis,\n            title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n            author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n            year={2024},\n            eprint={2406.08464},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2406.08464},\n        }\n        ```\n    \"\"\"\n

The Citations section can include any number of bibtex references. To define them, you can add as much elements as needed just like in the example: each citation will be a block of the form: ```@misc{...}```. This information will be automatically used in the README of your Distiset if you decide to call distiset.push_to_hub. Alternatively, if the Citations is not found, but in the References there are found any urls pointing to https://arxiv.org/, we will try to obtain the Bibtex equivalent automatically. This way, Hugging Face can automatically track the paper for you and it's easier to find other datasets citing the same paper, or directly visiting the paper page.

"},{"location":"sections/how_to_guides/advanced/distiset/#save-and-load-from-disk","title":"Save and load from disk","text":"

Take into account that these methods work as datasets.load_from_disk and datasets.Dataset.save_to_disk so the arguments are directly passed to those methods. This means you can also make use of storage_options argument to save your Distiset in your cloud provider, including the distilabel artifacts (pipeline.yaml, pipeline.log and the README.md with the dataset card). You can read more in datasets documentation here.

Save to diskLoad from disk (local)Load from disk (cloud)

Save the Distiset to disk, and optionally (will be done by default) saves the dataset card, the pipeline config file and logs:

distiset.save_to_disk(\n    \"my-dataset\",\n    save_card=True,\n    save_pipeline_config=True,\n    save_pipeline_log=True\n)\n

Load a Distiset that was saved using Distiset.save_to_disk just the same way:

distiset = Distiset.load_from_disk(\"my-dataset\")\n

Load a Distiset from a remote location, like S3, GCS. You can pass the storage_options argument to authenticate with the cloud provider:

distiset = Distiset.load_from_disk(\n    \"s3://path/to/my_dataset\",  # gcs:// or any filesystem tolerated by fsspec\n    storage_options={\n        \"key\": os.environ[\"S3_ACCESS_KEY\"],\n        \"secret\": os.environ[\"S3_SECRET_KEY\"],\n        ...\n    }\n)\n

Take a look at the remaining arguments at Distiset.save_to_disk and Distiset.load_from_disk.

"},{"location":"sections/how_to_guides/advanced/distiset/#dataset-card","title":"Dataset card","text":"

Having this special type of dataset comes with an added advantage when calling Distiset.push_to_hub, which is the automatically generated dataset card in the Hugging Face Hub. Note that it is enabled by default, but can be disabled by setting generate_card=False:

distiset.push_to_hub(\"my-org/my-dataset\", generate_card=True)\n

We will have an automatic dataset card (an example can be seen here) with some handy information like reproducing the Pipeline with the CLI, or examples of the records from the different subsets.

"},{"location":"sections/how_to_guides/advanced/distiset/#create_distiset-helper","title":"create_distiset helper","text":"

Lastly, we presented in the caching section the create_distiset function, you can take a look at the section to see how to create a Distiset from the cache folder, using the helper function to automatically include all the relevant data.

"},{"location":"sections/how_to_guides/advanced/fs_to_pass_data/","title":"Using a file system to pass data of batches between steps","text":"

In some situations, it can happen that the batches contains so much data that is faster to write it to disk and read it back in the next step, instead of passing it using the queue. To solve this issue, distilabel uses fsspec to allow providing a file system configuration and whether if this file system should be used to pass data between steps in the run method of the distilabel pipelines:

Warning

In order to use a specific file system/cloud storage, you will need to install the specific package providing the fsspec implementation for that file system. For instance, to use Google Cloud Storage you will need to install gcsfs:

pip install gcsfs\n

Check the available implementations: fsspec - Other known implementations

from distilabel.pipeline import Pipeline\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n  ...\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        ..., \n        storage_parameters={\"path\": \"gcs://my-bucket\"},\n        use_fs_to_pass_data=True\n    )\n

The code above setups a file system (in this case Google Cloud Storage) and sets the flag use_fs_to_pass_data to specify that the data of the batches should be passed to the steps using the file system. The storage_parameters argument is optional, and in the case it's not provided but use_fs_to_pass_data==True, distilabel will use the local file system.

Note

As GlobalSteps receives all the data from the previous steps in one single batch accumulating all the data, it's very likely that the data of the batch will be too big to be passed using the queue. In this case and even if use_fs_to_pass_data==False, distilabel will use the file system to pass the data to the GlobalStep.

"},{"location":"sections/how_to_guides/advanced/offline_batch_generation/","title":"Offline Batch Generation","text":"

The offline batch generation is a feature that some LLMs implemented in distilabel offers, allowing to send the inputs to a LLM-as-a-service platform and waiting for the outputs in a asynchronous manner. LLM-as-a-service platforms offer this feature as it allows them to gather many inputs and creating batches as big as the hardware allows, maximizing the hardware utilization and reducing the cost of the service. In exchange, the user has to wait certain time for the outputs to be ready but the cost per token is usually much lower.

distilabel pipelines are able to handle LLMs that offer this feature in the following way:

  • The first time the pipeline gets executed, the LLM will send the inputs to the platform. The platform will return jobs ids that can be used later to check the status of the jobs and retrieve the results. The LLM will save these jobs ids in its jobs_ids attribute and raise an special exception DistilabelOfflineBatchGenerationNotFinishedException that will be handled by the Pipeline. The jobs ids will be saved in the pipeline cache, so they can be used in subsequent calls.
  • The second time and subsequent calls will recover the pipeline execution and the LLM won't send the inputs again to the platform. This time as it has the jobs_ids it will check if the jobs have finished, and if they have then it will retrieve the results and return the outputs. If they haven't finished, then it will raise again DistilabelOfflineBatchGenerationNotFinishedException again.
  • In addition, LLMs with offline batch generation can be specified to do polling until the jobs have finished, blocking the pipeline until they are done. If for some reason the polling needs to be stopped, one can press Ctrl+C or Cmd+C depending on your OS (or send a SIGINT to the main process) which will stop the polling and raise DistilabelOfflineBatchGenerationNotFinishedException that will be handled by the pipeline as described above.

Warning

In order to recover the pipeline execution and retrieve the results, the pipeline cache must be enabled. If the pipeline cache is disabled, then it will send the inputs again and create different jobs incurring in extra costs.

"},{"location":"sections/how_to_guides/advanced/offline_batch_generation/#example-pipeline-using-openaillm-with-offline-batch-generation","title":"Example pipeline using OpenAILLM with offline batch generation","text":"
from distilabel.models import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline() as pipeline:\n    load_data = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n    text_generation = TextGeneration(\n        llm=OpenAILLM(\n            model=\"gpt-3.5-turbo\",\n            use_offline_batch_generation=True,  # (1)\n        )\n    )\n\n    load_data >> text_generation\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            load_data.name: {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset\",\n                \"split\": \"test\",\n                \"batch_size\": 500,\n            },\n        }\n    )\n
  1. Indicate that the OpenAILLM should use offline batch generation.
"},{"location":"sections/how_to_guides/advanced/pipeline_requirements/","title":"Add requirements to run a Pipeline","text":"

When sharing a Pipeline that contains custom Steps or Tasks, you may want to add the specific requirements that are needed to run them. distilabel will take this list of requirements and warn the user if any are missing.

Let's see how we can add additional requirements with an example. The first thing we're going to do is to add requirements for our CustomStep. To do so we use the requirements decorator to specify that the step has nltk>=3.8 as dependency (we can use version specifiers). In addition, we're going to specify at Pipeline level that we need distilabel>=1.3.0 to run it.

from typing import List\n\nfrom distilabel.steps import Step\nfrom distilabel.steps.base import StepInput\nfrom distilabel.steps.typing import StepOutput\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.utils.requirements import requirements\nfrom distilabel.pipeline import Pipeline\n\n\n@requirements([\"nltk\"])\nclass CustomStep(Step):\n    @property\n    def inputs(self) -> List[str]:\n        return [\"instruction\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"response\"]\n\n    def process(self, inputs: StepInput) -> StepOutput:  # type: ignore\n        for input in inputs:\n            input[\"response\"] = nltk.word_tokenize(input)\n        yield inputs\n\n\nwith Pipeline(\n    name=\"pipeline-with-requirements\", requirements=[\"distilabel>=1.3.0\"]\n) as pipeline:\n    loader = LoadDataFromDicts(data=[{\"instruction\": \"sample sentence\"}])\n    step1 = CustomStep()\n    loader >> step1\n\nif __name__ == \"__main__\":\n    pipeline.run()\n

Once we call pipeline.run(), if any of the requirements informed at the Step or Pipeline level is missing, a ValueError will be raised telling us that we should install the list of dependencies:

>>> pipeline.run()\n[06/27/24 11:07:33] ERROR    ['distilabel.pipeline'] Please install the following requirements to run the pipeline:                                                                                                                                     base.py:350\n                             distilabel>=1.3.0\n...\nValueError: Please install the following requirements to run the pipeline:\ndistilabel>=1.3.0\n
"},{"location":"sections/how_to_guides/advanced/saving_step_generated_artifacts/","title":"Saving step generated artifacts","text":"

Some Steps might need to produce an auxiliary artifact that is not a result of the computation, but is needed for the computation. For example, the FaissNearestNeighbour needs to create a Faiss index to compute the output of the step which are the top k nearest neighbours for each input. Generating the Faiss index takes time and it could potentially be reused outside of the distilabel pipeline, so it would be a shame not saving it.

For this reason, Steps have a method called save_artifact that allows saving artifacts that will be included along the outputs of the pipeline in the generated Distiset. The generated artifacts will be uploaded and saved when using Distiset.push_to_hub or Distiset.save_to_disk respectively. Let's see how to use it with a simple example.

from typing import List, TYPE_CHECKING\nfrom distilabel.steps import GlobalStep, StepInput, StepOutput\nimport matplotlib.pyplot as plt\n\nif TYPE_CHECKING:\n    from distilabel.steps import StepOutput\n\n\nclass CountTextCharacters(GlobalStep):\n    @property\n    def inputs(self) -> List[str]:\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"text_character_count\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        character_counts = []\n\n        for input in inputs:\n            text_character_count = len(input[\"text\"])\n            input[\"text_character_count\"] = text_character_count\n            character_counts.append(text_character_count)\n\n        # Generate plot with the distribution of text character counts\n        plt.figure(figsize=(10, 6))\n        plt.hist(character_counts, bins=30, edgecolor=\"black\")\n        plt.title(\"Distribution of Text Character Counts\")\n        plt.xlabel(\"Character Count\")\n        plt.ylabel(\"Frequency\")\n\n        # Save the plot as an artifact of the step\n        self.save_artifact(\n            name=\"text_character_count_distribution\",\n            write_function=lambda path: plt.savefig(path / \"figure.png\"),\n            metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n        )\n\n        plt.close()\n\n        yield inputs\n

As it can be seen in the example above, we have created a simple step that counts the number of characters in each input text and generates a histogram with the distribution of the character counts. We save the histogram as an artifact of the step using the save_artifact method. The method takes three arguments:

  • name: The name we want to give to the artifact.
  • write_function: A function that writes the artifact to the desired path. The function will receive a path argument which is a pathlib.Path object pointing to the directory where the artifact should be saved.
  • metadata: A dictionary with metadata about the artifact. This metadata will be saved along with the artifact.

Let's execute the step with a simple pipeline and push the resulting Distiset to the Hugging Face Hub:

Example full code
from typing import TYPE_CHECKING, List\n\nimport matplotlib.pyplot as plt\nfrom datasets import load_dataset\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GlobalStep, StepInput, StepOutput\n\nif TYPE_CHECKING:\n    from distilabel.steps import StepOutput\n\n\nclass CountTextCharacters(GlobalStep):\n    @property\n    def inputs(self) -> List[str]:\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"text_character_count\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        character_counts = []\n\n        for input in inputs:\n            text_character_count = len(input[\"text\"])\n            input[\"text_character_count\"] = text_character_count\n            character_counts.append(text_character_count)\n\n        # Generate plot with the distribution of text character counts\n        plt.figure(figsize=(10, 6))\n        plt.hist(character_counts, bins=30, edgecolor=\"black\")\n        plt.title(\"Distribution of Text Character Counts\")\n        plt.xlabel(\"Character Count\")\n        plt.ylabel(\"Frequency\")\n\n        # Save the plot as an artifact of the step\n        self.save_artifact(\n            name=\"text_character_count_distribution\",\n            write_function=lambda path: plt.savefig(path / \"figure.png\"),\n            metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n        )\n\n        plt.close()\n\n        yield inputs\n\n\nwith Pipeline() as pipeline:\n    count_text_characters = CountTextCharacters()\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        dataset=load_dataset(\n            \"HuggingFaceH4/instruction-dataset\", split=\"test\"\n        ).rename_column(\"prompt\", \"text\"),\n    )\n\n    distiset.push_to_hub(\"distilabel-internal-testing/distilabel-artifacts-example\")\n

The generated distilabel-internal-testing/distilabel-artifacts-example dataset repository has a section in its card describing the artifacts generated by the pipeline and the generated plot can be seen here.

"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/","title":"Scaling and distributing a pipeline with Ray","text":"

Although the local Pipeline based on multiprocessing + serving LLMs with an external service is enough for executing most of the pipelines used to create SFT and preference datasets, there are scenarios where you might need to scale your pipeline across multiple machines. In such cases, distilabel leverages Ray to distribute the workload efficiently. This allows you to generate larger datasets, reduce execution time, and maximize resource utilization across a cluster of machines, without needing to change a single line of code.

"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#relation-between-distilabel-steps-and-ray-actors","title":"Relation between distilabel steps and Ray Actors","text":"

A distilabel pipeline consist of several Steps. An Step is a class that defines a basic life-cycle:

  1. It will load or create the resources (LLMs, clients, etc) required to run its logic.
  2. It will run a loop waiting for incoming batches received using a queue. Once it receives one batch, it will process it and put the processed batch into an output queue.
  3. When it finish a batch that is the final one or receives a special signal, the loop will finish and the unload logic will be executed.

So an Step needs to maintain a minimum state and the best way to do that with Ray is using actors.

graph TD\n    A[Step] -->|has| B[Multiple Replicas]\n    B -->|wrapped in| C[Ray Actor]\n    C -->|maintains| D[Step Replica State]\n    C -->|executes| E[Step Lifecycle]\n    E -->|1. Load/Create Resources| F[LLMs, Clients, etc.]\n    E -->|2. Process batches from| G[Input Queue]\n    E -->|3. Processed batches are put in| H[Output Queue]\n    E -->|4. Unload| I[Cleanup]\n
"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#executing-a-pipeline-with-ray","title":"Executing a pipeline with Ray","text":"

The recommended way to execute a distilabel pipeline using Ray is using the Ray Jobs API.

Before jumping on the explanation, let's first install the prerequisites:

pip install distilabel[ray]\n

Tip

It's recommended to create a virtual environment.

For the purpose of explaining how to execute a pipeline with Ray, we'll use the following pipeline throughout the examples:

from distilabel.models import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(name=\"text-generation-ray-pipeline\") as pipeline:\n    load_data_from_hub = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n    text_generation = TextGeneration(\n        llm=vLLM(\n            model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n            tokenizer=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        )\n    )\n\n    load_data_from_hub >> text_generation\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            load_data_from_hub.name: {\n                \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n                \"split\": \"test\",\n            },\n            text_generation.name: {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 4096,\n                    }\n                },\n                \"resources\": {\"replicas\": 2, \"gpus\": 1}, # (1)\n            },\n        }\n    )\n\n    distiset.push_to_hub(\n        \"<YOUR_HF_USERNAME_OR_ORGANIZATION>/text-generation-distilabel-ray\" # (2)\n    )\n
  1. We're setting resources for the text_generation step and defining that we want two replicas and one GPU per replica. distilabel will create two replicas of the step i.e. two actors in the Ray cluster, and each actor will request to be allocated in a node of the cluster that have at least one GPU. You can read more about how Ray manages the resources here.
  2. You should modify this and add your user or organization on the Hugging Face Hub.

It's a basic pipeline with just two steps: one to load a dataset from the Hub with an instruction column and one to generate a response for that instruction using Llama 3 8B Instruct with vLLM. Simple but enough to demonstrate how to distribute and scale the workload using a Ray cluster!

"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#using-ray-jobs-api","title":"Using Ray Jobs API","text":"

If you don't know the Ray Jobs API then it's recommended to read Ray Jobs Overview. Quick summary: Ray Jobs is the recommended way to execute a job in a Ray cluster as it will handle packaging, deploying and managing the Ray application.

To execute the pipeline above, we first need to create a directory (kind of a package) with the pipeline script (or scripts) that we will submit to the Ray cluster:

mkdir ray-pipeline\n

The content of the directory ray-pipeline should be:

ray-pipeline/\n\u251c\u2500\u2500 pipeline.py\n\u2514\u2500\u2500 runtime_env.yaml\n

The first file contains the code of the pipeline, while the second one (runtime_env.yaml) is a specific Ray file containing the environment dependencies required to run the job:

pip:\n  - distilabel[ray,vllm] >= 1.3.0\nenv_vars:\n  HF_TOKEN: <YOUR_HF_TOKEN>\n

With this file we're basically informing to the Ray cluster that it will have to install distilabel with the vllm and ray extra dependencies to be able to run the job. In addition, we're defining the HF_TOKEN environment variable that will be used (by the push_to_hub method) to upload the resulting dataset to the Hugging Face Hub.

After that, we can proceed to execute the ray command that will submit the job to the Ray cluster:

ray job submit \\\n    --address http://localhost:8265 \\\n    --working-dir ray-pipeline \\\n    --runtime-env ray-pipeline/runtime_env.yaml -- python pipeline.py\n

What this will do, it's to basically upload the --working-dir to the Ray cluster, install the dependencies and then execute the python pipeline.py command from the head node.

"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#file-system-requirements","title":"File system requirements","text":"

As described in Using a file system to pass data to steps, distilabel relies on the file system to pass the data to the GlobalSteps, so if the pipeline to be executed in the Ray cluster have any GlobalStep or do you want to set the use_fs_to_pass_data=True of the run method, then you will need to setup a file system to which all the nodes of the Ray cluster have access:

if __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={...},\n        storage_parameters={\"path\": \"file:///mnt/data\"}, # (1)\n        use_fs_to_pass_data=True,\n    )\n
  1. All the nodes of the Ray cluster should have access to /mnt/data.
"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#executing-a-raypipeline-in-a-cluster-with-slurm","title":"Executing a RayPipeline in a cluster with Slurm","text":"

If you have access to an HPC, then you're probably also a user of Slurm, a workload manager typically used on HPCs. We can create Slurm job that takes some nodes and deploy a Ray cluster to run a distributed distilabel pipeline:

#!/bin/bash\n#SBATCH --job-name=distilabel-ray-text-generation\n#SBATCH --partition=your-partition\n#SBATCH --qos=normal\n#SBATCH --nodes=2 # (1)\n#SBATCH --exclusive\n#SBATCH --ntasks-per-node=1 # (2)\n#SBATCH --gpus-per-node=1 # (3)\n#SBATCH --time=0:30:00\n\nset -ex\n\necho \"SLURM_JOB_ID: $SLURM_JOB_ID\"\necho \"SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST\"\n\n# Activate virtual environment\nsource /path/to/virtualenv/.venv/bin/activate\n\n# Getting the node names\nnodes=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\")\nnodes_array=($nodes)\n\n# Get the IP address of the head node\nhead_node=${nodes_array[0]}\nhead_node_ip=$(srun --nodes=1 --ntasks=1 -w \"$head_node\" hostname --ip-address)\n\n# Start Ray head node\nport=6379\nip_head=$head_node_ip:$port\nexport ip_head\necho \"IP Head: $ip_head\"\n\n# Generate a unique Ray tmp dir for the head node (just in case the default one is not writable)\nhead_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_head\"\n\necho \"Starting HEAD at $head_node\"\nOUTLINES_CACHE_DIR=\"/tmp/.outlines\" srun --nodes=1 --ntasks=1 -w \"$head_node\" \\ # (4)\n    ray start --head --node-ip-address=\"$head_node_ip\" --port=$port \\\n    --dashboard-host=0.0.0.0 \\\n    --dashboard-port=8265 \\\n    --temp-dir=\"$head_tmp_dir\" \\\n    --block &\n\n# Give some time to head node to start...\necho \"Waiting a bit before starting worker nodes...\"\nsleep 10\n\n# Start Ray worker nodes\nworker_num=$((SLURM_JOB_NUM_NODES - 1))\n\n# Start from 1 (0 is head node)\nfor ((i = 1; i <= worker_num; i++)); do\n    node_i=${nodes_array[$i]}\n    worker_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_worker_$i\"\n    echo \"Starting WORKER $i at $node_i\"\n    OUTLINES_CACHE_DIR=\"/tmp/.outlines\" srun --nodes=1 --ntasks=1 -w \"$node_i\" \\\n        ray start --address \"$ip_head\" \\\n        --temp-dir=\"$worker_tmp_dir\" \\\n        --block &\n    sleep 5\ndone\n\n# Give some time to the Ray cluster to gather info\necho \"Waiting a bit before submitting the job...\"\nsleep 60\n\n# Finally submit the job to the cluster\nray job submit --address http://localhost:8265 --working-dir ray-pipeline -- python -u pipeline.py\n
  1. In this case, we just want two nodes: one to run the Ray head node and one to run a worker.
  2. We just want to run a task per node i.e. the Ray command that starts the head/worker node.
  3. We have selected 1 GPU per node, but we could have selected more depending on the pipeline.
  4. We need to set the environment variable OUTLINES_CACHE_DIR to /tmp/.outlines to avoid issues with the nodes trying to read/write the same outlines cache files, which is not possible.
"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#vllm-and-tensor_parallel_size","title":"vLLM and tensor_parallel_size","text":"

In order to use vLLM multi-GPU and multi-node capabilities with ray, we need to do a few changes in the example pipeline from above. The first change needed is to specify a value for tensor_parallel_size aka \"In how many GPUs do I want you to load the model\", and the second one is to define ray as the distributed_executor_backend as the default one in vLLM is to use multiprocessing:

with Pipeline(name=\"text-generation-ray-pipeline\") as pipeline:\n    load_data_from_hub = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n    text_generation = TextGeneration(\n        llm=vLLM(\n            model=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            extra_kwargs={\n                \"tensor_parallel_size\": 8,\n                \"distributed_executor_backend\": \"ray\",\n            }\n        )\n    )\n\n    load_data_from_hub >> text_generation\n

More information about distributed inference with vLLM can be found here: vLLM - Distributed Serving

"},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/","title":"Serving an LLM for sharing it between several Tasks","text":"

It's very common to want to use the same LLM for several Tasks in a pipeline. To avoid loading the LLM as many times as the number of Tasks and avoid wasting resources, it's recommended to serve the model using solutions like text-generation-inference or vLLM, and then use an AsyncLLM compatible client like InferenceEndpointsLLM or OpenAILLM to communicate with the server respectively.

"},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/#serving-llms-using-text-generation-inference","title":"Serving LLMs using text-generation-inference","text":"
model=meta-llama/Meta-Llama-3-8B-Instruct\nvolume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run\n\ndocker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \\\n    -e HUGGING_FACE_HUB_TOKEN=<secret> \\\n    ghcr.io/huggingface/text-generation-inference:2.0.4 \\\n    --model-id $model\n

Note

The bash command above has been copy-pasted from the official docs text-generation-inference. Please refer to the official docs for more information.

And then we can use InferenceEndpointsLLM with base_url=http://localhost:8080 (pointing to our TGI local deployment):

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nwith Pipeline(name=\"serving-llm\") as pipeline:\n    load_data = LoadDataFromDicts(\n        data=[{\"instruction\": \"Write a poem about the sun and moon.\"}]\n    )\n\n    # `base_url` points to the address of the `TGI` serving the LLM\n    llm = InferenceEndpointsLLM(base_url=\"http://192.168.1.138:8080\")\n\n    text_generation = TextGeneration(\n        llm=llm,\n        num_generations=3,\n        group_generations=True,\n        output_mappings={\"generation\": \"generations\"},\n    )\n\n    ultrafeedback = UltraFeedback(aspect=\"overall-rating\", llm=llm)\n\n    load_data >> text_generation >> ultrafeedback\n
"},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/#serving-llms-using-vllm","title":"Serving LLMs using vLLM","text":"
docker run --gpus all \\\n    -v ~/.cache/huggingface:/root/.cache/huggingface \\\n    --env \"HUGGING_FACE_HUB_TOKEN=<secret>\" \\\n    -p 8000:8000 \\\n    --ipc=host \\\n    vllm/vllm-openai:latest \\\n    --model meta-llama/Meta-Llama-3-8B-Instruct\n

Note

The bash command above has been copy-pasted from the official docs vLLM. Please refer to the official docs for more information.

And then we can use OpenAILLM with base_url=http://localhost:8000 (pointing to our vLLM local deployment):

from distilabel.models import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nwith Pipeline(name=\"serving-llm\") as pipeline:\n    load_data = LoadDataFromDicts(\n        data=[{\"instruction\": \"Write a poem about the sun and moon.\"}]\n    )\n\n    # `base_url` points to the address of the `vLLM` serving the LLM\n    llm = OpenAILLM(base_url=\"http://192.168.1.138:8000\", model=\"\")\n\n    text_generation = TextGeneration(\n        llm=llm,\n        num_generations=3,\n        group_generations=True,\n        output_mappings={\"generation\": \"generations\"},\n    )\n\n    ultrafeedback = UltraFeedback(aspect=\"overall-rating\", llm=llm)\n\n    load_data >> text_generation >> ultrafeedback\n
"},{"location":"sections/how_to_guides/advanced/structured_generation/","title":"Structured data generation","text":"

Distilabel has integrations with relevant libraries to generate structured text i.e. to guide the LLM towards the generation of structured outputs following a JSON schema, a regex, etc.

"},{"location":"sections/how_to_guides/advanced/structured_generation/#outlines","title":"Outlines","text":"

Distilabel integrates outlines within some LLM subclasses. At the moment, the following LLMs integrated with outlines are supported in distilabel: TransformersLLM, vLLM or LlamaCppLLM, so that anyone can generate structured outputs in the form of JSON or a parseable regex.

The LLM has an argument named structured_output1 that determines how we can generate structured outputs with it, let's see an example using LlamaCppLLM.

Note

For outlines integration to work you may need to install the corresponding dependencies:

pip install distilabel[outlines]\n
"},{"location":"sections/how_to_guides/advanced/structured_generation/#json","title":"JSON","text":"

We will start with a JSON example, where we initially define a pydantic.BaseModel schema to guide the generation of the structured output.

Note

Take a look at StructuredOutputType to see the expected format of the structured_output dict variable.

from pydantic import BaseModel\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n

And then we provide that schema to the structured_output argument of the LLM.

from distilabel.models import LlamaCppLLM\n\nllm = LlamaCppLLM(\n    model_path=\"./openhermes-2.5-mistral-7b.Q4_K_M.gguf\"  # (1)\n    n_gpu_layers=-1,\n    n_ctx=1024,\n    structured_output={\"format\": \"json\", \"schema\": User},\n)\nllm.load()\n
  1. We have previously downloaded a GGUF model i.e. llama.cpp compatible, from the Hugging Face Hub using curl2, but any model can be used as replacement, as long as the model_path argument is updated.

And we are ready to pass our instruction as usual:

import json\n\nresult = llm.generate(\n    [[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]],\n    max_new_tokens=50\n)\n\ndata = json.loads(result[0][0])\ndata\n# {'name': 'Kathy', 'last_name': 'Smith', 'id': 4539210}\nUser(**data)\n# User(name='Kathy', last_name='Smith', id=4539210)\n

We get back a Python dictionary (formatted as a string) that we can parse using json.loads, or validate it directly using the User, which si a pydantic.BaseModel instance.

"},{"location":"sections/how_to_guides/advanced/structured_generation/#regex","title":"Regex","text":"

The following example shows an example of text generation whose output adhere to a regular expression:

pattern = r\"<name>(.*?)</name>.*?<grade>(.*?)</grade>\"  #\u00a0the same pattern for re.compile\n\nllm=LlamaCppLLM(\n    model_path=model_path,\n    n_gpu_layers=-1,\n    n_ctx=1024,\n    structured_output={\"format\": \"regex\", \"schema\": pattern},\n)\nllm.load()\n\nresult = llm.generate(\n    [\n        [\n            {\"role\": \"system\", \"content\": \"You are Simpsons' fans who loves assigning grades from A to E, where A is the best and E is the worst.\"},\n            {\"role\": \"user\", \"content\": \"What's up with Homer Simpson?\"}\n        ]\n    ],\n    max_new_tokens=200\n)\n

We can check the output by parsing the content using the same pattern we required from the LLM.

import re\nmatch = re.search(pattern, result[0][0])\n\nif match:\n    name = match.group(1)\n    grade = match.group(2)\n    print(f\"Name: {name}, Grade: {grade}\")\n# Name: Homer Simpson, Grade: C+\n

These were some simple examples, but one can see the options this opens.

Tip

A full pipeline example can be seen in the following script: examples/structured_generation_with_outlines.py

"},{"location":"sections/how_to_guides/advanced/structured_generation/#instructor","title":"Instructor","text":"

For other LLM providers behind APIs, there's no direct way of accessing the internal logit processor like outlines does, but thanks to instructor we can generate structured output from LLM providers based on pydantic.BaseModel objects. We have integrated instructor to deal with the AsyncLLM.

Note

For instructor integration to work you may need to install the corresponding dependencies:

pip install distilabel[instructor]\n

Note

Take a look at InstructorStructuredOutputType to see the expected format of the structured_output dict variable.

The following is the same example you can see with outlines's JSON section for comparison purposes.

from pydantic import BaseModel\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n

And then we provide that schema to the structured_output argument of the LLM:

Note

In this example we are using Meta Llama 3.1 8B Instruct, keep in mind not all the models support structured outputs.

from distilabel.models import MistralLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n    structured_output={\"schema\": User}\n)\nllm.load()\n

And we are ready to pass our instructions as usual:

import json\n\nresult = llm.generate(\n    [[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]],\n    max_new_tokens=256\n)\n\ndata = json.loads(result[0][0])\ndata\n# {'name': 'John', 'last_name': 'Doe', 'id': 12345}\nUser(**data)\n# User(name='John', last_name='Doe', id=12345)\n

We get back a Python dictionary (formatted as a string) that we can parse using json.loads, or validate it directly using the User, which is a pydantic.BaseModel instance.

Tip

A full pipeline example can be seen in the following script: examples/structured_generation_with_instructor.py

"},{"location":"sections/how_to_guides/advanced/structured_generation/#openai-json","title":"OpenAI JSON","text":"

OpenAI offers a JSON Mode to deal with structured output via their API, let's see how to make use of them. The JSON mode instructs the model to always return a JSON object following the instruction required.

Warning

Bear in mind, for this to work, you must instruct the model in some way to generate JSON, either in the system message or in the instruction, as can be seen in the API reference.

Contrary to what we have via outlines, JSON mode will not guarantee the output matches any specific schema, only that it is valid and parses without errors. More information can be found in the OpenAI documentation.

Other than the reference to generating JSON, to ensure the model generates parseable JSON we can pass the argument response_format=\"json\"3:

from distilabel.models import OpenAILLM\nllm = OpenAILLM(model=\"gpt4-turbo\", api_key=\"api.key\")\nllm.generate(..., response_format=\"json\")\n
  1. You can check the variable type by importing it from:

    from distilabel.steps.tasks.structured_outputs.outlines import StructuredOutputType\n
    \u21a9

  2. Download the model with curl:

    curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n
    \u21a9

  3. Keep in mind that to interact with this response_format argument in a pipeline, you will have to pass it via the generation_kwargs:

    # Assuming a pipeline is already defined, and we have a task using OpenAILLM called `task_with_openai`:\npipeline.run(\n    parameters={\n        \"task_with_openai\": {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"response_format\": \"json\"\n                }\n            }\n        }\n    }\n)\n
    \u21a9

"},{"location":"sections/how_to_guides/advanced/cli/","title":"Command Line Interface (CLI)","text":"

Distilabel offers a CLI to explore and re-run existing Pipeline dumps, meaning that an existing dump can be explored to see the steps, how those are connected, the runtime parameters used, and also re-run it with the same or different runtime parameters, respectively.

"},{"location":"sections/how_to_guides/advanced/cli/#available-commands","title":"Available commands","text":"

The only available command as of the current version of distilabel is distilabel pipeline.

$ distilabel pipeline --help\n\n Usage: distilabel pipeline [OPTIONS] COMMAND [ARGS]...\n\n Commands to run and inspect Distilabel pipelines.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 --help          Show this message and exit.                                             \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n\u256d\u2500 Commands \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 info      Get information about a Distilabel pipeline.                                  \u2502\n\u2502 run       Run a Distilabel pipeline.                                                    \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n

So on, distilabel pipeline has two subcommands: info and run, as described below. Note that for testing purposes we will be using the following dataset.

"},{"location":"sections/how_to_guides/advanced/cli/#distilabel-pipeline-info","title":"distilabel pipeline info","text":"
$ distilabel pipeline info --help\n\n Usage: distilabel pipeline info [OPTIONS]\n\n Get information about a Distilabel pipeline.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 *  --config        TEXT  Path or URL to the Distilabel pipeline configuration file. \u2502\n\u2502                          [default: None]                                            \u2502\n\u2502                          [required]                                                 \u2502\n\u2502    --help                Show this message and exit.                                \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n

As we can see from the help message, we need to pass either a Path or a URL. This second option comes handy for datasets stored in Hugging Face Hub, for example:

distilabel pipeline info --config \"https://huggingface.co/datasets/distilabel-internal-testing/instruction-dataset-mini-with-generations/raw/main/pipeline.yaml\"\n

If we take a look:

The pipeline information includes the steps used in the Pipeline along with the Runtime Parameter that was used, as well as a description of each of them, and also the connections between these steps. These can be helpful to explore the Pipeline locally.

"},{"location":"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run","title":"distilabel pipeline run","text":"

We can also run a Pipeline from the CLI just pointing to the same pipeline.yaml file or an URL pointing to it and calling distilabel pipeline run. Alternatively, an URL pointing to a Python script containing a distilabel pipeline can be used:

$ distilabel pipeline run --help\n\n Usage: distilabel pipeline run [OPTIONS]\n\n Run a Distilabel pipeline.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 --param                                          PARSE_RUNTIME_PARAM  [default: (dynamic)]                                         \u2502\n\u2502 --config                                         TEXT                 Path or URL to the Distilabel pipeline configuration file.   \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --script                                         TEXT                 URL pointing to a python script containing a distilabel      \u2502\n\u2502                                                                       pipeline.                                                    \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --pipeline-variable-name                         TEXT                 Name of the pipeline in a script. I.e. the 'pipeline'        \u2502\n\u2502                                                                       variable in `with Pipeline(...) as pipeline:...`.            \u2502\n\u2502                                                                       [default: pipeline]                                          \u2502\n\u2502 --ignore-cache              --no-ignore-cache                         Whether to ignore the cache and re-run the pipeline from     \u2502\n\u2502                                                                       scratch.                                                     \u2502\n\u2502                                                                       [default: no-ignore-cache]                                   \u2502\n\u2502 --repo-id                                        TEXT                 The Hugging Face Hub repository ID to push the resulting     \u2502\n\u2502                                                                       dataset to.                                                  \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --commit-message                                 TEXT                 The commit message to use when pushing the dataset.          \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --private                   --no-private                              Whether to make the resulting dataset private on the Hub.    \u2502\n\u2502                                                                       [default: no-private]                                        \u2502\n\u2502 --token                                          TEXT                 The Hugging Face Hub API token to use when pushing the       \u2502\n\u2502                                                                       dataset.                                                     \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --help                                                                Show this message and exit.                                  \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n

Using --config option, we must pass a path with a pipeline.yaml file. To specify the runtime parameters of the steps we will need to use the --param option and the value of the parameter in the following format:

distilabel pipeline run --config \"https://huggingface.co/datasets/distilabel-internal-testing/instruction-dataset-mini-with-generations/raw/main/pipeline.yaml\" \\\n    --param load_dataset.repo_id=distilabel-internal-testing/instruction-dataset-mini \\\n    --param load_dataset.split=test \\\n    --param generate_with_gpt35.llm.generation_kwargs.max_new_tokens=512 \\\n    --param generate_with_gpt35.llm.generation_kwargs.temperature=0.7 \\\n    --param to_argilla.dataset_name=text_generation_with_gpt35 \\\n    --param to_argilla.dataset_workspace=admin\n

Or using --script we can pass directly a remote python script (keep in mind --config and --script are exclusive):

distilabel pipeline run --script \"https://huggingface.co/datasets/distilabel-internal-testing/pipe_nothing_test/raw/main/pipe_nothing.py\"\n

You can also pass runtime parameters to the python script as we saw with --config option.

Again, this helps with the reproducibility of the results, and simplifies sharing not only the final dataset but also the process to generate it.

"},{"location":"sections/how_to_guides/basic/llm/","title":"Executing Tasks with LLMs","text":""},{"location":"sections/how_to_guides/basic/llm/#working-with-llms","title":"Working with LLMs","text":"

LLM subclasses are designed to be used within a Task, but they can also be used standalone.

from distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(model=\"meta-llama/Meta-Llama-3.1-70B-Instruct\")\nllm.load()\n\nllm.generate_outputs(\n    inputs=[\n        [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n    ],\n)\n# \"The capital of Spain is Madrid.\"\n

Note

Always call the LLM.load or Task.load method when using LLMs standalone or as part of a Task. If using a Pipeline, this is done automatically in Pipeline.run().

"},{"location":"sections/how_to_guides/basic/llm/#offline-batch-generation","title":"Offline Batch Generation","text":"

By default, all LLMs will generate text in a synchronous manner i.e. send inputs using generate_outputs method that will get blocked until outputs are generated. There are some LLMs (such as OpenAILLM) that implements what we denote as offline batch generation, which allows to send the inputs to the LLM-as-a-service which will generate the outputs asynchronously and give us a job id that we can use later to check the status and retrieve the generated outputs when they are ready. LLM-as-a-service platforms offers this feature as a way to save costs in exchange of waiting for the outputs to be generated.

To use this feature in distilabel the only thing we need to do is to set the use_offline_batch_generation attribute to True when creating the LLM instance:

from distilabel.models import OpenAILLM\n\nllm = OpenAILLM(\n    model=\"gpt-4o\",\n    use_offline_batch_generation=True,\n)\n\nllm.load()\n\nllm.jobs_ids  # (1)\n# None\n\nllm.generate_outputs(  # (2)\n    inputs=[\n        [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n    ],\n)\n# DistilabelOfflineBatchGenerationNotFinishedException: Batch generation with jobs_ids=('batch_OGB4VjKpu2ay9nz3iiFJxt5H',) is not finished\n\nllm.jobs_ids  # (3)\n# ('batch_OGB4VjKpu2ay9nz3iiFJxt5H',)\n\n\nllm.generate_outputs(  # (4)\n    inputs=[\n        [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n    ],\n)\n# \"The capital of Spain is Madrid.\"\n
  1. At first the jobs_ids attribute is None.
  2. The first call to generate_outputs will send the inputs to the LLM-as-a-service and return a DistilabelOfflineBatchGenerationNotFinishedException since the outputs are not ready yet.
  3. After the first call to generate_outputs the jobs_ids attribute will contain the job ids created for generating the outputs.
  4. The second call or subsequent calls to generate_outputs will return the outputs if they are ready or raise a DistilabelOfflineBatchGenerationNotFinishedException if they are not ready yet.

The offline_batch_generation_block_until_done attribute can be used to block the generate_outputs method until the outputs are ready polling the platform the specified amount of seconds.

from distilabel.models import OpenAILLM\n\nllm = OpenAILLM(\n    model=\"gpt-4o\",\n    use_offline_batch_generation=True,\n    offline_batch_generation_block_until_done=5,  # poll for results every 5 seconds\n)\nllm.load()\n\nllm.generate_outputs(\n    inputs=[\n        [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n    ],\n)\n# \"The capital of Spain is Madrid.\"\n
"},{"location":"sections/how_to_guides/basic/llm/#within-a-task","title":"Within a Task","text":"

Pass the LLM as an argument to the Task, and the task will handle the rest.

from distilabel.models import OpenAILLM\nfrom distilabel.steps.tasks import TextGeneration\n\nllm = OpenAILLM(model=\"gpt-4\")\ntask = TextGeneration(name=\"text_generation\", llm=llm)\n\ntask.load()\n\nnext(task.process(inputs=[{\"instruction\": \"What's the capital of Spain?\"}]))\n# [{'instruction': \"What's the capital of Spain?\", \"generation\": \"The capital of Spain is Madrid.\"}]\n
"},{"location":"sections/how_to_guides/basic/llm/#runtime-parameters","title":"Runtime Parameters","text":"

LLMs can have runtime parameters, such as generation_kwargs, provided via the Pipeline.run() method using the params argument.

Note

Runtime parameters can differ between LLM subclasses, caused by the different functionalities offered by the LLM providers.

from distilabel.pipeline import Pipeline\nfrom distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(name=\"text-generation-pipeline\") as pipeline:\n    load_dataset = LoadDataFromDicts(\n        name=\"load_dataset\",\n        data=[{\"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\"}],\n    )\n\n    text_generation = TextGeneration(\n        name=\"text_generation\",\n        llm=OpenAILLM(model=\"gpt-4\"),\n    )\n\n    load_dataset >> text_generation\n\nif __name__ == \"__main__\":\n    pipeline.run(\n        parameters={\n            text_generation.name: {\"llm\": {\"generation_kwargs\": {\"temperature\": 0.3}}},\n        },\n    )\n
"},{"location":"sections/how_to_guides/basic/llm/#creating-custom-llms","title":"Creating custom LLMs","text":"

To create custom LLMs, subclass either LLM for synchronous or AsyncLLM for asynchronous LLMs. Implement the following methods:

  • model_name: A property containing the model's name.

  • generate: A method that takes a list of prompts and returns generated texts.

  • agenerate: A method that takes a single prompt and returns generated texts. This method is used within the generate method of the AsyncLLM class.

  • (optional) get_last_hidden_state: is a method that will take a list of prompts and return a list of hidden states. This method is optional and will be used by some tasks such as the GenerateEmbeddings task.

Custom LLMCustom AsyncLLM
from typing import Any\n\nfrom pydantic import validate_call\n\nfrom distilabel.models import LLM\nfrom distilabel.typing import GenerateOutput, HiddenState\nfrom distilabel.typing import ChatType\n\nclass CustomLLM(LLM):\n    @property\n    def model_name(self) -> str:\n        return \"my-model\"\n\n    @validate_call\n    def generate(self, inputs: List[ChatType], num_generations: int = 1, **kwargs: Any) -> List[GenerateOutput]:\n        for _ in range(num_generations):\n            ...\n\n    def get_last_hidden_state(self, inputs: List[ChatType]) -> List[HiddenState]:\n        ...\n
from typing import Any\n\nfrom pydantic import validate_call\n\nfrom distilabel.models import AsyncLLM\nfrom distilabel.typing import GenerateOutput, HiddenState\nfrom distilabel.typing import ChatType\n\nclass CustomAsyncLLM(AsyncLLM):\n    @property\n    def model_name(self) -> str:\n        return \"my-model\"\n\n    @validate_call\n    async def agenerate(self, input: ChatType, num_generations: int = 1, **kwargs: Any) -> GenerateOutput:\n        for _ in range(num_generations):\n            ...\n\n    def get_last_hidden_state(self, inputs: List[ChatType]) -> List[HiddenState]:\n        ...\n

generate and agenerate keyword arguments (but input and num_generations) are considered as RuntimeParameters, so a value can be passed to them via the parameters argument of the Pipeline.run method.

Note

To have the arguments of the generate and agenerate coerced to the expected types, the validate_call decorator is used, which will automatically coerce the arguments to the expected types, and raise an error if the types are not correct. This is specially useful when providing a value for an argument of generate or agenerate from the CLI, since the CLI will always provide the arguments as strings.

"},{"location":"sections/how_to_guides/basic/llm/#available-llms","title":"Available LLMs","text":"

Our LLM gallery shows a list of the available LLMs that can be used within the distilabel library.

"},{"location":"sections/how_to_guides/basic/pipeline/","title":"Execute Steps and Tasks in a Pipeline","text":""},{"location":"sections/how_to_guides/basic/pipeline/#how-to-create-a-pipeline","title":"How to create a pipeline","text":"

Pipeline organise the Steps and Tasks in a sequence, where the output of one step is the input of the next one. A Pipeline should be created by making use of the context manager along with passing a name, and optionally a description.

from distilabel.pipeline import Pipeline\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    ...\n
"},{"location":"sections/how_to_guides/basic/pipeline/#connecting-steps-with-the-stepconnect-method","title":"Connecting steps with the Step.connect method","text":"

Now, we can define the steps of our Pipeline.

Note

Steps without predecessors (i.e. root steps), need to be GeneratorSteps such as LoadDataFromDicts or LoadDataFromHub. After this, other steps can be defined.

from distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n    ...\n

Easily load your datasets

If you are already used to work with Hugging Face's Dataset via load_dataset or pd.DataFrame, you can create the GeneratorStep directly from the dataset (or dataframe), and create the step with the help of make_generator_step:

From a list of dictsFrom datasets.DatasetFrom pd.DataFrame
from distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = [{\"instruction\": \"Tell me a joke.\"}]\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n    ...\n
from datasets import load_dataset\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = load_dataset(\n    \"DIBT/10k_prompts_ranked\",\n    split=\"train\"\n).filter(\n    lambda r: r[\"avg_rating\"]>=4 and r[\"num_responses\"]>=2\n).select(range(500))\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n    ...\n
import pandas as pd\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = pd.read_csv(\"path/to/dataset.csv\")\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n    ...\n

Next, we will use prompt column from the dataset obtained through LoadDataFromHub and use several LLMs to execute a TextGeneration task. We will also use the Task.connect() method to connect the steps, so the output of one step is the input of the next one.

Note

The order of the execution of the steps will be determined by the connections of the steps. In this case, the TextGeneration tasks will be executed after the LoadDataFromHub step.

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        task.connect(load_dataset)\n\n    ...\n

For each row of the dataset, the TextGeneration task will generate a text based on the instruction column and the LLM model, and store the result (a single string) in a new column called generation. Because we need to have the responses in the same column, we will add GroupColumns to combine them all in the same column as a list of strings.

Note

In this case, the GroupColumns tasks will be executed after all TextGeneration steps.

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        load_dataset.connect(task)\n        task.connect(combine_generations)\n
"},{"location":"sections/how_to_guides/basic/pipeline/#connecting-steps-with-the-operator","title":"Connecting steps with the >> operator","text":"

Besides the Step.connect method: step1.connect(step2), there's an alternative way by making use of the >> operator. We can connect steps in a more readable way, and it's also possible to connect multiple steps at once.

Step per stepMultiple steps at once

Each call to step1.connect(step2) has been exchanged by step1 >> step2 within the loop.

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        load_dataset >> task >> combine_generations\n

Each task is first appended to a list, and then all the calls to connections are done in a single call.

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    tasks = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        tasks.append(\n            TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        )\n\n    load_dataset >> tasks >> combine_generations\n
"},{"location":"sections/how_to_guides/basic/pipeline/#routing-batches-to-specific-downstream-steps","title":"Routing batches to specific downstream steps","text":"

In some pipelines, you may want to send batches from a single upstream step to specific downstream steps based on certain conditions. To achieve this, you can use a routing_batch_function. This function takes a list of downstream steps and returns a list of step names to which each batch should be routed.

Let's update the example above to route the batches loaded by the LoadDataFromHub step to just 2 of the TextGeneration tasks. First, we will create our custom routing_batch_function, and then we will update the pipeline to use it:

import random\nfrom distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\n@routing_batch_function\ndef sample_two_steps(steps: list[str]) -> list[str]:\n    return random.sample(steps, 2)\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n    )\n\n    tasks = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.0-pro\"),\n    ):\n        tasks.append(\n            TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        )\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    load_dataset >> sample_two_steps >> tasks >> combine_generations\n

The routing_batch_function that we just built is a common one, so distilabel comes with a builtin function that can be used to achieve the same behavior:

from distilable.pipeline import sample_n_steps\n\nsample_two_steps = sample_n_steps(2)\n
"},{"location":"sections/how_to_guides/basic/pipeline/#running-the-pipeline","title":"Running the pipeline","text":""},{"location":"sections/how_to_guides/basic/pipeline/#pipelinedry_run","title":"Pipeline.dry_run","text":"

Before running the Pipeline we can check if the pipeline is valid using the Pipeline.dry_run() method. It takes the same parameters as the run method which we will discuss in the following section, plus the batch_size we want the dry run to use (by default set to 1).

with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    ...\n\nif __name__ == \"__main__\":\n    distiset = pipeline.dry_run(parameters=..., batch_size=1)\n
"},{"location":"sections/how_to_guides/basic/pipeline/#pipelinerun","title":"Pipeline.run","text":"

After testing, we can now execute the full Pipeline using the Pipeline.run() method.

with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    ...\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            \"load_dataset\": {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n            \"text_generation_with_gpt-4-0125-preview\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n            \"text_generation_with_mistral-large-2402\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n            \"text_generation_with_gemini-1.0-pro\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n        },\n    )\n

But if we run the pipeline above, we will see that the run method will fail:

ValueError: Step 'text_generation_with_gpt-4-0125-preview' requires inputs ['instruction'], but only the inputs=['prompt', 'completion', 'meta'] are available, which means that the inputs=['instruction'] are missing or not available\nwhen the step gets to be executed in the pipeline. Please make sure previous steps to 'text_generation_with_gpt-4-0125-preview' are generating the required inputs.\n

This is because, before actually running the pipeline, we must ensure each step has the necessary input columns to be executed. In this case, the TextGeneration task requires the instruction column, but the LoadDataFromHub step generates the prompt column. To solve this, we can use the output_mappings or input_mapping arguments of individual Steps, to map columns from one step to another.

with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"}\n    )\n\n    ...\n

If we execute the pipeline again, it will run successfully and we will have a Distiset with the outputs of all the leaf steps of the pipeline which we can push to the Hugging Face Hub.

if __name__ == \"__main__\":\n    distiset = pipeline.run(...)\n    distiset.push_to_hub(\"distilabel-internal-testing/instruction-dataset-mini-with-generations\")\n
"},{"location":"sections/how_to_guides/basic/pipeline/#pipelinerun-with-a-dataset","title":"Pipeline.run with a dataset","text":"

Note that in most cases if you don't need the extra flexibility the GeneratorSteps bring you, you can create a dataset as you would normally do and pass it to the Pipeline.run method directly. Look at the highlighted lines to see the updated lines:

import random\nfrom distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import GroupColumns\nfrom distilabel.steps.tasks import TextGeneration\n\n@routing_batch_function\ndef sample_two_steps(steps: list[str]) -> list[str]:\n    return random.sample(steps, 2)\n\ndataset = load_dataset(\n    \"distilabel-internal-testing/instruction-dataset-mini\",\n    split=\"test\"\n)\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    tasks = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.0-pro\"),\n    ):\n        tasks.append(\n            TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        )\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    sample_two_steps >> tasks >> combine_generations\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        dataset=dataset,\n        parameters=...\n    )\n
"},{"location":"sections/how_to_guides/basic/pipeline/#stopping-the-pipeline","title":"Stopping the pipeline","text":"

In case you want to stop the pipeline while it's running, you can press Ctrl+C or Cmd+C depending on your OS (or send a SIGINT to the main process), and the outputs will be stored in the cache. Pressing an additional time will force the pipeline to stop its execution, but this can lead to losing the generated outputs for certain batches.

"},{"location":"sections/how_to_guides/basic/pipeline/#cache","title":"Cache","text":"

If for some reason, the pipeline execution stops (for example by pressing Ctrl+C), the state of the pipeline and the outputs will be stored in the cache, so we can resume the pipeline execution from the point where it was stopped.

If we want to force the pipeline to run again without can, then we can use the use_cache argument of the Pipeline.run() method:

if __name__ == \"__main__\":\n    distiset = pipeline.run(parameters={...}, use_cache=False)\n

Note

For more information on caching, we refer the reader to the caching section.

"},{"location":"sections/how_to_guides/basic/pipeline/#adjusting-the-batch-size-for-each-step","title":"Adjusting the batch size for each step","text":"

Memory issues can arise when processing large datasets or when using large models. To avoid this, we can use the input_batch_size argument of individual tasks. TextGeneration task will receive 5 dictionaries, while the LoadDataFromHub step will send 10 dictionaries per batch:

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n        batch_size=10\n    )\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(\n            name=f\"text_generation_with_{llm.model_name.replace('.', '-')}\",\n            llm=llm,\n            input_batch_size=5,\n        )\n\n    ...\n
"},{"location":"sections/how_to_guides/basic/pipeline/#serializing-the-pipeline","title":"Serializing the pipeline","text":"

Sharing a pipeline with others is very easy, as we can serialize the pipeline object using the save method. We can save the pipeline in different formats, such as yaml or json:

yamljson
if __name__ == \"__main__\":\n    pipeline.save(\"pipeline.yaml\", format=\"yaml\")\n
if __name__ == \"__main__\":\n    pipeline.save(\"pipeline.json\", format=\"json\")\n

To load the pipeline, we can use the from_yaml or from_json methods:

yamljson
pipeline = Pipeline.from_yaml(\"pipeline.yaml\")\n
pipeline = Pipeline.from_json(\"pipeline.json\")\n

Serializing the pipeline is very useful when we want to share the pipeline with others, or when we want to store the pipeline for future use. It can even be hosted online, so the pipeline can be executed directly using the CLI.

"},{"location":"sections/how_to_guides/basic/pipeline/#visualizing-the-pipeline","title":"Visualizing the pipeline","text":"

We can visualize the pipeline using the Pipeline.draw() method. This will create a mermaid graph, and return the path to the image.

path_to_image = pipeline.draw(\n    top_to_bottom=True,\n    show_edge_labels=True,\n)\n

Within notebooks, we can simply call pipeline and the graph will be displayed. Alternatively, we can use the Pipeline.draw() method to have more control over the graph visualization and use IPython to display it.

from IPython.display import Image, display\n\ndisplay(Image(path_to_image))\n

Let's now see how the pipeline of the fully working example looks like.

"},{"location":"sections/how_to_guides/basic/pipeline/#fully-working-example","title":"Fully working example","text":"

To sum up, here is the full code of the pipeline we have created in this section. Note that you will need to change the name of the Hugging Face repository where the resulting will be pushed, set OPENAI_API_KEY environment variable, set MISTRAL_API_KEY and have gcloud installed and configured:

Code
from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n    )\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.0-pro\"),\n    ):\n        task = TextGeneration(\n            name=f\"text_generation_with_{llm.model_name.replace('.', '-')}\", llm=llm\n        )\n        load_dataset.connect(task)\n        task.connect(combine_generations)\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            \"load_dataset\": {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n            \"text_generation_with_gpt-4-0125-preview\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n            \"text_generation_with_mistral-large-2402\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n            \"text_generation_with_gemini-1.0-pro\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n        },\n    )\n    distiset.push_to_hub(\n        \"distilabel-internal-testing/instruction-dataset-mini-with-generations\"\n    )\n
"},{"location":"sections/how_to_guides/basic/step/","title":"Steps for processing data","text":""},{"location":"sections/how_to_guides/basic/step/#working-with-steps","title":"Working with Steps","text":"

The Step is intended to be used within the scope of a Pipeline, which will orchestrate the different steps defined but can also be used standalone.

Assuming that we have a Step already defined as it follows:

from typing import TYPE_CHECKING\nfrom distilabel.steps import Step, StepInput\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, StepOutput\n\nclass MyStep(Step):\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return [\"input_field\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"output_field\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        for input in inputs:\n            input[\"output_field\"] = input[\"input_field\"]\n        yield inputs\n

Then we can use it as follows:

step = MyStep(name=\"my-step\")\nstep.load()\n\nnext(step.process([{\"input_field\": \"value\"}]))\n# [{'input_field': 'value', 'output_field': 'value'}]\n

Note

The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution.

"},{"location":"sections/how_to_guides/basic/step/#arguments","title":"Arguments","text":"
  • input_mappings, is a dictionary that maps keys from the input dictionaries to the keys expected by the step. For example, if input_mappings={\"instruction\": \"prompt\"}, means that the input key prompt will be used as the key instruction for current step.

  • output_mappings, is a dictionary that can be used to map the outputs of the step to other names. For example, if output_mappings={\"conversation\": \"prompt\"}, means that output key conversation will be renamed to prompt for the next step.

  • input_batch_size (by default set to 50), is independent for every step and will determine how many input dictionaries will process at once.

"},{"location":"sections/how_to_guides/basic/step/#runtime-parameters","title":"Runtime parameters","text":"

Steps can also have RuntimeParameter, which are parameters that can only be used after the pipeline initialisation when calling the Pipeline.run.

from distilabel.mixins.runtime_parameters import RuntimeParameter\n\nclass Step(...):\n    input_batch_size: RuntimeParameter[PositiveInt] = Field(\n        default=DEFAULT_INPUT_BATCH_SIZE,\n        description=\"The number of rows that will contain the batches processed by the\"\n        \" step.\",\n    )\n
"},{"location":"sections/how_to_guides/basic/step/#types-of-steps","title":"Types of Steps","text":"

There are two special types of Step in distilabel:

  • GeneratorStep: is a step that only generates data, and it doesn't need any input data from previous steps and normally is the first node in a Pipeline. More information: Components -> Step - GeneratorStep.

  • GlobalStep: is a step with the standard interface i.e. receives inputs and generates outputs, but it processes all the data at once, and often is the final step in the Pipeline. The fact that a GlobalStep requires the previous steps to finish before being able to start. More information: Components - Step - GlobalStep.

  • Task, is essentially the same as a default Step, but it relies on an LLM as an attribute, and the process method will be in charge of calling that LLM. More information: Components - Task.

"},{"location":"sections/how_to_guides/basic/step/#defining-custom-steps","title":"Defining custom Steps","text":"

We can define a custom step by creating a new subclass of the Step and defining the following:

  • inputs: is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • outputs: is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • process: is a method that receives the input data and returns the output data, and it should be a generator, meaning that it should yield the output data.

Note

The default signature for the process method is process(self, *inputs: StepInput) -> StepOutput. The argument inputs should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one.

Warning

For the custom Step subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline, the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING, otherwise, the validation and/or serialization will fail.

Inherit from StepUsing the @step decorator

We can inherit from the Step class and define the inputs, outputs, and process methods as follows:

from typing import TYPE_CHECKING\nfrom distilabel.steps import Step, StepInput\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, StepOutput\n\nclass CustomStep(Step):\n    @property\n    def inputs(self) -> \"StepColumns\":\n        ...\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        ...\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        for upstream_step_inputs in inputs:\n            ...\n            yield item\n\n    # When overridden (ideally under the `typing_extensions.override` decorator)\n    # @typing_extensions.override\n    # def process(self, inputs: StepInput) -> StepOutput:\n    #     for input in inputs:\n    #         ...\n    #     yield inputs\n

The @step decorator will take care of the boilerplate code, and will allow to define the inputs, outputs, and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom Step subclass.

from typing import TYPE_CHECKING\nfrom distilabel.steps import StepInput, step\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepOutput\n\n@step(inputs=[...], outputs=[...])\ndef CustomStep(inputs: StepInput) -> \"StepOutput\":\n    for input in inputs:\n        ...\n    yield inputs\n\nstep = CustomStep(name=\"my-step\")\n
"},{"location":"sections/how_to_guides/basic/step/generator_step/","title":"GeneratorStep","text":"

The GeneratorStep is a subclass of Step that is intended to be used as the first step within a Pipeline, because it doesn't require input and generates data that can be used by other steps. Alternatively, it can also be used as a standalone.

from typing import List, TYPE_CHECKING\nfrom typing_extensions import override\n\nfrom distilabel.steps import GeneratorStep\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, GeneratorStepOutput\n\nclass MyGeneratorStep(GeneratorStep):\n    instructions: List[str]\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        if offset:\n            self.instructions = self.instructions[offset:]\n\n        while self.instructions:\n            batch = [\n                {\n                    \"instruction\": instruction\n                } for instruction in self.instructions[: self.batch_size]\n            ]\n            self.instructions = self.instructions[self.batch_size :]\n            yield (\n                batch,\n                True if len(self.instructions) == 0 else False,\n            )\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"instruction\"]\n

Then we can use it as follows:

step = MyGeneratorStep(\n    name=\"my-generator-step\",\n    instructions=[\"Tell me a joke.\", \"Tell me a story.\"],\n    batch_size=1,\n)\nstep.load()\n\nnext(step.process(offset=0))\n# ([{'instruction': 'Tell me a joke.'}], False)\nnext(step.process(offset=1))\n# ([{'instruction': 'Tell me a story.'}], True)\n

Note

The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution.

"},{"location":"sections/how_to_guides/basic/step/generator_step/#defining-custom-generatorsteps","title":"Defining custom GeneratorSteps","text":"

We can define a custom generator step by creating a new subclass of the GeneratorStep and defining the following:

  • outputs: is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • process: is a method that yields output data and a boolean flag indicating whether that's the last batch to be generated.

Note

The default signature for the process method is process(self, offset: int = 0) -> GeneratorStepOutput. The argument offset should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one.

Warning

For the custom Step subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline, the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING, otherwise, the validation and/or serialization will fail.

Inherit from GeneratorStepUsing the @step decorator

We can inherit from the GeneratorStep class and define the outputs, and process methods as follows:

from typing import List, TYPE_CHECKING\nfrom typing_extensions import override\n\nfrom distilabel.steps import GeneratorStep\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, GeneratorStepOutput\n\nclass MyGeneratorStep(GeneratorStep):\n    instructions: List[str]\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        ...\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        ...\n

The @step decorator will take care of the boilerplate code, and will allow to define the outputs, and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom GeneratorStep subclass.

from typing import TYPE_CHECKING\nfrom distilabel.steps import step\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import GeneratorStepOutput\n\n@step(outputs=[...], step_type=\"generator\")\ndef CustomGeneratorStep(offset: int = 0) -> \"GeneratorStepOutput\":\n    yield (\n        ...,\n        True if offset == 10 else False,\n    )\n\nstep = CustomGeneratorStep(name=\"my-step\")\n
"},{"location":"sections/how_to_guides/basic/step/global_step/","title":"GlobalStep","text":"

The GlobalStep is a subclass of Step that is used to define a step that requires the previous steps to be completed to run, since it will wait until all the input batches are received before running. This step is useful when you need to run a step that requires all the input data to be processed before running. Alternatively, it can also be used as a standalone.

"},{"location":"sections/how_to_guides/basic/step/global_step/#defining-custom-globalsteps","title":"Defining custom GlobalSteps","text":"

We can define a custom step by creating a new subclass of the GlobalStep and defining the following:

  • inputs: is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • outputs: is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • process: is a method that receives the input data and returns the output data, and it should be a generator, meaning that it should yield the output data.

Note

The default signature for the process method is process(self, *inputs: StepInput) -> StepOutput. The argument inputs should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one.

Warning

For the custom GlobalStep subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline, the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING, otherwise, the validation and/or serialization will fail.

Inherit from GlobalStepUsing the @step decorator

We can inherit from the GlobalStep class and define the inputs, outputs, and process methods as follows:

from typing import TYPE_CHECKING\nfrom distilabel.steps import GlobalStep, StepInput\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, StepOutput\n\nclass CustomStep(Step):\n    @property\n    def inputs(self) -> \"StepColumns\":\n        ...\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        ...\n\n    def process(self, *inputs: StepInput) -> StepOutput:\n        for upstream_step_inputs in inputs:\n            for item in input:\n                ...\n            yield item\n\n    # When overridden (ideally under the `typing_extensions.override` decorator)\n    # @typing_extensions.override\n    # def process(self, inputs: StepInput) -> StepOutput:\n    #     for input in inputs:\n    #         ...\n    #     yield inputs\n

The @step decorator will take care of the boilerplate code, and will allow to define the inputs, outputs, and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom GlobalStep subclass.

from typing import TYPE_CHECKING\nfrom distilabel.steps import StepInput, step\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepOutput\n\n@step(inputs=[...], outputs=[...], step_type=\"global\")\ndef CustomStep(inputs: StepInput) -> \"StepOutput\":\n    for input in inputs:\n        ...\n    yield inputs\n\nstep = CustomStep(name=\"my-step\")\n
"},{"location":"sections/how_to_guides/basic/task/","title":"Tasks for generating and judging with LLMs","text":""},{"location":"sections/how_to_guides/basic/task/#working-with-tasks","title":"Working with Tasks","text":"

The Task is a special kind of Step that includes the LLM as a mandatory argument. As with a Step, it is normally used within a Pipeline but can also be used standalone.

For example, the most basic task is the TextGeneration task, which generates text based on a given instruction.

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n    name=\"text-generation\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': 'The capital of Spain is Madrid.',\n#         'distilabel_metadata': {\n#               'raw_output_text-generation': 'The capital of Spain is Madrid.',\n#               'raw_input_text-generation': [\n#                   {'role': 'user', 'content': \"What's the capital of Spain?\"}\n#               ]\n#         },\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     }\n# ]\n

Note

The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution.

As shown above, the TextGeneration task adds a generation based on the instruction.

Tip

Since version 1.2.0, we provide some metadata about the LLM call through distilabel_metadata. This can be disabled by setting the add_raw_output attribute to False when creating the task.

Additionally, since version 1.4.0, the formatted input can also be included, which can be helpful when testing custom templates (testing the pipeline using the dry_run method).

disable raw input and output
task = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    add_raw_output=False,\n    add_raw_input=False\n)\n
"},{"location":"sections/how_to_guides/basic/task/#taskprint","title":"Task.print","text":"

Info

New since version 1.4.0, Task.print Task.print method.

The Tasks include a handy method to show what the prompt formatted for an LLM would look like, let's see an example with UltraFeedback, but it applies to any other Task.

from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\nuf = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n)\nuf.load()\nuf.print()\n

The result will be a rendered prompt, with the System prompt (if contained for the task) and the User prompt, rendered with rich (it will show exactly the same in a jupyter notebook).

In case you want to test with a custom input, you can pass an example to the tasksformat_input` method (or generate it on your own depending on the task), and pass it to the print method so that it shows your example:

uf.print(\n    uf.format_input({\"instruction\": \"test\", \"generations\": [\"1\", \"2\"]})\n)\n
Using a DummyLLM to avoid loading one

In case you don't want to load an LLM to render the template, you can create a dummy one like the ones we could use for testing.

from distilabel.models import LLM\nfrom distilabel.models.mixins import MagpieChatTemplateMixin\n\nclass DummyLLM(AsyncLLM, MagpieChatTemplateMixin):\n    structured_output: Any = None\n    magpie_pre_query_template: str = \"llama3\"\n\n    def load(self) -> None:\n        pass\n\n    @property\n    def model_name(self) -> str:\n        return \"test\"\n\n    def generate(\n        self, input: \"FormattedInput\", num_generations: int = 1\n    ) -> \"GenerateOutput\":\n        return [\"output\" for _ in range(num_generations)]\n

You can use this LLM just as any of the other ones to load your task and call print:

uf = UltraFeedback(llm=DummyLLM())\nuf.load()\nuf.print()\n

Note

When creating a custom task, the print method will be available by default, but it is limited to the most common scenarios for the inputs. If you test your new task and find it's not working as expected (for example, if your task contains one input consisting of a list of texts instead of a single one), you should override the _sample_input method. You can inspect the UltraFeedback source code for this.

"},{"location":"sections/how_to_guides/basic/task/#specifying-the-number-of-generations-and-grouping-generations","title":"Specifying the number of generations and grouping generations","text":"

All the Tasks have a num_generations attribute that allows defining the number of generations that we want to have per input. We can update the example above to generate 3 completions per input:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n    name=\"text-generation\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n    num_generations=3,\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': 'The capital of Spain is Madrid.',\n#         'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     },\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': 'The capital of Spain is Madrid.',\n#         'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     },\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': 'The capital of Spain is Madrid.',\n#         'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     }\n# ]\n

In addition, we might want to group the generations in a single output row as maybe one downstream step expects a single row with multiple generations. We can achieve this by setting the group_generations attribute to True:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n    name=\"text-generation\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n    num_generations=3,\n    group_generations=True\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': ['The capital of Spain is Madrid.', 'The capital of Spain is Madrid.', 'The capital of Spain is Madrid.'],\n#         'distilabel_metadata': [\n#             {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#             {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#             {'raw_output_text-generation': 'The capital of Spain is Madrid.'}\n#         ],\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     }\n# ]\n
"},{"location":"sections/how_to_guides/basic/task/#defining-custom-tasks","title":"Defining custom Tasks","text":"

We can define a custom step by creating a new subclass of the Task and defining the following:

  • inputs: is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • format_input: is a method that receives a dictionary with the input data and returns a ChatType following the chat-completion OpenAI message formatting.

  • outputs: is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. This property should always include model_name as one of the outputs since that's automatically injected from the LLM.

  • format_output: is a method that receives the output from the LLM and optionally also the input data (which may be useful to build the output in some scenarios), and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs. Note that there's no need to include the model_name in the output.

Inherit from TaskUsing the @task decorator

When using the Task class inheritance method for creating a custom task, we can also optionally override the Task.process method to define a more complex processing logic involving an LLM, as the default one just calls the LLM.generate method once previously formatting the input and subsequently formatting the output. For example, EvolInstruct task overrides this method to call the LLM.generate multiple times (one for each evolution).

from typing import Any, Dict, List, Union, TYPE_CHECKING\n\nfrom distilabel.steps.tasks import Task\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns\n    from distilabel.steps.tasks.typing import ChatType\n\n\nclass MyCustomTask(Task):\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return [\"input_field\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": input[\"input_field\"],\n            },\n        ]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"output_field\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return {\"output_field\": output}\n

If your task just needs a system prompt, a user message template and a way to format the output given by the LLM, then you can use the @task decorator to avoid writing too much boilerplate code.

from typing import Any, Dict, Union\nfrom distilabel.steps.tasks import task\n\n\n@task(inputs=[\"input_field\"], outputs=[\"output_field\"])\ndef MyCustomTask(output: Union[str, None], input: Union[Dict[str, Any], None] = None) -> Dict[str, Any]:\n    \"\"\"\n    ---\n    system_prompt: |\n        My custom system prompt\n\n    user_message_template: |\n        My custom user message template: {input_field}\n    ---\n    \"\"\"\n    # Format the `LLM` output here\n    return {\"output_field\": output}\n
"},{"location":"sections/how_to_guides/basic/task/generator_task/","title":"GeneratorTask that produces output","text":""},{"location":"sections/how_to_guides/basic/task/generator_task/#working-with-generatortasks","title":"Working with GeneratorTasks","text":"

The GeneratorTask is a custom implementation of a Task based on the GeneratorStep. As with a Task, it is normally used within a Pipeline but can also be used standalone.

Warning

This task is still experimental and may be subject to changes in the future.

from typing import Any, Dict, List, Union\nfrom typing_extensions import override\n\nfrom distilabel.steps.tasks.base import GeneratorTask\nfrom distilabel.steps.tasks.typing import ChatType\nfrom distilabel.steps.typing import GeneratorOutput\n\n\nclass MyCustomTask(GeneratorTask):\n    instruction: str\n\n    @override\n    def process(self, offset: int = 0) -> GeneratorOutput:\n        output = self.llm.generate(\n            inputs=[\n                [\n                    {\"role\": \"user\", \"content\": self.instruction},\n                ],\n            ],\n        )\n        output = {\"model_name\": self.llm.model_name}\n        output.update(\n            self.format_output(output=output, input=None)\n        )\n        yield output\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"output_field\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return {\"output_field\": output}\n

We can then use it as follows:

task = MyCustomTask(\n    name=\"custom-generation\",\n    instruction=\"Tell me a joke.\",\n    llm=OpenAILLM(model=\"gpt-4\"),\n)\ntask.load()\n\nnext(task.process())\n# [{'output_field\": \"Why did the scarecrow win an award? Because he was outstanding!\", \"model_name\": \"gpt-4\"}]\n

Note

Most of the times you would need to override the default process method, as it's suited for the standard Task and not for the GeneratorTask. But within the context of the process function you can freely use the llm to generate data in any way.

Note

The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution.

"},{"location":"sections/how_to_guides/basic/task/generator_task/#defining-custom-generatortasks","title":"Defining custom GeneratorTasks","text":"

We can define a custom generator task by creating a new subclass of the GeneratorTask and defining the following:

  • process: is a method that generates the data based on the LLM and the instruction provided within the class instance, and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs. Note that the inputs argument is not allowed in this function since this is a GeneratorTask. The signature only expects the offset argument, which is used to keep track of the current iteration in the generator.

  • outputs: is a property that returns a list of strings with the names of the output fields, this property should always include model_name as one of the outputs since that's automatically injected from the LLM.

  • format_output: is a method that receives the output from the LLM and optionally also the input data (which may be useful to build the output in some scenarios), and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs. Note that there's no need to include the model_name in the output.

from typing import Any, Dict, List, Union\n\nfrom distilabel.steps.tasks.base import GeneratorTask\nfrom distilabel.steps.tasks.typing import ChatType\n\n\nclass MyCustomTask(GeneratorTask):\n    @override\n    def process(self, offset: int = 0) -> GeneratorOutput:\n        output = self.llm.generate(\n            inputs=[\n                [{\"role\": \"user\", \"content\": \"Tell me a joke.\"}],\n            ],\n        )\n        output = {\"model_name\": self.llm.model_name}\n        output.update(\n            self.format_output(output=output, input=None)\n        )\n        yield output\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"output_field\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return {\"output_field\": output}\n
"},{"location":"sections/pipeline_samples/","title":"Tutorials","text":"
  • End-to-end tutorials provide detailed step-by-step explanations and the code used for end-to-end workflows.
  • Paper implementations provide reproductions of fundamental papers in the synthetic data domain.
  • Examples don't provide explenations but simply show code for different tasks.
"},{"location":"sections/pipeline_samples/#end-to-end-tutorials","title":"End-to-end tutorials","text":"
  • Generate a preference dataset

    Learn about synthetic data generation for ORPO and DPO.

    Tutorial

  • Clean an existing preference dataset

    Learn about how to provide AI feedback to clean an existing dataset.

    Tutorial

  • Retrieval and reranking models

    Learn about synthetic data generation for fine-tuning custom retrieval and reranking models.

    Tutorial

  • Generate text classification data

    Learn about how synthetic data generation for text classification can help address data imbalance or scarcity.

    Tutorial

"},{"location":"sections/pipeline_samples/#paper-implementations","title":"Paper Implementations","text":"
  • Deepseek Prover

    Learn about an approach to generate mathematical proofs for theorems generated from informal math problems.

    Example

  • DEITA

    Learn about prompt, response tuning for complexity and quality and LLMs as judges for automatic data selection.

    Paper

  • Instruction Backtranslation

    Learn about automatically labeling human-written text with corresponding instructions.

    Paper

  • Prometheus 2

    Learn about using open-source models as judges for direct assessment and pair-wise ranking.

    Paper

  • UltraFeedback

    Learn about a large-scale, fine-grained, diverse preference dataset, used for training powerful reward and critic models.

    Paper

  • APIGen

    Learn how to create verifiable high-quality datases for function-calling applications.

    Paper

  • CLAIR

    Learn Contrastive Learning from AI Revisions (CLAIR), a data-creation method which leads to more contrastive preference pairs.

    Paper

"},{"location":"sections/pipeline_samples/#examples","title":"Examples","text":"
  • Benchmarking with distilabel

    Learn about reproducing the Arena Hard benchmark with disitlabel.

    Example

  • Structured generation with outlines

    Learn about generating RPG characters following a pydantic.BaseModel with outlines in distilabel.

    Example

  • Structured generation with instructor

    Learn about answering instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel.

    Example

  • Create a social network with FinePersonas

    Learn how to leverage FinePersonas to create a synthetic social network and fine-tune adapters for Multi-LoRA.

    Example

"},{"location":"sections/pipeline_samples/examples/benchmarking_with_distilabel/","title":"Benchmarking with distilabel","text":"

Benchmark LLMs with distilabel: reproducing the Arena Hard benchmark.

The script below first defines both the ArenaHard and the ArenaHardResults tasks, so as to generate responses for a given collection of prompts/questions with up to two LLMs, and then calculate the results as per the original implementation, respectively. Additionally, the second part of the example builds a Pipeline to run the generation on top of the prompts with InferenceEndpointsLLM while streaming the rest of the generations from a pre-computed set of GPT-4 generations, and then evaluate one against the other with OpenAILLM generating an alternate response, a comparison between the responses, and a result as A>>B, A>B, B>A, B>>A, or tie.

To run this example you will first need to install the Arena Hard optional dependencies, being pandas, scikit-learn, and numpy.

Run
python examples/arena_hard.py\n
arena_hard.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom typing_extensions import override\n\nfrom distilabel.steps import GlobalStep, StepInput\nfrom distilabel.steps.tasks.base import Task\nfrom distilabel.steps.tasks.typing import ChatType\nfrom distilabel.steps.typing import StepOutput\n\n\nclass ArenaHard(Task):\n    \"\"\"Evaluates two assistant responses using an LLM as judge.\n\n    This `Task` is based on the \"From Live Data to High-Quality Benchmarks: The\n    Arena-Hard Pipeline\" paper that presents Arena Hard, which is a benchmark for\n    instruction-tuned LLMs that contains 500 challenging user queries. GPT-4 is used\n    as the judge to compare the model responses against a baseline model, which defaults\n    to `gpt-4-0314`.\n\n    Note:\n        Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena\n        among popular open-ended LLM benchmarks.\n\n    Input columns:\n        - instruction (`str`): The instruction to evaluate the responses.\n        - generations (`List[str]`): The responses generated by two, and only two, LLMs.\n\n    Output columns:\n        - evaluation (`str`): The evaluation of the responses generated by the LLMs.\n        - score (`str`): The score extracted from the evaluation.\n        - model_name (`str`): The model name used to generate the evaluation.\n\n    Categories:\n        - benchmark\n\n    References:\n        - [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)\n        - [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)\n\n    Examples:\n\n        Evaluate two assistant responses for a given instruction using Arean Hard prompts:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import GroupColumns, LoadDataFromDicts\n        from distilabel.steps.tasks import ArenaHard, TextGeneration\n\n        with Pipeline() as pipeline:\n            load_data = LoadDataFromDicts(\n                data=[{\"instruction\": \"What is the capital of France?\"}],\n            )\n\n            text_generation_a = TextGeneration(\n                llm=...,  # LLM instance\n                output_mappings={\"model_name\": \"generation_model\"},\n            )\n\n            text_generation_b = TextGeneration(\n                llm=...,  # LLM instance\n                output_mappings={\"model_name\": \"generation_model\"},\n            )\n\n            combine = GroupColumns(\n                columns=[\"generation\", \"generation_model\"],\n                output_columns=[\"generations\", \"generation_models\"],\n            )\n\n            arena_hard = ArenaHard(\n                llm=...,  # LLM instance\n            )\n\n            load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs required by this task are the `instruction` and the `generations`,\n        which are the responses generated by two, and only two, LLMs.\"\"\"\n        return [\"instruction\", \"generations\"]\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"This method formats the input data as a `ChatType` using the prompt defined\n        by the Arena Hard benchmark, which consists on a `system_prompt` plus a template\n        for the user first message that contains the `instruction` and both `generations`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": \"Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\\n\\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\\n\\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\\n\\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\\n\\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\\n\\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\\n\\n1. Assistant A is significantly better: [[A>>B]]\\n2. Assistant A is slightly better: [[A>B]]\\n3. Tie, relatively the same: [[A=B]]\\n4. Assistant B is slightly better: [[B>A]]\\n5. Assistant B is significantly better: [[B>>A]]\\n\\nExample output: \\\"My final verdict is tie: [[A=B]]\\\".\",\n            },\n            {\n                \"role\": \"user\",\n                \"content\": f\"<|User Prompt|>\\n{input['instruction']}\\n\\n<|The Start of Assistant A's Answer|>\\n{input['generations'][0]}\\n<|The End of Assistant A's Answer|>\\n\\n<|The Start of Assistant B's Answer|>\\n{input['generations'][1]}\\n<|The End of Assistant B's Answer|>\",\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The outputs generated by this task are the `evaluation`, the `score` and\n        the `model_name` (which is automatically injected within the `process` method\n        of the parent task).\"\"\"\n        return [\"evaluation\", \"score\", \"model_name\"]\n\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Union[Dict[str, Any], None] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"This method formats the output generated by the LLM as a Python dictionary\n        containing the `evaluation` which is the raw output generated by the LLM (consisting\n        of the judge LLM alternate generation for the given instruction, plus an explanation\n        on the evaluation of the given responses; plus the `score` extracted from the output.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Is provided in case it needs to be used to enrich\n                the output if needed.\n\n        Returns:\n            A dict with the keys `evaluation` with the raw output which contains the LLM\n            evaluation and the extracted `score` if possible.\n        \"\"\"\n        if output is None:\n            return {\"evaluation\": None, \"score\": None}\n        pattern = re.compile(r\"\\[\\[([AB<>=]+)\\]\\]\")\n        match = pattern.search(output)\n        if match is None:\n            return {\"evaluation\": output, \"score\": None}\n        return {\"evaluation\": output, \"score\": match.group(1)}\n\n\nclass ArenaHardResults(GlobalStep):\n    \"\"\"Process Arena Hard results to calculate the ELO scores.\n\n    This `Step` is based on the \"From Live Data to High-Quality Benchmarks: The\n    Arena-Hard Pipeline\" paper that presents Arena Hard, which is a benchmark for\n    instruction-tuned LLMs that contains 500 challenging user queries. This step is\n    a `GlobalStep` that should run right after the `ArenaHard` task to calculate the\n    ELO scores for the evaluated models.\n\n    Note:\n        Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena\n        among popular open-ended LLM benchmarks.\n\n    Input columns:\n        - evaluation (`str`): The evaluation of the responses generated by the LLMs.\n        - score (`str`): The score extracted from the evaluation.\n\n    References:\n        - [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)\n        - [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)\n\n    Examples:\n\n        Rate the ELO scores for two assistant responses for a given an evaluation / comparison between both using Arean Hard prompts:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import GroupColumns, LoadDataFromDicts\n        from distilabel.steps.tasks import ArenaHard, TextGeneration\n\n        with Pipeline() as pipeline:\n            load_data = LoadDataFromDicts(\n                data=[{\"instruction\": \"What is the capital of France?\"}],\n            )\n\n            text_generation_a = TextGeneration(\n                llm=...,  # LLM instance\n                output_mappings={\"model_name\": \"generation_model\"},\n            )\n\n            text_generation_b = TextGeneration(\n                llm=...,  # LLM instance\n                output_mappings={\"model_name\": \"generation_model\"},\n            )\n\n            combine = GroupColumns(\n                columns=[\"generation\", \"generation_model\"],\n                output_columns=[\"generations\", \"generation_models\"],\n            )\n\n            arena_hard = ArenaHard(\n                llm=...,  # LLM instance\n            )\n\n            arena_hard_results = ArenaHardResults(\n                custom_model_column=\"generation_models\",\n                custom_weights={\"A>B\": 1, \"A>>B\": 3, \"B>A\": 1, \"B>>A\": 3},\n            )\n\n            load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard >> arena_hard_results\n        ```\n\n    \"\"\"\n\n    custom_model_column: Optional[str] = None\n    custom_weights: Dict[str, int] = {\"A>B\": 1, \"A>>B\": 3, \"B>A\": 1, \"B>>A\": 3}\n\n    def load(self) -> None:\n        \"\"\"Ensures that the required dependencies are installed.\"\"\"\n        super().load()\n\n        try:\n            import numpy as np  # noqa: F401\n            import pandas as pd  # noqa: F401\n            from sklearn.linear_model import LogisticRegression  # noqa: F401\n        except ImportError as e:\n            raise ImportError(\n                \"In order to run `ArenaHardResults`, the `arena-hard` extra dependencies\"\n                \" must be installed i.e. `numpy`, `pandas`, and `scikit-learn`.\\n\"\n                \"Please install the dependencies by running `pip install distilabel[arena-hard]`.\"\n            ) from e\n\n    # TODO: the `evaluation` is not really required as an input, so it could be removed, since\n    # only `score` is used / required\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs required by this step are the `evaluation` and the `score` generated\n        by the `ArenaHard` task. Since this step does use the identifiers `model_a` and `model_b`,\n        optionally one can set `custom_model_column` to use the model names if existing within\n        the input data, ideally this value should be `model_name` if connected from the `ArenaHard`\n        step.\"\"\"\n        columns = [\"evaluation\", \"score\"]\n        if self.custom_model_column:\n            columns.append(self.custom_model_column)\n        return columns\n\n    @override\n    def process(self, inputs: StepInput) -> StepOutput:  # type: ignore\n        \"\"\"This method processes the inputs generated by the `ArenaHard` task to calculate the\n        win rates for each of the models to evaluate. Since this step inherits from the `GlobalStep`,\n        it will wait for all the input batches to be processed, and then the output will be yielded in\n        case there's a follow up step, since this step won't modify the received inputs.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task.\n\n        References:\n            - https://github.com/lm-sys/arena-hard-auto/blob/main/show_result.py\n        \"\"\"\n        import numpy as np\n        import pandas as pd\n        from sklearn.linear_model import LogisticRegression\n\n        models = [\"A\", \"B\"]\n        if self.custom_model_column:\n            models = inputs[0][self.custom_model_column]\n\n        # TODO: the battles are only calculated for the first game, even though the official\n        # implementation also covers the possibility of a second game (not within the released\n        # dataset yet)\n        battles = pd.DataFrame()\n        for input in inputs:\n            output = {\n                # TODO: \"question_id\": input[\"question_id\"],\n                \"model_a\": models[0],\n                \"model_b\": models[1],\n            }\n            if input[\"score\"] in [\"A>B\", \"A>>B\"]:\n                output[\"winner\"] = models[0]\n                rows = [output] * self.custom_weights[input[\"score\"]]\n            elif input[\"score\"] in [\"B>A\", \"B>>A\"]:\n                output[\"winner\"] = models[1]\n                rows = [output] * self.custom_weights[input[\"score\"]]\n            elif input[\"score\"] == \"A=B\":\n                output[\"winner\"] = \"tie\"\n                rows = [output]\n            else:\n                continue\n\n            battles = pd.concat([battles, pd.DataFrame(rows)])\n\n        models = pd.concat([battles[\"model_a\"], battles[\"model_b\"]]).unique()\n        models = pd.Series(np.arange(len(models)), index=models)\n\n        battles = pd.concat([battles, battles], ignore_index=True)\n        p = len(models.index)\n        n = battles.shape[0]\n\n        X = np.zeros([n, p])\n        X[np.arange(n), models[battles[\"model_a\"]]] = +np.log(10)\n        X[np.arange(n), models[battles[\"model_b\"]]] = -np.log(10)\n\n        Y = np.zeros(n)\n        Y[battles[\"winner\"] == \"model_a\"] = 1.0\n\n        tie_idx = battles[\"winner\"] == \"tie\"\n        tie_idx[len(tie_idx) // 2 :] = False\n        Y[tie_idx] = 1.0\n\n        lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)  # type: ignore\n        lr.fit(X, Y)\n\n        # The ELO scores are calculated assuming that the reference is `gpt-4-0314`\n        # with an starting ELO of 1000, so that the evaluated models are compared with\n        # `gtp-4-0314` only if it's available within the models\n        elo_scores = 400 * lr.coef_[0] + 1000\n        # TODO: we could parametrize the reference / anchor model, but left as is to be faithful to the\n        # original implementation\n        if \"gpt-4-0314\" in models.index:\n            elo_scores += 1000 - elo_scores[models[\"gpt-4-0314\"]]\n\n        output = pd.Series(elo_scores, index=models.index).sort_values(ascending=False)\n        self._logger.info(f\"Arena Hard ELO: {output}\")\n\n        # Here only so that if follow up steps are connected the inputs are preserved,\n        # since this step doesn't modify nor generate new inputs\n        yield inputs\n\n\nif __name__ == \"__main__\":\n    import json\n\n    from distilabel.models import InferenceEndpointsLLM, OpenAILLM\n    from distilabel.pipeline import Pipeline\n    from distilabel.steps import (\n        GroupColumns,\n        KeepColumns,\n        LoadDataFromHub,\n        StepInput,\n        step,\n    )\n    from distilabel.steps.tasks import TextGeneration\n    from distilabel.steps.typing import StepOutput\n\n    @step(inputs=[\"turns\"], outputs=[\"system_prompt\", \"instruction\"])\n    def PrepareForTextGeneration(*inputs: StepInput) -> StepOutput:\n        for input in inputs:\n            for item in input:\n                item[\"system_prompt\"] = \"You are a helpful assistant.\"\n                item[\"instruction\"] = item[\"turns\"][0][\"content\"]\n            yield input\n\n    @step(\n        inputs=[\"question_id\"],\n        outputs=[\"generation\", \"generation_model\"],\n        step_type=\"global\",\n    )\n    def LoadReference(*inputs: StepInput) -> StepOutput:\n        # File downloaded from https://raw.githubusercontent.com/lm-sys/arena-hard-auto/e0a8ea1df42c1df76451a6cd04b14e31ff992b87/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl\n        lines = open(\"gpt-4-0314.jsonl\", mode=\"r\").readlines()\n        for input in inputs:\n            for item in input:\n                for line in lines:\n                    data = json.loads(line)\n                    if data[\"question_id\"] == item[\"question_id\"]:\n                        item[\"generation\"] = data[\"choices\"][0][\"turns\"][0][\"content\"]\n                        item[\"generation_model\"] = data[\"model_id\"]\n                        break\n            yield input\n\n    with Pipeline(name=\"arena-hard-v0.1\") as pipeline:\n        load_dataset = LoadDataFromHub(\n            name=\"load_dataset\",\n            repo_id=\"alvarobartt/lmsys-arena-hard-v0.1\",\n            split=\"test\",\n            num_examples=5,\n        )\n\n        load_reference = LoadReference(name=\"load_reference\")\n\n        prepare = PrepareForTextGeneration(name=\"prepare\")\n\n        text_generation_cohere = TextGeneration(\n            name=\"text_generation_cohere\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"CohereForAI/c4ai-command-r-plus\",\n                tokenizer_id=\"CohereForAI/c4ai-command-r-plus\",\n            ),\n            use_system_prompt=True,\n            input_batch_size=10,\n            output_mappings={\"model_name\": \"generation_model\"},\n        )\n\n        combine_columns = GroupColumns(\n            name=\"combine_columns\",\n            columns=[\"generation\", \"generation_model\"],\n            output_columns=[\"generations\", \"generation_models\"],\n        )\n\n        arena_hard = ArenaHard(\n            name=\"arena_hard\",\n            llm=OpenAILLM(model=\"gpt-4-1106-preview\"),\n            output_mappings={\"model_name\": \"evaluation_model\"},\n        )\n\n        keep_columns = KeepColumns(\n            name=\"keep_columns\",\n            columns=[\n                \"question_id\",\n                \"category\",\n                \"cluster\",\n                \"system_prompt\",\n                \"instruction\",\n                \"generations\",\n                \"generation_models\",\n                \"evaluation\",\n                \"score\",\n                \"evaluation_model\",\n            ],\n        )\n\n        win_rates = ArenaHardResults(\n            name=\"win_rates\", custom_model_column=\"generation_models\"\n        )\n\n        load_dataset >> load_reference  # type: ignore\n        load_dataset >> prepare >> text_generation_cohere  # type: ignore\n        (  # type: ignore\n            [load_reference, text_generation_cohere]\n            >> combine_columns\n            >> arena_hard\n            >> keep_columns\n            >> win_rates\n        )\n\n        distiset = pipeline.run(\n            parameters={  # type: ignore\n                text_generation_cohere.name: {\n                    \"llm\": {\n                        \"generation_kwargs\": {\n                            \"temperature\": 0.7,\n                            \"max_new_tokens\": 4096,\n                            \"stop_sequences\": [\"<EOS_TOKEN>\", \"<|END_OF_TURN_TOKEN|>\"],\n                        }\n                    }\n                },\n                arena_hard.name: {\n                    \"llm\": {\n                        \"generation_kwargs\": {\n                            \"temperature\": 0.0,\n                            \"max_new_tokens\": 4096,\n                        }\n                    }\n                },\n            },\n        )\n        if distiset is not None:\n            distiset.push_to_hub(\"arena-hard-results\")\n
"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/","title":"Create a social network with FinePersonas","text":"

In this example, we'll explore the creation of specialized user personas for social network interactions using the FinePersonas-v0.1 dataset from Hugging Face. The final dataset will be ready to fine-tune a chat model with specific traits and characteristics.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#introduction","title":"Introduction","text":"

We'll delve into the process of fine-tuning different LoRA (Low-Rank Adaptation) models to imbue these personas with specific traits and characteristics.

This approach draws inspiration from Michael Sayman's work on SocialAI (visit the profile to see some examples), to leverage FinePersonas-v0.1 for building models that can emulate bots with specific behaviour.

By fine-tuning these adapters, we can potentially create AI personas with distinct characteristics, communication styles, and areas of expertise. The result? AI interactions that feel more natural and tailored to specific contexts or user needs. For those interested in the technical aspects of this approach, we recommend the insightful blog post on Multi-LoRA serving. It provides a clear and comprehensive explanation of the technology behind this innovative method.

Let's jump to the demo.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#creating-our-socialai-task","title":"Creating our SocialAI Task","text":"

Building on the new TextGeneration, creating custom tasks is easier than ever before. This powerful tool opens up a world of possibilities for creating tailored text-based content with ease and precision. We will create a SocialAI task that will be in charge of generating responses to user interactions, taking into account a given follower_type, and use the perspective from a given persona:

from distilabel.steps.tasks import TextGeneration\n\nclass SocialAI(TextGeneration):\n    follower_type: Literal[\"supporter\", \"troll\", \"alarmist\"] = \"supporter\"\n    system_prompt: str = (\n        \"You are an AI assistant expert at simulating user interactions. \"\n        \"You must answer as if you were a '{follower_type}', be concise answer with no more than 200 characters, nothing else.\"\n        \"Here are some traits to use for your personality:\\n\\n\"\n        \"{traits}\"\n    )  #\u00a0(1)\n    template: str = \"You are the folowing persona:\\n\\n{{ persona }}\\n\\nWhat would you say to the following?\\n\\n {{ post }}\"  # (2)\n    columns: str | list[str] = [\"persona\", \"post\"]  # (3)\n\n    _follower_traits: dict[str, str] = {\n        \"supporter\": (\n            \"- Encouraging and positive\\n\"\n            \"- Tends to prioritize enjoyment and relaxation\\n\"\n            \"- Focuses on the present moment and short-term pleasure\\n\"\n            \"- Often uses humor and playful language\\n\"\n            \"- Wants to help others feel good and have fun\\n\"\n        ),\n        \"troll\": (\n            \"- Provocative and confrontational\\n\"\n            \"- Enjoys stirring up controversy and conflict\\n\"\n            \"- Often uses sarcasm, irony, and mocking language\\n\"\n            \"- Tends to belittle or dismiss others' opinions and feelings\\n\"\n            \"- Seeks to get a rise out of others and create drama\\n\"\n        ),\n        \"alarmist\": (\n            \"- Anxious and warning-oriented\\n\"\n            \"- Focuses on potential risks and negative consequences\\n\"\n            \"- Often uses dramatic or sensational language\\n\"\n            \"- Tends to be serious and stern in tone\\n\"\n            \"- Seeks to alert others to potential dangers and protect them from harm (even if it's excessive or unwarranted)\\n\"\n        ),\n    }\n\n    def load(self) -> None:\n        super().load()\n        self.system_prompt = self.system_prompt.format(\n            follower_type=self.follower_type,\n            traits=self._follower_traits[self.follower_type]\n        )  # (4)\n
  1. We have a custom system prompt that will depend on the follower_type we decide for our model.

  2. The base template or prompt will answert to the post we have, from the point of view of a persona.

  3. We will need our dataset to have both persona and post columns to populate the prompt.

  4. In the load method we place the specific traits for our follower type in the system prompt.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#data-preparation","title":"Data preparation","text":"

This is an example, so let's keep it short. We will use 3 posts, and 3 different types of personas. While there's potential to enhance this process (perhaps by implementing random persona selection or leveraging semantic similarity) we'll opt for a straightforward method in this demonstration.

Our goal is to create a set of nine examples, each pairing a post with a persona. To achieve this, we'll employ an LLM to respond to each post from the perspective of a specific persona, effectively simulating how different characters might engage with the content.

posts = [\n    {\n        \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\"\n    },\n    {\n        \"post\": \"I need to develop a training course for my company on communication skills. Need to decide how deliver it remotely.\"\n    },\n    {\n        \"post\": \"I'm always 10 minutes late to meetups but no one's complained. Could this be annoying to them?\"\n    },\n]\n\npersonas = (\n    load_dataset(\"argilla/FinePersonas-v0.1-clustering-100k\", split=\"train\")\n    .shuffle()\n    .select(range(3))\n    .select_columns(\"persona\")\n    .to_list()\n)\n\ndata = []\nfor post in posts:\n    for persona in personas:\n        data.append({\"post\": post[\"post\"], \"persona\": persona[\"persona\"]})\n

Each row in will have the following format:

import json\nprint(json.dumps(data[0], indent=4))\n{\n    \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\",\n    \"persona\": \"A high school or college environmental science teacher or an ecology student specializing in biogeography and ecosystem dynamics.\"\n}\n

This will be our dataset, that we can ingest using the LoadDataFromDicts:

loader = LoadDataFromDicts(data=data)\n
"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#simulating-from-different-types-of-followers","title":"Simulating from different types of followers","text":"

With our data in hand, we're ready to explore the capabilities of our SocialAI task. For this demonstration, we'll make use of of meta-llama/Meta-Llama-3.1-70B-Instruct While this model has become something of a go-to choice recently, it's worth noting that experimenting with a variety of models could yield even more interesting results:

from distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 256,\n    },\n)\nfollower_type = \"supporter\"\n\nfollower = SocialAI(\n    llm=llm,\n    follower_type=follower_type,\n    name=f\"{follower_type}_user\",\n)\n

This setup simplifies the process, we only need to input the follower type, and the system handles the rest. We could update this too to have a random type of follower by default, and simulate from a bunch of different personalities.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#building-our-pipeline","title":"Building our Pipeline","text":"

The foundation of our pipeline is now in place. At its core is a single, powerful LLM. This versatile model will be repurposed to drive three distinct SocialAI Tasks, each tailored to a specific TextGeneration task, and each one of them will be prepared for Supervised Fine Tuning using FormatTextGenerationSFT:

with Pipeline(name=\"Social AI Personas\") as pipeline:\n    loader = LoadDataFromDicts(data=data, batch_size=1)\n\n    llm = InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 256,\n        },\n    )\n\n    for follower_type in [\"supporter\", \"troll\", \"alarmist\"]:\n        follower = SocialAI(\n            llm=llm,\n            follower_type=follower_type,\n            name=f\"{follower_type}_user\",  # (1)\n            output_mappings={\n                \"generation\": f\"interaction_{follower_type}\"  # (2)\n            }\n        )\n        format_sft = FormatTextGenerationSFT(\n            name=f\"format_sft_{follower_type}\",\n            input_mappings={\n                \"instruction\": \"post\",\n                \"generation\": f\"interaction_{follower_type}\"  # (3)\n            },\n        )\n        loader >> follower >> format_sft  # (4)\n
  1. We update the name of the step to keep track in the pipeline.

  2. The generation column from each LLM will be mapped to avoid them being overriden, as we are reusing the same task.

  3. As we have modified the output column from SocialAI, we redirect each one of the \"follower_type\" responses.

  4. Connect the loader to each one of the follower tasks and format_sft to obtain 3 different subsets.

The outcome of this pipeline will be three specialized models, each fine-tuned to a unique follower type crafted by the SocialAI task. These models will generate SFT-formatted datasets, where each post is paired with its corresponding interaction data for a specific follower type. This setup enables seamless fine-tuning using your preferred framework, such as TRL, or any other training framework of your choice.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#script-and-final-dataset","title":"Script and final dataset","text":"

All the pieces are in place for our script, the full pipeline can be seen here:

Run
python examples/finepersonas_social_ai.py\n
finepersonas_social_ai.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Literal\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import FormatTextGenerationSFT, LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass SocialAI(TextGeneration):\n    follower_type: Literal[\"supporter\", \"troll\", \"alarmist\"] = \"supporter\"\n    system_prompt: str = (\n        \"You are an AI assistant expert at simulating user interactions. \"\n        \"You must answer as if you were a '{follower_type}', be concise answer with no more than 200 characters, nothing else.\"\n        \"Here are some traits to use for your personality:\\n\\n\"\n        \"{traits}\"\n    )\n    template: str = \"You are the folowing persona:\\n\\n{{ persona }}\\n\\nWhat would you say to the following?\\n\\n {{ post }}\"\n    columns: str | list[str] = [\"persona\", \"post\"]\n\n    _follower_traits: dict[str, str] = {\n        \"supporter\": (\n            \"- Encouraging and positive\\n\"\n            \"- Tends to prioritize enjoyment and relaxation\\n\"\n            \"- Focuses on the present moment and short-term pleasure\\n\"\n            \"- Often uses humor and playful language\\n\"\n            \"- Wants to help others feel good and have fun\\n\"\n        ),\n        \"troll\": (\n            \"- Provocative and confrontational\\n\"\n            \"- Enjoys stirring up controversy and conflict\\n\"\n            \"- Often uses sarcasm, irony, and mocking language\\n\"\n            \"- Tends to belittle or dismiss others' opinions and feelings\\n\"\n            \"- Seeks to get a rise out of others and create drama\\n\"\n        ),\n        \"alarmist\": (\n            \"- Anxious and warning-oriented\\n\"\n            \"- Focuses on potential risks and negative consequences\\n\"\n            \"- Often uses dramatic or sensational language\\n\"\n            \"- Tends to be serious and stern in tone\\n\"\n            \"- Seeks to alert others to potential dangers and protect them from harm (even if it's excessive or unwarranted)\\n\"\n        ),\n    }\n\n    def load(self) -> None:\n        super().load()\n        self.system_prompt = self.system_prompt.format(\n            follower_type=self.follower_type,\n            traits=self._follower_traits[self.follower_type],\n        )\n\n\nposts = [\n    {\n        \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\"\n    },\n    {\n        \"post\": \"I need to develop a training course for my company on communication skills. Need to decide how deliver it remotely.\"\n    },\n    {\n        \"post\": \"I'm always 10 minutes late to meetups but no one's complained. Could this be annoying to them?\"\n    },\n]\n\npersonas = (\n    load_dataset(\"argilla/FinePersonas-v0.1-clustering-100k\", split=\"train\")\n    .shuffle()\n    .select(range(3))\n    .select_columns(\"persona\")\n    .to_list()\n)\n\ndata = []\nfor post in posts:\n    for persona in personas:\n        data.append({\"post\": post[\"post\"], \"persona\": persona[\"persona\"]})\n\n\nwith Pipeline(name=\"Social AI Personas\") as pipeline:\n    loader = LoadDataFromDicts(data=data, batch_size=1)\n\n    llm = InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 256,\n        },\n    )\n\n    for follower_type in [\"supporter\", \"troll\", \"alarmist\"]:\n        follower = SocialAI(\n            llm=llm,\n            follower_type=follower_type,\n            name=f\"{follower_type}_user\",\n            output_mappings={\"generation\": f\"interaction_{follower_type}\"},\n        )\n        format_sft = FormatTextGenerationSFT(\n            name=f\"format_sft_{follower_type}\",\n            input_mappings={\n                \"instruction\": \"post\",\n                \"generation\": f\"interaction_{follower_type}\",\n            },\n        )\n        loader >> follower >> format_sft\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    distiset.push_to_hub(\"plaguss/FinePersonas-SocialAI-test\", include_script=True)\n

This is the final toy dataset we obtain: FinePersonas-SocialAI-test

You can see examples of how to load each subset of them to fine-tune a model:

from datasets import load_dataset\n\nds = load_dataset(\"plaguss/FinePersonas-SocialAI-test\", \"format_sft_troll\")\n

And a sample of the generated field with the corresponding post and persona:

{\n    \"post\": \"Hmm, ok now I\\u0027m torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\",\n    \"persona\": \"A high school or undergraduate physics or chemistry teacher, likely with a focus on experimental instruction.\",\n    \"interaction_troll\": \"\\\"Late night cravings? More like late night brain drain. Either way, it\\u0027s just a collision of molecules in your stomach. Choose the one with more calories, at least that\\u0027s some decent kinetic energy.\\\"\",\n}\n

There's a lot of room for improvement, but quite a promising start.

"},{"location":"sections/pipeline_samples/examples/llama_cpp_with_outlines/","title":"Structured generation with outlines","text":"

Generate RPG characters following a pydantic.BaseModel with outlines in distilabel.

This script makes use of LlamaCppLLM and the structured output capabilities thanks to outlines to generate RPG characters that adhere to a JSON schema.

It makes use of a local model which can be downloaded using curl (explained in the script itself), and can be exchanged with other LLMs like vLLM.

Run
python examples/structured_generation_with_outlines.py\n
structured_generation_with_outlines.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom enum import Enum\nfrom pathlib import Path\n\nfrom pydantic import BaseModel, StringConstraints, conint\nfrom typing_extensions import Annotated\n\nfrom distilabel.models import LlamaCppLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass Weapon(str, Enum):\n    sword = \"sword\"\n    axe = \"axe\"\n    mace = \"mace\"\n    spear = \"spear\"\n    bow = \"bow\"\n    crossbow = \"crossbow\"\n\n\nclass Armor(str, Enum):\n    leather = \"leather\"\n    chainmail = \"chainmail\"\n    plate = \"plate\"\n    mithril = \"mithril\"\n\n\nclass Character(BaseModel):\n    name: Annotated[str, StringConstraints(max_length=30)]\n    age: conint(gt=1, lt=3000)\n    armor: Armor\n    weapon: Weapon\n\n\n# Download the model with\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nwith Pipeline(\"RPG-characters\") as pipeline:\n    system_prompt = (\n        \"You are a leading role play gamer. You have seen thousands of different characters and their attributes.\"\n        \" Please return a JSON object with common attributes of an RPG character.\"\n    )\n\n    load_dataset = LoadDataFromDicts(\n        name=\"load_instructions\",\n        data=[\n            {\n                \"system_prompt\": system_prompt,\n                \"instruction\": f\"Give me a character description for a {char}\",\n            }\n            for char in [\"dwarf\", \"elf\", \"human\", \"ork\"]\n        ],\n    )\n    llm = LlamaCppLLM(\n        model_path=str(Path.home() / model_path),  # type: ignore\n        n_gpu_layers=-1,\n        n_ctx=1024,\n        structured_output={\"format\": \"json\", \"schema\": Character},\n    )\n    # Change to vLLM as such:\n    # llm = vLLM(\n    #     model=\"teknium/OpenHermes-2.5-Mistral-7B\",\n    #     extra_kwargs={\"tensor_parallel_size\": 1},\n    #     structured_output={\"format\": \"json\", \"schema\": Character},\n    # )\n\n    text_generation = TextGeneration(\n        name=\"text_generation_rpg\",\n        llm=llm,\n        input_batch_size=8,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            text_generation.name: {\n                \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 256}}\n            }\n        },\n        use_cache=False,\n    )\n    for num, character in enumerate(distiset[\"default\"][\"train\"][\"generation\"]):\n        print(f\"Character: {num}\")\n        print(character)\n\n# Character: 0\n# {\n# \"name\": \"Gimli\",\n# \"age\": 42,\n# \"armor\": \"plate\",\n# \"weapon\": \"axe\" }\n# Character: 1\n# {\"name\":\"Gaelen\",\"age\":600,\"armor\":\"leather\",\"weapon\":\"bow\"}\n# Character: 2\n# {\"name\": \"John Smith\",\"age\": 35,\"armor\": \"leather\",\"weapon\": \"sword\"}\n# Character: 3\n# { \"name\": \"Grug\", \"age\": 35, \"armor\": \"leather\", \"weapon\": \"axe\"}\n
"},{"location":"sections/pipeline_samples/examples/mistralai_with_instructor/","title":"Structured generation with instructor","text":"

Answer instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel.

This script makes use of MistralLLM and the structured output capabilities thanks to instructor to generate knowledge graphs from complex topics.

This example is translated from this awesome example from instructor cookbook.

Run
python examples/structured_generation_with_instructor.py\n
structured_generation_with_instructor.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nfrom pydantic import BaseModel, Field\n\nfrom distilabel.models import MistralLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass Node(BaseModel):\n    id: int\n    label: str\n    color: str\n\n\nclass Edge(BaseModel):\n    source: int\n    target: int\n    label: str\n    color: str = \"black\"\n\n\nclass KnowledgeGraph(BaseModel):\n    nodes: List[Node] = Field(..., default_factory=list)\n    edges: List[Edge] = Field(..., default_factory=list)\n\n\nwith Pipeline(\n    name=\"Knowledge-Graphs\",\n    description=(\n        \"Generate knowledge graphs to answer questions, this type of dataset can be used to \"\n        \"steer a model to answer questions with a knowledge graph.\"\n    ),\n) as pipeline:\n    sample_questions = [\n        \"Teach me about quantum mechanics\",\n        \"Who is who in The Simpsons family?\",\n        \"Tell me about the evolution of programming languages\",\n    ]\n\n    load_dataset = LoadDataFromDicts(\n        name=\"load_instructions\",\n        data=[\n            {\n                \"system_prompt\": \"You are a knowledge graph expert generator. Help me understand by describing everything as a detailed knowledge graph.\",\n                \"instruction\": f\"{question}\",\n            }\n            for question in sample_questions\n        ],\n    )\n\n    text_generation = TextGeneration(\n        name=\"knowledge_graph_generation\",\n        llm=MistralLLM(\n            model=\"open-mixtral-8x22b\", structured_output={\"schema\": KnowledgeGraph}\n        ),\n        input_batch_size=8,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            text_generation.name: {\n                \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 2048}}\n            }\n        },\n        use_cache=False,\n    )\n\n    distiset.push_to_hub(\"distilabel-internal-testing/knowledge_graphs\")\n
Visualizing the graphs

Want to see how to visualize the graphs? You can test it using the following script. Generate some samples on your own and take a look:

Note

This example uses graphviz to render the graph, you can install with pip in the following way:

pip install graphviz\n
python examples/draw_kg.py 2  # You can pass 0,1,2 to visualize each of the samples.\n

"},{"location":"sections/pipeline_samples/papers/apigen/","title":"Create Function-Calling datasets with APIGen","text":"

This example will introduce APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets, a data generation pipeline designed to synthesize verifiable high-quality datasets for function-calling applications.

"},{"location":"sections/pipeline_samples/papers/apigen/#replication","title":"Replication","text":"

The following figure showcases the APIGen framework:

Now, let's walk through the key steps illustrated in the figure:

  • DataSampler: With the help of this step and the original Salesforce/xlam-function-calling-60k we are getting the Seed QA Data Sampler for the prompt template.

  • APIGenGenerator: This step does the job of the Query-Answer Generator, including the format checker from Stage 1: Format Checker thanks to the structured output generation.

  • APIGenExecutionChecker: This step is in charge of the Stage 2: Execution Checker.

  • APIGenSemanticChecker: Step in charge of running Stage 3: Semantic Checker, can use the same or a different LLM, we are using the same as in APIGenGenerator step.

The current implementation hasn't utilized the Diverse Prompt Library. To incorporate it, one could either adjust the prompt template within the APIGenGenerator or develop a new sampler specifically for this purpose. As for the API Sampler, while no specific data is shared here, we've created illustrative examples to demonstrate the pipeline's functionality. These examples represent a mix of data that could be used to replicate the sampler's output.

"},{"location":"sections/pipeline_samples/papers/apigen/#data-preparation","title":"Data preparation","text":"

The original paper tells about the data they used and give some hints, but nothing was shared. In this example, we will write a bunch of examples by hand to showcase how this pipeline can be built.

Assume we have the following function names, and corresponding descriptions of their behaviour:

data = [\n    {\n        \"func_name\": \"final_velocity\",\n        \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n    },\n    {\n        \"func_name\": \"permutation_count\",\n        \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n    },\n    {\n        \"func_name\": \"getdivision\",\n        \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n    },\n    {\n        \"func_name\": \"binary_addition\",\n        \"func_desc\": \"Adds two binary numbers and returns the result as a binary string.\",\n    },\n    {\n        \"func_name\": \"swapi_planet_resource\",\n        \"func_desc\": \"get a specific planets resource\",\n    },\n    {\n        \"func_name\": \"disney_character\",\n        \"func_desc\": \"Find a specific character using this endpoint\",\n    }\n]\n

The original paper refers to both python functions and APIs, but we will make use of python functions exclusively for simplicity. In order to execute and check this functions/APIs, we need access to the code, which we have moved to a Python file: lib_apigen.py. All this functions are executable, but we also need access to their tool representation. For this, we will make use of transformers' get_json_schema function1.

We have all the machinery prepared in our libpath, except from the tool definition. With the help of our helper function load_module_from_path we will load this python module, collect all the tools, and add them to each row in our data variable.

from distilabel.steps.tasks.apigen.utils import load_module_from_path\n\nlibpath_module = load_module_from_path(libpath)\ntools = getattr(libpath_module, \"get_tools\")()  # call get_tools()\n\nfor row in data:\n    #\u00a0The tools should have a mix where both the correct and irrelevant tools are present.\n    row.update({\"tools\": [tools[row[\"func_name\"]]]})\n

Now we have all the necessary data for our prompt. Additionally, we will make use of the original dataset as few-shot examples to enhance the model:

ds_og = (\n    load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n    .shuffle(seed=42)\n    .select(range(500))\n    .to_list()\n)\n

We have just loaded a subset and transformed it to a list of dictionaries, as we will use it in the DataSampler GeneratorStep, grabbing random examples from the original dataset.

"},{"location":"sections/pipeline_samples/papers/apigen/#building-the-pipeline","title":"Building the Pipeline","text":"

Now that we've walked through each component, it's time to see how it all comes together, here's the Pipeline code:

with Pipeline(name=\"apigen-example\") as pipeline:\n    loader_seeds = LoadDataFromDicts(data=data)  # (1)\n\n    sampler = DataSampler(  # (2)\n        data=ds_og,\n        size=2,\n        samples=len(data),\n        batch_size=8,\n    )\n\n    prep_examples = PrepareExamples()  # This step will add the 'examples' column\n\n    combine_steps = CombineOutputs()  # (3)\n\n    model_id = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n    llm=InferenceEndpointsLLM(  # (4)\n        model_id=model_id,\n        tokenizer_id=model_id,\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 2048,\n        },\n    )\n    apigen = APIGenGenerator(  # (5)\n        llm=llm,\n        use_default_structured_output=True,\n    )\n\n    execution_checker = APIGenExecutionChecker(libpath=str(libpath))  # (6)\n    semantic_checker = APIGenSemanticChecker(llm=llm)  # (7)\n\n    sampler >> prep_examples\n    (\n        [loader_seeds, prep_examples] \n        >> combine_steps \n        >> apigen\n        >> execution_checker\n        >> semantic_checker\n    )\n
  1. Load the data seeds we are going to use to generate our function calling dataset.

  2. The DataSampler together with PrepareExamples will be used to help us create the few-shot examples from the original dataset to be fed in our prompt.

  3. Combine both columns to obtain a single stream of data

  4. Will reuse the same LLM for the generation and the semantic checks.

  5. Creates the query and answers that will be used together with the tools to fine-tune a new model. Will generate the structured outputs to ensure we have valid JSON formatted answers.

  6. Adds columns keep_row_after_execution_check and execution_result.

  7. Adds columns keep_row_after_semantic_check and thought.

"},{"location":"sections/pipeline_samples/papers/apigen/#script-and-final-dataset","title":"Script and final dataset","text":"

To see all the pieces in place, take a look at the full pipeline, as well as an example row that would be generated from this pipeline.

Run
python examples/pipeline_apigen.py\n
pipeline_apigen.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom pathlib import Path\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, DataSampler, LoadDataFromDicts\nfrom distilabel.steps.tasks import (\n    APIGenExecutionChecker,\n    APIGenGenerator,\n    APIGenSemanticChecker,\n)\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples, load_module_from_path\n\nlibpath = Path(__file__).parent / \"lib_apigen.py\"\n\ndata = [\n    {\n        \"func_name\": \"final_velocity\",\n        \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n    },\n    {\n        \"func_name\": \"permutation_count\",\n        \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n    },\n    {\n        \"func_name\": \"getdivision\",\n        \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n    },\n    {\n        \"func_name\": \"binary_addition\",\n        \"func_desc\": \"Adds two binary numbers and returns the result as a binary string.\",\n    },\n    {\n        \"func_name\": \"swapi_planet_resource\",\n        \"func_desc\": \"get a specific planets resource\",\n    },\n    {\n        \"func_name\": \"disney_character\",\n        \"func_desc\": \"Find a specific character using this endpoint\",\n    },\n]\n\nlibpath_module = load_module_from_path(libpath)\ntools = libpath_module.get_tools()  # call get_tools()\n\n# TODO: Add in the tools between 0 and 2 extra tools to make the task more challenging.\nfor row in data:\n    # The tools should have a mix where both the correct and irrelevant tools are present.\n    row.update({\"tools\": [tools[row[\"func_name\"]]]})\n\n\nds_og = (\n    load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n    .shuffle(seed=42)\n    .select(range(500))\n    .to_list()\n)\n\n\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n    loader_seeds = LoadDataFromDicts(data=data)\n    sampler = DataSampler(\n        data=ds_og,\n        size=2,\n        samples=len(data),\n        batch_size=8,\n    )\n\n    prep_examples = PrepareExamples()\n\n    model_id = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n    llm = InferenceEndpointsLLM(\n        model_id=model_id,\n        tokenizer_id=model_id,\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 2048,\n        },\n    )\n    apigen = APIGenGenerator(\n        llm=llm,\n        use_default_structured_output=True,\n    )\n    combine_steps = CombineOutputs()\n\n    execution_checker = APIGenExecutionChecker(libpath=str(libpath))\n    semantic_checker = APIGenSemanticChecker(llm=llm)\n\n    sampler >> prep_examples\n    (\n        [loader_seeds, prep_examples]\n        >> combine_steps\n        >> apigen\n        >> execution_checker\n        >> semantic_checker\n    )\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run()\n    print(distiset[\"default\"][\"train\"][0])\n

Example row:

{\n  \"func_name\": \"final_velocity\",\n  \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n  \"tools\": [\n    {\n      \"function\": {\n        \"description\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n        \"name\": \"final_velocity\",\n        \"parameters\": {\n          \"properties\": {\n            \"acceleration\": {\n              \"description\": \"The acceleration of the object.\",\n              \"type\": \"number\"\n            },\n            \"initial_velocity\": {\n              \"description\": \"The initial velocity of the object.\",\n              \"type\": \"number\"\n            },\n            \"time\": {\n              \"description\": \"The time elapsed.\",\n              \"type\": \"number\"\n            }\n          },\n          \"required\": [\n            \"initial_velocity\",\n            \"acceleration\",\n            \"time\"\n          ],\n          \"type\": \"object\"\n        }\n      },\n      \"type\": \"function\"\n    }\n  ],\n  \"examples\": \"## Query:\\nRetrieve the first 15 comments for post ID '12345' from the Tokapi mobile API.\\n## Answers:\\n[{\\\"name\\\": \\\"v1_post_post_id_comments\\\", \\\"arguments\\\": {\\\"post_id\\\": \\\"12345\\\", \\\"count\\\": 15}}]\\n\\n## Query:\\nRetrieve the detailed recipe for the cake with ID 'cake101'.\\n## Answers:\\n[{\\\"name\\\": \\\"detailed_cake_recipe_by_id\\\", \\\"arguments\\\": {\\\"is_id\\\": \\\"cake101\\\"}}]\\n\\n## Query:\\nWhat are the frequently asked questions and their answers for Coca-Cola Company? Also, what are the suggested tickers based on Coca-Cola Company?\\n## Answers:\\n[{\\\"name\\\": \\\"symbols_faq\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}, {\\\"name\\\": \\\"symbols_suggested\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}]\",\n  \"query\": \"What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\",\n  \"answers\": \"[{\\\"arguments\\\": {\\\"acceleration\\\": \\\"9.8\\\", \\\"initial_velocity\\\": \\\"0\\\", \\\"time\\\": \\\"10\\\"}, \\\"name\\\": \\\"final_velocity\\\"}]\",\n  \"distilabel_metadata\": {\n    \"raw_input_a_p_i_gen_generator_0\": [\n      {\n        \"content\": \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\",\n        \"role\": \"system\"\n      },\n      {\n        \"content\": \"Here are examples of queries and the corresponding answers for similar functions:\\n## Query:\\nRetrieve the first 15 comments for post ID '12345' from the Tokapi mobile API.\\n## Answers:\\n[{\\\"name\\\": \\\"v1_post_post_id_comments\\\", \\\"arguments\\\": {\\\"post_id\\\": \\\"12345\\\", \\\"count\\\": 15}}]\\n\\n## Query:\\nRetrieve the detailed recipe for the cake with ID 'cake101'.\\n## Answers:\\n[{\\\"name\\\": \\\"detailed_cake_recipe_by_id\\\", \\\"arguments\\\": {\\\"is_id\\\": \\\"cake101\\\"}}]\\n\\n## Query:\\nWhat are the frequently asked questions and their answers for Coca-Cola Company? Also, what are the suggested tickers based on Coca-Cola Company?\\n## Answers:\\n[{\\\"name\\\": \\\"symbols_faq\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}, {\\\"name\\\": \\\"symbols_suggested\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\n\\nBased on these examples, generate 1 diverse query and answer pairs for the function `final_velocity`.\\nThe detailed function description is the following:\\nCalculates the final velocity of an object given its initial velocity, acceleration, and time.\\n\\nThese are the available tools to help you:\\n[{'type': 'function', 'function': {'name': 'final_velocity', 'description': 'Calculates the final velocity of an object given its initial velocity, acceleration, and time.', 'parameters': {'type': 'object', 'properties': {'initial_velocity': {'type': 'number', 'description': 'The initial velocity of the object.'}, 'acceleration': {'type': 'number', 'description': 'The acceleration of the object.'}, 'time': {'type': 'number', 'description': 'The time elapsed.'}}, 'required': ['initial_velocity', 'acceleration', 'time']}}}]\\n\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n```json\\n[\\n   {\\n       \\\"query\\\": \\\"The generated query.\\\",\\n       \\\"answers\\\": [\\n           {\\n               \\\"name\\\": \\\"api_name\\\",\\n               \\\"arguments\\\": {\\n                   \\\"arg_name\\\": \\\"value\\\"\\n                   ... (more arguments as required)\\n               }\\n           },\\n           ... (more API calls as required)\\n       ]\\n   }\\n]\\n```\\n\\nNow please generate 1 diverse query and answer pairs following the above format.\",\n        \"role\": \"user\"\n      }\n    ],\n    \"raw_input_a_p_i_gen_semantic_checker_0\": [\n      {\n        \"content\": \"As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\",\n        \"role\": \"system\"\n      },\n      {\n        \"content\": \"Given Information:\\n- All Available Functions:\\nCalculates the final velocity of an object given its initial velocity, acceleration, and time.\\n- User Query: What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\\n- Generated Function Calls: [{\\\"arguments\\\": {\\\"acceleration\\\": \\\"9.8\\\", \\\"initial_velocity\\\": \\\"0\\\", \\\"time\\\": \\\"10\\\"}, \\\"name\\\": \\\"final_velocity\\\"}]\\n- Execution Results: ['9.8']\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query's intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n   \\\"thought\\\": \\\"Concisely describe your reasoning here\\\",\\n   \\\"passes\\\": \\\"yes\\\" or \\\"no\\\"\\n}\\n```\\n\",\n        \"role\": \"user\"\n      }\n    ],\n    \"raw_output_a_p_i_gen_generator_0\": \"{\\\"pairs\\\": [\\n   {\\n       \\\"answers\\\": [\\n           {\\n               \\\"arguments\\\": {\\n                   \\\"acceleration\\\": \\\"9.8\\\",\\n                   \\\"initial_velocity\\\": \\\"0\\\",\\n                   \\\"time\\\": \\\"10\\\"\\n               },\\n               \\\"name\\\": \\\"final_velocity\\\"\\n           }\\n       ],\\n       \\\"query\\\": \\\"What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\\\"\\n   }\\n]}\",\n    \"raw_output_a_p_i_gen_semantic_checker_0\": \"{\\n   \\\"thought\\\": \\\"\\\",\\n   \\\"passes\\\": \\\"yes\\\"\\n}\"\n  },\n  \"model_name\": \"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n  \"keep_row_after_execution_check\": true,\n  \"execution_result\": [\n    \"9.8\"\n  ],\n  \"thought\": \"\",\n  \"keep_row_after_semantic_check\": true\n}\n
  1. Read this nice blog post for more information on tools and the reasoning behind get_json_schema: Tool Use, Unified.\u00a0\u21a9

"},{"location":"sections/pipeline_samples/papers/clair/","title":"Contrastive Learning From AI Revisions (CLAIR)","text":"

\"Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment\" introduces both Contrastive Learning from AI Revisions (CLAIR), a data-creation method which leads to more contrastive preference pairs, and Anchored Preference Optimization (APO), a controllable and more stable alignment objective. While APO can be found in TRL, we have implemented a task for CLAIR in distilabel.

CLAIR is a method for creating preference pairs which minimally revises one output to express a preference, resulting in a more precise learning signal as opposed to conventional methods which use a judge to select a preferred response.

The athors from the original paper shared a collection of datasets from CLAIR and APO, where ContextualAI/ultrafeedback_clair_32k corresponds to the CLAIR implementation.

"},{"location":"sections/pipeline_samples/papers/clair/#replication","title":"Replication","text":"

Note

The section is named Replication but in this case we are showing how to use the CLAIR task create revisions for your generations using distilabel.

To showcase CLAIR we will be using the CLAIR task implemented in distilabel and we are reusing a small sample of the already generated dataset by ContextualAI ContextualAI/ultrafeedback_clair_32k for testing.

"},{"location":"sections/pipeline_samples/papers/clair/#installation","title":"Installation","text":"

To reproduce the code below, one will need to install distilabel as follows:

pip install \"distilabel>=1.4.0\"\n

Depending on the LLM provider you want to use, the requirements may vary, take a look at the dependencies in that case, we are using for the example the free inference endpoints from Hugging Face, but that won't apply for a bigger dataset.

"},{"location":"sections/pipeline_samples/papers/clair/#building-blocks","title":"Building blocks","text":"

In this case where we already have instructions and their generations, we will just need to load the data and the corresponding CLAIR task for the revisions:

  • CLAIR to generate the revisions.
"},{"location":"sections/pipeline_samples/papers/clair/#code","title":"Code","text":"

Let's see the full pipeline applied to ContextualAI/ultrafeedback_clair_32k in distilabel:

from typing import Any, Dict\n\nfrom datasets import load_dataset\n\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\n\ndef transform_ultrafeedback(example: Dict[str, Any]) -> Dict[str, Any]:\n    return {\n        \"task\": example[\"prompt\"],\n        \"student_solution\": example[\"rejected\"][1][\"content\"],\n    }\n\ndataset = (\n    load_dataset(\"ContextualAI/ultrafeedback_clair_32k\", split=\"train\")\n    .select(range(10))             #\u00a0We collect just 10 examples\n    .map(transform_ultrafeedback)  # Apply the transformation to get just the text\n)\n\nwith Pipeline(name=\"CLAIR UltraFeedback sample\") as pipeline:\n    clair = CLAIR(  # (1)\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 4096\n            }\n        )\n    )\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(dataset=dataset)  # (2)\n    distiset.push_to_hub(repo_id=\"username/clair-test\", include_script=True)  # (3)\n
  1. This Pipeline uses just CLAIR because we already have the generations, but one can just include a first task to create generations from instructions, and then the revisions with CLAIR.

  2. Include the dataset directly in the run method for simplicity.

  3. Push the distiset to the hub with the script for reproducibility.

An example dataset can be found at: distilabel-internal-testing/clair-test.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/","title":"DeepSeek Prover","text":"

\"DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data\" presents an approach to generate mathematical proofs for theorems generated from informal math problems. This approach shows promising results to advance the capabilities of models towards theorem proving using synthetic data. Until this moment the dataset and the model trained on top of it haven't been opened, let's see how the approach works to reproduce the pipeline using distilabel. The following figure depicts the approach taken to generate the dataset:

The authors propose a method for generating Lean 4 proof data from informal mathematical problems. Their approach translates high-school and undergraduate-level mathematical competition problems into formal statements.

Here we show how to deal with steps 1 and 2, but the authors ensure the theorems are checked using the lean4 program on the generated proofs, and iterate for a series of steps, fine-tuning a model on the synthetic data (DeepSeek prover 7B), regenerating the dataset, and continue the process until no further improvement is found.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#replication","title":"Replication","text":"

Note

The section is named Replication but we will show how we can use distilabel to create the different steps outlined in the DeepSeek-Prover approach. We intentionally let some steps out of the pipeline, but this can easily be extended.

We will define the components needed to generate a dataset like the one depicted in the previous figure (we won't call lean4 or do the fine-tuning, this last step can be done outside of distilabel). The different blocks will have all the docstrings as we would have in the internal steps to showcase how they are done, but they can be omitted for brevity.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#installation","title":"Installation","text":"

To reproduce the code below, we need to install distilabel as it follows:

pip install \"distilabel[hf-inference-endpoints]\"\n

We have decided to use InferenceEndpointsLLM, but any other provider with a strong model could work.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#building-blocks","title":"Building blocks","text":"

There are three components we needed to define for this pipeline, for the different components in the paper: A task to formalize the original statements, another one to assess the relevance of the theorems, and a final one to generate proofs for the theorems.

Note

We will use the same LLM for all the tasks, so we will define once and reuse it for the different tasks:

llm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n)\n
"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproverautoformalization","title":"DeepSeekProverAutoFormalization","text":"

This Task corresponds to the first step in the figure. Given an informal statement, it will formalize it for us in Lean 4 language, meaning it will translate from an informal statement that could be gathered from the internet, to the lean4 structured language.

DeepSeekProverAutoFormalization
_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX = r\"```lean4(.*?)```\"\n\ntemplate_deepseek_prover_auto_formalization = \"\"\"\\\nMathematical Problem in Natural Language:\n{{ informal_statement }}\n{%- if few_shot %}\n\nPlease use the following examples to guide you with the answer:\n{%- for example in examples %}\n- {{ example }}\n{%- endfor %}\n{% endif -%}\"\"\"\n\n\nclass DeepSeekProverAutoFormalization(Task):\n    examples: Optional[List[str]] = None\n    system_prompt: str = \"Translate the problem to Lean 4 (only the core declaration):\\n```lean4\\nformal statement goes here\\n```\"\n    _template: Union[Template, None] = PrivateAttr(...)\n    _few_shot: bool = PrivateAttr(default=False)\n\n    def load(self) -> None:\n        super().load()\n        self._template = Template(template_deepseek_prover_auto_formalization)\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"informal_statement\"]\n\n    @property\n    def outputs(self):\n        return [\"formal_statement\", \"model_name\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self.system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    informal_statement=input[self.inputs[0]],\n                    few_shot=bool(self.examples),\n                    examples=self.examples,\n                ),\n            },\n        ]\n\n    @override\n    def format_output(  # type: ignore\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n        if match:\n            match = match.group(1).strip()\n        return {\"formal_statement\": match}\n

Following the paper, they found that the model yields better results if it uses examples in a few shot setting, so this class allows to take some examples to help in generating the formulation. Let's see an example of how we can instantiate it:

from textwrap import dedent\n\nexamples = [\n    dedent(\"\"\"\n    ## Statement in natural language:\n    For real numbers k and x:\n    If x is equal to (13 - \u221a131) / 4, and\n    If the equation 2x\u00b2 - 13x + k = 0 is satisfied,\n    Then k must be equal to 19/4.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n    dedent(\"\"\"\n    ## Statement in natural language:\n    The greatest common divisor (GCD) of 20 factorial (20!) and 200,000 is equal to 40,000.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n    dedent(\"\"\"\n    ## Statement in natural language:\n    Given two integers x and y:\n    If y is positive (greater than 0),\n    And y is less than x,\n    And the equation x + y + xy = 80 is true,\n    Then x must be equal to 26.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n]\n\nauto_formalization = DeepSeekProverAutoFormalization(\n    name=\"auto_formalization\",\n    input_batch_size=8,\n    llm=llm,\n    examples=examples\n)\n
"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproverscorer","title":"DeepSeekProverScorer","text":"

The next Task corresponds to the second step, the model scoring and assessment. It uses an LLM as judge to evaluate the relevance of the theorem, and assigns a score so it can be filtered afterwards.

DeepSeekProverScorer
template_deepseek_prover_scorer = \"\"\"\\\nTo evaluate whether a formal Lean4 statement will be of interest to the community, consider the following criteria:\n\n1. Relevance to Current Research: Does the statement address a problem or concept that is actively being researched in mathematics or related fields? Higher relevance scores indicate greater potential interest.\n2. Complexity and Depth: Is the statement complex enough to challenge existing theories and methodologies, yet deep enough to provide significant insights or advancements? Complexity and depth showcase Lean4's capabilities and attract interest.\n3. Interdisciplinary Potential: Does the statement offer opportunities for interdisciplinary research, connecting mathematics with other fields such as computer science, physics, or biology? Interdisciplinary projects often garner wide interest.\n4. Community Needs and Gaps: Does the statement fill an identified need or gap within the Lean4 community or the broader mathematical community? Addressing these needs directly correlates with interest.\n5. Innovativeness: How innovative is the statement? Does it propose new methods, concepts, or applications? Innovation drives interest and engagement.\n\nCustomize your evaluation for each problem accordingly, assessing it as 'excellent', 'good', 'above average', 'fair' or 'poor'.\n\nYou should respond in the following format for each statement:\n\n'''\nNatural language: (Detailed explanation of the informal statement, including any relevant background information, assumptions, and definitions.)\nAnalysis: (Provide a brief justification for each score, highlighting why the statement scored as it did across the criteria.)\nAssessment: (Based on the criteria, rate the statement as 'excellent', 'good', 'above average', 'fair' or 'poor'. JUST the Assessment.)\n'''\"\"\"\n\nclass DeepSeekProverScorer(Task):\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    def load(self) -> None:\n        super().load()\n        self._template = Template(template_deepseek_prover_scorer)\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"informal_statement\", \"formal_statement\"]\n\n    @property\n    def outputs(self):\n        return [\"natural_language\", \"analysis\", \"assessment\", \"model_name\"]\n\n    def format_input(self, input: str) -> ChatType:\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self._template.render(),\n            },\n            {\n                \"role\": \"user\",\n                \"content\": f\"## Informal statement:\\n{input[self.inputs[0]]}\\n\\n ## Formal statement:\\n{input[self.inputs[1]]}\",\n            },\n        ]\n\n    @override\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:\n        try:\n            result = output.split(\"Natural language:\")[1].strip()\n            natural_language, analysis = result.split(\"Analysis:\")\n            analysis, assessment = analysis.split(\"Assessment:\")\n            natural_language = natural_language.strip()\n            analysis = analysis.strip()\n            assessment = assessment.strip()\n        except Exception:\n            natural_language = analysis = assessment = None\n\n        return {\n            \"natural_language\": natural_language,\n            \"analysis\": analysis,\n            \"assessment\": assessment\n        }\n
"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproversolver","title":"DeepSeekProverSolver","text":"

The last task is in charge of generating a proof for the theorems generated in the previous steps.

DeepSeekProverSolver
class DeepSeekProverSolver(Task):\n    system_prompt: str = (\n        \"You are an expert in proving mathematical theorems formalized in lean4 language. \"\n        \"Your answers consist just in the proof to the theorem given, and nothing else.\"\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"formal_statement\"]\n\n    @property\n    def outputs(self):\n        return [\"proof\"]\n\n    def format_input(self, input: str) -> ChatType:\n        prompt = dedent(\"\"\"\n            Give me a proof for the following theorem:\n            ```lean4\n            {theorem}\n            ```\"\"\"\n        )\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self.system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": prompt.format(theorem=input[\"formal_statement\"]),\n            },\n        ]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:\n        import re\n        match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n        if match:\n            match = match.group(1).strip()\n        return {\"proof\": match}\n

Additionally, the original pipeline defined in the paper includes a step to check the final proofs using the lean 4 language that we have omitted for simplicity. The fine tuning can be done completely offline, and come back to the pipeline after each iteration/training run.

All the docstrings have been removed from the code blocks, but can be seen in the full pipeline.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#code","title":"Code","text":"

Lets's put the building blocks together to create the final pipeline with distilabel. For this example we have generated a sample dataset plaguss/informal-mathematical-statements-tiny of informal mathematical statements starting from casey-martin/multilingual-mathematical-autoformalization, but as the paper mentions, we can create formal statements and it's corresponding proofs starting from informal ones:

Click to see the full pipeline deepseek_prover.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom pathlib import Path\nfrom textwrap import dedent\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom jinja2 import Template\nfrom pydantic import PrivateAttr\nfrom typing_extensions import override\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks.base import Task\nfrom distilabel.steps.tasks.typing import ChatType\n\n_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX = r\"```lean4(.*?)```\"\n\n\ntemplate_deepseek_prover_auto_formalization = \"\"\"\\\nMathematical Problem in Natural Language:\n{{ informal_statement }}\n{%- if few_shot %}\n\nPlease use the following examples to guide you with the answer:\n{%- for example in examples %}\n- {{ example }}\n{%- endfor %}\n{% endif -%}\"\"\"\n\n\nclass DeepSeekProverAutoFormalization(Task):\n    \"\"\"Task to translate a mathematical problem from natural language to Lean 4.\n\n    Note:\n        A related dataset (MMA from the paper) can be found in Hugging Face:\n        [casey-martin/multilingual-mathematical-autoformalization](https://huggingface.co/datasets/casey-martin/multilingual-mathematical-autoformalization).\n\n    Input columns:\n        - informal_statement (`str`): The statement to be formalized using Lean 4.\n\n    Output columns:\n        - formal_statement (`str`): The formalized statement using Lean 4, to be analysed.\n\n    Categories:\n        - generation\n\n    References:\n        - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n        - [`Lean 4`](https://github.com/leanprover/lean4).\n\n    Examples:\n\n        Formalize a mathematical problem from natural language to Lean 4:\n\n        ```python\n        from distilabel.steps.tasks import DeepSeekProverAutoFormalization\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prover_autoformal = DeepSeekProverAutoFormalization(\n            llm=InferenceEndpointsLLM(\n                model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n                tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n            ),\n        )\n\n        prover_autoformal.load()\n\n        result = next(\n            prover_autoformal.process(\n                [\n                    {\"informal_statement\": \"If a polynomial g is monic, then the root of g is integral over the ring R.\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'informal_statement': 'If a polynomial g is monic, then the root of g is integral over the ring R.',\n        #         'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n        #         'distilabel_metadata': {\n        #             'raw_output_deep_seek_prover_auto_formalization_0': '```lean4\\ntheorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\\n```'\n        #         },\n        #         'model_name': 'deepseek-prover'\n        #     }\n        # ]\n        ```\n\n        Use a few-shot setting to formalize a mathematical problem from natural language to Lean 4:\n\n        ```python\n        from distilabel.steps.tasks import DeepSeekProverAutoFormalization\n        from distilabel.models import InferenceEndpointsLLM\n\n        # You can gain inspiration from the following examples to create your own few-shot examples:\n        # https://github.com/yangky11/miniF2F-lean4/blob/main/MiniF2F/Valid.lean\n        # Consider this as a placeholder for your actual LLM.\n        prover_autoformal = DeepSeekProverAutoFormalization(\n            llm=InferenceEndpointsLLM(\n                model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n                tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n            ),\n            examples=[\n                \"theorem amc12a_2019_p21 (z : \u2102) (h\u2080 : z = (1 + Complex.I) / Real.sqrt 2) :\\n\\n((\u2211 k : \u2124 in Finset.Icc 1 12, z ^ k ^ 2) * (\u2211 k : \u2124 in Finset.Icc 1 12, 1 / z ^ k ^ 2)) = 36 := by\\n\\nsorry\",\n                \"theorem amc12a_2015_p10 (x y : \u2124) (h\u2080 : 0 < y) (h\u2081 : y < x) (h\u2082 : x + y + x * y = 80) : x = 26 := by\\n\\nsorry\"\n            ]\n        )\n\n        prover_autoformal.load()\n\n        result = next(\n            prover_autoformal.process(\n                [\n                    {\"informal_statement\": \"If a polynomial g is monic, then the root of g is integral over the ring R.\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'informal_statement': 'If a polynomial g is monic, then the root of g is integral over the ring R.',\n        #         'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n        #         'distilabel_metadata': {\n        #             'raw_output_deep_seek_prover_auto_formalization_0': '```lean4\\ntheorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\\n```'\n        #         },\n        #         'model_name': 'deepseek-prover'\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    examples: Optional[List[str]] = None\n    system_prompt: str = \"Translate the problem to Lean 4 (only the core declaration):\\n```lean4\\nformal statement goes here\\n```\"\n    _template: Union[Template, None] = PrivateAttr(...)\n    _few_shot: bool = PrivateAttr(default=False)\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        self._template = Template(template_deepseek_prover_auto_formalization)\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`.\"\"\"\n        return [\"informal_statement\"]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n        return [\"formal_statement\", \"model_name\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation. And the\n        `system_prompt` is added as the first message if it exists.\"\"\"\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self.system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    informal_statement=input[self.inputs[0]],\n                    few_shot=bool(self.examples),\n                    examples=self.examples,\n                ),\n            },\n        ]\n\n    @override\n    def format_output(  # type: ignore\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        \"\"\"Extracts the formal statement from the Lean 4 output.\"\"\"\n        match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n        if match:\n            match = match.group(1).strip()\n        return {\"formal_statement\": match}\n\n\ntemplate_deepseek_prover_scorer = \"\"\"\\\nTo evaluate whether a formal Lean4 statement will be of interest to the community, consider the following criteria:\n\n1. Relevance to Current Research: Does the statement address a problem or concept that is actively being researched in mathematics or related fields? Higher relevance scores indicate greater potential interest.\n2. Complexity and Depth: Is the statement complex enough to challenge existing theories and methodologies, yet deep enough to provide significant insights or advancements? Complexity and depth showcase Lean4's capabilities and attract interest.\n3. Interdisciplinary Potential: Does the statement offer opportunities for interdisciplinary research, connecting mathematics with other fields such as computer science, physics, or biology? Interdisciplinary projects often garner wide interest.\n4. Community Needs and Gaps: Does the statement fill an identified need or gap within the Lean4 community or the broader mathematical community? Addressing these needs directly correlates with interest.\n5. Innovativeness: How innovative is the statement? Does it propose new methods, concepts, or applications? Innovation drives interest and engagement.\n\nCustomize your evaluation for each problem accordingly, assessing it as 'excellent', 'good', 'above average', 'fair' or 'poor'.\n\nYou should respond in the following format for each statement:\n\n'''\nNatural language: (Detailed explanation of the informal statement, including any relevant background information, assumptions, and definitions.)\nAnalysis: (Provide a brief justification for each score, highlighting why the statement scored as it did across the criteria.)\nAssessment: (Based on the criteria, rate the statement as 'excellent', 'good', 'above average', 'fair' or 'poor'. JUST the Assessment.)\n'''\"\"\"\n\n\nclass DeepSeekProverScorer(Task):\n    \"\"\"Task to evaluate the quality of a formalized mathematical problem in Lean 4,\n    inspired by the DeepSeek-Prover task for scoring.\n\n    Note:\n        A related dataset (MMA from the paper) can be found in Hugging Face:\n        [casey-martin/multilingual-mathematical-autoformalization](https://huggingface.co/datasets/casey-martin/multilingual-mathematical-autoformalization).\n\n    Input columns:\n        - informal_statement (`str`): The statement to be formalized using Lean 4.\n        - formal_statement (`str`): The formalized statement using Lean 4, to be analysed.\n\n    Output columns:\n        - natural_language (`str`): Explanation for the problem.\n        - analysis (`str`): Analysis of the different points defined in the prompt.\n        - assessment (`str`): Result of the assessment.\n\n    Categories:\n        - scorer\n        - quality\n        - response\n\n    References:\n        - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n        - [`Lean 4`](https://github.com/leanprover/lean4).\n\n    Examples:\n\n        Analyse a formal statement in Lean 4:\n\n        ```python\n        from distilabel.steps.tasks import DeepSeekProverScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prover_scorer = DeepSeekProverAutoFormalization(\n            llm=InferenceEndpointsLLM(\n                model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n                tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n            ),\n        )\n\n        prover_scorer.load()\n\n        result = next(\n            prover_scorer.process(\n                [\n                    {\"formal_statement\": \"theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n        #         'informal_statement': 'INFORMAL',\n        #         'analysis': 'ANALYSIS',\n        #         'assessment': 'ASSESSMENT',\n        #         'distilabel_metadata': {\n        #             'raw_output_deep_seek_prover_scorer_0': 'Natural language:\\nINFORMAL\\nAnalysis:\\nANALYSIS\\nAssessment:\\nASSESSMENT'\n        #         },\n        #         'model_name': 'deepseek-prover-scorer'\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        self._template = Template(template_deepseek_prover_scorer)\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`.\"\"\"\n        return [\"informal_statement\", \"formal_statement\"]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n        return [\"natural_language\", \"analysis\", \"assessment\", \"model_name\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation. And the\n        `system_prompt` is added as the first message if it exists.\"\"\"\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self._template.render(),\n            },\n            {\n                \"role\": \"user\",\n                \"content\": f\"## Informal statement:\\n{input[self.inputs[0]]}\\n\\n ## Formal statement:\\n{input[self.inputs[1]]}\",\n            },\n        ]\n\n    @override\n    def format_output(  # type: ignore\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        \"\"\"Analyses the formal statement with Lean 4 output and generates an assessment\n        and the corresponding informal assessment.\"\"\"\n\n        try:\n            result = output.split(\"Natural language:\")[1].strip()\n            natural_language, analysis = result.split(\"Analysis:\")\n            analysis, assessment = analysis.split(\"Assessment:\")\n            natural_language = natural_language.strip()\n            analysis = analysis.strip()\n            assessment = assessment.strip()\n        except Exception:\n            natural_language = analysis = assessment = None\n\n        return {\n            \"natural_language\": natural_language,\n            \"analysis\": analysis,\n            \"assessment\": assessment,\n        }\n\n\nclass DeepSeekProverSolver(Task):\n    \"\"\"Task to generate a proof for a formal statement (theorem) in lean4.\n\n    Input columns:\n        - formal_statement (`str`): The formalized statement using Lean 4.\n\n    Output columns:\n        - proof (`str`): The proof for the formal statement theorem.\n\n    Categories:\n        - scorer\n        - quality\n        - response\n\n    References:\n        - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n    \"\"\"\n\n    system_prompt: str = (\n        \"You are an expert in proving mathematical theorems formalized in lean4 language. \"\n        \"Your answers consist just in the proof to the theorem given, and nothing else.\"\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `formal_statement`.\"\"\"\n        return [\"formal_statement\"]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is the proof for the formal statement theorem.\"\"\"\n        return [\"proof\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType`, with a system prompt to guide our model.\"\"\"\n        prompt = dedent(\"\"\"\n            Give me a proof for the following theorem:\n            ```lean4\n            {theorem}\n            ```\"\"\")\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self.system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": prompt.format(theorem=input[\"formal_statement\"]),\n            },\n        ]\n\n    def format_output(  # type: ignore\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        import re\n\n        match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n        if match:\n            match = match.group(1).strip()\n        return {\"proof\": match}\n\n\nexamples = [\n    dedent(\"\"\"\n    ## Statement in natural language:\n    For real numbers k and x:\n    If x is equal to (13 - \u221a131) / 4, and\n    If the equation 2x\u00b2 - 13x + k = 0 is satisfied,\n    Then k must be equal to 19/4.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n    dedent(\"\"\"\n    ## Statement in natural language:\n    The greatest common divisor (GCD) of 20 factorial (20!) and 200,000 is equal to 40,000.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n    dedent(\"\"\"\n    ## Statement in natural language:\n    Given two integers x and y:\n    If y is positive (greater than 0),\n    And y is less than x,\n    And the equation x + y + xy = 80 is true,\n    Then x must be equal to 26.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n]\n\n\nwith Pipeline(name=\"test_deepseek_prover\") as pipeline:\n    data_loader = LoadDataFromHub(\n        repo_id=\"plaguss/informal-mathematical-statements-tiny\",\n        split=\"val\",\n        batch_size=8,\n    )\n\n    llm = InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    )\n    auto_formalization = DeepSeekProverAutoFormalization(\n        name=\"auto_formalization\", input_batch_size=8, llm=llm, examples=examples\n    )\n    prover_scorer = DeepSeekProverScorer(\n        name=\"prover_scorer\",\n        input_batch_size=8,\n        llm=llm,\n    )\n    proof_generator = DeepSeekProverSolver(\n        name=\"proof_generator\", input_batch_size=8, llm=llm\n    )\n\n    (data_loader >> auto_formalization >> prover_scorer >> proof_generator)\n\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"-d\",\n        \"--dry-run\",\n        action=argparse.BooleanOptionalAction,\n        help=\"Do a dry run for testing purposes.\",\n    )\n    args = parser.parse_args()\n\n    pipeline_parameters = {\n        data_loader.name: {\"split\": \"val\"},\n        auto_formalization.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"temperature\": 0.6,\n                    \"top_p\": 0.9,\n                    \"max_new_tokens\": 512,\n                }\n            }\n        },\n        prover_scorer.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"temperature\": 0.6,\n                    \"top_p\": 0.9,\n                    \"max_new_tokens\": 512,\n                }\n            }\n        },\n    }\n\n    ds_name = \"test_deepseek_prover\"\n\n    if args.dry_run:\n        distiset = pipeline.dry_run(batch_size=1, parameters=pipeline_parameters)\n        distiset.save_to_disk(Path.home() / f\"Downloads/{ds_name}\")\n\n        import pprint\n\n        pprint.pprint(distiset[\"default\"][\"train\"][0])\n\n    else:\n        distiset = pipeline.run(parameters=pipeline_parameters)\n        distiset.push_to_hub(ds_name, include_script=True)\n

The script can be run run for a dry run or not, depending on the argument (the pipeline will run without dry run by default), and will be pushed to the hub with the name your_username/test_deepseek_prover:

python deepseek_prover.py [-d | --dry-run | --no-dry-run]\n

Final dataset: plaguss/test_deepseek_prover.

"},{"location":"sections/pipeline_samples/papers/deita/","title":"DEITA","text":"

DEITA (Data-Efficient Instruction Tuning for Alignment) studies an automatic data selection process by first quantifying the data quality based on complexity, quality and diversity. Second, select the best potential combination from an open-source dataset that would fit into the budget you allocate to tune your own LLM.

In most setting we cannot allocate unlimited resources for instruction-tuning LLMs. Therefore, the DEITA authors investigated how to select qualitative data for instruction tuning based on the principle of fewer high-quality samples. Liu et al. tackle the issue of first defining good data and second identifying it to respect an initial budget to instruct-tune your LLM.

The strategy utilizes LLMs to replace human effort in time-intensive data quality tasks on instruction-tuning datasets**. DEITA introduces a way to measure data quality across three critical dimensions: complexity, quality and diversity.

You can see that we see again the dataset of instructions/responses and we kind of reproducing the second step when we learn how to optimize the responses according to an instruction by comparing several possibilities.

"},{"location":"sections/pipeline_samples/papers/deita/#datasets-and-budget","title":"Datasets and budget","text":"

We will dive deeper into the whole process. We will investigate each stage to efficiently select the final dataset used for supervised fine-tuning with a budget constraint. We will tackle technical challenges by explaining exactly how you would assess good data as presented in the paper.

As a reminder, we're looking for a strategy to automatically select good data for the instruction-tuning step when you want to fine-tune an LLM to your own use case taking into account a resource constraint. This means that you cannot blindly train a model on any data you encounter on the internet.

The DEITA authors assume that you have access to open-source datasets that fit your use case. This may not be the case entirely. But with open-source communities tackling many use cases, with projects such as BLOOM or AYA, it's likely that your use case will be tackled at some point. Furthermore, you could generate your own instruction/response pairs with methods such as self-generated instructions using distilabel. This tutorial assumes that we have a data pool with excessive samples for the project's cost constraint. In short, we aim to achieve adequate performance from fewer samples.

The authors claim that the subsample size \"correlates proportionally with the computation consumed in instruction tuning\". Hence on a first approximation, reducing the sample size means reducing computation consumption and so the total development cost. Reproducing the paper notations, we will associate the budget m to a number of instruction/response pairs that you can set depending on your real budget.

To match the experimental set-up, dataset X_sota is a meta-dataset combining major open-source datasets available to instruct-tune LLMs. This dataset is composed of ShareGPT (58k instruction/response pairs), UltraChat (105k instruction/response pairs) and WizardLM (143k instruction/response pairs). It sums to more than 300k instruction/response pairs. We aim to reduce the final subsample to 6k instruction/response pairs.

"},{"location":"sections/pipeline_samples/papers/deita/#setup-the-notebook-and-packages","title":"Setup the notebook and packages","text":"

Let's prepare our dependencies:

pip install \"distilabel[openai,hf-transformers]>=1.0.0\"\npip install pynvml huggingface_hub argilla\n

Import distilabel:

from distilabel.models import TransformersLLM, OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import ConversationTemplate, DeitaFiltering, ExpandColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import ComplexityScorer, EvolInstruct, EvolQuality, GenerateEmbeddings, QualityScorer\n

Define the distilabel Pipeline and load the dataset from the Hugging Face Hub.

pipeline = Pipeline(name=\"DEITA\")\n\nload_data = LoadDataFromHub(\n    name=\"load_data\", batch_size=100, output_mappings={\"prompt\": \"instruction\"}, pipeline=pipeline\n)\n
"},{"location":"sections/pipeline_samples/papers/deita/#evol-instruct-generate-instructions-with-an-llm","title":"EVOL-INSTRUCT: Generate Instructions with an LLM","text":"

Evol-Instruct automates the creation of complex instruction data for training large language models (LLMs) by progressively rewriting an initial set of instructions into more complex forms. This generated data is then used to fine-tune a model named WizardLM.

Evaluations show that instructions from Evol-Instruct are superior to human-created ones, and WizardLM achieves performance close to or exceeding GPT3.5-turbo in many skills. In distilabel, we initialise each step of the data generation pipeline. Later, we'll connect them together.

evol_instruction_complexity = EvolInstruct(\n    name=\"evol_instruction_complexity\",\n    llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n    num_evolutions=5,\n    store_evolutions=True,\n    generate_answers=True,\n    include_original_instruction=True,\n    pipeline=pipeline,\n)\n\nevol_instruction_complexity.load()\n\n_evolved_instructions = next(evol_instruction_complexity.process(\n    ([{\"instruction\": \"How many fish are there in a dozen fish?\"}]))\n)\n\nprint(*_evolved_instructions, sep=\"\\n\")\n

Output:

( 1, 'How many fish are there in a dozen fish?')\n( 2, 'How many rainbow trout are there in a dozen rainbow trout?')\n( 3, 'What is the average weight in pounds of a dozen rainbow trout caught in a specific river in Alaska during the month of May?')\n
"},{"location":"sections/pipeline_samples/papers/deita/#evol-complexity-evaluate-complexity-of-generated-instructions","title":"EVOL COMPLEXITY: Evaluate complexity of generated instructions","text":"

The second step is the evaluation of complexity for an instruction in a given instruction-response pair. Like EVOL-INSTRUCT, this method uses LLMs instead of humans to automatically improve instructions, specifically through their complexity. From any instruction-response pair, \\((I, R)\\), we first generate new instructions following the In-Depth Evolving Response. We generate more complex instructions through prompting, as explained by authors, by adding some constraints or reasoning steps. Let\\'s take an example from GPT-4-LLM which aims to generate observations by GPT-4 to instruct-tune LLMs with supervised fine-tuning. And, we have the instruction \\(instruction_0\\):

instruction_0 = \"Give three tips for staying healthy.\"\n

To make it more complex, you can use, as the authors did, some prompt templates to add constraints or deepen the instruction. They provided some prompts in the paper appendix. For instance, this one was used to add constraints:

PROMPT = \"\"\"I want you act as a Prompt Rewriter.\nYour objective is to rewrite a given prompt into a more complex version to\nmake those famous AI systems (e.g., ChatGPT and GPT4) a bit harder to handle.\nBut the rewritten prompt must be reasonable and must be understood and\nresponded by humans.\nYour rewriting cannot omit the non-text parts such as the table and code in\n#Given Prompt#:. Also, please do not omit the input in #Given Prompt#.\nYou SHOULD complicate the given prompt using the following method:\nPlease add one more constraints/requirements into #Given Prompt#\nYou should try your best not to make the #Rewritten Prompt# become verbose,\n#Rewritten Prompt# can only add 10 to 20 words into #Given Prompt#.\n\u2018#Given Prompt#\u2019, \u2018#Rewritten Prompt#\u2019, \u2018given prompt\u2019 and \u2018rewritten prompt\u2019\nare not allowed to appear in #Rewritten Prompt#\n#Given Prompt#:\n<Here is instruction>\n#Rewritten Prompt#:\n\"\"\"\n

Prompting this to an LLM, you automatically get a more complex instruction, called \\(instruction_1\\), from an initial instruction \\(instruction_0\\):

instruction_1 = \"Provide three recommendations for maintaining well-being, ensuring one focuses on mental health.\"\n

With sequences of evolved instructions, we use a further LLM to automatically rank and score them. We provide the 6 instructions at the same time. By providing all instructions together, we force the scoring model to look at minor complexity differences between evolved instructions. Encouraging the model to discriminate between instructions. Taking the example below, \\(instruction_0\\) and \\(instruction_1\\) could deserve the same score independently, but when compared together we would notice the slight difference that makes \\(instruction_1\\) more complex.

In distilabel, we implement this like so:

instruction_complexity_scorer = ComplexityScorer(\n    name=\"instruction_complexity_scorer\",\n    llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n    input_mappings={\"instructions\": \"evolved_instructions\"},\n    pipeline=pipeline,\n)\n\nexpand_evolved_instructions = ExpandColumns(\n    name=\"expand_evolved_instructions\",\n    columns=[\"evolved_instructions\", \"answers\", \"scores\"],\n    output_mappings={\n        \"evolved_instructions\": \"evolved_instruction\",\n        \"answers\": \"answer\",\n        \"scores\": \"evol_instruction_score\",\n    },\n    pipeline=pipeline,\n)\n\ninstruction_complexity_scorer.load()\n\n_evolved_instructions = next(instruction_complexity_scorer.process(([{\"evolved_instructions\": [PROMPT + instruction_1]}])))\n\nprint(\"Original Instruction:\")\nprint(instruction_1)\nprint(\"\\nEvolved Instruction:\")\nprint(_evolved_instructions[0][\"evolved_instructions\"][0].split(\"#Rewritten Prompt#:\\n\")[1])\n

Output:

Original Instruction:\nProvide three recommendations for maintaining well-being, ensuring one focuses on mental health.\n\nEvolved Instruction:\nSuggest three strategies for nurturing overall well-being, with the stipulation that at least one explicitly addresses the enhancement of mental health, incorporating evidence-based practices.\n
"},{"location":"sections/pipeline_samples/papers/deita/#evol-quality-quality-evaluation","title":"EVOL-QUALITY: Quality Evaluation","text":"

Now that we have scored the complexity of the instructions, we will focus on the quality of the responses. Similar to EVOL COMPLEXITY, the authors introduced EVOL QUALITY, a method based on LLMs, instead of humans, to automatically score the quality of the response.

From an instruction-response pair, \\((I, R)\\), the goal is to make the response evolve into a more helpful and relevant response. The key difference is that we need to also provide the first instruction to guide evolution. Let's take back our example from GPT-4-LLM.

Here we have the response \\(response_0\\) and its initial instruction \\(instruction_0\\):

instruction_0 = \"Give three tips for staying healthy.\"\nreponse_0 = \"1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\"\n

Again the authors provided several prompts you could use to make your response evolve according to some guidelines. For example, this one was used to enrich the answer:

PROMPT = \"\"\"I want you to act as a Response Rewriter\nYour goal is to enhance the quality of the response given by an AI assistant\nto the #Given Prompt# through rewriting.\nBut the rewritten response must be reasonable and must be understood by humans.\nYour rewriting cannot omit the non-text parts such as the table and code in\n#Given Prompt# and #Given Response#. Also, please do not omit the input\nin #Given Prompt#.\nYou Should enhance the quality of the response using the following method:\nPlease make the Response more in-depth\nYou should try your best not to make the #Rewritten Response# become verbose,\n#Rewritten Response# can only add 10 to 20 words into #Given Response#.\n\u2018#Given Response#\u2019, \u2018#Rewritten Response#\u2019, \u2018given response\u2019 and \u2018rewritten response\u2019\nare not allowed to appear in #Rewritten Response#\n#Given Prompt#:\n<instruction_0>\n#Given Response#:\n<response_0>\n#Rewritten Response#:\n\"\"\"\n

Prompting this to an LLM, you will automatically get a more enriched response, called \\(response_1\\), from an initial response \\(response_0\\) and initial instruction \\(instruction_0\\):

evol_response_quality = EvolQuality(\n    name=\"evol_response_quality\",\n    llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n    num_evolutions=5,\n    store_evolutions=True,\n    include_original_response=True,\n    input_mappings={\n        \"instruction\": \"evolved_instruction\",\n        \"response\": \"answer\",\n    },\n    pipeline=pipeline,\n)\n\nevol_response_quality.load()\n\n_evolved_responses = next(evol_response_quality.process([{\"instruction\": PROMPT + instruction_0, \"response\": reponse_0}]))\n\nprint(\"Original Response:\")\nprint(reponse_0)\nprint(\"\\nEvolved Response:\")\nprint(*_evolved_responses[0]['evolved_responses'], sep=\"\\n\")\n

And now, as in EVOL COMPLEXITY you iterate through this path and use different prompts to make your responses more relevant, helpful or creative. In the paper, they make 4 more iterations to get 5 evolved responses \\((R0, R1, R2, R3, R4)\\) which makes 5 different responses for one initial instruction at the end of this step.

response_quality_scorer = QualityScorer(\n    name=\"response_quality_scorer\",\n    llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n    input_mappings={\n        \"instruction\": \"evolved_instruction\",\n        \"responses\": \"evolved_responses\",\n    },\n    pipeline=pipeline,\n)\n\nexpand_evolved_responses = ExpandColumns(\n    name=\"expand_evolved_responses\",\n    columns=[\"evolved_responses\", \"scores\"],\n    output_mappings={\n        \"evolved_responses\": \"evolved_response\",\n        \"scores\": \"evol_response_score\",\n    },\n    pipeline=pipeline,\n)\n\nresponse_quality_scorer.load()\n\n_scored_responses = next(response_quality_scorer.process([{\"instruction\": PROMPT + instruction_0, \"responses\": _evolved_responses[0]['evolved_responses']}]))\n\nprint(\"Original Response:\")\nprint(reponse_0)\n\nprint(\"\\nScore, Evolved Response:\")\nprint(*zip(_scored_responses[0][\"scores\"], _evolved_responses[0]['evolved_responses']), sep=\"\\n\")\n

Output:

Original Response:\n1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\n\nScore, Evolved Response:\n(4.0, 'Here are three essential tips for maintaining good health: \\n1. Prioritize regular exercise \\n2. Eat a balanced diet with plenty of fruits and vegetables \\n3. Get an adequate amount of sleep each night.')\n(2.0, 'Here are three effective strategies to maintain a healthy lifestyle.')\n(5.0, 'Here are three practical tips to maintain good health: Ensure a balanced diet, engage in regular exercise, and prioritize sufficient sleep. These practices support overall well-being.')\n
"},{"location":"sections/pipeline_samples/papers/deita/#improving-data-diversity","title":"Improving Data Diversity","text":"

One main component of good data to instruct-tune LLMs is diversity. Real world data can often contain redundancy due repetitive and homogeneous data.

The authors of the DEITA paper tackle the challenge of ensuring data diversity in the instruction tuning LLMs to avoid the pitfalls of data redundancy that can lead to over-fitting or poor generalization. They propose an embedding-based method to filter data for diversity. This method, called Repr Filter, uses embeddings generated by the Llama 1 13B model to represent instruction-response pairs in a vector space. The diversity of a new data sample is assessed based on the cosine distance between its embedding and that of its nearest neighbor in the already selected dataset. If this distance is greater than a specified threshold, the sample is considered diverse and is added to the selection. This process prioritizes diversity by assessing each sample's contribution to the variety of the dataset until the data selection budget is met. This approach effectively maintains the diversity of the data used for instruction tuning, as demonstrated by the DEITA models outperforming or matching state-of-the-art models with significantly less training data. In this implementation of DEITA we use the hidden state of the last layer of the Llama 2 model to generate embeddings, instead of a sentence transformer model, because we found that it improved the diversity of the data selection.

generate_conversation = ConversationTemplate(\n    name=\"generate_conversation\",\n    input_mappings={\n        \"instruction\": \"evolved_instruction\",\n        \"response\": \"evolved_response\",\n    },\n    pipeline=pipeline,\n)\n\ngenerate_embeddings = GenerateEmbeddings(\n    name=\"generate_embeddings\",\n    llm=TransformersLLM(\n        model=\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n        device=\"cuda\",\n        torch_dtype=\"float16\",\n    ),\n    input_mappings={\"text\": \"conversation\"},\n    input_batch_size=5,\n    pipeline=pipeline,\n)\n\ndeita_filtering = DeitaFiltering(name=\"deita_filtering\", pipeline=pipeline)\n
"},{"location":"sections/pipeline_samples/papers/deita/#build-the-distilabel-pipeline","title":"Build the \u2697 distilabel Pipeline","text":"

Now we're ready to build a distilabel pipeline using the DEITA method:

load_data.connect(evol_instruction_complexity)\nevol_instruction_complexity.connect(instruction_complexity_scorer)\ninstruction_complexity_scorer.connect(expand_evolved_instructions)\nexpand_evolved_instructions.connect(evol_response_quality)\nevol_response_quality.connect(response_quality_scorer)\nresponse_quality_scorer.connect(expand_evolved_responses)\nexpand_evolved_responses.connect(generate_conversation)\ngenerate_conversation.connect(generate_embeddings)\ngenerate_embeddings.connect(deita_filtering)\n

Now we can run the pipeline. We use the step names to reference them in the pipeline configuration:

distiset = pipeline.run(\n    parameters={\n        \"load_data\": {\n            \"repo_id\": \"distilabel-internal-testing/instruction-dataset-50\",\n            \"split\": \"train\",\n        },\n        \"evol_instruction_complexity\": {\n            \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 512, \"temperature\": 0.7}}\n        },\n        \"instruction_complexity_scorer\": {\n            \"llm\": {\"generation_kwargs\": {\"temperature\": 0.0}}\n        },\n        \"evol_response_quality\": {\n            \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 512, \"temperature\": 0.7}}\n        },\n        \"response_quality_scorer\": {\"llm\": {\"generation_kwargs\": {\"temperature\": 0.0}}},\n        \"deita_filtering\": {\"data_budget\": 500, \"diversity_threshold\": 0.04},\n    },\n    use_cache=False,\n)\n

We can push the results to the Hugging Face Hub:

distiset.push_to_hub(\"distilabel-internal-testing/deita-colab\")\n
"},{"location":"sections/pipeline_samples/papers/deita/#results","title":"Results","text":"

Again, to show the relevance of EVOL QUALITY method, the authors evaluated on the MT-bench models fine-tuned with different data selections according to how we defined quality responses according to an instruction. Each time they selected 6k data according to the quality score:

Credit: Liu et al. (2023)

The score is much better when selecting data with the EVOL QUALITY method than when we select randomly or according to the length, making a more qualitative response if longer. Nevertheless, we see that the margin we may have seen in the complexity score is thinner. And we'll discuss the strategy in a later part. Nevertheless, this strategy looks to improve the fine-tuning compared to the baselines and now we're interested in mixing quality and complexity assessment with a diversity evaluation to find the right trade-off in our selection process.

"},{"location":"sections/pipeline_samples/papers/deita/#conclusion","title":"Conclusion","text":"

In conclusion, if you are looking for some efficient method to align an open-source LLM to your business case with a constrained budget, the solutions provided by DEITA are really worth the shot. This data-centric approach enables one to focus on the content of the dataset to have the best results instead of \"just\" scaling the instruction-tuning with more, and surely less qualitative, data. In a nutshell, the strategy developed, through automatically scoring instructions-responses, aims to substitute the human preference step proprietary models such as GPT-4 have been trained with. There are a few improvements we could think about when it comes to how to select the good data, but it opens a really great way in instruct-tuning LLM with lower computational needs making the whole process intellectually relevant and more sustainable than most of the other methods. We'd be happy to help you out with aligning an LLM with your business case drawing inspiration from such a methodology.

"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/","title":"Instruction Backtranslation","text":"

\"Self Alignment with Instruction Backtranslation\" presents a scalable method to build high-quality instruction following a language model by automatically labeling human-written text with corresponding instructions. Their approach, named instruction backtranslation, starts with a language model finetuned on a small amount of seed data, and a given web corpus. The seed model is used to construct training examples by generating instruction prompts for web documents (self-augmentation), and then selecting high-quality examples from among these candidates (self-curation). This data is then used to finetune a stronger model.

Their self-training approach assumes access to a base language model, a small amount of seed data, and a collection of unlabelled examples, e.g. a web corpus. The unlabelled data is a large, diverse set of human-written documents that includes writing about all manner of topics humans are interested in \u2013 but crucially is not paired with instructions.

A first key assumption is that there exists some subset of this very large human-written text that would be suitable as gold generations for some user instructions. A second key assumption is that they can predict instructions for these candidate gold answers that can be used as high-quality example pairs to train an instruction-following model.

Their overall process, called instruction back translation performs two core steps:

  1. Self-augment: Generate instructions for unlabelled data, i.e. the web corpus, to produce candidate training data of (instruction, output) pairs for instruction tuning.

  2. Self-curate: Self-select high-quality demonstration examples as training data to finetune the base model to follow instructions. This approach is done iteratively where a better intermediate instruction-following model can improve on selecting data for finetuning in the next iteration.

This replication covers the self-curation step i.e. the second/latter step as mentioned above, so as to be able to use the proposed prompting approach to rate the quality of the generated text, which can either be synthetically generated or real human-written text.

"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#replication","title":"Replication","text":"

To replicate the paper we will be using distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes.

"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#installation","title":"Installation","text":"

To replicate Self Alignment with Instruction Backtranslation one will need to install distilabel as it follows:

pip install \"distilabel[hf-inference-endpoints,openai]>=1.0.0\"\n

And since we will be using InferenceEndpointsLLM (installed via the extra hf-inference-endpoints) we will need deploy those in advance either locally or in the Hugging Face Hub (alternatively also the serverless endpoints can be used, but most of the times the inference times are slower, and there's a limited quota to use those as those are free) and set both the HF_TOKEN (to use the InferenceEndpointsLLM) and the OPENAI_API_KEY environment variable value (to use the OpenAILLM).

"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#building-blocks","title":"Building blocks","text":"
  • LoadDataFromHub: Generator Step to load a dataset from the Hugging Face Hub.
  • TextGeneration: Task to generate responses for a given instruction using an LLM.
    • InferenceEndpointsLLM: LLM that runs a model from an Inference Endpoint in the Hugging Face Hub.
  • InstructionBacktranslation: Task that generates a score and a reason for a response for a given instruction using the Self Alignment with Instruction Backtranslation prompt.
    • OpenAILLM: LLM that loads a model from OpenAI.
"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#code","title":"Code","text":"

As mentioned before, we will put the previously mentioned building blocks together to replicate Self Alignment with Instruction Backtranslation.

from distilabel.models import InferenceEndpointsLLM, OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub, KeepColumns\nfrom distilabel.steps.tasks import InstructionBacktranslation, TextGeneration\n\n\nwith Pipeline(name=\"self-alignment-with-instruction-backtranslation\") as pipeline:\n    load_hub_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n    )\n\n    text_generation = TextGeneration(\n        name=\"text_generation\",\n        llm=InferenceEndpointsLLM(\n            base_url=\"<INFERENCE_ENDPOINT_URL>\",\n            tokenizer_id=\"argilla/notus-7b-v1\",\n            model_display_name=\"argilla/notus-7b-v1\",\n        ),\n        input_batch_size=10,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n\n    instruction_backtranslation = InstructionBacktranslation(\n        name=\"instruction_backtranslation\",\n        llm=OpenAILLM(model=\"gpt-4\"),\n        input_batch_size=10,\n        output_mappings={\"model_name\": \"scoring_model\"},\n    )\n\n    keep_columns = KeepColumns(\n        name=\"keep_columns\",\n        columns=[\n            \"instruction\",\n            \"generation\",\n            \"generation_model\",\n            \"score\",\n            \"reason\",\n            \"scoring_model\",\n        ],\n    )\n\n    load_hub_dataset >> text_generation >> instruction_backtranslation >> keep_columns\n

Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched.

distiset = pipeline.run(\n    parameters={\n        load_hub_dataset.name: {\n            \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n            \"split\": \"test\",\n        },\n        text_generation.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 1024,\n                    \"temperature\": 0.7,\n                },\n            },\n        },\n        instruction_backtranslation.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 1024,\n                    \"temperature\": 0.7,\n                },\n            },\n        },\n    },\n)\n

Finally, we can optionally push the generated dataset, named Distiset, to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub.

distiset.push_to_hub(\n    \"instruction-backtranslation-instruction-dataset\",\n    private=True,\n)\n
"},{"location":"sections/pipeline_samples/papers/prometheus/","title":"Prometheus 2","text":"

\"Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models\" presents Prometheus 2, a new and more powerful evaluator LLM compared to Prometheus (its predecessor) presented in \"Prometheus: Inducing Fine-grained Evaluation Capability in Language Models\"; since GPT-4, as well as other proprietary LLMs, are commonly used to assess the quality of the responses for various LLMs, but there are concerns about transparency, controllability, and affordability, that motivate the need of open-source LLMs specialized in evaluations.

Existing open evaluator LMs exhibit critical shortcomings:

  1. They issue scores that significantly diverge from those assigned by humans.
  2. They lack the flexibility to perform both direct assessment and pairwise ranking, the two most prevalent forms of assessment.

Additionally, they do not possess the ability to evaluate based on custom evaluation criteria, focusing instead on general attributes like helpfulness and harmlessness. Prometheus 2 is capable of processing both direct assessment and pair-wise ranking formats grouped with user-defined evaluation criteria.

Prometheus 2 released two variants:

  • prometheus-eval/prometheus-7b-v2.0: fine-tuned on top of mistralai/Mistral-7B-Instruct-v0.2
  • prometheus-eval/prometheus-8x7b-v2.0: fine-tuned on top of mistralai/Mixtral-8x7B-Instruct-v0.1

Both models have been fine-tuned for both direct assessment and pairwise ranking tasks i.e. assessing the quality of a single isolated response for a given instruction with or without a reference answer and assessing the quality of one response against another one for a given instruction with or without a reference answer, respectively.

On four direct assessment benchmarks and four pairwise ranking benchmarks, Prometheus 2 scores the highest correlation and agreement with humans and proprietary LM judges among all tested open evaluator LMs. Their models, code, and data are all publicly available at prometheus-eval/prometheus-eval.

"},{"location":"sections/pipeline_samples/papers/prometheus/#replication","title":"Replication","text":"

Note

The section is named Replication but in this case we're not replicating the Prometheus 2 paper per se, but rather showing how to use the PrometheusEval task implemented within distilabel to evaluate the quality of the responses from a given instruction using the Prometheus 2 model.

To showcase Prometheus 2 we will be using the PrometheusEval task implemented in distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes.

"},{"location":"sections/pipeline_samples/papers/prometheus/#installation","title":"Installation","text":"

To reproduce the code below, one will need to install distilabel as it follows:

pip install \"distilabel[vllm]>=1.1.0\"\n

Alternatively, it's recommended to install Dao-AILab/flash-attention to benefit from Flash Attention 2 speed ups during inference via vllm.

pip install flash-attn --no-build-isolation\n

Note

The installation notes above assume that you are using a VM with one GPU accelerator with at least the required VRAM to fit prometheus-eval/prometheus-7b-v2.0 in bfloat16 (28GB); but if you have enough VRAM to fit their 8x7B model in bfloat16 (~90GB) you can use prometheus-eval/prometheus-8x7b-v2.0 instead.

"},{"location":"sections/pipeline_samples/papers/prometheus/#building-blocks","title":"Building blocks","text":"
  • LoadDataFromHub: GeneratorStep to load a dataset from the Hugging Face Hub.

  • PrometheusEval: Task that assesses the quality of a response for a given instruction using any of the Prometheus 2 models.

    • vLLM: LLM that loads a model from the Hugging Face Hub via vllm-project/vllm.

    Note

    Since the Prometheus 2 models use a slightly different chat template than mistralai/Mistral-7B-Instruct-v0.2, we need to set the chat_template parameter to [INST] {{ messages[0]['content'] }}\\n{{ messages[1]['content'] }}[/INST] so as to properly format the input for Prometheus 2.

  • (Optional) KeepColumns: Task that keeps only the specified columns in the dataset, used to remove the undesired columns.

"},{"location":"sections/pipeline_samples/papers/prometheus/#code","title":"Code","text":"

As mentioned before, we will put the previously mentioned building blocks together to see how Prometheus 2 can be used via distilabel.

from distilabel.models import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import KeepColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import PrometheusEval\n\nif __name__ == \"__main__\":\n    with Pipeline(name=\"prometheus\") as pipeline:\n        load_dataset = LoadDataFromHub(\n            name=\"load_dataset\",\n            repo_id=\"HuggingFaceH4/instruction-dataset\",\n            split=\"test\",\n            output_mappings={\"prompt\": \"instruction\", \"completion\": \"generation\"},\n        )\n\n        task = PrometheusEval(\n            name=\"task\",\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]['content'] }}\\n{{ messages[1]['content'] }}[/INST]\",\n            ),\n            mode=\"absolute\",\n            rubric=\"factual-validity\",\n            reference=False,\n            num_generations=1,\n            group_generations=False,\n        )\n\n        keep_columns = KeepColumns(\n            name=\"keep_columns\",\n            columns=[\"instruction\", \"generation\", \"feedback\", \"result\", \"model_name\"],\n        )\n\n        load_dataset >> task >> keep_columns\n

Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched.

distiset = pipeline.run(\n    parameters={\n        task.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 1024,\n                    \"temperature\": 0.7,\n                },\n            },\n        },\n    },\n)\n

Finally, we can optionally push the generated dataset, named Distiset, to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub.

distiset.push_to_hub(\n    \"instruction-dataset-prometheus\",\n    private=True,\n)\n
"},{"location":"sections/pipeline_samples/papers/ultrafeedback/","title":"UltraFeedback","text":"

UltraFeedback: Boosting Language Models with High-quality Feedback is a paper published by OpenBMB which proposes UltraFeedback, a large-scale, fine-grained, diverse preference dataset, used for training powerful reward models and critic models.

UltraFeedback collects about 64k prompts from diverse resources (including UltraChat, ShareGPT, Evol-Instruct, TruthfulQA, FalseQA, and FLAN), then they use these prompts to query multiple LLMs (commercial models, Llama models ranging 7B to 70B, and non-Llama models) and generate four different responses for each prompt, resulting in a total of 256k samples i.e. the UltraFeedback will rate four responses on every OpenAI request.

To collect high-quality preference and textual feedback, they design a fine-grained annotation instruction, which contains four different aspects, namely instruction-following, truthfulness, honesty and helpfulness (even though within the paper they also mention a fifth one named verbalized calibration). Finally, GPT-4 is used to generate the ratings for the generated responses to the given prompt using the previously mentioned aspects.

"},{"location":"sections/pipeline_samples/papers/ultrafeedback/#replication","title":"Replication","text":"

To replicate the paper we will be using distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes.

Also for testing purposes we will just show how to evaluate the generated responses for a given prompt using a new global aspect named overall-rating defined by Argilla, that computes the average of the four aspects, so as to reduce number of requests to be sent to OpenAI, but note that all the aspects are implemented within distilabel and can be used instead for a more faithful reproduction. Besides that we will generate three responses for each instruction using three LLMs selected from a pool of six: HuggingFaceH4/zephyr-7b-beta, argilla/notus-7b-v1, google/gemma-1.1-7b-it, meta-llama/Meta-Llama-3-8B-Instruct, HuggingFaceH4/zephyr-7b-gemma-v0.1 and mlabonne/UltraMerge-7B.

"},{"location":"sections/pipeline_samples/papers/ultrafeedback/#installation","title":"Installation","text":"

To replicate UltraFeedback one will need to install distilabel as it follows:

pip install \"distilabel[argilla,openai,vllm]>=1.0.0\"\n

And since we will be using vllm we will need to use a VM with at least 6 NVIDIA GPUs with at least 16GB of memory each to run the text generation, and set the OPENAI_API_KEY environment variable value.

"},{"location":"sections/pipeline_samples/papers/ultrafeedback/#building-blocks","title":"Building blocks","text":"
  • LoadDataFromHub: Generator Step to load a dataset from the Hugging Face Hub.
  • sample_n_steps: Function to create a routing_batch_function that samples n downstream steps for each batch generated by the upstream step. This is the key to replicate the LLM pooling mechanism described in the paper.
  • TextGeneration: Task to generate responses for a given instruction using an LLM.
    • vLLM: LLM that loads a model from the Hugging Face Hub using vllm.
  • GroupColumns: Task that combines multiple columns into a single one i.e. from string to list of strings. Useful when there are multiple parallel steps that are connected to the same node.
  • UltraFeedback: Task that generates ratings for the responses of a given instruction using the UltraFeedback prompt.
    • OpenAILLM: LLM that loads a model from OpenAI.
  • KeepColumns: Task to keep the desired columns while removing the not needed ones, as well as defining the order for those.
  • (optional) PreferenceToArgilla: Task to optionally push the generated dataset to Argilla to do some further analysis and human annotation.
"},{"location":"sections/pipeline_samples/papers/ultrafeedback/#code","title":"Code","text":"

As mentioned before, we will put the previously mentioned building blocks together to replicate UltraFeedback.

from distilabel.models import OpenAILLM, vLLM\nfrom distilabel.pipeline import Pipeline, sample_n_steps\nfrom distilabel.steps import (\n    GroupColumns,\n    KeepColumns,\n    LoadDataFromHub,\n    PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nsample_three_llms = sample_n_steps(n=3)\n\n\nwith Pipeline(name=\"ultrafeedback-pipeline\") as pipeline:\n    load_hub_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n        batch_size=2,\n    )\n\n    text_generation_with_notus = TextGeneration(\n        name=\"text_generation_with_notus\",\n        llm=vLLM(model=\"argilla/notus-7b-v1\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_zephyr = TextGeneration(\n        name=\"text_generation_with_zephyr\",\n        llm=vLLM(model=\"HuggingFaceH4/zephyr-7b-gemma-v0.1\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_gemma = TextGeneration(\n        name=\"text_generation_with_gemma\",\n        llm=vLLM(model=\"google/gemma-1.1-7b-it\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_zephyr_gemma = TextGeneration(\n        name=\"text_generation_with_zephyr_gemma\",\n        llm=vLLM(model=\"HuggingFaceH4/zephyr-7b-gemma-v0.1\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_llama = TextGeneration(\n        name=\"text_generation_with_llama\",\n        llm=vLLM(model=\"meta-llama/Meta-Llama-3-8B-Instruct\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_ultramerge = TextGeneration(\n        name=\"text_generation_with_ultramerge\",\n        llm=vLLM(model=\"mlabonne/UltraMerge-7B\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n\n    combine_columns = GroupColumns(\n        name=\"combine_columns\",\n        columns=[\"generation\", \"generation_model\"],\n        output_columns=[\"generations\", \"generation_models\"],\n        input_batch_size=2\n    )\n\n    ultrafeedback = UltraFeedback(\n        name=\"ultrafeedback_openai\",\n        llm=OpenAILLM(model=\"gpt-4-turbo-2024-04-09\"),\n        aspect=\"overall-rating\",\n        output_mappings={\"model_name\": \"ultrafeedback_model\"},\n    )\n\n    keep_columns = KeepColumns(\n        name=\"keep_columns\",\n        columns=[\n            \"instruction\",\n            \"generations\",\n            \"generation_models\",\n            \"ratings\",\n            \"rationales\",\n            \"ultrafeedback_model\",\n        ],\n    )\n\n    (\n        load_hub_dataset\n        >> sample_three_llms\n        >> [\n            text_generation_with_notus,\n            text_generation_with_zephyr,\n            text_generation_with_gemma,\n            text_generation_with_llama,\n            text_generation_with_zephyr_gemma,\n            text_generation_with_ultramerge\n        ]\n        >> combine_columns\n        >> ultrafeedback\n        >> keep_columns\n    )\n\n    # Optional: Push the generated dataset to Argilla, but will need to `pip install argilla` first\n    # push_to_argilla = PreferenceToArgilla(\n    #     name=\"push_to_argilla\",\n    #     api_url=\"<ARGILLA_API_URL>\",\n    #     api_key=\"<ARGILLA_API_KEY>\",  # type: ignore\n    #     dataset_name=\"ultrafeedback\",\n    #     dataset_workspace=\"admin\",\n    #     num_generations=2,\n    # )\n    # keep_columns >> push_to_argilla\n

Note

As we're using a relative small dataset, we're setting a low batch_size and input_batch_size so we have more batches for the routing_batch_function i.e. we will have more variety on the LLMs used to generate the responses. When using a large dataset, it's recommended to use a larger batch_size and input_batch_size to benefit from the vLLM optimizations for larger batch sizes, which makes the pipeline execution faster.

Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched.

distiset = pipeline.run(\n    parameters={\n        load_hub_dataset.name: {\n            \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n            \"split\": \"test\",\n        },\n        text_generation_with_notus.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_zephyr.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_gemma.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_llama.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_zephyr_gemma.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_ultramerge.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        ultrafeedback.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 2048,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n    }\n)\n

Finally, we can optionally push the generated dataset, named Distiset, to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub.

distiset.push_to_hub(\n    \"ultrafeedback-instruction-dataset\",\n    private=True,\n)\n
"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/","title":"Synthetic data generation for fine-tuning custom retrieval and reranking models","text":"
!pip install \"distilabel[hf-inference-endpoints]\"\n
!pip install \"sentence-transformers~=3.0\"\n

Let's make the needed imports:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.steps import LoadDataFromHub\n\nfrom sentence_transformers import SentenceTransformer, CrossEncoder\nimport torch\n

You'll need an HF_TOKEN to use the HF Inference Endpoints. Login to use it directly within this notebook.

import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n
!pip install \"distilabel[argilla, hf-inference-endpoints]\"\n

Let's make the extra needed imports:

import argilla as rg\n
context = (\n\"\"\"\nThe text is a chunk from technical Python SDK documentation of Argilla.\nArgilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets.\nAlong with prose explanations, the text chunk may include code snippets and Python references.\n\"\"\"\n)\n
llm = InferenceEndpointsLLM(\n    model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    tokenizer_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n)\n\nwith Pipeline(name=\"generate\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        num_examples=15,\n        output_mappings={\"chunks\": \"anchor\"},\n    )\n    generate_retrieval_pairs = GenerateSentencePair(\n        name=\"generate_retrieval_pairs\",\n        triplet=True,\n        hard_negative=True,\n        action=\"query\",\n        llm=llm,\n        input_batch_size=10,\n        context=context,\n    )\n    generate_reranking_pairs = GenerateSentencePair(\n        name=\"generate_reranking_pairs\",\n        triplet=True,\n        hard_negative=False,  # to potentially generate non-relevant pairs\n        action=\"semantically-similar\",\n        llm=llm,\n        input_batch_size=10,\n        context=context,\n    )\n\n    load_dataset.connect(generate_retrieval_pairs, generate_reranking_pairs)\n

Next, we can execute this using pipeline.run. We will provide some parameters to specific components within our pipeline.

generation_kwargs = {\n    \"llm\": {\n        \"generation_kwargs\": {\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 512,\n        }\n    }\n}\n\ndistiset = pipeline.run(  \n    parameters={\n        load_dataset.name: {\n            \"repo_id\": \"plaguss/argilla_sdk_docs_raw_unstructured\",\n            \"split\": \"train\",\n        },\n        generate_retrieval_pairs.name: generation_kwargs,\n        generate_reranking_pairs.name: generation_kwargs,\n    },\n    use_cache=False,  # False for demo\n)\n

Data generation can be a expensive, so it is recommended to store the data somewhere. For now, we will store it on the Hugging Face Hub, using our push_to_hub method.

distiset.push_to_hub(\"[your-owner-name]/example-retrieval-reranking-dataset\")\n

We have got 2 different leaf/end nodes, therefore we've got a distil configurations we can access, one for the retrieval data, and one for the reranking data.

Looking at these initial examples, we can see they nicely capture the essence of the chunks column but we will need to evaluate the quality of the data a bit more before we can use it for fine-tuning.

model_id = \"Snowflake/snowflake-arctic-embed-m\"  # Hugging Face model ID\n\nmodel_retrieval = SentenceTransformer(\n    model_id, device=\"cuda\" if torch.cuda.is_available() else \"cpu\"\n)\n

Next, we will encode the generated text pairs and compute the similarities.

from sklearn.metrics.pairwise import cosine_similarity\n\ndef get_embeddings(texts):\n    vectors = model_retrieval.encode(texts)\n    return [vector.tolist() for vector in vectors]\n\n\ndef get_similarities(vector_batch_a, vector_batch_b):\n    similarities = []\n    for vector_a, vector_b in zip(vector_batch_a, vector_batch_b):\n        similarity = cosine_similarity([vector_a], [vector_b])[0][0]\n        similarities.append(similarity)\n    return similarities\n\ndef format_data_retriever(batch):# -&gt; Any:\n    batch[\"anchor-vector\"] = get_embeddings(batch[\"anchor\"])\n    batch[\"positive-vector\"] = get_embeddings(batch[\"positive\"])\n    batch[\"negative-vector\"] = get_embeddings(batch[\"negative\"])    \n    batch[\"similarity-positive-negative\"] = get_similarities(batch[\"positive-vector\"], batch[\"negative-vector\"])\n    batch[\"similarity-anchor-positive\"] = get_similarities(batch[\"anchor-vector\"], batch[\"positive-vector\"])\n    batch[\"similarity-anchor-negative\"] = get_similarities(batch[\"anchor-vector\"], batch[\"negative-vector\"])\n    return batch\n\ndataset_generate_retrieval_pairs = distiset[\"generate_retrieval_pairs\"][\"train\"].map(format_data_retriever, batched=True, batch_size=250)\n
model_id = \"sentence-transformers/all-MiniLM-L12-v2\"\n\nmodel = CrossEncoder(model_id)\n

Next, we will compute the similarity for the generated text pairs using the reranker. On top of that, we will compute an anchor-vector to allow for doing semantic search.

def format_data_retriever(batch):# -&gt; Any:\n    batch[\"anchor-vector\"] = get_embeddings(batch[\"anchor\"])\n    batch[\"similarity-positive-negative\"] = model.predict(zip(batch[\"positive-vector\"], batch[\"negative-vector\"]))\n    batch[\"similarity-anchor-positive\"] = model.predict(zip(batch[\"anchor-vector\"], batch[\"positive-vector\"]))\n    batch[\"similarity-anchor-negative\"] = model.predict(zip(batch[\"anchor-vector\"], batch[\"negative-vector\"]))\n    return batch\n\ndataset_generate_reranking_pairs = distiset[\"generate_reranking_pairs\"][\"train\"].map(format_data_retriever, batched=True, batch_size=250)\n

And voila, we have our proxies for quality evaluation which we can use to filter out the best and worst examples.

First, we need to define the setting for our Argilla dataset. We will create two different datasets, one for the retrieval data and one for the reranking data to ensure our annotators can focus on the task at hand.

import argilla as rg\nfrom argilla._exceptions import ConflictError\n\napi_key = \"ohh so secret\"\napi_url = \"https://[your-owner-name]-[your-space-name].hf.space\"\n\nclient = rg.Argilla(api_url=api_url, api_key=api_key)\n\nsettings = rg.Settings(\n    fields=[\n        rg.TextField(\"anchor\")\n    ],\n    questions=[\n        rg.TextQuestion(\"positive\"),\n        rg.TextQuestion(\"negative\"),\n        rg.LabelQuestion(\n            name=\"is_positive_relevant\",\n            title=\"Is the positive query relevant?\",\n            labels=[\"yes\", \"no\"],\n        ),\n        rg.LabelQuestion(\n            name=\"is_negative_irrelevant\",\n            title=\"Is the negative query irrelevant?\",\n            labels=[\"yes\", \"no\"],\n        )\n    ],\n    metadata=[\n        rg.TermsMetadataProperty(\"filename\"),\n        rg.FloatMetadataProperty(\"similarity-positive-negative\"),\n        rg.FloatMetadataProperty(\"similarity-anchor-positive\"),\n        rg.FloatMetadataProperty(\"similarity-anchor-negative\"),\n    ],\n    vectors=[\n        rg.VectorField(\"anchor-vector\", dimensions=model.get_sentence_embedding_dimension())\n    ]\n)\nrg_datasets = []\nfor dataset_name in [\"generate_retrieval_pairs\", \"generate_reranking_pairs\"]:\n    ds = rg.Dataset(\n        name=dataset_name,\n        settings=settings\n    )\n    try:\n        ds.create()\n    except ConflictError:\n        ds = client.datasets(dataset_name)\n    rg_datasets.append(ds)\n

Now, we've got our dataset definitions setup in Argilla, we can upload our data to Argilla.

ds_datasets = [dataset_generate_retrieval_pairs, dataset_generate_reranking_pairs]\n\nrecords = []\n\nfor rg_dataset, ds_dataset in zip(rg_datasets, ds_datasets):\n    for idx, entry in enumerate(ds_dataset):\n        records.append(\n            rg.Record(\n                id=idx,\n                fields={\"anchor\": entry[\"anchor\"]},\n                suggestions=[\n                    rg.Suggestion(\"positive\", value=entry[\"positive\"], agent=\"gpt-4o\", type=\"model\"),\n                    rg.Suggestion(\"negative\", value=entry[\"negative\"], agent=\"gpt-4o\", type=\"model\"),\n                ],\n                metadata={\n                    \"filename\": entry[\"filename\"],\n                    \"similarity-positive-negative\": entry[\"similarity-positive-negative\"],\n                    \"similarity-anchor-positive\": entry[\"similarity-anchor-positive\"],\n                    \"similarity-anchor-negative\": entry[\"similarity-anchor-negative\"]\n                },\n                vectors={\"anchor-vector\": entry[\"anchor-vector\"]}\n            )\n        )\n    rg_dataset.records.log(records)\n

Now, we can explore the UI and add a final human touch to get he most out of our dataset.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#synthetic-data-generation-for-fine-tuning-custom-retrieval-and-reranking-models","title":"Synthetic data generation for fine-tuning custom retrieval and reranking models","text":"
  • Goal: Bootstrap, optimize and maintain your embedding models and rerankers through synthetic data generation and human feedback.
  • Libraries: argilla, hf-inference-endpoints, sentence-transformers
  • Components: LoadDataFromHub, GenerateSentencePair, InferenceEndpointsLLM

Note

For a comprehensive overview on optimizing the retrieval performance in a RAG pipeline, check this guide in collaboration with ZenML, an open-source MLOps framework designed for building portable and production-ready machine learning pipelines.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#install-the-dependencies","title":"Install the dependencies","text":"

To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command:

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"

You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide.

Along with that, you will need to install Argilla as a distilabel extra.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#the-dataset","title":"The dataset","text":"

Before starting any project, it is always important to look at your data. Our data is publicly available on the Hugging Face Hub so we can have a quick look through their dataset viewer within an embedded iFrame.

As we can see, our dataset contains a column called chunks, which was obtained from the Argilla docs. Normally, you would need to download and chunk the data but we will not cover that in this tutorial. To read a full explanation for how this dataset was generated, please refer to How we leveraged distilabel to create an Argilla 2.0 Chatbot.

Alternatively, we can load the entire dataset to disk with datasets.load_dataset.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#synthetic-data-generation","title":"Synthetic data generation","text":"

The GenerateSentencePair component from distilabel can be used to generate training datasets for embeddings models.

It is a pre-defined Task that given an anchor sentence generate data for a specific action. Supported actions are: \"paraphrase\", \"semantically-similar\", \"query\", \"answer\". In our case the chunks column corresponds to the anchor. This means we will use query to generate potential queries for a fine-tuning a retrieval model and that we will use semantically-similar to generate texts that are similar to the intial anchor for fine-tuning a reranking model.

We will triplet=True in order to generate both positive and negative examples, which should help the model generalize better during fine-tuning and we will set hard_negative=True to generate more challenging examples that are closer to the anchor and discussed topics.

Lastly, we can seed the LLM with context to generate more relevant examples.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval","title":"Retrieval","text":"

For retrieval, we will thus generate queries that are similar to the chunks column. We will use the query action to generate potential queries for a fine-tuning a retrieval model.

generate_sentence_pair = GenerateSentencePair(\n    triplet=True,  \n    hard_negative=True,\n    action=\"query\",\n    llm=llm,\n    input_batch_size=10,\n    context=context,\n)\n
"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking","title":"Reranking","text":"

For reranking, we will generate texts that are similar to the intial anchor. We will use the semantically-similar action to generate texts that are similar to the intial anchor for fine-tuning a reranking model. In this case, we set hard_negative=False to generate more diverse and potentially wrong examples, which can be used as negative examples for similarity fine-tuning because rerankers cannot be fine-tuned using triplets.

generate_sentence_pair = GenerateSentencePair(\n    triplet=True,\n    hard_negative=False,\n    action=\"semantically-similar\",\n    llm=llm,\n    input_batch_size=10,\n    context=context,\n)\n
"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#combined-pipeline","title":"Combined pipeline","text":"

We will now use the GenerateSentencePair task to generate synthetic data for both retrieval and reranking models in a single pipeline. Note that, we map the chunks column to the anchor argument.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#data-quality-evaluation","title":"Data quality evaluation","text":"

Data is never as clean as it can be and this also holds for synthetically generated data too, therefore, it is always good to spent some time and look at your data.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#feature-engineering","title":"Feature engineering","text":"

In order to evaluate the quality of our data we will use features of the models that we intent to fine-tune as proxy for data quality. We can then use these features to filter out the best examples.

In order to choose a good default model, we will use the Massive Text Embedding Benchmark (MTEB) Leaderboard. We want to optimize for size and speed, so we will set model size <100M and then filter for Retrieval and Reranking based on the highest average score, resulting in Snowflake/snowflake-arctic-embed-s and sentence-transformers/all-MiniLM-L12-v2 respectively.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval_1","title":"Retrieval","text":"

For retrieval, we will compute similarities for the current embeddings of anchor-positive, positive-negative and anchor-negative pairs. We assume that an overlap of these similarities will cause the model to have difficulties generalizing and therefore we can use these features to evaluate the quality of our data.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking_1","title":"Reranking","text":"

For reranking, we will compute the compute the relevance scores from an existing reranker model for anchor-positive, positive-negative and anchor-negative pais and make a similar assumption as for the retrieval model.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#optional-argilla","title":"(Optional) Argilla","text":"

To get the most out of you data and actually look at our data, we will use Argilla. If you are not familiar with Argilla, we recommend taking a look at the Argilla quickstart docs. Alternatively, you can use your Hugging Face account to login to the Argilla demo Space.

To start exploring data, we first need to define an argilla.Dataset. We will create a basic datset with some input TextFields for the anchor and output TextQuestions for the positive and negative pairs. Additionally, we will use the file_name as MetaDataProperty. Lastly, we will be re-using the vectors obtained from our previous step to allow for semantic search and we will add te similarity scores for some basic filtering and sorting.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#fine-tuning","title":"Fine-tuning","text":"

At last, we can fine-tune our models. We will use the sentence-transformers library to fine-tune our models.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval_2","title":"Retrieval","text":"

For retrieval, we have created a script that fine-tunes a model on our generated data the generated data based https://github.com/argilla-io/argilla-sdk-chatbot/blob/main/train_embedding.ipynb.You can also open it in Google Colab directly.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking_2","title":"Reranking","text":"

For reranking, sentence-transformers provides a script that shows how to fine-tune a CrossEncoder models. Ad of now, there is some uncertainty over fine-tuning CrossEncoder models with triplets but you can still use the positive and anchor

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#conclusions","title":"Conclusions","text":"

In this tutorial, we present an end-to-end example of fine-tuning retrievers and rerankers for RAG. This serves as a good starting point for optimizing and maintaining your data and model but need to be adapted to your specific use case.

We started with some seed data from the Argilla docs, generated synthetic data for retrieval and reranking models, evaluated the quality of the data, and showed how to fine-tune the models. We also used Argilla to get a human touch on the data.

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/","title":"Clean an existing preference dataset","text":"
  • Goal: Clean an existing preference dataset by providing AI feedback on the quality of the data.
  • Libraries: argilla, hf-inference-endpoints
  • Components: LoadDataFromDicts, UltraFeedback, KeepColumns, PreferenceToArgilla, InferenceEndpointsLLM, GlobalStep
!pip install \"distilabel[hf-inference-endpoints]\"\n
!pip install \"transformers~=4.0\" \"torch~=2.0\"\n

Let's make the required imports:

import random\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import (\n    KeepColumns,\n    LoadDataFromDicts,\n    PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import UltraFeedback\n

You'll need an HF_TOKEN to use the HF Inference Endpoints. Login to use it directly within this notebook.

import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n
!pip install \"distilabel[argilla, hf-inference-endpoints]\"\n

In this case, we will clean a preference dataset, so we will use the Intel/orca_dpo_pairs dataset from the Hugging Face Hub.

dataset = load_dataset(\"Intel/orca_dpo_pairs\", split=\"train[:20]\")\n

Next, we will shuffle the chosen and rejected columns to avoid any bias in the dataset.

def shuffle_and_track(chosen, rejected):\n    pair = [chosen, rejected]\n    random.shuffle(pair)\n    order = [\"chosen\" if x == chosen else \"rejected\" for x in pair]\n    return {\"generations\": pair, \"order\": order}\n\ndataset = dataset.map(lambda x: shuffle_and_track(x[\"chosen\"], x[\"rejected\"]))\n
dataset = dataset.to_list()\n
As a custom step

You can also create a custom step in a separate module, import it and add it to the pipeline after loading the orca_dpo_pairs dataset using the LoadDataFromHub step.

shuffle_step.py
from typing import TYPE_CHECKING, List\nfrom distilabel.steps import GlobalStep, StepInput\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepOutput\n\nimport random\n\nclass ShuffleStep(GlobalStep):\n    @property\n    def inputs(self):\n        \"\"\"Returns List[str]: The inputs of the step.\"\"\"\n        return [\"instruction\", \"chosen\", \"rejected\"]\n\n    @property\n    def outputs(self):\n        \"\"\"Returns List[str]: The outputs of the step.\"\"\"\n        return [\"instruction\", \"generations\", \"order\"]\n\n    def process(self, inputs: StepInput):\n        \"\"\"Returns StepOutput: The outputs of the step.\"\"\"\n        outputs = []\n\n        for input in inputs:\n            chosen = input[\"chosen\"]\n            rejected = input[\"rejected\"]\n            pair = [chosen, rejected]\n            random.shuffle(pair)\n            order = [\"chosen\" if x == chosen else \"rejected\" for x in pair]\n\n            outputs.append({\"instruction\": input[\"instruction\"], \"generations\": pair, \"order\": order})\n\n        yield outputs\n
from shuffle_step import ShuffleStep\n

To clean an existing preference dataset, we will need to define a Pipeline with all the necessary steps. However, a similar workflow can be used to clean a SFT dataset. Below, we will go over each step in detail.

load_dataset = LoadDataFromDicts(\n    data=dataset[:1],\n    output_mappings={\"question\": \"instruction\"},\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nload_dataset.load()\nnext(load_dataset.process())\n
\n([{'system': '',\n   'question': \"You will be given a definition of a task first, then some input of the task.\\nThis task is about using the specified sentence and converting the sentence to Resource Description Framework (RDF) triplets of the form (subject, predicate object). The RDF triplets generated must be such that the triplets accurately capture the structure and semantics of the input sentence. The input is a sentence and the output is a list of triplets of the form [subject, predicate, object] that capture the relationships present in the sentence. When a sentence has more than 1 RDF triplet possible, the output must contain all of them.\\n\\nAFC Ajax (amateurs)'s ground is Sportpark De Toekomst where Ajax Youth Academy also play.\\nOutput:\",\n   'chosen': '[\\n  [\"AFC Ajax (amateurs)\", \"has ground\", \"Sportpark De Toekomst\"],\\n  [\"Ajax Youth Academy\", \"plays at\", \"Sportpark De Toekomst\"]\\n]',\n   'rejected': \" Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\\n\\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\\n\\nExplanation:\\n\\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\\n\\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.\",\n   'generations': [\" Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\\n\\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\\n\\nExplanation:\\n\\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\\n\\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.\",\n    '[\\n  [\"AFC Ajax (amateurs)\", \"has ground\", \"Sportpark De Toekomst\"],\\n  [\"Ajax Youth Academy\", \"plays at\", \"Sportpark De Toekomst\"]\\n]'],\n   'order': ['rejected', 'chosen']}],\n True)\n
evaluate_responses = UltraFeedback(\n    aspect=\"overall-rating\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n    ),\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nevaluate_responses.load()\nnext(\n    evaluate_responses.process(\n        [\n            {\n                \"instruction\": \"What's the capital of Spain?\",\n                \"generations\": [\"Madrid\", \"Barcelona\"],\n            }\n        ]\n    )\n)\n
\n[{'instruction': \"What's the capital of Spain?\",\n  'generations': ['Madrid', 'Barcelona'],\n  'ratings': [5, 1],\n  'rationales': [\"The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\",\n   \"The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"],\n  'distilabel_metadata': {'raw_output_ultra_feedback_0': \"#### Output for Text 1\\nRating: 5 (Excellent)\\nRationale: The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\\n\\n#### Output for Text 2\\nRating: 1 (Low Quality)\\nRationale: The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"},\n  'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
keep_columns = KeepColumns(\n    columns=[\n        \"instruction\",\n        \"generations\",\n        \"order\",\n        \"ratings\",\n        \"rationales\",\n        \"model_name\",\n    ],\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nkeep_columns.load()\nnext(\n    keep_columns.process(\n        [\n            {\n                \"system\": \"\",\n                \"instruction\": \"What's the capital of Spain?\",\n                \"chosen\": \"Madrid\",\n                \"rejected\": \"Barcelona\",\n                \"generations\": [\"Madrid\", \"Barcelona\"],\n                \"order\": [\"chosen\", \"rejected\"],\n                \"ratings\": [5, 1],\n                \"rationales\": [\"\", \"\"],\n                \"model_name\": \"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            }\n        ]\n    )\n)\n
\n[{'instruction': \"What's the capital of Spain?\",\n  'generations': ['Madrid', 'Barcelona'],\n  'order': ['chosen', 'rejected'],\n  'ratings': [5, 1],\n  'rationales': ['', ''],\n  'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
to_argilla = PreferenceToArgilla(\n    dataset_name=\"cleaned-dataset\",\n    dataset_workspace=\"argilla\",\n    api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n    api_key=\"[your-api-key]\",\n    num_generations=2\n)\n

Below, you can see the full pipeline definition:

with Pipeline(name=\"clean-dataset\") as pipeline:\n\n    load_dataset = LoadDataFromDicts(\n        data=dataset, output_mappings={\"question\": \"instruction\"}\n    )\n\n    evaluate_responses = UltraFeedback(\n        aspect=\"overall-rating\",\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n        ),\n    )\n\n    keep_columns = KeepColumns(\n        columns=[\n            \"instruction\",\n            \"generations\",\n            \"order\",\n            \"ratings\",\n            \"rationales\",\n            \"model_name\",\n        ]\n    )\n\n    to_argilla = PreferenceToArgilla(\n        dataset_name=\"cleaned-dataset\",\n        dataset_workspace=\"argilla\",\n        api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n        api_key=\"[your-api-key]\",\n        num_generations=2,\n    )\n\n    load_dataset.connect(evaluate_responses)\n    evaluate_responses.connect(keep_columns)\n    keep_columns.connect(to_argilla)\n

Let's now run the pipeline and clean our preference dataset.

distiset = pipeline.run()\n

Let's check it! If you have loaded the data to Argilla, you can start annotating in the Argilla UI.

You can push the dataset to the Hub for sharing with the community and embed it to explore the data.

distiset.push_to_hub(\"[your-owner-name]/example-cleaned-preference-dataset\")\n

In this tutorial, we showcased the detailed steps to build a pipeline for cleaning a preference dataset using distilabel. However, you can customize this pipeline for your own use cases, such as cleaning an SFT dataset or adding custom steps.

We used a preference dataset as our starting point and shuffled the data to avoid any bias. Next, we evaluated the responses using a model through the serverless Hugging Face Inference API, following the UltraFeedback standards. Finally, we kept the needed columns and used Argilla for further curation.

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#clean-an-existing-preference-dataset","title":"Clean an existing preference dataset","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#getting-started","title":"Getting Started","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#install-the-dependencies","title":"Install the dependencies","text":"

To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command:

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"

You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide.

Along with that, you will need to install Argilla as a distilabel extra.

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#the-dataset","title":"The dataset","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#define-the-pipeline","title":"Define the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#load-the-dataset","title":"Load the dataset","text":"

We will use the dataset we just shuffled as source data.

  • Component: LoadDataFromDicts
  • Input columns: system, question, chosen, rejected, generations and order, the same keys as in the loaded list of dictionaries.
  • Output columns: system, instruction, chosen, rejected, generations and order. We will use output_mappings to rename the columns.
"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#evaluate-the-responses","title":"Evaluate the responses","text":"

To evaluate the quality of the responses, we will use meta-llama/Meta-Llama-3.1-70B-Instruct, applying the UltraFeedback task that judges the responses according to different dimensions (helpfulness, honesty, instruction-following, truthfulness). For an SFT dataset, you can use PrometheusEval instead.

  • Component: UltraFeedback task with LLMs using InferenceEndpointsLLM
  • Input columns: instruction, generations
  • Output columns: ratings, rationales, distilabel_metadata, model_name

For your use case and to improve the results, you can use any other LLM of your choice.

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#keep-only-the-required-columns","title":"Keep only the required columns","text":"

We will get rid of the unneeded columns.

  • Component: KeepColumns
  • Input columns: system, instruction, chosen, rejected, generations, ratings, rationales, distilabel_metadata and model_name
  • Output columns: instruction, chosen, rejected, generations and order
"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#optional-further-data-curation","title":"(Optional) Further data curation","text":"

You can use Argilla to further curate your data.

  • Component: PreferenceToArgilla step
  • Input columns: instruction, generations, generation_models, ratings
  • Output columns: instruction, generations, generation_models, ratings
"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#conclusions","title":"Conclusions","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/","title":"Generate a preference dataset","text":"
  • Goal: Generate a synthetic preference dataset for DPO/ORPO.
  • Libraries: argilla, hf-inference-endpoints
  • Components: LoadDataFromHub, TextGeneration, UltraFeedback, GroupColumns, FormatTextGenerationDPO, PreferenceToArgilla, InferenceEndpointsLLM
!pip install \"distilabel[hf-inference-endpoints]\"\n
!pip install \"transformers~=4.0\" \"torch~=2.0\"\n

Let's make the required imports:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import (\n    LoadDataFromHub,\n    GroupColumns,\n    FormatTextGenerationDPO,\n    PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n

You'll need an HF_TOKEN to use the HF Inference Endpoints. Log in to use it directly within this notebook.

import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n
!pip install \"distilabel[argilla, hf-inference-endpoints]\"\n

To generate our preference dataset, we will need to define a Pipeline with all the necessary steps. Below, we will go over each step in detail.

load_dataset = LoadDataFromHub(\n        repo_id= \"argilla/10Kprompts-mini\",\n        num_examples=1,\n        pipeline=Pipeline(name=\"showcase-pipeline\"),\n    )\nload_dataset.load()\nnext(load_dataset.process())\n
\n([{'instruction': 'How can I create an efficient and robust workflow that utilizes advanced automation techniques to extract targeted data, including customer information, from diverse PDF documents and effortlessly integrate it into a designated Google Sheet? Furthermore, I am interested in establishing a comprehensive and seamless system that promptly activates an SMS notification on my mobile device whenever a new PDF document is uploaded to the Google Sheet, ensuring real-time updates and enhanced accessibility.',\n   'topic': 'Software Development'}],\n True)\n
generate_responses = [\n    TextGeneration(\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n            generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n        ),\n        pipeline=Pipeline(name=\"showcase-pipeline\"),\n    ),\n    TextGeneration(\n        llm=InferenceEndpointsLLM(\n            model_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n            tokenizer_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n            generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n        ),\n        pipeline=Pipeline(name=\"showcase-pipeline\"),\n    ),\n]\nfor task in generate_responses:\n    task.load()\n    print(next(task.process([{\"instruction\": \"Which are the top cities in Spain?\"}])))\n
\n[{'instruction': 'Which are the top cities in Spain?', 'generation': 'Spain is a country with a rich culture, history, and architecture, and it has many great cities to visit. Here are some of the top cities in Spain:\\n\\n1. **Madrid**: The capital city of Spain, known for its vibrant nightlife, museums, and historic landmarks like the Royal Palace and Prado Museum.\\n2. **Barcelona**: The second-largest city in Spain, famous for its modernist architecture, beaches, and iconic landmarks like La Sagrada Fam\u00edlia and Park G\u00fcell, designed by Antoni Gaud\u00ed.\\n3. **Valencia**: Located on the Mediterranean coast, Valencia is known for its beautiful beaches, City of Arts and Sciences, and delicious local cuisine, such as paella.\\n4. **Seville**: The capital of Andalusia, Seville is famous for its stunning cathedral, Royal Alc\u00e1zar Palace, and lively flamenco music scene.\\n5. **M\u00e1laga**: A coastal city in southern Spain, M\u00e1laga is known for its rich history, beautiful beaches, and being the birthplace of Pablo Picasso.\\n6. **Zaragoza**: Located in the northeastern region of Aragon, Zaragoza is a city with a rich history, known for its Roman ruins, Gothic cathedral, and beautiful parks.\\n7. **Granada**: A city in the Andalusian region, Granada is famous for its stunning Alhambra palace and generalife gardens, a UNESCO World Heritage Site.\\n8. **Bilbao**: A city in the Basque Country, Bilbao is known for its modern architecture, including the Guggenheim Museum, and its rich cultural heritage.\\n9. **Alicante**: A coastal city in the Valencia region, Alicante is famous for its beautiful beaches, historic castle, and lively nightlife.\\n10. **San Sebasti\u00e1n**: A city in the Basque Country, San Sebasti\u00e1n is known for its stunning beaches, gastronomic scene, and cultural events like the San Sebasti\u00e1n International Film Festival.\\n\\nThese are just a few of the many great cities in Spain, each with its own unique character and attractions.', 'distilabel_metadata': {'raw_output_text_generation_0': 'Spain is a country with a rich culture, history, and architecture, and it has many great cities to visit. Here are some of the top cities in Spain:\\n\\n1. **Madrid**: The capital city of Spain, known for its vibrant nightlife, museums, and historic landmarks like the Royal Palace and Prado Museum.\\n2. **Barcelona**: The second-largest city in Spain, famous for its modernist architecture, beaches, and iconic landmarks like La Sagrada Fam\u00edlia and Park G\u00fcell, designed by Antoni Gaud\u00ed.\\n3. **Valencia**: Located on the Mediterranean coast, Valencia is known for its beautiful beaches, City of Arts and Sciences, and delicious local cuisine, such as paella.\\n4. **Seville**: The capital of Andalusia, Seville is famous for its stunning cathedral, Royal Alc\u00e1zar Palace, and lively flamenco music scene.\\n5. **M\u00e1laga**: A coastal city in southern Spain, M\u00e1laga is known for its rich history, beautiful beaches, and being the birthplace of Pablo Picasso.\\n6. **Zaragoza**: Located in the northeastern region of Aragon, Zaragoza is a city with a rich history, known for its Roman ruins, Gothic cathedral, and beautiful parks.\\n7. **Granada**: A city in the Andalusian region, Granada is famous for its stunning Alhambra palace and generalife gardens, a UNESCO World Heritage Site.\\n8. **Bilbao**: A city in the Basque Country, Bilbao is known for its modern architecture, including the Guggenheim Museum, and its rich cultural heritage.\\n9. **Alicante**: A coastal city in the Valencia region, Alicante is famous for its beautiful beaches, historic castle, and lively nightlife.\\n10. **San Sebasti\u00e1n**: A city in the Basque Country, San Sebasti\u00e1n is known for its stunning beaches, gastronomic scene, and cultural events like the San Sebasti\u00e1n International Film Festival.\\n\\nThese are just a few of the many great cities in Spain, each with its own unique character and attractions.'}, 'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct'}]\n[{'instruction': 'Which are the top cities in Spain?', 'generation': ' Here are some of the top cities in Spain based on various factors such as tourism, culture, history, and quality of life:\\n\\n1. Madrid: The capital and largest city in Spain, Madrid is known for its vibrant nightlife, world-class museums (such as the Prado Museum and Reina Sofia Museum), stunning parks (such as the Retiro Park), and delicious food.\\n\\n2. Barcelona: Famous for its unique architecture, Barcelona is home to several UNESCO World Heritage sites designed by Antoni Gaud\u00ed, including the Sagrada Familia and Park G\u00fcell. The city also boasts beautiful beaches, a lively arts scene, and delicious Catalan cuisine.\\n\\n3. Valencia: A coastal city located in the east of Spain, Valencia is known for its City of Arts and Sciences, a modern architectural complex that includes a planetarium, opera house, and museum of interactive science. The city is also famous for its paella, a traditional Spanish dish made with rice, vegetables, and seafood.\\n\\n4. Seville: The capital of Andalusia, Seville is famous for its flamenco dancing, stunning cathedral (the largest Gothic cathedral in the world), and the Alc\u00e1zar, a beautiful palace made up of a series of rooms and courtyards.\\n\\n5. Granada: Located in the foothills of the Sierra Nevada mountains, Granada is known for its stunning Alhambra palace, a Moorish fortress that dates back to the 9th century. The city is also famous for its tapas, a traditional Spanish dish that is often served for free with drinks.\\n\\n6. Bilbao: A city in the Basque Country, Bilbao is famous for its modern architecture, including the Guggenheim Museum, a contemporary art museum designed by Frank Gehry. The city is also known for its pintxos, a type of Basque tapas that are served in bars and restaurants.\\n\\n7. M\u00e1laga: A coastal city in Andalusia, M\u00e1laga is known for its beautiful beaches, historic sites (including the Alcazaba and Gibralfaro castles), and the Picasso Museum, which is dedicated to the famous Spanish artist who was born in the city.\\n\\nThese are just a few of the many wonderful cities in Spain.', 'distilabel_metadata': {'raw_output_text_generation_0': ' Here are some of the top cities in Spain based on various factors such as tourism, culture, history, and quality of life:\\n\\n1. Madrid: The capital and largest city in Spain, Madrid is known for its vibrant nightlife, world-class museums (such as the Prado Museum and Reina Sofia Museum), stunning parks (such as the Retiro Park), and delicious food.\\n\\n2. Barcelona: Famous for its unique architecture, Barcelona is home to several UNESCO World Heritage sites designed by Antoni Gaud\u00ed, including the Sagrada Familia and Park G\u00fcell. The city also boasts beautiful beaches, a lively arts scene, and delicious Catalan cuisine.\\n\\n3. Valencia: A coastal city located in the east of Spain, Valencia is known for its City of Arts and Sciences, a modern architectural complex that includes a planetarium, opera house, and museum of interactive science. The city is also famous for its paella, a traditional Spanish dish made with rice, vegetables, and seafood.\\n\\n4. Seville: The capital of Andalusia, Seville is famous for its flamenco dancing, stunning cathedral (the largest Gothic cathedral in the world), and the Alc\u00e1zar, a beautiful palace made up of a series of rooms and courtyards.\\n\\n5. Granada: Located in the foothills of the Sierra Nevada mountains, Granada is known for its stunning Alhambra palace, a Moorish fortress that dates back to the 9th century. The city is also famous for its tapas, a traditional Spanish dish that is often served for free with drinks.\\n\\n6. Bilbao: A city in the Basque Country, Bilbao is famous for its modern architecture, including the Guggenheim Museum, a contemporary art museum designed by Frank Gehry. The city is also known for its pintxos, a type of Basque tapas that are served in bars and restaurants.\\n\\n7. M\u00e1laga: A coastal city in Andalusia, M\u00e1laga is known for its beautiful beaches, historic sites (including the Alcazaba and Gibralfaro castles), and the Picasso Museum, which is dedicated to the famous Spanish artist who was born in the city.\\n\\nThese are just a few of the many wonderful cities in Spain.'}, 'model_name': 'mistralai/Mixtral-8x7B-Instruct-v0.1'}]\n\n
group_responses = GroupColumns(\n    columns=[\"generation\", \"model_name\"],\n    output_columns=[\"generations\", \"model_names\"],\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nnext(\n    group_responses.process(\n        [\n            {\n                \"generation\": \"Madrid\",\n                \"model_name\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n            },\n        ],\n        [\n            {\n                \"generation\": \"Barcelona\",\n                \"model_name\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n            }\n        ],\n    )\n)\n
\n[{'generations': ['Madrid', 'Barcelona'],\n  'model_names': ['meta-llama/Meta-Llama-3-8B-Instruct',\n   'mistralai/Mixtral-8x7B-Instruct-v0.1']}]\n
evaluate_responses = UltraFeedback(\n    aspect=\"overall-rating\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n    ),\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nevaluate_responses.load()\nnext(\n    evaluate_responses.process(\n        [\n            {\n                \"instruction\": \"What's the capital of Spain?\",\n                \"generations\": [\"Madrid\", \"Barcelona\"],\n            }\n        ]\n    )\n)\n
\n[{'instruction': \"What's the capital of Spain?\",\n  'generations': ['Madrid', 'Barcelona'],\n  'ratings': [5, 1],\n  'rationales': [\"The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\",\n   \"The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"],\n  'distilabel_metadata': {'raw_output_ultra_feedback_0': \"#### Output for Text 1\\nRating: 5 (Excellent)\\nRationale: The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\\n\\n#### Output for Text 2\\nRating: 1 (Low Quality)\\nRationale: The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"},\n  'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'}]\n
format_dpo = FormatTextGenerationDPO(pipeline=Pipeline(name=\"showcase-pipeline\"))\nformat_dpo.load()\nnext(\n    format_dpo.process(\n        [\n            {\n                \"instruction\": \"What's the capital of Spain?\",\n                \"generations\": [\"Madrid\", \"Barcelona\"],\n                \"generation_models\": [\n                    \"Meta-Llama-3-8B-Instruct\",\n                    \"Mixtral-8x7B-Instruct-v0.1\",\n                ],\n                \"ratings\": [5, 1],\n            }\n        ]\n    )\n)\n
\n[{'instruction': \"What's the capital of Spain?\",\n  'generations': ['Madrid', 'Barcelona'],\n  'generation_models': ['Meta-Llama-3-8B-Instruct',\n   'Mixtral-8x7B-Instruct-v0.1'],\n  'ratings': [5, 1],\n  'prompt': \"What's the capital of Spain?\",\n  'prompt_id': '26174c953df26b3049484e4721102dca6b25d2de9e3aa22aa84f25ed1c798512',\n  'chosen': [{'role': 'user', 'content': \"What's the capital of Spain?\"},\n   {'role': 'assistant', 'content': 'Madrid'}],\n  'chosen_model': 'Meta-Llama-3-8B-Instruct',\n  'chosen_rating': 5,\n  'rejected': [{'role': 'user', 'content': \"What's the capital of Spain?\"},\n   {'role': 'assistant', 'content': 'Barcelona'}],\n  'rejected_model': 'Mixtral-8x7B-Instruct-v0.1',\n  'rejected_rating': 1}]\n
  • Or you can use Argilla to manually label the data and convert it to a preference dataset.
    • Component: PreferenceToArgilla step
    • Input columns: instruction, generations, generation_models, ratings
    • Output columns: instruction, generations, generation_models, ratings
to_argilla = PreferenceToArgilla(\n    dataset_name=\"preference-dataset\",\n    dataset_workspace=\"argilla\",\n    api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n    api_key=\"[your-api-key]\",\n    num_generations=2\n)\n

Below, you can see the full pipeline definition:

with Pipeline(name=\"generate-dataset\") as pipeline:\n\n    load_dataset = LoadDataFromHub(repo_id=\"argilla/10Kprompts-mini\")\n\n    generate_responses = [\n        TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n            )\n        ),\n        TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n                tokenizer_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n                generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n            )\n        ),\n    ]\n\n    group_responses = GroupColumns(\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    evaluate_responses = UltraFeedback(\n        aspect=\"overall-rating\",\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n        )\n    )\n\n    format_dpo = FormatTextGenerationDPO()\n\n    to_argilla = PreferenceToArgilla(\n        dataset_name=\"preference-dataset\",\n        dataset_workspace=\"argilla\",\n        api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n        api_key=\"[your-api-key]\",\n        num_generations=2\n    )\n\n    for task in generate_responses:\n        load_dataset.connect(task)\n        task.connect(group_responses)\n    group_responses.connect(evaluate_responses)\n    evaluate_responses.connect(format_dpo, to_argilla)\n

Let's now run the pipeline and generate the preference dataset.

distiset = pipeline.run()\n

Let's check the preference dataset! If you have loaded the data to Argilla, you can start annotating in the Argilla UI.

You can push the dataset to the Hub for sharing with the community and embed it to explore the data.

distiset.push_to_hub(\"[your-owner-name]/example-preference-dataset\")\n

In this tutorial, we showcased the detailed steps to build a pipeline for generating a preference dataset using distilabel. You can customize this pipeline for your own use cases and share your datasets with the community through the Hugging Face Hub, or use them to train a model for DPO or ORPO.

We used a dataset containing prompts to generate responses using two different models through the serverless Hugging Face Inference API. Next, we evaluated the responses using a third model, following the UltraFeedback standards. Finally, we converted the data to a preference dataset and used Argilla for further curation.

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#generate-a-preference-dataset","title":"Generate a preference dataset","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#install-the-dependencies","title":"Install the dependencies","text":"

To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command:

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"

You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide.

Along with that, you will need to install Argilla as a distilabel extra.

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#define-the-pipeline","title":"Define the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#load-the-dataset","title":"Load the dataset","text":"

We will use as source data the argilla/10Kprompts-mini dataset from the Hugging Face Hub.

  • Component: LoadDataFromHub
  • Input columns: instruction and topic, the same as in the loaded dataset
  • Output columns: instruction and topic
"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#generate-responses","title":"Generate responses","text":"

We need to generate the responses for the given instructions. We will use two different models available on the Hugging Face Hub through the Serverless Inference API: meta-llama/Meta-Llama-3-8B-Instruct and mistralai/Mixtral-8x7B-Instruct-v0.1. We will also indicate the generation parameters for each model.

  • Component: TextGeneration task with LLMs using InferenceEndpointsLLM
  • Input columns: instruction
  • Output columns: generation, distilabel_metadata, model_name for each model

For your use case and to improve the results, you can use any other LLM of your choice.

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#group-the-responses","title":"Group the responses","text":"

The task to evaluate the responses needs as input a list of generations. However, each model response was saved in the generation column of the subsets text_generation_0 and text_generation_1. We will combine these two columns into a single column and the default subset.

  • Component: GroupColumns
  • Input columns: generation and model_namefrom text_generation_0 and text_generation_1
  • Output columns: generations and model_names
"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#evaluate-the-responses","title":"Evaluate the responses","text":"

To build our preference dataset, we need to evaluate the responses generated by the models. We will use meta-llama/Meta-Llama-3-70B-Instruct for this, applying the UltraFeedback task that judges the responses according to different dimensions (helpfulness, honesty, instruction-following, truthfulness).

  • Component: UltraFeedback task with LLMs using InferenceEndpointsLLM
  • Input columns: instruction, generations
  • Output columns: ratings, rationales, distilabel_metadata, model_name

For your use case and to improve the results, you can use any other LLM of your choice.

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#convert-to-a-preference-dataset","title":"Convert to a preference dataset","text":"
  • You can automatically convert it to a preference dataset with the chosen and rejected columns.
    • Component: FormatTextGenerationDPO step
    • Input columns: instruction, generations, generation_models, ratings
    • Output columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating
"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#conclusions","title":"Conclusions","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/","title":"Generate synthetic text classification data","text":"
  • Goal: Generate synthetic text classification data to augment an imbalanced and limited dataset for training a topic classifier. In addition, generate new data for training a fact-based versus opinion-based classifier to add a new label.
  • Libraries: argilla, hf-inference-endpoints, SetFit
  • Components: LoadDataFromDicts, EmbeddingTaskGenerator, GenerateTextClassificationData
!pip install \"distilabel[hf-inference-endpoints]\"\n
!pip install \"transformers~=4.40\" \"torch~=2.0\" \"setfit~=1.0\"\n

Let's make the required imports:

import random\nfrom collections import Counter\n\nfrom datasets import load_dataset, Dataset\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import (\n    GenerateTextClassificationData,\n)\nfrom setfit import SetFitModel, Trainer, sample_dataset\n

You'll need an HF_TOKEN to use the HF Inference Endpoints. Log in to use it directly within this notebook.

import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n
!pip install \"distilabel[argilla, hf-inference-endpoints]\"\n

We will use the fancyzhx/ag_news dataset from the Hugging Face Hub as our original data source. To simulate a real-world scenario with imbalanced and limited data, we will load only 20 samples from this dataset.

hf_dataset = load_dataset(\"fancyzhx/ag_news\", split=\"train[-20:]\")\n

Now, we can retrieve the available labels in the dataset and examine the current data distribution.

labels_topic = hf_dataset.features[\"label\"].names\nid2str = {i: labels_topic[i] for i in range(len(labels_topic))}\nprint(id2str)\nprint(Counter(hf_dataset[\"label\"]))\n
\n{0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}\nCounter({0: 12, 1: 6, 2: 2})\n\n

As observed, the dataset is imbalanced, with most samples falling under the World category, while the Sci/Tech category is entirely missing. Moreover, there are insufficient samples to effectively train a topic classification model.

We will also define the labels for the new classification task.

labels_fact_opinion = [\"Fact-based\", \"Opinion-based\"]\n

To generate the data we will use the GenerateTextClassificationData task. This task will use as input classification tasks and we can define the language, difficulty and clarity required for the generated data.

task = GenerateTextClassificationData(\n    language=\"English\",\n    difficulty=\"college\",\n    clarity=\"clear\",\n    num_generations=1,\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.4},\n    ),\n    input_batch_size=5,\n)\ntask.load()\nresult = next(\n    task.process([{\"task\": \"Classify the news article as fact-based or opinion-based\"}])\n)\nprint(result[0][\"distilabel_metadata\"][\"raw_input_generate_text_classification_data_0\"])\n
\n[{'role': 'user', 'content': 'You have been assigned a text classification task: Classify the news article as fact-based or opinion-based\\n\\nYour mission is to write one text classification example for this task in JSON format. The JSON object must contain the following keys:\\n - \"input_text\": a string, the input text specified by the classification task.\\n - \"label\": a string, the correct label of the input text.\\n - \"misleading_label\": a string, an incorrect label that is related to the task.\\n\\nPlease adhere to the following guidelines:\\n - The \"input_text\" should be diverse in expression.\\n - The \"misleading_label\" must be a valid label for the given task, but not as appropriate as the \"label\" for the \"input_text\".\\n - The values for all fields should be in English.\\n - Avoid including the values of the \"label\" and \"misleading_label\" fields in the \"input_text\", that would make the task too easy.\\n - The \"input_text\" is clear and requires college level education to comprehend.\\n\\nYour output must always be a JSON object only, do not explain yourself or output anything else. Be creative!'}]\n\n

For our use case, we only need to generate data for two tasks: a topic classification task and a fact versus opinion classification task. Therefore, we will define the tasks accordingly. As we will be using an smaller model for generation, we will select 2 random labels for each topic classification task and change the order for the fact versus opinion classification task ensuring more diversity in the generated data.

task_templates = [\n    \"Determine the news article as {}\",\n    \"Classify news article as {}\",\n    \"Identify the news article as {}\",\n    \"Categorize the news article as {}\",\n    \"Label the news article using {}\",\n    \"Annotate the news article based on {}\",\n    \"Determine the theme of a news article from {}\",\n    \"Recognize the topic of the news article as {}\",\n]\n\nclassification_tasks = [\n    {\"task\": action.format(\" or \".join(random.sample(labels_topic, 2)))}\n    for action in task_templates for _ in range(4)\n] + [\n    {\"task\": action.format(\" or \".join(random.sample(labels_fact_opinion, 2)))}\n    for action in task_templates\n]\n

Now, it's time to define and run the pipeline. As mentioned, we will load the written tasks and feed them into the GenerateTextClassificationData task. For our use case, we will be using Meta-Llama-3.1-8B-Instruct via the InferenceEndpointsLLM, with different degrees of difficulty and clarity.

difficulties = [\"college\", \"high school\", \"PhD\"]\nclarity = [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n\nwith Pipeline(\"texcat-generation-pipeline\") as pipeline:\n\n    tasks_generator = LoadDataFromDicts(data=classification_tasks)\n\n    generate_data = []\n    for difficulty in difficulties:\n        for clarity_level in clarity:\n            task = GenerateTextClassificationData(\n                language=\"English\",\n                difficulty=difficulty,\n                clarity=clarity_level,\n                num_generations=2,\n                llm=InferenceEndpointsLLM(\n                    model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n                    tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n                    generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n                ),\n                input_batch_size=5,\n            )\n            generate_data.append(task)\n\n    for task in generate_data:\n        tasks_generator.connect(task)\n

Let's now run the pipeline and generate the synthetic data.

distiset = pipeline.run()\n
distiset[\"generate_text_classification_data_0\"][\"train\"][0]\n
\n{'task': 'Determine the news article as Business or World',\n 'input_text': \"The recent decision by the European Central Bank to raise interest rates will likely have a significant impact on the eurozone's economic growth, with some analysts predicting a 0.5% contraction in GDP due to the increased borrowing costs. The move is seen as a measure to combat inflation, which has been rising steadily over the past year.\",\n 'label': 'Business',\n 'misleading_label': 'World',\n 'distilabel_metadata': {'raw_output_generate_text_classification_data_0': '{\\n  \"input_text\": \"The recent decision by the European Central Bank to raise interest rates will likely have a significant impact on the eurozone\\'s economic growth, with some analysts predicting a 0.5% contraction in GDP due to the increased borrowing costs. The move is seen as a measure to combat inflation, which has been rising steadily over the past year.\",\\n  \"label\": \"Business\",\\n  \"misleading_label\": \"World\"\\n}'},\n 'model_name': 'meta-llama/Meta-Llama-3.1-8B-Instruct'}\n

You can push the dataset to the Hub for sharing with the community and embed it to explore the data.

distiset.push_to_hub(\"[your-owner-name]/example-texcat-generation-dataset\")\n

By examining the distiset distribution, we can confirm that it includes at least the 8 required samples for each label to train our classification models with SetFit.

all_labels = [\n    entry[\"label\"]\n    for dataset_name in distiset\n    for entry in distiset[dataset_name][\"train\"]\n]\n\nCounter(all_labels)\n
\nCounter({'Sci/Tech': 275,\n         'Business': 130,\n         'World': 86,\n         'Fact-based': 86,\n         'Sports': 64,\n         'Opinion-based': 54,\n         None: 20,\n         'Opinion Based': 1,\n         'News/Opinion': 1,\n         'Science': 1,\n         'Environment': 1,\n         'Opinion': 1})\n

We will create two datasets with the required labels and data for our use cases.

def extract_rows(distiset, labels):\n    return [\n        {\n            \"text\": entry[\"input_text\"],\n            \"label\": entry[\"label\"],\n            \"id\": i\n        }\n        for dataset_name in distiset\n        for i, entry in enumerate(distiset[dataset_name][\"train\"])\n        if entry[\"label\"] in labels\n    ]\n\ndata_topic = extract_rows(distiset, labels_topic)\ndata_fact_opinion = extract_rows(distiset, labels_fact_opinion)\n

Get started in Argilla

If you are not familiar with Argilla, we recommend taking a look at the Argilla quickstart docs. Alternatively, you can use your Hugging Face account to login to the Argilla demo Space.

To get the most out of our data, we will use Argilla. First, we need to connect to the Argilla instance.

import argilla as rg\n\n# Replace api_url with your url if using Docker\n# Replace api_key with your API key under \"My Settings\" in the UI\n# Uncomment the last line and set your HF_TOKEN if your space is private\nclient = rg.Argilla(\n    api_url=\"https://[your-owner-name]-[your_space_name].hf.space\",\n    api_key=\"[your-api-key]\",\n    # headers={\"Authorization\": f\"Bearer {HF_TOKEN}\"}\n)\n

We will create a Dataset for each task, with an input TextField for the text classification text and a LabelQuestion to ensure the generated labels are correct.

def create_texcat_dataset(dataset_name, labels):\n    settings = rg.Settings(\n        fields=[rg.TextField(\"text\")],\n        questions=[\n            rg.LabelQuestion(\n                name=\"label\",\n                title=\"Classify the texts according to the following labels\",\n                labels=labels,\n            ),\n        ],\n    )\n    return rg.Dataset(name=dataset_name, settings=settings).create()\n\n\nrg_dataset_topic = create_texcat_dataset(\"topic-classification\", labels_topic)\nrg_dataset_fact_opinion = create_texcat_dataset(\n    \"fact-opinion-classification\", labels_fact_opinion\n)\n

Now, we can upload the generated data to Argilla and evaluate it. We will use the generated labels as suggestions.

rg_dataset_topic.records.log(data_topic)\nrg_dataset_fact_opinion.records.log(data_fact_opinion)\n

Now, we can start the annotation process. Just open the dataset in the Argilla UI and start annotating the records. If the suggestions are correct, you can just click on Submit. Otherwise, you can select the correct label.

Note

Check this how-to guide to know more about annotating in the UI.

Once, you get the annotations, let's continue by retrieving the data from Argilla and format it as a dataset with the required data.

rg_dataset_topic = client.datasets(\"topic-classification\")\nrg_dataset_fact_opinion = client.datasets(\"fact-opinion-classification\")\n
status_filter = rg.Query(filter=rg.Filter((\"response.status\", \"==\", \"submitted\")))\n\nsubmitted_topic = rg_dataset_topic.records(status_filter).to_list(flatten=True)\nsubmitted_fact_opinion = rg_dataset_fact_opinion.records(status_filter).to_list(\n    flatten=True\n)\n
def format_submitted(submitted):\n    return [\n        {\n            \"text\": r[\"text\"],\n            \"label\": r[\"label.responses\"][0],\n            \"id\": i,\n        }\n        for i, r in enumerate(submitted)\n    ]\n\ndata_topic = format_submitted(submitted_topic)\ndata_fact_opinion = format_submitted(submitted_fact_opinion)\n

In our case, we will fine-tune using SetFit. However, you can select the one that best fits your requirements.

The next step will be to format the data to be compatible with SetFit. In the case of the topic classification, we will need to combine the synthetic data with the original data.

hf_topic = hf_dataset.to_list()\nnum = len(data_topic)\n\ndata_topic.extend(\n    [\n        {\n            \"text\": r[\"text\"],\n            \"label\": id2str[r[\"label\"]],\n            \"id\": num + i,\n        }\n        for i, r in enumerate(hf_topic)\n    ]\n)\n

If we check the data distribution now, we can see that we have enough samples for each label to train our models.

labels = [record[\"label\"] for record in data_topic]\nCounter(labels)\n
\nCounter({'Sci/Tech': 275, 'Business': 132, 'World': 98, 'Sports': 70})\n
labels = [record[\"label\"] for record in data_fact_opinion]\nCounter(labels)\n
\nCounter({'Fact-based': 86, 'Opinion-based': 54})\n

Now, let's create our training and validation datasets. The training dataset will gather 8 samples by label. In this case, the validation datasets will contain the remaining samples not included in the training datasets.

def sample_and_split(dataset, label_column, num_samples):\n    train_dataset = sample_dataset(\n        dataset, label_column=label_column, num_samples=num_samples\n    )\n    eval_dataset = dataset.filter(lambda x: x[\"id\"] not in set(train_dataset[\"id\"]))\n    return train_dataset, eval_dataset\n\n\ndataset_topic_full = Dataset.from_list(data_topic)\ndataset_fact_opinion_full = Dataset.from_list(data_fact_opinion)\n\ntrain_dataset_topic, eval_dataset_topic = sample_and_split(\n    dataset_topic_full, \"label\", 8\n)\ntrain_dataset_fact_opinion, eval_dataset_fact_opinion = sample_and_split(\n    dataset_fact_opinion_full, \"label\", 8\n)\n

Let's train our models for each task! We will use TaylorAI/bge-micro-v2, available in the Hugging Face Hub. You can check the MTEB leaderboard to select the best model for your use case.

def train_model(model_name, dataset, eval_dataset):\n    model = SetFitModel.from_pretrained(model_name)\n\n    trainer = Trainer(\n        model=model,\n        train_dataset=dataset,\n    )\n    trainer.train()\n    metrics = trainer.evaluate(eval_dataset)\n    print(metrics)\n\n    return model\n
model_topic = train_model(\n    model_name=\"TaylorAI/bge-micro-v2\",\n    dataset=train_dataset_topic,\n    eval_dataset=eval_dataset_topic,\n)\nmodel_topic.save_pretrained(\"topic_classification_model\")\nmodel_topic = SetFitModel.from_pretrained(\"topic_classification_model\")\n
\n***** Running training *****\n  Num unique pairs = 768\n  Batch size = 16\n  Num epochs = 1\n  Total optimization steps = 48\n\n
\n{'embedding_loss': 0.1873, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}\n\n
\n***** Running evaluation *****\n\n
\n{'train_runtime': 4.9767, 'train_samples_per_second': 154.318, 'train_steps_per_second': 9.645, 'epoch': 1.0}\n{'accuracy': 0.8333333333333334}\n\n
model_fact_opinion = train_model(\n    model_name=\"TaylorAI/bge-micro-v2\",\n    dataset=train_dataset_fact_opinion,\n    eval_dataset=eval_dataset_fact_opinion,\n)\nmodel_fact_opinion.save_pretrained(\"fact_opinion_classification_model\")\nmodel_fact_opinion = SetFitModel.from_pretrained(\"fact_opinion_classification_model\")\n
\n***** Running training *****\n  Num unique pairs = 144\n  Batch size = 16\n  Num epochs = 1\n  Total optimization steps = 9\n\n
\n{'embedding_loss': 0.2985, 'learning_rate': 2e-05, 'epoch': 0.11}\n\n
\n***** Running evaluation *****\n\n
\n{'train_runtime': 0.8327, 'train_samples_per_second': 172.931, 'train_steps_per_second': 10.808, 'epoch': 1.0}\n{'accuracy': 0.9090909090909091}\n\n

Voil\u00e0! The models are now trained and ready to be used. You can start making predictions to check the model's performance and add the new label. Optionally, you can continue using distilabel to generate additional data or Argilla to verify the quality of the predictions.

def predict(model, input, labels):\n    model.labels = labels\n    prediction = model.predict([input])\n    return prediction[0]\n
predict(\n    model_topic, \"The new iPhone is expected to be released next month.\", labels_topic\n)\n
\n'Sci/Tech'\n
predict(\n    model_fact_opinion,\n    \"The new iPhone is expected to be released next month.\",\n    labels_fact_opinion,\n)\n
\n'Opinion-based'\n

In this tutorial, we showcased the detailed steps to build a pipeline for generating text classification data using distilabel. You can customize this pipeline for your own use cases and share your datasets with the community through the Hugging Face Hub.

We defined two text classification tasks\u2014a topic classification task and a fact versus opinion classification task\u2014and generated new data using various models via the serverless Hugging Face Inference API. Then, we curated the generated data with Argilla. Finally, we trained the models with SetFit using both the original and synthetic data.

"},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#generate-synthetic-text-classification-data","title":"Generate synthetic text classification data","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#install-the-dependencies","title":"Install the dependencies","text":"

To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command:

"},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"

You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide.

Along with that, you will need to install Argilla as a distilabel extra.

"},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#the-dataset","title":"The dataset","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#define-the-text-classification-task","title":"Define the text classification task","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#optional-evaluate-with-argilla","title":"(Optional) Evaluate with Argilla","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#train-your-models","title":"Train your models","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#formatting-the-data","title":"Formatting the data","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#the-actual-training","title":"The actual training","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#conclusions","title":"Conclusions","text":""},{"location":"components-gallery/","title":"Components Gallery","text":"
  • Steps

    Explore all the available Steps that can be used for data manipulation.

    Steps

  • Tasks

    Explore all the available Tasks that can be used with an LLM to perform data generation, annotation, and more.

    Tasks

  • LLMs

    Explore all the available LLMs integrated with distilabel.

    LLMs

  • Embeddings

    Explore all the available Embeddings models integrated with distilabel.

    Embeddings

"},{"location":"components-gallery/steps/","title":"Steps Gallery","text":"Category Overview

The gallery page showcases the different types of components within distilabel.

Icon Category Description text-generation Text generation steps are used to generate text based on a given prompt. chat-generation Chat generation steps are used to generate text based on a conversation. text-classification Text classification steps are used to classify text into a category. text-manipulation Text manipulation steps are used to manipulate or rewrite an input text. evol Evol steps are used to rewrite input text and evolve it to a higher quality. critique Critique steps are used to provide feedback on the quality of the data with a written explanation. scorer Scorer steps are used to evaluate and score the data with a numerical value. preference Preference steps are used to collect preferences on the data with numerical values or ranks. embedding Embedding steps are used to generate embeddings for the data. clustering Clustering steps are used to group similar data points together. columns Columns steps are used to manipulate columns in the data. filtering Filtering steps are used to filter the data based on some criteria. format Format steps are used to format the data. load Load steps are used to load the data. execution Executes python functions. save Save steps are used to save the data.
  • PreferenceToArgilla

    Creates a preference dataset in Argilla.

    PreferenceToArgilla

  • TextGenerationToArgilla

    Creates a text generation dataset in Argilla.

    TextGenerationToArgilla

  • CombineColumns

    CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead.

    CombineColumns

  • PushToHub

    Push data to a Hugging Face Hub dataset.

    PushToHub

  • LoadDataFromDicts

    Loads a dataset from a list of dictionaries.

    LoadDataFromDicts

  • DataSampler

    Step to sample from a dataset.

    DataSampler

  • LoadDataFromHub

    Loads a dataset from the Hugging Face Hub.

    LoadDataFromHub

  • LoadDataFromFileSystem

    Loads a dataset from a file in your filesystem.

    LoadDataFromFileSystem

  • LoadDataFromDisk

    Load a dataset that was previously saved to disk.

    LoadDataFromDisk

  • PrepareExamples

    Helper step to create examples from query and answers pairs used as Few Shots in APIGen.

    PrepareExamples

  • ConversationTemplate

    Generate a conversation template from an instruction and a response.

    ConversationTemplate

  • FormatTextGenerationDPO

    Format the output of your LLMs for Direct Preference Optimization (DPO).

    FormatTextGenerationDPO

  • FormatChatGenerationDPO

    Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO).

    FormatChatGenerationDPO

  • FormatTextGenerationSFT

    Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT).

    FormatTextGenerationSFT

  • FormatChatGenerationSFT

    Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT).

    FormatChatGenerationSFT

  • DeitaFiltering

    Filter dataset rows using DEITA filtering strategy.

    DeitaFiltering

  • EmbeddingDedup

    Deduplicates text using embeddings.

    EmbeddingDedup

  • APIGenExecutionChecker

    Executes the generated function calls.

    APIGenExecutionChecker

  • MinHashDedup

    Deduplicates text using MinHash and MinHashLSH.

    MinHashDedup

  • CombineOutputs

    Combine the outputs of several upstream steps.

    CombineOutputs

  • ExpandColumns

    Expand columns that contain lists into multiple rows.

    ExpandColumns

  • GroupColumns

    Combines columns from a list of StepInput.

    GroupColumns

  • KeepColumns

    Keeps selected columns in the dataset.

    KeepColumns

  • MergeColumns

    Merge columns from a row.

    MergeColumns

  • DBSCAN

    DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core

    DBSCAN

  • UMAP

    UMAP is a general purpose manifold learning and dimension reduction algorithm.

    UMAP

  • FaissNearestNeighbour

    Create a faiss index to get the nearest neighbours.

    FaissNearestNeighbour

  • EmbeddingGeneration

    Generate embeddings using an Embeddings model.

    EmbeddingGeneration

  • RewardModelScore

    Assign a score to a response using a Reward Model.

    RewardModelScore

  • TruncateTextColumn

    Truncate a row using a tokenizer or the number of characters.

    TruncateTextColumn

"},{"location":"components-gallery/steps/preferencetoargilla/","title":"PreferenceToArgilla","text":"

Creates a preference dataset in Argilla.

Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a preference dataset, where there's one field for the instruction and one extra field per each generation within the same record, and then a rating question per each of the generation fields. The rating question asks the annotator to set a rating from 1 to 5 for each of the provided generations.

"},{"location":"components-gallery/steps/preferencetoargilla/#note","title":"Note","text":"

This step is meant to be used in conjunction with the UltraFeedback step, or any other step generating both ratings and responses for a given set of instruction and generations for the given instruction. But alternatively, it can also be used with any other task or step generating only the instruction and generations, as the ratings and rationales are optional.

"},{"location":"components-gallery/steps/preferencetoargilla/#attributes","title":"Attributes","text":"
  • num_generations: The number of generations to include in the dataset.

  • dataset_name: The name of the dataset in Argilla.

  • dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

  • api_url: The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

  • api_key: The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

"},{"location":"components-gallery/steps/preferencetoargilla/#runtime-parameters","title":"Runtime Parameters","text":"
  • api_url: The base URL to use for the Argilla API requests.

  • api_key: The API key to authenticate the requests to the Argilla API.

"},{"location":"components-gallery/steps/preferencetoargilla/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generations]\n            ICOL2[ratings]\n            ICOL3[rationales]\n        end\n    end\n\n    subgraph PreferenceToArgilla\n        StepInput[Input Columns: instruction, generations, ratings, rationales]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n
"},{"location":"components-gallery/steps/preferencetoargilla/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction that was used to generate the completion.

  • generations (List[str]): The completion that was generated based on the input instruction.

  • ratings (List[str], optional): The ratings for the generations. If not provided, the generated ratings won't be pushed to Argilla.

  • rationales (List[str], optional): The rationales for the ratings. If not provided, the generated rationales won't be pushed to Argilla.

"},{"location":"components-gallery/steps/preferencetoargilla/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/preferencetoargilla/#push-a-preference-dataset-to-an-argilla-instance","title":"Push a preference dataset to an Argilla instance","text":"
from distilabel.steps import PreferenceToArgilla\n\nto_argilla = PreferenceToArgilla(\n    num_generations=2,\n    api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n    api_key=\"api.key\",\n    dataset_name=\"argilla_dataset\",\n    dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generations\": [\"first_generation\", \"second_generation\"],\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n
"},{"location":"components-gallery/steps/preferencetoargilla/#it-can-also-include-ratings-and-rationales","title":"It can also include ratings and rationales","text":"
result = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generations\": [\"first_generation\", \"second_generation\"],\n                \"ratings\": [\"4\", \"5\"],\n                \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n            }\n        ],\n    )\n)\n# >>> result\n# [\n#     {\n#         'instruction': 'instruction',\n#         'generations': ['first_generation', 'second_generation'],\n#         'ratings': ['4', '5'],\n#         'rationales': ['rationale for 4', 'rationale for 5']\n#     }\n# ]\n
"},{"location":"components-gallery/steps/textgenerationtoargilla/","title":"TextGenerationToArgilla","text":"

Creates a text generation dataset in Argilla.

Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a text-generation dataset, where there's one field per each input, and then a label question to rate the quality of the completion in either bad (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d).

"},{"location":"components-gallery/steps/textgenerationtoargilla/#note","title":"Note","text":"

This step is meant to be used in conjunction with a TextGeneration step and no column mapping is needed, as it will use the default values for the instruction and generation columns.

"},{"location":"components-gallery/steps/textgenerationtoargilla/#attributes","title":"Attributes","text":"
  • dataset_name: The name of the dataset in Argilla.

  • dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

  • api_url: The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

  • api_key: The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

"},{"location":"components-gallery/steps/textgenerationtoargilla/#runtime-parameters","title":"Runtime Parameters","text":"
  • api_url: The base URL to use for the Argilla API requests.

  • api_key: The API key to authenticate the requests to the Argilla API.

"},{"location":"components-gallery/steps/textgenerationtoargilla/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generation]\n        end\n    end\n\n    subgraph TextGenerationToArgilla\n        StepInput[Input Columns: instruction, generation]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n
"},{"location":"components-gallery/steps/textgenerationtoargilla/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction that was used to generate the completion.

  • generation (str or List[str]): The completions that were generated based on the input instruction.

"},{"location":"components-gallery/steps/textgenerationtoargilla/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/textgenerationtoargilla/#push-a-text-generation-dataset-to-an-argilla-instance","title":"Push a text generation dataset to an Argilla instance","text":"
from distilabel.steps import PreferenceToArgilla\n\nto_argilla = TextGenerationToArgilla(\n    num_generations=2,\n    api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n    api_key=\"api.key\",\n    dataset_name=\"argilla_dataset\",\n    dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generation\": \"generation\",\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generation': 'generation'}]\n
"},{"location":"components-gallery/steps/combinecolumns/","title":"CombineColumns","text":"

CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead.

"},{"location":"components-gallery/steps/combinecolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n    end\n\n    subgraph CombineColumns\n    end\n\n
"},{"location":"components-gallery/steps/pushtohub/","title":"PushToHub","text":"

Push data to a Hugging Face Hub dataset.

A GlobalStep which creates a datasets.Dataset with the input data and pushes it to the Hugging Face Hub.

"},{"location":"components-gallery/steps/pushtohub/#attributes","title":"Attributes","text":"
  • repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.

  • split: The split of the dataset that will be pushed. Defaults to \"train\".

  • private: Whether the dataset to be pushed should be private or not. Defaults to False.

  • token: The token that will be used to authenticate in the Hub. If not provided, the token will be tried to be obtained from the environment variable HF_TOKEN. If not provided using one of the previous methods, then huggingface_hub library will try to use the token from the local Hugging Face CLI configuration. Defaults to None.

"},{"location":"components-gallery/steps/pushtohub/#runtime-parameters","title":"Runtime Parameters","text":"
  • repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.

  • split: The split of the dataset that will be pushed.

  • private: Whether the dataset to be pushed should be private or not.

  • token: The token that will be used to authenticate in the Hub.

"},{"location":"components-gallery/steps/pushtohub/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n    end\n\n    subgraph PushToHub\n        StepInput[Input Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n
"},{"location":"components-gallery/steps/pushtohub/#inputs","title":"Inputs","text":"
  • dynamic (all): all columns from the input will be used to create the dataset.
"},{"location":"components-gallery/steps/pushtohub/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/pushtohub/#push-batches-of-your-dataset-to-the-hugging-face-hub-repository","title":"Push batches of your dataset to the Hugging Face Hub repository","text":"
from distilabel.steps import PushToHub\n\npush = PushToHub(repo_id=\"path_to/repo\")\npush.load()\n\nresult = next(\n    push.process(\n        [\n            {\n                \"instruction\": \"instruction \",\n                \"generation\": \"generation\"\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction ', 'generation': 'generation'}]\n
"},{"location":"components-gallery/steps/loaddatafromdicts/","title":"LoadDataFromDicts","text":"

Loads a dataset from a list of dictionaries.

GeneratorStep that loads a dataset from a list of dictionaries and yields it in batches.

"},{"location":"components-gallery/steps/loaddatafromdicts/#attributes","title":"Attributes","text":"
  • data: The list of dictionaries to load the data from.
"},{"location":"components-gallery/steps/loaddatafromdicts/#runtime-parameters","title":"Runtime Parameters","text":"
  • batch_size: The batch size to use when processing the data.
"},{"location":"components-gallery/steps/loaddatafromdicts/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph LoadDataFromDicts\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/loaddatafromdicts/#outputs","title":"Outputs","text":"
  • dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
"},{"location":"components-gallery/steps/loaddatafromdicts/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromdicts/#load-data-from-a-list-of-dictionaries","title":"Load data from a list of dictionaries","text":"
from distilabel.steps import LoadDataFromDicts\n\nloader = LoadDataFromDicts(\n    data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n    batch_size=2\n)\nloader.load()\n\nresult = next(loader.process())\n# >>> result\n# ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n
"},{"location":"components-gallery/steps/datasampler/","title":"DataSampler","text":"

Step to sample from a dataset.

GeneratorStep that samples from a dataset and yields it in batches. This step is useful when you have a pipeline that can benefit from using examples in the prompts for example as few-shot learning, that can be changing on each row. For example, you can pass a list of dictionaries with N examples and generate M samples from it (assuming you have another step loading data, this M should have the same size as the data being loaded in that step). The size S argument is the number of samples per row generated, so each example would contain S examples to be used as examples.

"},{"location":"components-gallery/steps/datasampler/#attributes","title":"Attributes","text":"
  • data: The list of dictionaries to sample from.

  • size: Number of samples per example. For example in a few-shot learning scenario, the number of few-shot examples that will be generated per example. Defaults to 2.

  • samples: Number of examples that will be generated by the step in total. If used with another loader step, this should be the same as the number of samples in the loader step. Defaults to 100.

"},{"location":"components-gallery/steps/datasampler/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph DataSampler\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/datasampler/#outputs","title":"Outputs","text":"
  • dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
"},{"location":"components-gallery/steps/datasampler/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/datasampler/#sample-data-from-a-list-of-dictionaries","title":"Sample data from a list of dictionaries","text":"
from distilabel.steps import DataSampler\n\nsampler = DataSampler(\n    data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n    samples=10,\n    size=2,\n    batch_size=4\n)\nsampler.load()\n\nresult = next(sampler.process())\n# >>> result\n# ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n
"},{"location":"components-gallery/steps/datasampler/#pipeline-with-a-loader-and-a-sampler-combined-in-a-single-stream","title":"Pipeline with a loader and a sampler combined in a single stream","text":"
from datasets import load_dataset\n\nfrom distilabel.steps import LoadDataFromDicts, DataSampler\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples\nfrom distilabel.pipeline import Pipeline\n\nds = (\n    load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n    .shuffle(seed=42)\n    .select(range(500))\n    .to_list()\n)\ndata = [\n    {\n        \"func_name\": \"final_velocity\",\n        \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n    },\n    {\n        \"func_name\": \"permutation_count\",\n        \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n    },\n    {\n        \"func_name\": \"getdivision\",\n        \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n    },\n]\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n    loader_seeds = LoadDataFromDicts(data=data)\n    sampler = DataSampler(\n        data=ds,\n        size=2,\n        samples=len(data),\n        batch_size=8,\n    )\n    prep_examples = PrepareExamples()\n\n    sampler >> prep_examples\n    (\n        [loader_seeds, prep_examples]\n        >> combine_steps\n    )\n# Now we have a single stream of data with the loader and the sampler data\n
"},{"location":"components-gallery/steps/loaddatafromhub/","title":"LoadDataFromHub","text":"

Loads a dataset from the Hugging Face Hub.

GeneratorStep that loads a dataset from the Hugging Face Hub using the datasets library.

"},{"location":"components-gallery/steps/loaddatafromhub/#attributes","title":"Attributes","text":"
  • repo_id: The Hugging Face Hub repository ID of the dataset to load.

  • split: The split of the dataset to load.

  • config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations.

"},{"location":"components-gallery/steps/loaddatafromhub/#runtime-parameters","title":"Runtime Parameters","text":"
  • batch_size: The batch size to use when processing the data.

  • repo_id: The Hugging Face Hub repository ID of the dataset to load.

  • split: The split of the dataset to load. Defaults to 'train'.

  • config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations.

  • revision: The revision of the dataset to load. Defaults to the latest revision.

  • streaming: Whether to load the dataset in streaming mode or not. Defaults to False.

  • num_examples: The number of examples to load from the dataset. By default will load all examples.

  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

"},{"location":"components-gallery/steps/loaddatafromhub/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph LoadDataFromHub\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/loaddatafromhub/#outputs","title":"Outputs","text":"
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
"},{"location":"components-gallery/steps/loaddatafromhub/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromhub/#load-data-from-a-dataset-in-hugging-face-hub","title":"Load data from a dataset in Hugging Face Hub","text":"
from distilabel.steps import LoadDataFromHub\n\nloader = LoadDataFromHub(\n    repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n    split=\"test\",\n    batch_size=2\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'prompt': 'Arianna has 12...', False)\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/","title":"LoadDataFromFileSystem","text":"

Loads a dataset from a file in your filesystem.

GeneratorStep that creates a dataset from a file in the filesystem, uses Hugging Face datasets library. Take a look at Hugging Face Datasets for more information of the supported file types.

"},{"location":"components-gallery/steps/loaddatafromfilesystem/#attributes","title":"Attributes","text":"
  • data_files: The path to the file, or directory containing the files that conform the dataset.

  • split: The split of the dataset to load (typically will be train, test or validation).

"},{"location":"components-gallery/steps/loaddatafromfilesystem/#runtime-parameters","title":"Runtime Parameters","text":"
  • batch_size: The batch size to use when processing the data.

  • data_files: The path to the file, or directory containing the files that conform the dataset.

  • split: The split of the dataset to load. Defaults to 'train'.

  • streaming: Whether to load the dataset in streaming mode or not. Defaults to False.

  • num_examples: The number of examples to load from the dataset. By default will load all examples.

  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

  • filetype: The expected filetype. If not provided, it will be inferred from the file extension. For more than one file, it will be inferred from the first file.

"},{"location":"components-gallery/steps/loaddatafromfilesystem/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph LoadDataFromFileSystem\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#outputs","title":"Outputs","text":"
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-from-a-hugging-face-dataset-in-your-file-system","title":"Load data from a Hugging Face dataset in your file system","text":"
from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#specify-a-filetype-if-the-file-extension-is-not-expected","title":"Specify a filetype if the file extension is not expected","text":"
from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-from-a-file-in-your-cloud-provider","title":"Load data from a file in your cloud provider","text":"
from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n    data_files=\"gcs://path/to/dataset\",\n    storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-passing-a-glob-pattern","title":"Load data passing a glob pattern","text":"
from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n    data_files=\"path/to/dataset/*.jsonl\",\n    streaming=True\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromdisk/","title":"LoadDataFromDisk","text":"

Load a dataset that was previously saved to disk.

If you previously saved your dataset using the save_to_disk method, or Distiset.save_to_disk you can load it again to build a new pipeline using this class.

"},{"location":"components-gallery/steps/loaddatafromdisk/#attributes","title":"Attributes","text":"
  • dataset_path: The path to the dataset or distiset.

  • split: The split of the dataset to load (typically will be train, test or validation).

  • config: The configuration of the dataset to load. Defaults to default, if there are multiple configurations in the dataset this must be suplied or an error is raised.

"},{"location":"components-gallery/steps/loaddatafromdisk/#runtime-parameters","title":"Runtime Parameters","text":"
  • batch_size: The batch size to use when processing the data.

  • dataset_path: The path to the dataset or distiset.

  • is_distiset: Whether the dataset to load is a Distiset or not. Defaults to False.

  • split: The split of the dataset to load. Defaults to 'train'.

  • config: The configuration of the dataset to load. Defaults to default, if there are multiple configurations in the dataset this must be suplied or an error is raised.

  • num_examples: The number of examples to load from the dataset. By default will load all examples.

  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

"},{"location":"components-gallery/steps/loaddatafromdisk/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph LoadDataFromDisk\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/loaddatafromdisk/#outputs","title":"Outputs","text":"
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
"},{"location":"components-gallery/steps/loaddatafromdisk/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-hugging-face-dataset","title":"Load data from a Hugging Face Dataset","text":"
from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-distilabel-distiset","title":"Load data from a distilabel Distiset","text":"
from distilabel.steps import LoadDataFromDisk\n\n# Specify the configuration to load.\nloader = LoadDataFromDisk(\n    dataset_path=\"path/to/dataset\",\n    is_distiset=True,\n    config=\"leaf_step_1\"\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n
"},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-hugging-face-dataset-or-distiset-in-your-cloud-provider","title":"Load data from a Hugging Face Dataset or Distiset in your cloud provider","text":"
from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(\n    dataset_path=\"gcs://path/to/dataset\",\n    storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/prepareexamples/","title":"PrepareExamples","text":"

Helper step to create examples from query and answers pairs used as Few Shots in APIGen.

"},{"location":"components-gallery/steps/prepareexamples/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[query]\n            ICOL1[answers]\n        end\n        subgraph New columns\n            OCOL0[examples]\n        end\n    end\n\n    subgraph PrepareExamples\n        StepInput[Input Columns: query, answers]\n        StepOutput[Output Columns: examples]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/prepareexamples/#inputs","title":"Inputs","text":"
  • query (str): The query to generate examples from.

  • answers (str): The answers to the query.

"},{"location":"components-gallery/steps/prepareexamples/#outputs","title":"Outputs","text":"
  • examples (str): The formatted examples.
"},{"location":"components-gallery/steps/prepareexamples/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/prepareexamples/#generate-examples-for-apigen","title":"Generate examples for APIGen","text":"
from distilabel.steps.tasks.apigen.utils import PrepareExamples\n\nprepare_examples = PrepareExamples()\nresult = next(prepare_examples.process(\n    [\n        {\n            \"query\": ['I need the area of circles with radius 2.5, 5, and 7.5 inches, please.', 'Can you provide the current locations of buses and trolleys on route 12?'],\n            \"answers\": ['[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]', '[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]']\n        }\n    ]\n)\n# result\n# [{'examples': '## Query:\\nI need the area of circles with radius 2.5, 5, and 7.5 inches, please.\\n## Answers:\\n[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]\\n\\n## Query:\\nCan you provide the current locations of buses and trolleys on route 12?\\n## Answers:\\n[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]'}, {'examples': '## Query:\\nI need the area of circles with radius 2.5, 5, and 7.5 inches, please.\\n## Answers:\\n[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]\\n\\n## Query:\\nCan you provide the current locations of buses and trolleys on route 12?\\n## Answers:\\n[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]'}]\n
"},{"location":"components-gallery/steps/conversationtemplate/","title":"ConversationTemplate","text":"

Generate a conversation template from an instruction and a response.

"},{"location":"components-gallery/steps/conversationtemplate/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[response]\n        end\n        subgraph New columns\n            OCOL0[conversation]\n        end\n    end\n\n    subgraph ConversationTemplate\n        StepInput[Input Columns: instruction, response]\n        StepOutput[Output Columns: conversation]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/conversationtemplate/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to be used in the conversation.

  • response (str): The response to be used in the conversation.

"},{"location":"components-gallery/steps/conversationtemplate/#outputs","title":"Outputs","text":"
  • conversation (ChatType): The conversation template.
"},{"location":"components-gallery/steps/conversationtemplate/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/conversationtemplate/#create-a-conversation-from-an-instruction-and-a-response","title":"Create a conversation from an instruction and a response","text":"
from distilabel.steps import ConversationTemplate\n\nconv_template = ConversationTemplate()\nconv_template.load()\n\nresult = next(\n    conv_template.process(\n        [\n            {\n                \"instruction\": \"Hello\",\n                \"response\": \"Hi\",\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n
"},{"location":"components-gallery/steps/formattextgenerationdpo/","title":"FormatTextGenerationDPO","text":"

Format the output of your LLMs for Direct Preference Optimization (DPO).

FormatTextGenerationDPO is a Step that formats the output of the combination of a TextGeneration task with a preference Task i.e. a task generating ratings, so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings. Use this step to transform the output of a combination of a TextGeneration + a preference task such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook.

"},{"location":"components-gallery/steps/formattextgenerationdpo/#note","title":"Note","text":"

The generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations.

"},{"location":"components-gallery/steps/formattextgenerationdpo/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[system_prompt]\n            ICOL1[instruction]\n            ICOL2[generations]\n            ICOL3[generation_models]\n            ICOL4[ratings]\n        end\n        subgraph New columns\n            OCOL0[prompt]\n            OCOL1[prompt_id]\n            OCOL2[chosen]\n            OCOL3[chosen_model]\n            OCOL4[chosen_rating]\n            OCOL5[rejected]\n            OCOL6[rejected_model]\n            OCOL7[rejected_rating]\n        end\n    end\n\n    subgraph FormatTextGenerationDPO\n        StepInput[Input Columns: system_prompt, instruction, generations, generation_models, ratings]\n        StepOutput[Output Columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    ICOL4 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n    StepOutput --> OCOL5\n    StepOutput --> OCOL6\n    StepOutput --> OCOL7\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/formattextgenerationdpo/#inputs","title":"Inputs","text":"
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generations, if available.

  • instruction (str): The instruction used to generate the generations with the LLM.

  • generations (List[str]): The generations produced by the LLM.

  • generation_models (List[str], optional): The model names used to generate the generations, only available if the model_name from the TextGeneration task/s is combined into a single column named this way, otherwise, it will be ignored.

  • ratings (List[float]): The ratings for each of the generations, produced by a preference task such as UltraFeedback.

"},{"location":"components-gallery/steps/formattextgenerationdpo/#outputs","title":"Outputs","text":"
  • prompt (str): The instruction used to generate the generations with the LLM.

  • prompt_id (str): The SHA256 hash of the prompt.

  • chosen (List[Dict[str, str]]): The chosen generation based on the ratings.

  • chosen_model (str, optional): The model name used to generate the chosen generation, if the generation_models are available.

  • chosen_rating (float): The rating of the chosen generation.

  • rejected (List[Dict[str, str]]): The rejected generation based on the ratings.

  • rejected_model (str, optional): The model name used to generate the rejected generation, if the generation_models are available.

  • rejected_rating (float): The rating of the rejected generation.

"},{"location":"components-gallery/steps/formattextgenerationdpo/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formattextgenerationdpo/#format-your-dataset-for-dpo-fine-tuning","title":"Format your dataset for DPO fine tuning","text":"
from distilabel.steps import FormatTextGenerationDPO\n\nformat_dpo = FormatTextGenerationDPO()\nformat_dpo.load()\n\n# NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\nresult = next(\n    format_dpo.process(\n        [\n            {\n                \"instruction\": \"What's 2+2?\",\n                \"generations\": [\"4\", \"5\", \"6\"],\n                \"ratings\": [1, 0, -1],\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#    {   'instruction': \"What's 2+2?\",\n#        'generations': ['4', '5', '6'],\n#        'ratings': [1, 0, -1],\n#        'prompt': \"What's 2+2?\",\n#        'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#        'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#        'chosen_rating': 1,\n#        'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n#        'rejected_rating': -1\n#    }\n# ]\n
"},{"location":"components-gallery/steps/formatchatgenerationdpo/","title":"FormatChatGenerationDPO","text":"

Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO).

FormatChatGenerationDPO is a Step that formats the output of the combination of a ChatGeneration task with a preference Task i.e. a task generating ratings such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook., so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings.

"},{"location":"components-gallery/steps/formatchatgenerationdpo/#note","title":"Note","text":"

The messages column should contain at least one message from the user, the generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations.

"},{"location":"components-gallery/steps/formatchatgenerationdpo/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[messages]\n            ICOL1[generations]\n            ICOL2[generation_models]\n            ICOL3[ratings]\n        end\n        subgraph New columns\n            OCOL0[prompt]\n            OCOL1[prompt_id]\n            OCOL2[chosen]\n            OCOL3[chosen_model]\n            OCOL4[chosen_rating]\n            OCOL5[rejected]\n            OCOL6[rejected_model]\n            OCOL7[rejected_rating]\n        end\n    end\n\n    subgraph FormatChatGenerationDPO\n        StepInput[Input Columns: messages, generations, generation_models, ratings]\n        StepOutput[Output Columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n    StepOutput --> OCOL5\n    StepOutput --> OCOL6\n    StepOutput --> OCOL7\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/formatchatgenerationdpo/#inputs","title":"Inputs","text":"
  • messages (List[Dict[str, str]]): The conversation messages.

  • generations (List[str]): The generations produced by the LLM.

  • generation_models (List[str], optional): The model names used to generate the generations, only available if the model_name from the ChatGeneration task/s is combined into a single column named this way, otherwise, it will be ignored.

  • ratings (List[float]): The ratings for each of the generations, produced by a preference task such as UltraFeedback.

"},{"location":"components-gallery/steps/formatchatgenerationdpo/#outputs","title":"Outputs","text":"
  • prompt (str): The user message used to generate the generations with the LLM.

  • prompt_id (str): The SHA256 hash of the prompt.

  • chosen (List[Dict[str, str]]): The chosen generation based on the ratings.

  • chosen_model (str, optional): The model name used to generate the chosen generation, if the generation_models are available.

  • chosen_rating (float): The rating of the chosen generation.

  • rejected (List[Dict[str, str]]): The rejected generation based on the ratings.

  • rejected_model (str, optional): The model name used to generate the rejected generation, if the generation_models are available.

  • rejected_rating (float): The rating of the rejected generation.

"},{"location":"components-gallery/steps/formatchatgenerationdpo/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatchatgenerationdpo/#format-your-dataset-for-dpo-fine-tuning","title":"Format your dataset for DPO fine tuning","text":"
from distilabel.steps import FormatChatGenerationDPO\n\nformat_dpo = FormatChatGenerationDPO()\nformat_dpo.load()\n\n# NOTE: \"generation_models\" can be added optionally.\nresult = next(\n    format_dpo.process(\n        [\n            {\n                \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                \"generations\": [\"4\", \"5\", \"6\"],\n                \"ratings\": [1, 0, -1],\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n#         'generations': ['4', '5', '6'],\n#         'ratings': [1, 0, -1],\n#         'prompt': \"What's 2+2?\",\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#         'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#         'chosen_rating': 1,\n#         'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n#         'rejected_rating': -1\n#     }\n# ]\n
"},{"location":"components-gallery/steps/formattextgenerationsft/","title":"FormatTextGenerationSFT","text":"

Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT).

FormatTextGenerationSFT is a Step that formats the output of a TextGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook. The output of the TextGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation.

"},{"location":"components-gallery/steps/formattextgenerationsft/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[system_prompt]\n            ICOL1[instruction]\n            ICOL2[generation]\n        end\n        subgraph New columns\n            OCOL0[prompt]\n            OCOL1[prompt_id]\n            OCOL2[messages]\n        end\n    end\n\n    subgraph FormatTextGenerationSFT\n        StepInput[Input Columns: system_prompt, instruction, generation]\n        StepOutput[Output Columns: prompt, prompt_id, messages]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/formattextgenerationsft/#inputs","title":"Inputs","text":"
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generation, if available.

  • instruction (str): The instruction used to generate the generation with the LLM.

  • generation (str): The generation produced by the LLM.

"},{"location":"components-gallery/steps/formattextgenerationsft/#outputs","title":"Outputs","text":"
  • prompt (str): The instruction used to generate the generation with the LLM.

  • prompt_id (str): The SHA256 hash of the prompt.

  • messages (List[Dict[str, str]]): The chat-like conversation with the instruction as the user message and the generation as the assistant message.

"},{"location":"components-gallery/steps/formattextgenerationsft/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formattextgenerationsft/#format-your-dataset-for-sft-fine-tuning","title":"Format your dataset for SFT fine tuning","text":"
from distilabel.steps import FormatTextGenerationSFT\n\nformat_sft = FormatTextGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n    format_sft.process(\n        [\n            {\n                \"instruction\": \"What's 2+2?\",\n                \"generation\": \"4\"\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'instruction': 'What's 2+2?',\n#         'generation': '4',\n#         'prompt': 'What's 2+2?',\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n#     }\n# ]\n
"},{"location":"components-gallery/steps/formatchatgenerationsft/","title":"FormatChatGenerationSFT","text":"

Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT).

FormatChatGenerationSFT is a Step that formats the output of a ChatGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook. The output of the ChatGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation.

"},{"location":"components-gallery/steps/formatchatgenerationsft/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[system_prompt]\n            ICOL1[instruction]\n            ICOL2[generation]\n        end\n        subgraph New columns\n            OCOL0[prompt]\n            OCOL1[prompt_id]\n            OCOL2[messages]\n        end\n    end\n\n    subgraph FormatChatGenerationSFT\n        StepInput[Input Columns: system_prompt, instruction, generation]\n        StepOutput[Output Columns: prompt, prompt_id, messages]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/formatchatgenerationsft/#inputs","title":"Inputs","text":"
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generation, if available.

  • instruction (str): The instruction used to generate the generation with the LLM.

  • generation (str): The generation produced by the LLM.

"},{"location":"components-gallery/steps/formatchatgenerationsft/#outputs","title":"Outputs","text":"
  • prompt (str): The instruction used to generate the generation with the LLM.

  • prompt_id (str): The SHA256 hash of the prompt.

  • messages (List[Dict[str, str]]): The chat-like conversation with the instruction as the user message and the generation as the assistant message.

"},{"location":"components-gallery/steps/formatchatgenerationsft/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatchatgenerationsft/#format-your-dataset-for-sft","title":"Format your dataset for SFT","text":"
from distilabel.steps import FormatChatGenerationSFT\n\nformat_sft = FormatChatGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n    format_sft.process(\n        [\n            {\n                \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                \"generation\": \"4\"\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#         'generation': '4',\n#         'prompt': 'What's 2+2?',\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#     }\n# ]\n
"},{"location":"components-gallery/steps/deitafiltering/","title":"DeitaFiltering","text":"

Filter dataset rows using DEITA filtering strategy.

Filter the dataset based on the DEITA score and the cosine distance between the embeddings. It's an implementation of the filtering step from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

"},{"location":"components-gallery/steps/deitafiltering/#attributes","title":"Attributes","text":"
  • data_budget: The desired size of the dataset after filtering.

  • diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Defaults to 0.9.

  • normalize_embeddings: Whether to normalize the embeddings before computing the cosine distance. Defaults to True.

"},{"location":"components-gallery/steps/deitafiltering/#runtime-parameters","title":"Runtime Parameters","text":"
  • data_budget: The desired size of the dataset after filtering.

  • diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset.

"},{"location":"components-gallery/steps/deitafiltering/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[evol_instruction_score]\n            ICOL1[evol_response_score]\n            ICOL2[embedding]\n        end\n        subgraph New columns\n            OCOL0[deita_score]\n            OCOL1[deita_score_computed_with]\n            OCOL2[nearest_neighbor_distance]\n        end\n    end\n\n    subgraph DeitaFiltering\n        StepInput[Input Columns: evol_instruction_score, evol_response_score, embedding]\n        StepOutput[Output Columns: deita_score, deita_score_computed_with, nearest_neighbor_distance]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/deitafiltering/#inputs","title":"Inputs","text":"
  • evol_instruction_score (float): The score of the instruction generated by ComplexityScorer step.

  • evol_response_score (float): The score of the response generated by QualityScorer step.

  • embedding (List[float]): The embedding generated for the conversation of the instruction-response pair using GenerateEmbeddings step.

"},{"location":"components-gallery/steps/deitafiltering/#outputs","title":"Outputs","text":"
  • deita_score (float): The DEITA score for the instruction-response pair.

  • deita_score_computed_with (List[str]): The scores used to compute the DEITA score.

  • nearest_neighbor_distance (float): The cosine distance between the embeddings of the instruction-response pair.

"},{"location":"components-gallery/steps/deitafiltering/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/deitafiltering/#filter-the-dataset-based-on-the-deita-score-and-the-cosine-distance-between-the-embeddings","title":"Filter the dataset based on the DEITA score and the cosine distance between the embeddings","text":"
from distilabel.steps import DeitaFiltering\n\ndeita_filtering = DeitaFiltering(data_budget=1)\n\ndeita_filtering.load()\n\nresult = next(\n    deita_filtering.process(\n        [\n            {\n                \"evol_instruction_score\": 0.5,\n                \"evol_response_score\": 0.5,\n                \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n            },\n            {\n                \"evol_instruction_score\": 0.6,\n                \"evol_response_score\": 0.6,\n                \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n            },\n            {\n                \"evol_instruction_score\": 0.7,\n                \"evol_response_score\": 0.7,\n                \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n            },\n        ],\n    )\n)\n# >>> result\n# [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n
"},{"location":"components-gallery/steps/deitafiltering/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/steps/embeddingdedup/","title":"EmbeddingDedup","text":"

Deduplicates text using embeddings.

EmbeddingDedup is a Step that detects near-duplicates in datasets, using embeddings to compare the similarity between the texts. The typical workflow with this step would include having a dataset with embeddings precomputed, and then (possibly using the FaissNearestNeighbour) using the nn_indices and nn_scores, determine the texts that are duplicate.

"},{"location":"components-gallery/steps/embeddingdedup/#attributes","title":"Attributes","text":"
  • threshold: the threshold to consider 2 examples as duplicates. It's dependent on the type of index that was used to generate the embeddings. For example, if the embeddings were generated using cosine similarity, a threshold of 0.9 would make all the texts with a cosine similarity above the value duplicates. Higher values detect less duplicates in such an index, but that should be taken into account when building it. Defaults to 0.9. Runtime Parameters: - threshold: the threshold to consider 2 examples as duplicates.
"},{"location":"components-gallery/steps/embeddingdedup/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[nn_indices]\n            ICOL1[nn_scores]\n        end\n        subgraph New columns\n            OCOL0[keep_row_after_embedding_filtering]\n        end\n    end\n\n    subgraph EmbeddingDedup\n        StepInput[Input Columns: nn_indices, nn_scores]\n        StepOutput[Output Columns: keep_row_after_embedding_filtering]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/embeddingdedup/#inputs","title":"Inputs","text":"
  • nn_indices (List[int]): a list containing the indices of the k nearest neighbours in the inputs for the row.

  • nn_scores (List[float]): a list containing the score or distance to each k nearest neighbour in the inputs.

"},{"location":"components-gallery/steps/embeddingdedup/#outputs","title":"Outputs","text":"
  • keep_row_after_embedding_filtering (bool): boolean indicating if the piece text is not a duplicate i.e. this text should be kept.
"},{"location":"components-gallery/steps/embeddingdedup/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/embeddingdedup/#deduplicate-a-list-of-texts-using-embedding-information","title":"Deduplicate a list of texts using embedding information","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n    data = LoadDataFromDicts(\n        data=[\n            {\n                \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n                \"embedding\": [\n                    0.018477669046149742,\n                    -0.03748236608841726,\n                    0.001919870620352492,\n                    0.024918478063770535,\n                    0.02348063521315178,\n                    0.0038251285566308375,\n                    -0.01723884983037716,\n                    0.02881971942372201,\n                ],\n                \"nn_indices\": [0, 1],\n                \"nn_scores\": [\n                    0.9164746999740601,\n                    0.782106876373291,\n                ],\n            },\n            {\n                \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n                \"embedding\": [\n                    -0.0023464179614082125,\n                    -0.07325472251663565,\n                    -0.06058678419516501,\n                    -0.02100326928586996,\n                    -0.013462744792362657,\n                    0.027368447064244242,\n                    -0.003916070100455717,\n                    0.01243614518480423,\n                ],\n                \"nn_indices\": [0, 2],\n                \"nn_scores\": [\n                    0.7552462220191956,\n                    0.7261884808540344,\n                ],\n            },\n            {\n                \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n                \"embedding\": [\n                    -0.01630817942328242,\n                    -0.023760151552345232,\n                    -0.014249650090627883,\n                    -0.005713686451446624,\n                    -0.016033059279131567,\n                    0.0071440908501058786,\n                    -0.05691099643425161,\n                    0.01597412704817784,\n                ],\n                \"nn_indices\": [1, 2],\n                \"nn_scores\": [\n                    0.8107735514640808,\n                    0.7172299027442932,\n                ],\n            },\n        ],\n        batch_size=batch_size,\n    )\n    # In general you should do something like this before the deduplication step, to obtain the\n    # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n    # no need for it.\n    # nn = FaissNearestNeighbour(\n    #     k=30,\n    #     metric_type=faiss.METRIC_INNER_PRODUCT,\n    #     search_batch_size=50,\n    #     train_size=len(dataset),              # The number of embeddings to use for training\n    #     string_factory=\"IVF300_HNSW32,Flat\"   # To use an index (optional, maybe required for big datasets)\n    # )\n    # Read more about the `string_factory` here:\n    # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n    embedding_dedup = EmbeddingDedup(\n        threshold=0.8,\n        input_batch_size=batch_size,\n    )\n\n    data >> embedding_dedup\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    ds = distiset[\"default\"][\"train\"]\n    # Filter out the duplicates\n    ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n
"},{"location":"components-gallery/steps/apigenexecutionchecker/","title":"APIGenExecutionChecker","text":"

Executes the generated function calls.

This step checks if a given answer from a model as generated by APIGenGenerator can be executed against the given library (given by libpath, which is a string pointing to a python .py file with functions).

"},{"location":"components-gallery/steps/apigenexecutionchecker/#attributes","title":"Attributes","text":"
  • libpath: The path to the library where we will retrieve the functions. It can also point to a folder with the functions. In this case, the folder layout should be a folder with .py files, each containing a single function, the name of the function being the same as the filename.

  • check_is_dangerous: Bool to exclude some potentially dangerous functions, it contains some heuristics found while testing. This functions can run subprocesses, deal with the OS, or have other potentially dangerous operations. Defaults to True.

"},{"location":"components-gallery/steps/apigenexecutionchecker/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[answers]\n        end\n        subgraph New columns\n            OCOL0[keep_row_after_execution_check]\n            OCOL1[execution_result]\n        end\n    end\n\n    subgraph APIGenExecutionChecker\n        StepInput[Input Columns: answers]\n        StepOutput[Output Columns: keep_row_after_execution_check, execution_result]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/apigenexecutionchecker/#inputs","title":"Inputs","text":"
  • answers (str): List with arguments to be passed to the function, dumped as a string from a list of dictionaries. Should be loaded using json.loads.
"},{"location":"components-gallery/steps/apigenexecutionchecker/#outputs","title":"Outputs","text":"
  • keep_row_after_execution_check (bool): Whether the function should be kept or not.

  • execution_result (str): The result from executing the function.

"},{"location":"components-gallery/steps/apigenexecutionchecker/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/apigenexecutionchecker/#execute-a-function-from-a-given-library-with-the-answer-from-an-llm","title":"Execute a function from a given library with the answer from an LLM","text":"
from distilabel.steps.tasks import APIGenExecutionChecker\n\n# For the libpath you can use as an example the file at the tests folder:\n# ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\ntask = APIGenExecutionChecker(\n    libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n)\ntask.load()\n\nres = next(\n    task.process(\n        [\n            {\n                \"answers\": [\n                    {\n                        \"arguments\": {\n                            \"initial_velocity\": 0.2,\n                            \"acceleration\": 0.1,\n                            \"time\": 0.5,\n                        },\n                        \"name\": \"final_velocity\",\n                    }\n                ],\n            }\n        ]\n    )\n)\nres\n#[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n
"},{"location":"components-gallery/steps/apigenexecutionchecker/#references","title":"References","text":"
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets

  • Salesforce/xlam-function-calling-60k

"},{"location":"components-gallery/steps/minhashdedup/","title":"MinHashDedup","text":"

Deduplicates text using MinHash and MinHashLSH.

MinHashDedup is a Step that detects near-duplicates in datasets. The idea roughly translates to the following steps: 1. Tokenize the text into words or ngrams. 2. Create a MinHash for each text. 3. Store the MinHashes in a MinHashLSH. 4. Check if the MinHash is already in the LSH, if so, it is a duplicate.

"},{"location":"components-gallery/steps/minhashdedup/#attributes","title":"Attributes","text":"
  • num_perm: the number of permutations to use. Defaults to 128.

  • seed: the seed to use for the MinHash. This seed must be the same used for MinHash, keep in mind when both steps are created. Defaults to 1.

  • tokenizer: the tokenizer to use. Available ones are words or ngrams. If words is selected, it tokenize the text into words using nltk's word tokenizer. ngram estimates the ngrams (together with the size n) using. Defaults to words.

  • n: the size of the ngrams to use. Only relevant if tokenizer=\"ngrams\". Defaults to 5.

  • threshold: the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to 0.9.

  • storage: the storage to use for the LSH. Can be dict to store the index in memory, or disk. Keep in mind, disk is an experimental feature not defined in datasketch, that is based on DiskCache's Index class. It should work as a dict, but backed by disk, but depending on the system it can be slower. Defaults to dict. which uses a custom shelve backend. Note the disk is an experimetal feature that may cause issues. Defaults to dict.

"},{"location":"components-gallery/steps/minhashdedup/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n        end\n        subgraph New columns\n            OCOL0[keep_row_after_minhash_filtering]\n        end\n    end\n\n    subgraph MinHashDedup\n        StepInput[Input Columns: text]\n        StepOutput[Output Columns: keep_row_after_minhash_filtering]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/minhashdedup/#inputs","title":"Inputs","text":"
  • text (str): the texts to be filtered.
"},{"location":"components-gallery/steps/minhashdedup/#outputs","title":"Outputs","text":"
  • keep_row_after_minhash_filtering (bool): boolean indicating if the piece text is not a duplicate i.e. this text should be kept.
"},{"location":"components-gallery/steps/minhashdedup/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/minhashdedup/#deduplicate-a-list-of-texts-using-minhash-and-minhashlsh","title":"Deduplicate a list of texts using MinHash and MinHashLSH","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps import MinHashDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n    ds_size = 1000\n    batch_size = 500  # Bigger batch sizes work better for this step\n    data = LoadDataFromDicts(\n        data=[\n            {\"text\": \"This is a test document.\"},\n            {\"text\": \"This document is a test.\"},\n            {\"text\": \"Test document for duplication.\"},\n            {\"text\": \"Document for duplication test.\"},\n            {\"text\": \"This is another unique document.\"},\n        ]\n        * (ds_size // 5),\n        batch_size=batch_size,\n    )\n    minhash_dedup = MinHashDedup(\n        tokenizer=\"words\",\n        threshold=0.9,      # lower values will increase the number of duplicates\n        storage=\"dict\",     # or \"disk\" for bigger datasets\n    )\n\n    data >> minhash_dedup\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    ds = distiset[\"default\"][\"train\"]\n    # Filter out the duplicates\n    ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n
"},{"location":"components-gallery/steps/minhashdedup/#references","title":"References","text":"
  • datasketch documentation

  • Identifying and Filtering Near-Duplicate Documents

  • Diskcache's Index

"},{"location":"components-gallery/steps/combineoutputs/","title":"CombineOutputs","text":"

Combine the outputs of several upstream steps.

CombineOutputs is a Step that takes the outputs of several upstream steps and combines them to generate a new dictionary with all keys/columns of the upstream steps outputs.

"},{"location":"components-gallery/steps/combineoutputs/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph CombineOutputs\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/combineoutputs/#inputs","title":"Inputs","text":"
  • dynamic (based on the upstream Steps): All the columns of the upstream steps outputs.
"},{"location":"components-gallery/steps/combineoutputs/#outputs","title":"Outputs","text":"
  • dynamic (based on the upstream Steps): All the columns of the upstream steps outputs.
"},{"location":"components-gallery/steps/combineoutputs/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/combineoutputs/#combine-dictionaries-of-a-dataset","title":"Combine dictionaries of a dataset","text":"
from distilabel.steps import CombineOutputs\n\ncombine_outputs = CombineOutputs()\ncombine_outputs.load()\n\nresult = next(\n    combine_outputs.process(\n        [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n        [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n    )\n)\n# [\n#   {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n#   {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n# ]\n
"},{"location":"components-gallery/steps/combineoutputs/#combine-upstream-steps-outputs-in-a-pipeline","title":"Combine upstream steps outputs in a pipeline","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs\n\nwith Pipeline() as pipeline:\n    step_1 = ...\n    step_2 = ...\n    step_3 = ...\n    combine = CombineOutputs()\n\n    [step_1, step_2, step_3] >> combine\n
"},{"location":"components-gallery/steps/expandcolumns/","title":"ExpandColumns","text":"

Expand columns that contain lists into multiple rows.

ExpandColumns is a Step that takes a list of columns and expands them into multiple rows. The new rows will have the same data as the original row, except for the expanded column, which will contain a single item from the original list.

"},{"location":"components-gallery/steps/expandcolumns/#attributes","title":"Attributes","text":"
  • columns: A dictionary that maps the column to be expanded to the new column name or a list of columns to be expanded. If a list is provided, the new column name will be the same as the column name.
"},{"location":"components-gallery/steps/expandcolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph ExpandColumns\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/expandcolumns/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): The columns to be expanded into multiple rows.
"},{"location":"components-gallery/steps/expandcolumns/#outputs","title":"Outputs","text":"
  • dynamic (determined by columns attribute): The expanded columns.
"},{"location":"components-gallery/steps/expandcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/expandcolumns/#expand-the-selected-columns-into-multiple-rows","title":"Expand the selected columns into multiple rows","text":"
from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n    columns=[\"generation\"],\n)\nexpand_columns.load()\n\nresult = next(\n    expand_columns.process(\n        [\n            {\n                \"instruction\": \"instruction 1\",\n                \"generation\": [\"generation 1\", \"generation 2\"]}\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n
"},{"location":"components-gallery/steps/groupcolumns/","title":"GroupColumns","text":"

Combines columns from a list of StepInput.

GroupColumns is a Step that implements the process method that calls the group_dicts function to handle and combine a list of StepInput. Also GroupColumns provides two attributes columns and output_columns to specify the columns to group and the output columns which will override the default value for the properties inputs and outputs, respectively.

"},{"location":"components-gallery/steps/groupcolumns/#attributes","title":"Attributes","text":"
  • columns: List of strings with the names of the columns to group.

  • output_columns: Optional list of strings with the names of the output columns.

"},{"location":"components-gallery/steps/groupcolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph GroupColumns\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/groupcolumns/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): The columns to group.
"},{"location":"components-gallery/steps/groupcolumns/#outputs","title":"Outputs","text":"
  • dynamic (determined by columns and output_columns attributes): The columns that were grouped.
"},{"location":"components-gallery/steps/groupcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/groupcolumns/#group-columns-of-a-dataset","title":"Group columns of a dataset","text":"
from distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n    name=\"group_columns\",\n    columns=[\"generation\", \"model_name\"],\n)\ngroup_columns.load()\n\nresult = next(\n    group_columns.process(\n        [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n        [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n    )\n)\n# >>> result\n# [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n
"},{"location":"components-gallery/steps/groupcolumns/#specify-the-name-of-the-output-columns","title":"Specify the name of the output columns","text":"
from distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n    name=\"group_columns\",\n    columns=[\"generation\", \"model_name\"],\n    output_columns=[\"generations\", \"generation_models\"]\n)\ngroup_columns.load()\n\nresult = next(\n    group_columns.process(\n        [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n        [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n    )\n)\n# >>> result\n#[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n
"},{"location":"components-gallery/steps/keepcolumns/","title":"KeepColumns","text":"

Keeps selected columns in the dataset.

KeepColumns is a Step that implements the process method that keeps only the columns specified in the columns attribute. Also KeepColumns provides an attribute columns to specify the columns to keep which will override the default value for the properties inputs and outputs.

"},{"location":"components-gallery/steps/keepcolumns/#note","title":"Note","text":"

The order in which the columns are provided is important, as the output will be sorted using the provided order, which is useful before pushing either a dataset.Dataset via the PushToHub step or a distilabel.Distiset via the Pipeline.run output variable.

"},{"location":"components-gallery/steps/keepcolumns/#attributes","title":"Attributes","text":"
  • columns: List of strings with the names of the columns to keep.
"},{"location":"components-gallery/steps/keepcolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph KeepColumns\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/keepcolumns/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): The columns to keep.
"},{"location":"components-gallery/steps/keepcolumns/#outputs","title":"Outputs","text":"
  • dynamic (determined by columns attribute): The columns that were kept.
"},{"location":"components-gallery/steps/keepcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/keepcolumns/#select-the-columns-to-keep","title":"Select the columns to keep","text":"
from distilabel.steps import KeepColumns\n\nkeep_columns = KeepColumns(\n    columns=[\"instruction\", \"generation\"],\n)\nkeep_columns.load()\n\nresult = next(\n    keep_columns.process(\n        [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n    )\n)\n# >>> result\n# [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n
"},{"location":"components-gallery/steps/mergecolumns/","title":"MergeColumns","text":"

Merge columns from a row.

MergeColumns is a Step that implements the process method that calls the merge_columns function to handle and combine columns in a StepInput. MergeColumns provides two attributes columns and output_column to specify the columns to merge and the resulting output column.

This step can be useful if you have a `Task` that generates instructions for example, and you\nwant to have more examples of those. In such a case, you could for example use another `Task`\nto multiply your instructions synthetically, what would yield two different columns splitted.\nUsing `MergeColumns` you can merge them and use them as a single column in your dataset for\nfurther processing.\n
"},{"location":"components-gallery/steps/mergecolumns/#attributes","title":"Attributes","text":"
  • columns: List of strings with the names of the columns to merge.

  • output_column: str name of the output column

"},{"location":"components-gallery/steps/mergecolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph MergeColumns\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/mergecolumns/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): The columns to merge.
"},{"location":"components-gallery/steps/mergecolumns/#outputs","title":"Outputs","text":"
  • dynamic (determined by columns and output_column attributes): The columns that were merged.
"},{"location":"components-gallery/steps/mergecolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/mergecolumns/#combine-columns-in-rows-of-a-dataset","title":"Combine columns in rows of a dataset","text":"
from distilabel.steps import MergeColumns\n\ncombiner = MergeColumns(\n    columns=[\"queries\", \"multiple_queries\"],\n    output_column=\"queries\",\n)\ncombiner.load()\n\nresult = next(\n    combiner.process(\n        [\n            {\n                \"queries\": \"How are you?\",\n                \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n            }\n        ],\n    )\n)\n# >>> result\n# [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n
"},{"location":"components-gallery/steps/dbscan/","title":"DBSCAN","text":"

DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core

samples in regions of high density and expands clusters from them. This algorithm is good for data which contains clusters of similar density.

This is a `GlobalStep` that clusters the embeddings using the DBSCAN algorithm\nfrom `sklearn`. Visit `TextClustering` step for an example of use.\nThe trained model is saved as an artifact when creating a distiset\nand pushing it to the Hugging Face Hub.\n
"},{"location":"components-gallery/steps/dbscan/#attributes","title":"Attributes","text":"
  • eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. - min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. - metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. - n_jobs: The number of parallel jobs to run.
"},{"location":"components-gallery/steps/dbscan/#runtime-parameters","title":"Runtime Parameters","text":"
  • eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.

  • min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse.

  • metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter.

  • n_jobs: The number of parallel jobs to run.

"},{"location":"components-gallery/steps/dbscan/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[projection]\n        end\n        subgraph New columns\n            OCOL0[cluster_label]\n        end\n    end\n\n    subgraph DBSCAN\n        StepInput[Input Columns: projection]\n        StepOutput[Output Columns: cluster_label]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/dbscan/#inputs","title":"Inputs","text":"
  • projection (List[float]): Vector representation of the text to cluster, normally the output from the UMAP step.
"},{"location":"components-gallery/steps/dbscan/#outputs","title":"Outputs","text":"
  • cluster_label (int): Integer representing the label of a given cluster. -1 means it wasn't clustered.
"},{"location":"components-gallery/steps/dbscan/#references","title":"References","text":"
  • DBSCAN demo of sklearn

  • sklearn dbscan

"},{"location":"components-gallery/steps/umap/","title":"UMAP","text":"

UMAP is a general purpose manifold learning and dimension reduction algorithm.

This is a GlobalStep that reduces the dimensionality of the embeddings using. Visit the TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub.

"},{"location":"components-gallery/steps/umap/#attributes","title":"Attributes","text":"
  • n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. - metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean. - n_jobs: The number of parallel jobs to run. Defaults to 8. - random_state: The random state to use for the UMAP algorithm.
"},{"location":"components-gallery/steps/umap/#runtime-parameters","title":"Runtime Parameters","text":"
  • n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100.

  • metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean.

  • n_jobs: The number of parallel jobs to run. Defaults to 8.

  • random_state: The random state to use for the UMAP algorithm.

"},{"location":"components-gallery/steps/umap/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[embedding]\n        end\n        subgraph New columns\n            OCOL0[projection]\n        end\n    end\n\n    subgraph UMAP\n        StepInput[Input Columns: embedding]\n        StepOutput[Output Columns: projection]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/umap/#inputs","title":"Inputs","text":"
  • embedding (List[float]): The original embeddings we want to reduce the dimension.
"},{"location":"components-gallery/steps/umap/#outputs","title":"Outputs","text":"
  • projection (List[float]): Embedding reduced to the number of components specified, the size of the new embeddings will be determined by the n_components.
"},{"location":"components-gallery/steps/umap/#references","title":"References","text":"
  • UMAP repository

  • UMAP documentation

"},{"location":"components-gallery/steps/faissnearestneighbour/","title":"FaissNearestNeighbour","text":"

Create a faiss index to get the nearest neighbours.

FaissNearestNeighbour is a GlobalStep that creates a faiss index using the Hugging Face datasets library integration, and then gets the nearest neighbours and the scores or distance of the nearest neighbours for each input row.

"},{"location":"components-gallery/steps/faissnearestneighbour/#attributes","title":"Attributes","text":"
  • device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None.

  • string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None.

  • metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None.

  • k: the number of nearest neighbours to search for each input row. Defaults to 1.

  • search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50.

  • train_size: If the index needs a training step, specifies how many vectors will be used to train the index.

"},{"location":"components-gallery/steps/faissnearestneighbour/#runtime-parameters","title":"Runtime Parameters","text":"
  • device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None.

  • string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None.

  • metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None.

  • k: the number of nearest neighbours to search for each input row. Defaults to 1.

  • search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50.

  • train_size: If the index needs a training step, specifies how many vectors will be used to train the index.

"},{"location":"components-gallery/steps/faissnearestneighbour/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[embedding]\n        end\n        subgraph New columns\n            OCOL0[nn_indices]\n            OCOL1[nn_scores]\n        end\n    end\n\n    subgraph FaissNearestNeighbour\n        StepInput[Input Columns: embedding]\n        StepOutput[Output Columns: nn_indices, nn_scores]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/faissnearestneighbour/#inputs","title":"Inputs","text":"
  • embedding (List[Union[float, int]]): a sentence embedding.
"},{"location":"components-gallery/steps/faissnearestneighbour/#outputs","title":"Outputs","text":"
  • nn_indices (List[int]): a list containing the indices of the k nearest neighbours in the inputs for the row.

  • nn_scores (List[float]): a list containing the score or distance to each k nearest neighbour in the inputs.

"},{"location":"components-gallery/steps/faissnearestneighbour/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/faissnearestneighbour/#generating-embeddings-and-getting-the-nearest-neighbours","title":"Generating embeddings and getting the nearest neighbours","text":"
from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\nwith Pipeline(name=\"hello\") as pipeline:\n    load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n    embeddings = EmbeddingGeneration(\n        embeddings=SentenceTransformerEmbeddings(\n            model=\"mixedbread-ai/mxbai-embed-large-v1\"\n        )\n    )\n\n    nearest_neighbours = FaissNearestNeighbour()\n\n    load_data >> embeddings >> nearest_neighbours\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            load_data.name: {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n        },\n        use_cache=False,\n    )\n
"},{"location":"components-gallery/steps/faissnearestneighbour/#references","title":"References","text":"
  • The Faiss library
"},{"location":"components-gallery/steps/embeddinggeneration/","title":"EmbeddingGeneration","text":"

Generate embeddings using an Embeddings model.

EmbeddingGeneration is a Step that using an Embeddings model generates sentence embeddings for the provided input texts.

"},{"location":"components-gallery/steps/embeddinggeneration/#attributes","title":"Attributes","text":"
  • embeddings: the Embeddings model used to generate the sentence embeddings.
"},{"location":"components-gallery/steps/embeddinggeneration/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n        end\n        subgraph New columns\n            OCOL0[embedding]\n        end\n    end\n\n    subgraph EmbeddingGeneration\n        StepInput[Input Columns: text]\n        StepOutput[Output Columns: embedding]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/embeddinggeneration/#inputs","title":"Inputs","text":"
  • text (str): The text for which the sentence embedding has to be generated.
"},{"location":"components-gallery/steps/embeddinggeneration/#outputs","title":"Outputs","text":"
  • embedding (List[Union[float, int]]): the generated sentence embedding.
"},{"location":"components-gallery/steps/embeddinggeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/embeddinggeneration/#generate-sentence-embeddings-with-sentence-transformers","title":"Generate sentence embeddings with Sentence Transformers","text":"
from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.steps import EmbeddingGeneration\n\nembedding_generation = EmbeddingGeneration(\n    embeddings=SentenceTransformerEmbeddings(\n        model=\"mixedbread-ai/mxbai-embed-large-v1\",\n    )\n)\n\nembedding_generation.load()\n\nresult = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n# [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n
"},{"location":"components-gallery/steps/rewardmodelscore/","title":"RewardModelScore","text":"

Assign a score to a response using a Reward Model.

RewardModelScore is a Step that using a Reward Model (RM) loaded using transformers, assigns an score to a response generated for an instruction, or a score to a multi-turn conversation.

"},{"location":"components-gallery/steps/rewardmodelscore/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

  • torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\".

  • trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

  • device_map: a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\". Defaults to None.

  • token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

  • truncation: whether to truncate sequences at the maximum length. Defaults to False.

  • max_length: maximun length to use for padding or truncation. Defaults to None.

"},{"location":"components-gallery/steps/rewardmodelscore/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[response]\n            ICOL2[conversation]\n        end\n        subgraph New columns\n            OCOL0[score]\n        end\n    end\n\n    subgraph RewardModelScore\n        StepInput[Input Columns: instruction, response, conversation]\n        StepOutput[Output Columns: score]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/rewardmodelscore/#inputs","title":"Inputs","text":"
  • instruction (str, optional): the instruction used to generate a response. If provided, then response must be provided too.

  • response (str, optional): the response generated for instruction. If provided, then instruction must be provide too.

  • conversation (ChatType, optional): a multi-turn conversation. If not provided, then instruction and response columns must be provided.

"},{"location":"components-gallery/steps/rewardmodelscore/#outputs","title":"Outputs","text":"
  • score (float): the score given by the reward model for the instruction-response pair or the conversation.
"},{"location":"components-gallery/steps/rewardmodelscore/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/rewardmodelscore/#response-pair","title":"response pair","text":"
from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n    model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n    step.process(\n        inputs=[\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"response\": \"The output of 2+2 is 4\",\n            },\n            {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n        ]\n    )\n)\n# [\n#   {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n#   {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n# ]\n
"},{"location":"components-gallery/steps/rewardmodelscore/#turn-conversation","title":"turn conversation","text":"
from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n    model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n    step.process(\n        inputs=[\n            {\n                \"conversation\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                    {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n                ],\n            },\n            {\n                \"conversation\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                    {\"role\": \"assistant\", \"content\": \"4\"},\n                ],\n            },\n        ]\n    )\n)\n# [\n#   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n#   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n# ]\n
"},{"location":"components-gallery/steps/truncatetextcolumn/","title":"TruncateTextColumn","text":"

Truncate a row using a tokenizer or the number of characters.

TruncateTextColumn is a Step that truncates a row according to the max length. If the tokenizer is provided, then the row will be truncated using the tokenizer, and the max_length will be used as the maximum number of tokens, otherwise it will be used as the maximum number of characters. The TruncateTextColumn step is useful when one wants to truncate a row to a certain length, to avoid posterior errors in the model due to the length.

"},{"location":"components-gallery/steps/truncatetextcolumn/#attributes","title":"Attributes","text":"
  • column: the column to truncate. Defaults to \"text\".

  • max_length: the maximum length to use for truncation. If a tokenizer is given, corresponds to the number of tokens, otherwise corresponds to the number of characters. Defaults to 8192.

  • tokenizer: the name of the tokenizer to use. If provided, the row will be truncated using the tokenizer. Defaults to None.

"},{"location":"components-gallery/steps/truncatetextcolumn/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph TruncateTextColumn\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/truncatetextcolumn/#inputs","title":"Inputs","text":"
  • dynamic (determined by column attribute): The columns to be truncated, defaults to \"text\".
"},{"location":"components-gallery/steps/truncatetextcolumn/#outputs","title":"Outputs","text":"
  • dynamic (determined by column attribute): The truncated column.
"},{"location":"components-gallery/steps/truncatetextcolumn/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/truncatetextcolumn/#truncating-a-row-to-a-given-number-of-tokens","title":"Truncating a row to a given number of tokens","text":"
from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(\n    tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    max_length=4,\n    column=\"text\"\n)\n\ntrunc.load()\n\nresult = next(\n    trunc.process(\n        [\n            {\"text\": \"This is a sample text that is longer than 10 characters\"}\n        ]\n    )\n)\n# result\n# [{'text': 'This is a sample'}]\n
"},{"location":"components-gallery/steps/truncatetextcolumn/#truncating-a-row-to-a-given-number-of-characters","title":"Truncating a row to a given number of characters","text":"
from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(max_length=10)\n\ntrunc.load()\n\nresult = next(\n    trunc.process(\n        [\n            {\"text\": \"This is a sample text that is longer than 10 characters\"}\n        ]\n    )\n)\n# result\n# [{'text': 'This is a '}]\n
"},{"location":"components-gallery/tasks/","title":"Tasks Gallery","text":"Category Overview

The gallery page showcases the different types of components within distilabel.

Icon Category Description text-generation Text generation steps are used to generate text based on a given prompt. chat-generation Chat generation steps are used to generate text based on a conversation. text-classification Text classification steps are used to classify text into a category. text-manipulation Text manipulation steps are used to manipulate or rewrite an input text. evol Evol steps are used to rewrite input text and evolve it to a higher quality. critique Critique steps are used to provide feedback on the quality of the data with a written explanation. scorer Scorer steps are used to evaluate and score the data with a numerical value. preference Preference steps are used to collect preferences on the data with numerical values or ranks. embedding Embedding steps are used to generate embeddings for the data. clustering Clustering steps are used to group similar data points together. columns Columns steps are used to manipulate columns in the data. filtering Filtering steps are used to filter the data based on some criteria. format Format steps are used to format the data. load Load steps are used to load the data. execution Executes python functions. save Save steps are used to save the data.
  • APIGenGenerator

    Generate queries and answers for the given functions in JSON format.

    APIGenGenerator

  • Genstruct

    Generate a pair of instruction-response from a document using an LLM.

    Genstruct

  • Magpie

    Generates conversations using an instruct fine-tuned LLM.

    Magpie

  • SelfInstruct

    Generate instructions based on a given input using an LLM.

    SelfInstruct

  • TextGeneration

    Text generation with an LLM given a prompt.

    TextGeneration

  • URIAL

    Generates a response using a non-instruct fine-tuned model.

    URIAL

  • MagpieGenerator

    Generator task the generates instructions or conversations using Magpie.

    MagpieGenerator

  • ChatGeneration

    Generates text based on a conversation.

    ChatGeneration

  • ArgillaLabeller

    Annotate Argilla records based on input fields, example records and question settings.

    ArgillaLabeller

  • TextClassification

    Classifies text into one or more categories or labels.

    TextClassification

  • EvolInstruct

    Evolve instructions using an LLM.

    EvolInstruct

  • EvolComplexity

    Evolve instructions to make them more complex using an LLM.

    EvolComplexity

  • EvolQuality

    Evolve the quality of the responses using an LLM.

    EvolQuality

  • EvolInstructGenerator

    Generate evolved instructions using an LLM.

    EvolInstructGenerator

  • EvolComplexityGenerator

    Generate evolved instructions with increased complexity using an LLM.

    EvolComplexityGenerator

  • InstructionBacktranslation

    Self-Alignment with Instruction Backtranslation.

    InstructionBacktranslation

  • PrometheusEval

    Critique and rank the quality of generations from an LLM using Prometheus 2.0.

    PrometheusEval

  • ComplexityScorer

    Score instructions based on their complexity using an LLM.

    ComplexityScorer

  • QualityScorer

    Score responses based on their quality using an LLM.

    QualityScorer

  • CLAIR

    Contrastive Learning from AI Revisions (CLAIR).

    CLAIR

  • UltraFeedback

    Rank generations focusing on different aspects using an LLM.

    UltraFeedback

  • PairRM

    Rank the candidates based on the input using the LLM model.

    PairRM

  • GenerateSentencePair

    Generate a positive and negative (optionally) sentences given an anchor sentence.

    GenerateSentencePair

  • GenerateEmbeddings

    Generate embeddings using the last hidden state of an LLM.

    GenerateEmbeddings

  • TextClustering

    Task that clusters a set of texts and generates summary labels for each cluster.

    TextClustering

  • TextClustering

    Task that clusters a set of texts and generates summary labels for each cluster.

    TextClustering

  • APIGenSemanticChecker

    Generate queries and answers for the given functions in JSON format.

    APIGenSemanticChecker

  • GenerateTextRetrievalData

    Generate text retrieval data with an LLM to later on train an embedding model.

    GenerateTextRetrievalData

  • GenerateShortTextMatchingData

    Generate short text matching data with an LLM to later on train an embedding model.

    GenerateShortTextMatchingData

  • GenerateLongTextMatchingData

    Generate long text matching data with an LLM to later on train an embedding model.

    GenerateLongTextMatchingData

  • GenerateTextClassificationData

    Generate text classification data with an LLM to later on train an embedding model.

    GenerateTextClassificationData

  • StructuredGeneration

    Generate structured content for a given instruction using an LLM.

    StructuredGeneration

  • MonolingualTripletGenerator

    Generate monolingual triplets with an LLM to later on train an embedding model.

    MonolingualTripletGenerator

  • BitextRetrievalGenerator

    Generate bitext retrieval data with an LLM to later on train an embedding model.

    BitextRetrievalGenerator

  • EmbeddingTaskGenerator

    Generate task descriptions for embedding-related tasks using an LLM.

    EmbeddingTaskGenerator

"},{"location":"components-gallery/tasks/apigengenerator/","title":"APIGenGenerator","text":"

Generate queries and answers for the given functions in JSON format.

The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format.

"},{"location":"components-gallery/tasks/apigengenerator/#attributes","title":"Attributes","text":"
  • system_prompt: The system prompt to guide the user in the generation of queries and answers.

  • use_tools: Whether to use the tools available in the prompt to generate the queries and answers. In case the tools are given in the input, they will be added to the prompt.

  • number: The number of queries to generate. It can be a list, where each number will be chosen randomly, or a dictionary with the number of queries and the probability of each. I.e: number=1, number=[1, 2, 3], number={1: 0.5, 2: 0.3, 3: 0.2} are all valid inputs. It corresponds to the number of parallel queries to generate.

  • use_default_structured_output: Whether to use the default structured output or not.

"},{"location":"components-gallery/tasks/apigengenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[examples]\n            ICOL1[func_name]\n            ICOL2[func_desc]\n            ICOL3[tools]\n        end\n        subgraph New columns\n            OCOL0[query]\n            OCOL1[answers]\n        end\n    end\n\n    subgraph APIGenGenerator\n        StepInput[Input Columns: examples, func_name, func_desc, tools]\n        StepOutput[Output Columns: query, answers]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/apigengenerator/#inputs","title":"Inputs","text":"
  • examples (str): Examples used as few shots to guide the model.

  • func_name (str): Name for the function to generate.

  • func_desc (str): Description of what the function should do.

  • tools (str): JSON formatted string containing the tool representation of the function.

"},{"location":"components-gallery/tasks/apigengenerator/#outputs","title":"Outputs","text":"
  • query (str): The list of queries.

  • answers (str): JSON formatted string with the list of answers, containing the info as a dictionary to be passed to the functions.

"},{"location":"components-gallery/tasks/apigengenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/apigengenerator/#generate-without-structured-output-original-implementation","title":"Generate without structured output (original implementation)","text":"
from distilabel.steps.tasks import ApiGenGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\napigen = ApiGenGenerator(\n    use_default_structured_output=False,\n    llm=llm\n)\napigen.load()\n\nres = next(\n    apigen.process(\n        [\n            {\n                \"examples\": 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n                \"func_name\": \"getrandommovie\",\n                \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n            }\n        ]\n    )\n)\nres\n# [{'examples': 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n# 'number': 1,\n# 'func_name': 'getrandommovie',\n# 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n# 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?',\n# 'Give me 5 random movie suggestions from your database to plan my weekend.'],\n# 'answers': [[{'name': 'getrandommovie', 'arguments': {}}],\n# [{'name': 'getrandommovie', 'arguments': {}},\n#     {'name': 'getrandommovie', 'arguments': {}},\n#     {'name': 'getrandommovie', 'arguments': {}},\n#     {'name': 'getrandommovie', 'arguments': {}},\n#     {'name': 'getrandommovie', 'arguments': {}}]],\n# 'raw_input_api_gen_generator_0': [{'role': 'system',\n#     'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\n\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\n\nEnsure the query:\n- Is clear and concise\n- Demonstrates typical use cases\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\n- The corresponding result's parameter types and ranges match with the function's descriptions\n\nEnsure the answer:\n- Is a list of function calls in JSON format\n- The length of the answer list should be equal to the number of requests in the query\n- Can solve all the requests in the query effectively\"},\n#     {'role': 'user',\n#     'content': 'Here are examples of queries and the corresponding answers for similar functions:\nQUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\n\nNote that the query could be interpreted as a combination of several independent requests.\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\nThe detailed function description is the following:\nReturns a list of random movies from a database by calling an external API.\n\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\n
"},{"location":"components-gallery/tasks/apigengenerator/#generate-with-structured-output","title":"Generate with structured output","text":"
from distilabel.steps.tasks import ApiGenGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\napigen = ApiGenGenerator(\n    use_default_structured_output=True,\n    llm=llm\n)\napigen.load()\n\nres_struct = next(\n    apigen.process(\n        [\n            {\n                \"examples\": 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n                \"func_name\": \"getrandommovie\",\n                \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n            }\n        ]\n    )\n)\nres_struct\n# [{'examples': 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n# 'number': 1,\n# 'func_name': 'getrandommovie',\n# 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n# 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\",\n# \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"],\n# 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}],\n# [{'arguments': {}, 'name': 'getrandommovie'}]],\n# 'raw_input_api_gen_generator_0': [{'role': 'system',\n#     'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\n\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\n\nEnsure the query:\n- Is clear and concise\n- Demonstrates typical use cases\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\n- The corresponding result's parameter types and ranges match with the function's descriptions\n\nEnsure the answer:\n- Is a list of function calls in JSON format\n- The length of the answer list should be equal to the number of requests in the query\n- Can solve all the requests in the query effectively\"},\n#     {'role': 'user',\n#     'content': 'Here are examples of queries and the corresponding answers for similar functions:\nQUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\n\nNote that the query could be interpreted as a combination of several independent requests.\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\nThe detailed function description is the following:\nReturns a list of random movies from a database by calling an external API.\n\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/apigengenerator/#references","title":"References","text":"
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets

  • Salesforce/xlam-function-calling-60k

"},{"location":"components-gallery/tasks/genstruct/","title":"Genstruct","text":"

Generate a pair of instruction-response from a document using an LLM.

Genstruct is a pre-defined task designed to generate valid instructions from a given raw document, with the title and the content, enabling the creation of new, partially synthetic instruction finetuning datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is inspired in the Ada-Instruct paper.

"},{"location":"components-gallery/tasks/genstruct/#note","title":"Note","text":"

The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended option is to use NousResearch/Genstruct-7B as the LLM provided to the task, since it was trained for this specific task.

"},{"location":"components-gallery/tasks/genstruct/#attributes","title":"Attributes","text":"
  • _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/genstruct/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[title]\n            ICOL1[content]\n        end\n        subgraph New columns\n            OCOL0[user]\n            OCOL1[assistant]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph Genstruct\n        StepInput[Input Columns: title, content]\n        StepOutput[Output Columns: user, assistant, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/genstruct/#inputs","title":"Inputs","text":"
  • title (str): The title of the document.

  • content (str): The content of the document.

"},{"location":"components-gallery/tasks/genstruct/#outputs","title":"Outputs","text":"
  • user (str): The user's instruction based on the document.

  • assistant (str): The assistant's response based on the user's instruction.

  • model_name (str): The model name used to generate the feedback and result.

"},{"location":"components-gallery/tasks/genstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/genstruct/#generate-instructions-from-raw-documents-using-the-title-and-content","title":"Generate instructions from raw documents using the title and content","text":"
from distilabel.steps.tasks import Genstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ngenstruct = Genstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"NousResearch/Genstruct-7B\",\n    ),\n)\n\ngenstruct.load()\n\nresult = next(\n    genstruct.process(\n        [\n            {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'title': 'An instruction',\n#         'content': 'content of the document',\n#         'model_name': 'test',\n#         'user': 'An instruction',\n#         'assistant': 'content of the document',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/genstruct/#references","title":"References","text":"
  • Genstruct 7B by Nous Research

  • Ada-Instruct: Adapting Instruction Generators for Complex Reasoning

"},{"location":"components-gallery/tasks/magpie/","title":"Magpie","text":"

Generates conversations using an instruct fine-tuned LLM.

Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as if it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'.

"},{"location":"components-gallery/tasks/magpie/#attributes","title":"Attributes","text":"
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.

  • end_with_user: whether the conversation should end with a user message. Defaults to False.

  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.

  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None.

"},{"location":"components-gallery/tasks/magpie/#runtime-parameters","title":"Runtime Parameters","text":"
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.

  • end_with_user: whether the conversation should end with a user message. Defaults to False.

  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.

  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic.

"},{"location":"components-gallery/tasks/magpie/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[system_prompt]\n        end\n        subgraph New columns\n            OCOL0[conversation]\n            OCOL1[instruction]\n            OCOL2[response]\n            OCOL3[system_prompt_key]\n            OCOL4[model_name]\n        end\n    end\n\n    subgraph Magpie\n        StepInput[Input Columns: system_prompt]\n        StepOutput[Output Columns: conversation, instruction, response, system_prompt_key, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/magpie/#inputs","title":"Inputs","text":"
  • system_prompt (str, optional): an optional system prompt that can be provided to guide the generation of the instruct LLM and steer it to generate instructions of certain topic.
"},{"location":"components-gallery/tasks/magpie/#outputs","title":"Outputs","text":"
  • conversation (ChatType): the generated conversation which is a list of chat items with a role and a message. Only if only_instruction=False.

  • instruction (str): the generated instructions if only_instruction=True or n_turns==1.

  • response (str): the generated response if n_turns==1.

  • system_prompt_key (str, optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary.

  • model_name (str): The model name used to generate the conversation or instruction.

"},{"location":"components-gallery/tasks/magpie/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/magpie/#generating-instructions-with-llama-3-8b-instruct-and-transformersllm","title":"Generating instructions with Llama 3 8B Instruct and TransformersLLM","text":"
from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 64,\n        },\n        device=\"mps\",\n    ),\n    only_instruction=True,\n)\n\nmagpie.load()\n\nresult = next(\n    magpie.process(\n        inputs=[\n            {\n                \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n            },\n            {\n                \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n            },\n        ]\n    )\n)\n# [\n#     {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n#     {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n# ]\n
"},{"location":"components-gallery/tasks/magpie/#generating-conversations-with-llama-3-8b-instruct-and-transformersllm","title":"Generating conversations with Llama 3 8B Instruct and TransformersLLM","text":"
from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 256,\n        },\n        device=\"mps\",\n    ),\n    n_turns=2,\n)\n\nmagpie.load()\n\nresult = next(\n    magpie.process(\n        inputs=[\n            {\n                \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n            },\n            {\n                \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n            },\n        ]\n    )\n)\n# [\n#     {\n#         'conversation': [\n#             {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n#             {\n#                 'role': 'user',\n#                 'content': 'I'm having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n# of x\".'\n#             },\n#             {\n#                 'role': 'assistant',\n#                 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don't worry, I'm here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n# x\". What it's asking us to do is find the'\n#             }\n#         ]\n#     },\n#     {\n#         'conversation': [\n#             {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n#             {\n#                 'role': 'user',\n#                 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n# might be pests or diseases, but I'm not sure which.\"\n#             },\n#             {\n#                 'role': 'assistant',\n#                 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n# **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n#             }\n#         ]\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/magpie/#references","title":"References","text":"
  • Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
"},{"location":"components-gallery/tasks/selfinstruct/","title":"SelfInstruct","text":"

Generate instructions based on a given input using an LLM.

SelfInstruct is a pre-defined task that, given a number of instructions, a certain criteria for query generations, an application description, and an input, generates a number of instruction related to the given input and following what is stated in the criteria for query generation and the application description. It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning Language Models with Self-Generated Instructions\".

"},{"location":"components-gallery/tasks/selfinstruct/#attributes","title":"Attributes","text":"
  • num_instructions: The number of instructions to be generated. Defaults to 5.

  • criteria_for_query_generation: The criteria for the query generation. Defaults to the criteria defined within the paper.

  • application_description: The description of the AI application that one want to build with these instructions. Defaults to AI assistant.

"},{"location":"components-gallery/tasks/selfinstruct/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[input]\n        end\n        subgraph New columns\n            OCOL0[instructions]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph SelfInstruct\n        StepInput[Input Columns: input]\n        StepOutput[Output Columns: instructions, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/selfinstruct/#inputs","title":"Inputs","text":"
  • input (str): The input to generate the instructions. It's also called seed in the paper.
"},{"location":"components-gallery/tasks/selfinstruct/#outputs","title":"Outputs","text":"
  • instructions (List[str]): The generated instructions.

  • model_name (str): The model name used to generate the instructions.

"},{"location":"components-gallery/tasks/selfinstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/selfinstruct/#generate-instructions-based-on-a-given-input","title":"Generate instructions based on a given input","text":"
from distilabel.steps.tasks import SelfInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\nself_instruct = SelfInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=5,  # This is the default value\n)\n\nself_instruct.load()\n\nresult = next(self_instruct.process([{\"input\": \"instruction\"}]))\n# result\n# [\n#     {\n#         'input': 'instruction',\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#         'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/textgeneration/","title":"TextGeneration","text":"

Text generation with an LLM given a prompt.

TextGeneration is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. This task should be good enough for tasks that don't need post-processing of the responses generated by the LLM.

"},{"location":"components-gallery/tasks/textgeneration/#attributes","title":"Attributes","text":"
  • system_prompt: The system prompt to use in the generation. If not provided, then it will check if the input row has a column named system_prompt and use it. If not, then no system prompt will be used. Defaults to None.

  • template: The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template.

  • columns: A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction.

  • use_system_prompt: DEPRECATED. To be removed in 1.5.0. Whether to use the system prompt in the generation. Defaults to True, which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored.

"},{"location":"components-gallery/tasks/textgeneration/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[generation]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph TextGeneration\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: generation, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/textgeneration/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): By default will be set to instruction. The columns can point both to a str or a List[str] to be used in the template.
"},{"location":"components-gallery/tasks/textgeneration/#outputs","title":"Outputs","text":"
  • generation (str): The generated text.

  • model_name (str): The name of the model used to generate the text.

"},{"location":"components-gallery/tasks/textgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textgeneration/#generate-text-from-an-instruction","title":"Generate text from an instruction","text":"
from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    )\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [{\"instruction\": \"your instruction\"}]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'your instruction',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'generation',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/textgeneration/#use-a-custom-template-to-generate-text","title":"Use a custom template to generate text","text":"
from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Document:\n{{ document }}\n\nQuestion: {{ question }}\n\nPlease provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n    template=CUSTOM_TEMPLATE,\n    columns=[\"document\", \"question\"],\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [\n            {\n                \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n                \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n#         'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/textgeneration/#few-shot-learning-with-different-system-prompts","title":"Few shot learning with different system prompts","text":"
from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n{% for example in examples %}\nExample {{ loop.index }}:\nInstruction: {{ example }}\n\n{% endfor %}\nNow, generate a new instruction in a similar style:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    template=CUSTOM_TEMPLATE,\n    columns=\"examples\",\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [\n            {\n                \"examples\": [\"This is an example\", \"Another relevant example\"],\n                \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'examples': ['This is an example', 'Another relevant example'],\n#         'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'Disable the firewall on the router',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/textgeneration/#references","title":"References","text":"
  • Jinja2 Template Designer Documentation
"},{"location":"components-gallery/tasks/urial/","title":"URIAL","text":"

Generates a response using a non-instruct fine-tuned model.

URIAL is a pre-defined task that generates a response using a non-instruct fine-tuned model. This task is used to generate a response based on the conversation provided as input.

"},{"location":"components-gallery/tasks/urial/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[conversation]\n        end\n        subgraph New columns\n            OCOL0[generation]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph URIAL\n        StepInput[Input Columns: instruction, conversation]\n        StepOutput[Output Columns: generation, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/urial/#inputs","title":"Inputs","text":"
  • instruction (str, optional): The instruction to generate a response from.

  • conversation (List[Dict[str, str]], optional): The conversation to generate a response from (the last message must be from the user).

"},{"location":"components-gallery/tasks/urial/#outputs","title":"Outputs","text":"
  • generation (str): The generated response.

  • model_name (str): The name of the model used to generate the response.

"},{"location":"components-gallery/tasks/urial/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/urial/#generate-text-from-an-instruction","title":"Generate text from an instruction","text":"
from distilabel.models import vLLM\nfrom distilabel.steps.tasks import URIAL\n\nstep = URIAL(\n    llm=vLLM(\n        model=\"meta-llama/Meta-Llama-3.1-8B\",\n        generation_kwargs={\"temperature\": 0.7},\n    ),\n)\n\nstep.load()\n\nresults = next(\n    step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n)\n# [\n#     {\n#         'instruction': \"What's the most most common type of cloud?\",\n#         'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n#         'distilabel_metadata': {...},\n#         'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/urial/#references","title":"References","text":"
  • The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning
"},{"location":"components-gallery/tasks/magpiegenerator/","title":"MagpieGenerator","text":"

Generator task the generates instructions or conversations using Magpie.

Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'.

"},{"location":"components-gallery/tasks/magpiegenerator/#attributes","title":"Attributes","text":"
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.

  • end_with_user: whether the conversation should end with a user message. Defaults to False.

  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.

  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None.

  • num_rows: the number of rows to be generated.

"},{"location":"components-gallery/tasks/magpiegenerator/#runtime-parameters","title":"Runtime Parameters","text":"
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.

  • end_with_user: whether the conversation should end with a user message. Defaults to False.

  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.

  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic.

  • num_rows: the number of rows to be generated.

"},{"location":"components-gallery/tasks/magpiegenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[conversation]\n            OCOL1[instruction]\n            OCOL2[response]\n            OCOL3[system_prompt_key]\n            OCOL4[model_name]\n        end\n    end\n\n    subgraph MagpieGenerator\n        StepOutput[Output Columns: conversation, instruction, response, system_prompt_key, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n
"},{"location":"components-gallery/tasks/magpiegenerator/#outputs","title":"Outputs","text":"
  • conversation (ChatType): the generated conversation which is a list of chat items with a role and a message.

  • instruction (str): the generated instructions if only_instruction=True.

  • response (str): the generated response if n_turns==1.

  • system_prompt_key (str, optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary.

  • model_name (str): The model name used to generate the conversation or instruction.

"},{"location":"components-gallery/tasks/magpiegenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/magpiegenerator/#generating-instructions-with-llama-3-8b-instruct-and-transformersllm","title":"Generating instructions with Llama 3 8B Instruct and TransformersLLM","text":"
from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 256,\n        },\n        device=\"mps\",\n    ),\n    only_instruction=True,\n    num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n#       [\n#           {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n#           {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n#       ],\n#       True\n# )\n
"},{"location":"components-gallery/tasks/magpiegenerator/#generating-a-conversation-with-llama-3-8b-instruct-and-transformersllm","title":"Generating a conversation with Llama 3 8B Instruct and TransformersLLM","text":"
from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 64,\n        },\n        device=\"mps\",\n    ),\n    n_turns=3,\n    num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n#     [\n#         {\n#             'conversation': [\n#                 {\n#                     'role': 'system',\n#                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n#                 },\n#                 {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n#                 {\n#                     'role': 'assistant',\n#                     'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n# let's break down the basics. First, we need to identify your goals and target audience. What do\"\n#                 },\n#                 {\n#                     'role': 'user',\n#                     'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n# expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n#                 },\n#                 {\n#                     'role': 'assistant',\n#                     'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n# agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n#                 }\n#             ]\n#         },\n#         {\n#             'conversation': [\n#                 {\n#                     'role': 'system',\n#                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n#                 },\n#                 {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n#                 {\n#                     'role': 'assistant',\n#                     'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n# **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n#                 },\n#                 {\n#                     'role': 'user',\n#                     'content': 'Let me stop you there. Let's explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I're primarily using my\n# laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n#                 },\n#                 {\n#                     'role': 'assistant',\n#                     'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n# option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n#                 }\n#             ]\n#         }\n#     ],\n#     True\n# )\n
"},{"location":"components-gallery/tasks/magpiegenerator/#generating-with-system-prompts-with-probabilities","title":"Generating with system prompts with probabilities","text":"
from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\nmagpie = MagpieGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 0.8,\n            \"max_new_tokens\": 256,\n        },\n    ),\n    n_turns=2,\n    system_prompt={\n        \"math\": (\"You're an expert AI assistant.\", 0.8),\n        \"writing\": (\"You're an expert writing assistant.\", 0.2),\n    },\n)\n\nmagpie.load()\n\nresult = next(magpie.process())\n
"},{"location":"components-gallery/tasks/magpiegenerator/#references","title":"References","text":"
  • Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
"},{"location":"components-gallery/tasks/chatgeneration/","title":"ChatGeneration","text":"

Generates text based on a conversation.

ChatGeneration is a pre-defined task that defines the messages as the input and generation as the output. This task is used to generate text based on a conversation. The model_name is also returned as part of the output in order to enhance it.

"},{"location":"components-gallery/tasks/chatgeneration/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[messages]\n        end\n        subgraph New columns\n            OCOL0[generation]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph ChatGeneration\n        StepInput[Input Columns: messages]\n        StepOutput[Output Columns: generation, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/chatgeneration/#inputs","title":"Inputs","text":"
  • messages (List[Dict[Literal[\"role\", \"content\"], str]]): The messages to generate the follow up completion from.
"},{"location":"components-gallery/tasks/chatgeneration/#outputs","title":"Outputs","text":"
  • generation (str): The generated text from the assistant.

  • model_name (str): The model name used to generate the text.

"},{"location":"components-gallery/tasks/chatgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/chatgeneration/#generate-text-from-a-conversation-in-openai-chat-format","title":"Generate text from a conversation in OpenAI chat format","text":"
from distilabel.steps.tasks import ChatGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nchat = ChatGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nchat.load()\n\nresult = next(\n    chat.process(\n        [\n            {\n                \"messages\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                ]\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#         'generation': '4',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/argillalabeller/","title":"ArgillaLabeller","text":"

Annotate Argilla records based on input fields, example records and question settings.

This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM. It uses a system prompt that guides the LLM to understand the input fields, the question type, and the question settings. The task then formats the input data and generates a response based on the question. The response is validated against the question's value model, and the final suggestion is prepared for annotation.

"},{"location":"components-gallery/tasks/argillalabeller/#attributes","title":"Attributes","text":"
  • _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/argillalabeller/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[record]\n            ICOL1[fields]\n            ICOL2[question]\n            ICOL3[example_records]\n            ICOL4[guidelines]\n        end\n        subgraph New columns\n            OCOL0[suggestion]\n        end\n    end\n\n    subgraph ArgillaLabeller\n        StepInput[Input Columns: record, fields, question, example_records, guidelines]\n        StepOutput[Output Columns: suggestion]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    ICOL4 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/argillalabeller/#inputs","title":"Inputs","text":"
  • record (argilla.Record): The record to be annotated.

  • fields (Optional[List[Dict[str, Any]]]): The list of field settings for the input fields.

  • question (Optional[Dict[str, Any]]): The question settings for the question to be answered.

  • example_records (Optional[List[Dict[str, Any]]]): The few shot example records with responses to be used to answer the question.

  • guidelines (Optional[str]): The guidelines for the annotation task.

"},{"location":"components-gallery/tasks/argillalabeller/#outputs","title":"Outputs","text":"
  • suggestion (Dict[str, Any]): The final suggestion for annotation.
"},{"location":"components-gallery/tasks/argillalabeller/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/argillalabeller/#annotate-a-record-with-the-same-dataset-and-question","title":"Annotate a record with the same dataset and question","text":"
import argilla as rg\nfrom argilla import Suggestion\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\npending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\ncompleted_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\npending_records = list(\n    dataset.records(\n        query=rg.Query(filter=pending_records_filter),\n        limit=5,\n    )\n)\nexample_records = list(\n    dataset.records(\n        query=rg.Query(filter=completed_records_filter),\n        limit=5,\n    )\n)\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    fields=[field],\n    question=question,\n    example_records=example_records,\n    guidelines=dataset.guidelines\n)\nlabeller.load()\n\n# Process the pending records\nresult = next(\n    labeller.process(\n        [\n            {\n                \"record\": record\n            } for record in pending_records\n        ]\n    )\n)\n\n# Add the suggestions to the records\nfor record, suggestion in zip(pending_records, result):\n    record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated records\ndataset.records.log(pending_records)\n
"},{"location":"components-gallery/tasks/argillalabeller/#annotate-a-record-with-alternating-datasets-and-questions","title":"Annotate a record with alternating datasets and questions","text":"
import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\nquestion2 = dataset.settings.questions[\"label2\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\nlabeller.load()\n\n# Process the record\nrecord = next(dataset.records())\nresult = next(\n    labeller.process(\n        [\n            {\n                \"record\": record,\n                \"fields\": [field],\n                \"question\": question,\n            },\n            {\n                \"record\": record,\n                \"fields\": [field],\n                \"question\": question2,\n            }\n        ]\n    )\n)\n\n# Add the suggestions to the record\nfor suggestion in result:\n    record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated record\ndataset.records.log([record])\n
"},{"location":"components-gallery/tasks/argillalabeller/#overwrite-default-prompts-and-instructions","title":"Overwrite default prompts and instructions","text":"
import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Overwrite default prompts and instructions\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n    question_to_label_instruction={\n        \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n        \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n        \"text\": \"Provide a text response to the question.\",\n        \"rating\": \"Provide a rating for the question.\",\n    },\n)\nlabeller.load()\n
"},{"location":"components-gallery/tasks/argillalabeller/#references","title":"References","text":"
  • Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets
"},{"location":"components-gallery/tasks/textclassification/","title":"TextClassification","text":"

Classifies text into one or more categories or labels.

This task can be used for text classification problems, where the goal is to assign one or multiple labels to a given text. It uses structured generation as per the reference paper by default, it can help to generate more concise labels. See section 4.1 in the reference.

"},{"location":"components-gallery/tasks/textclassification/#attributes","title":"Attributes","text":"
  • system_prompt: A prompt to display to the user before the task starts. Contains a default message to make the model behave like a classifier specialist.

  • n: Number of labels to generate If only 1 is required, corresponds to a label classification problem, if >1 it will intend return the \"n\" labels most representative for the text. Defaults to 1.

  • context: Context to use when generating the labels. By default contains a generic message, but can be used to customize the context for the task.

  • examples: List of examples to help the model understand the task, few shots.

  • available_labels: List of available labels to choose from when classifying the text, or a dictionary with the labels and their descriptions.

  • default_label: Default label to use when the text is ambiguous or lacks sufficient information for classification. Can be a list in case of multiple labels (n>1).

"},{"location":"components-gallery/tasks/textclassification/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n        end\n        subgraph New columns\n            OCOL0[labels]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph TextClassification\n        StepInput[Input Columns: text]\n        StepOutput[Output Columns: labels, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/textclassification/#inputs","title":"Inputs","text":"
  • text (str): The reference text we want to obtain labels for.
"},{"location":"components-gallery/tasks/textclassification/#outputs","title":"Outputs","text":"
  • labels (Union[str, List[str]]): The label or list of labels for the text.

  • model_name (str): The name of the model used to generate the label/s.

"},{"location":"components-gallery/tasks/textclassification/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textclassification/#assigning-a-sentiment-to-a-text","title":"Assigning a sentiment to a text","text":"
from distilabel.steps.tasks import TextClassification\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\ntext_classification = TextClassification(\n    llm=llm,\n    context=\"You are an AI system specialized in assigning sentiment to movies.\",\n    available_labels=[\"positive\", \"negative\"],\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n    )\n)\n# result\n# [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n# 'labels': 'positive',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"positive\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"positive\",  # The text shows positive sentiment\\n    \"negative\",  # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/textclassification/#assigning-predefined-labels-with-specified-descriptions","title":"Assigning predefined labels with specified descriptions","text":"
from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n    llm=llm,\n    n=1,\n    context=\"Determine the intent of the text.\",\n    available_labels={\n        \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n        \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n        \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n        \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n    },\n    query_title=\"Customer Query\",\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"Can you tell me more about your return policy?\"}]\n    )\n)\n# result\n# [{'text': 'Can you tell me more about your return policy?',\n# 'labels': 'inquiry',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"inquiry\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"complaint\",  # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n    \"inquiry\",  # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n    \"feedback\",  # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n    \"praise\",  # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/textclassification/#free-multi-label-classification-without-predefined-labels","title":"Free multi label classification without predefined labels","text":"
from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n    llm=llm,\n    n=3,\n    context=(\n        \"Describe the main themes, topics, or categories that could describe the \"\n        \"following type of persona.\"\n    ),\n    query_title=\"Example of Persona\",\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n    )\n)\n# result\n# [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n# 'labels': ['Historical Researcher',\n# 'Cultural Specialist',\n# 'Ethnic Studies Expert'],\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/textclassification/#references","title":"References","text":"
  • Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models
"},{"location":"components-gallery/tasks/evolinstruct/","title":"EvolInstruct","text":"

Evolve instructions using an LLM.

WizardLM: Empowering Large Language Models to Follow Complex Instructions

"},{"location":"components-gallery/tasks/evolinstruct/#attributes","title":"Attributes","text":"
  • num_evolutions: The number of evolutions to be performed.

  • store_evolutions: Whether to store all the evolutions or just the last one. Defaults to False.

  • generate_answers: Whether to generate answers for the evolved instructions. Defaults to False.

  • include_original_instruction: Whether to include the original instruction in the evolved_instructions output column. Defaults to False.

  • mutation_templates: The mutation templates to be used for evolving the instructions. Defaults to the ones provided in the utils.py file.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolinstruct/#runtime-parameters","title":"Runtime Parameters","text":"
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
"},{"location":"components-gallery/tasks/evolinstruct/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n        end\n        subgraph New columns\n            OCOL0[evolved_instruction]\n            OCOL1[evolved_instructions]\n            OCOL2[model_name]\n            OCOL3[answer]\n            OCOL4[answers]\n        end\n    end\n\n    subgraph EvolInstruct\n        StepInput[Input Columns: instruction]\n        StepOutput[Output Columns: evolved_instruction, evolved_instructions, model_name, answer, answers]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/evolinstruct/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to evolve.
"},{"location":"components-gallery/tasks/evolinstruct/#outputs","title":"Outputs","text":"
  • evolved_instruction (str): The evolved instruction if store_evolutions=False.

  • evolved_instructions (List[str]): The evolved instructions if store_evolutions=True.

  • model_name (str): The name of the LLM used to evolve the instructions.

  • answer (str): The answer to the evolved instruction if generate_answers=True and store_evolutions=False.

  • answers (List[str]): The answers to the evolved instructions if generate_answers=True and store_evolutions=True.

"},{"location":"components-gallery/tasks/evolinstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolinstruct/#evolve-an-instruction-using-an-llm","title":"Evolve an instruction using an LLM","text":"
from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n
"},{"location":"components-gallery/tasks/evolinstruct/#keep-the-iterations-of-the-evolutions","title":"Keep the iterations of the evolutions","text":"
from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n    store_evolutions=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'evolved_instructions': ['initial evolution', 'final evolution'],\n#         'model_name': 'model_name'\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/evolinstruct/#generate-answers-for-the-instructions-in-a-single-step","title":"Generate answers for the instructions in a single step","text":"
from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n    generate_answers=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'evolved_instruction': 'evolved instruction',\n#         'answer': 'answer to the instruction',\n#         'model_name': 'model_name'\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/evolinstruct/#references","title":"References","text":"
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

  • GitHub: h2oai/h2o-wizardlm

"},{"location":"components-gallery/tasks/evolcomplexity/","title":"EvolComplexity","text":"

Evolve instructions to make them more complex using an LLM.

EvolComplexity is a task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, using slight different prompts, but the exact same evolutionary approach.

"},{"location":"components-gallery/tasks/evolcomplexity/#attributes","title":"Attributes","text":"
  • num_instructions: The number of instructions to be generated.

  • generate_answers: Whether to generate answers for the instructions or not. Defaults to False.

  • mutation_templates: The mutation templates to be used for the generation of the instructions.

  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolcomplexity/#runtime-parameters","title":"Runtime Parameters","text":"
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.

  • seed: The number of evolutions to be run.

"},{"location":"components-gallery/tasks/evolcomplexity/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n        end\n        subgraph New columns\n            OCOL0[evolved_instruction]\n            OCOL1[answer]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph EvolComplexity\n        StepInput[Input Columns: instruction]\n        StepOutput[Output Columns: evolved_instruction, answer, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/evolcomplexity/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to evolve.
"},{"location":"components-gallery/tasks/evolcomplexity/#outputs","title":"Outputs","text":"
  • evolved_instruction (str): The evolved instruction.

  • answer (str, optional): The answer to the instruction if generate_answers=True.

  • model_name (str): The name of the LLM used to evolve the instructions.

"},{"location":"components-gallery/tasks/evolcomplexity/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolcomplexity/#evolve-an-instruction-using-an-llm","title":"Evolve an instruction using an LLM","text":"
from distilabel.steps.tasks import EvolComplexity\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity = EvolComplexity(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_complexity.load()\n\nresult = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n
"},{"location":"components-gallery/tasks/evolcomplexity/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

"},{"location":"components-gallery/tasks/evolquality/","title":"EvolQuality","text":"

Evolve the quality of the responses using an LLM.

EvolQuality task is used to evolve the quality of the responses given a prompt, by generating a new response with a language model. This step implements the evolution quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

"},{"location":"components-gallery/tasks/evolquality/#attributes","title":"Attributes","text":"
  • num_evolutions: The number of evolutions to be performed on the responses.

  • store_evolutions: Whether to store all the evolved responses or just the last one. Defaults to False.

  • include_original_response: Whether to include the original response within the evolved responses. Defaults to False.

  • mutation_templates: The mutation templates to be used to evolve the responses.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolquality/#runtime-parameters","title":"Runtime Parameters","text":"
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
"},{"location":"components-gallery/tasks/evolquality/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[response]\n        end\n        subgraph New columns\n            OCOL0[evolved_response]\n            OCOL1[evolved_responses]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph EvolQuality\n        StepInput[Input Columns: instruction, response]\n        StepOutput[Output Columns: evolved_response, evolved_responses, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/evolquality/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction that was used to generate the responses.

  • response (str): The responses to be rewritten.

"},{"location":"components-gallery/tasks/evolquality/#outputs","title":"Outputs","text":"
  • evolved_response (str): The evolved response if store_evolutions=False.

  • evolved_responses (List[str]): The evolved responses if store_evolutions=True.

  • model_name (str): The name of the LLM used to evolve the responses.

"},{"location":"components-gallery/tasks/evolquality/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolquality/#evolve-the-quality-of-the-responses-given-a-prompt","title":"Evolve the quality of the responses given a prompt","text":"
from distilabel.steps.tasks import EvolQuality\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_quality = EvolQuality(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_quality.load()\n\nresult = next(\n    evol_quality.process(\n        [\n            {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'response': 'a response',\n#         'evolved_response': 'evolved response',\n#         'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/evolquality/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/evolinstructgenerator/","title":"EvolInstructGenerator","text":"

Generate evolved instructions using an LLM.

WizardLM: Empowering Large Language Models to Follow Complex Instructions

"},{"location":"components-gallery/tasks/evolinstructgenerator/#attributes","title":"Attributes","text":"
  • num_instructions: The number of instructions to be generated.

  • generate_answers: Whether to generate answers for the instructions or not. Defaults to False.

  • mutation_templates: The mutation templates to be used for the generation of the instructions.

  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolinstructgenerator/#runtime-parameters","title":"Runtime Parameters","text":"
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method.

"},{"location":"components-gallery/tasks/evolinstructgenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[instruction]\n            OCOL1[answer]\n            OCOL2[instructions]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph EvolInstructGenerator\n        StepOutput[Output Columns: instruction, answer, instructions, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n
"},{"location":"components-gallery/tasks/evolinstructgenerator/#outputs","title":"Outputs","text":"
  • instruction (str): The generated instruction if generate_answers=False.

  • answer (str): The generated answer if generate_answers=True.

  • instructions (List[str]): The generated instructions if generate_answers=True.

  • model_name (str): The name of the LLM used to generate and evolve the instructions.

"},{"location":"components-gallery/tasks/evolinstructgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolinstructgenerator/#generate-evolved-instructions-without-initial-instructions","title":"Generate evolved instructions without initial instructions","text":"
from distilabel.steps.tasks import EvolInstructGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct_generator = EvolInstructGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=2,\n)\n\nevol_instruct_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n
"},{"location":"components-gallery/tasks/evolinstructgenerator/#references","title":"References","text":"
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

  • GitHub: h2oai/h2o-wizardlm

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/","title":"EvolComplexityGenerator","text":"

Generate evolved instructions with increased complexity using an LLM.

EvolComplexityGenerator is a generation task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, but using slight different prompts, but the exact same evolutionary approach.

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#attributes","title":"Attributes","text":"
  • num_instructions: The number of instructions to be generated.

  • generate_answers: Whether to generate answers for the instructions or not. Defaults to False.

  • mutation_templates: The mutation templates to be used for the generation of the instructions.

  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#runtime-parameters","title":"Runtime Parameters","text":"
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.

  • seed: The number of evolutions to be run.

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[instruction]\n            OCOL1[answer]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph EvolComplexityGenerator\n        StepOutput[Output Columns: instruction, answer, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n
"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#outputs","title":"Outputs","text":"
  • instruction (str): The evolved instruction.

  • answer (str, optional): The answer to the instruction if generate_answers=True.

  • model_name (str): The name of the LLM used to evolve the instructions.

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolcomplexitygenerator/#generate-evolved-instructions-without-initial-instructions","title":"Generate evolved instructions without initial instructions","text":"
from distilabel.steps.tasks import EvolComplexityGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity_generator = EvolComplexityGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=2,\n)\n\nevol_complexity_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n
"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

"},{"location":"components-gallery/tasks/instructionbacktranslation/","title":"InstructionBacktranslation","text":"

Self-Alignment with Instruction Backtranslation.

"},{"location":"components-gallery/tasks/instructionbacktranslation/#attributes","title":"Attributes","text":"
  • _template: the Jinja2 template to use for the Instruction Backtranslation task.
"},{"location":"components-gallery/tasks/instructionbacktranslation/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generation]\n        end\n        subgraph New columns\n            OCOL0[score]\n            OCOL1[reason]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph InstructionBacktranslation\n        StepInput[Input Columns: instruction, generation]\n        StepOutput[Output Columns: score, reason, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/instructionbacktranslation/#inputs","title":"Inputs","text":"
  • instruction (str): The reference instruction to evaluate the text output.

  • generation (str): The text output to evaluate for the given instruction.

"},{"location":"components-gallery/tasks/instructionbacktranslation/#outputs","title":"Outputs","text":"
  • score (str): The score for the generation based on the given instruction.

  • reason (str): The reason for the provided score.

  • model_name (str): The model name used to score the generation.

"},{"location":"components-gallery/tasks/instructionbacktranslation/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/instructionbacktranslation/#generate-a-score-and-reason-for-a-given-instruction-and-generation","title":"Generate a score and reason for a given instruction and generation","text":"
from distilabel.steps.tasks import InstructionBacktranslation\n\ninstruction_backtranslation = InstructionBacktranslation(\n        name=\"instruction_backtranslation\",\n        llm=llm,\n        input_batch_size=10,\n        output_mappings={\"model_name\": \"scoring_model\"},\n    )\ninstruction_backtranslation.load()\n\nresult = next(\n    instruction_backtranslation.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generation\": \"4\",\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         \"instruction\": \"How much is 2+2?\",\n#         \"generation\": \"4\",\n#         \"score\": 3,\n#         \"reason\": \"Reason for the generation.\",\n#         \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/instructionbacktranslation/#references","title":"References","text":"
  • Self-Alignment with Instruction Backtranslation
"},{"location":"components-gallery/tasks/prometheuseval/","title":"PrometheusEval","text":"

Critique and rank the quality of generations from an LLM using Prometheus 2.0.

PrometheusEval is a task created for Prometheus 2.0, covering both the absolute and relative evaluations. The absolute evaluation i.e. mode=\"absolute\" is used to evaluate a single generation from an LLM for a given instruction. The relative evaluation i.e. mode=\"relative\" is used to evaluate two generations from an LLM for a given instruction. Both evaluations provide the possibility of using a reference answer to compare with or withoug the reference attribute, and both are based on a score rubric that critiques the generation/s based on the following default aspects: helpfulness, harmlessness, honesty, factual-validity, and reasoning, that can be overridden via rubrics, and the selected rubric is set via the attribute rubric.

"},{"location":"components-gallery/tasks/prometheuseval/#note","title":"Note","text":"

The PrometheusEval task is better suited and intended to be used with any of the Prometheus 2.0 models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0, and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting and quality is not guaranteed if using another model, even though some other models may be able to correctly follow the formatting and generate insightful critiques too.

"},{"location":"components-gallery/tasks/prometheuseval/#attributes","title":"Attributes","text":"
  • mode: the evaluation mode to use, either absolute or relative. It defines whether the task will evaluate one or two generations.

  • rubric: the score rubric to use within the prompt to run the critique based on different aspects. Can be any existing key in the rubrics attribute, which by default means that it can be: helpfulness, harmlessness, honesty, factual-validity, or reasoning. Those will only work if using the default rubrics, otherwise, the provided rubrics should be used.

  • rubrics: a dictionary containing the different rubrics to use for the critique, where the keys are the rubric names and the values are the rubric descriptions. The default rubrics are the following: helpfulness, harmlessness, honesty, factual-validity, and reasoning.

  • reference: a boolean flag to indicate whether a reference answer / completion will be provided, so that the model critique is based on the comparison with it. It implies that the column reference needs to be provided within the input data in addition to the rest of the inputs.

  • _template: a Jinja2 template used to format the input for the LLM.

"},{"location":"components-gallery/tasks/prometheuseval/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generation]\n            ICOL2[generations]\n            ICOL3[reference]\n        end\n        subgraph New columns\n            OCOL0[feedback]\n            OCOL1[result]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph PrometheusEval\n        StepInput[Input Columns: instruction, generation, generations, reference]\n        StepOutput[Output Columns: feedback, result, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/prometheuseval/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to use as reference.

  • generation (str, optional): The generated text from the given instruction. This column is required if mode=absolute.

  • generations (List[str], optional): The generated texts from the given instruction. It should contain 2 generations only. This column is required if mode=relative.

  • reference (str, optional): The reference / golden answer for the instruction, to be used by the LLM for comparison against.

"},{"location":"components-gallery/tasks/prometheuseval/#outputs","title":"Outputs","text":"
  • feedback (str): The feedback explaining the result below, as critiqued by the LLM using the pre-defined score rubric, compared against reference if provided.

  • result (Union[int, Literal[\"A\", \"B\"]]): If mode=absolute, then the result contains the score for the generation in a likert-scale from 1-5, otherwise, if mode=relative, then the result contains either \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of generations if result='A' or the index 1 if result='B'.

  • model_name (str): The model name used to generate the feedback and result.

"},{"location":"components-gallery/tasks/prometheuseval/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/prometheuseval/#critique-and-evaluate-llm-generation-quality-using-prometheus-2_0","title":"Critique and evaluate LLM generation quality using Prometheus 2_0","text":"
from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"factual-validity\"\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generation\": \"something done\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/prometheuseval/#critique-for-relative-evaluation","title":"Critique for relative evaluation","text":"
from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"relative\",\n    rubric=\"honesty\"\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generations': ['something done', 'other thing'],\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 'something done',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/prometheuseval/#critique-with-a-custom-rubric","title":"Critique with a custom rubric","text":"
from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"custom\",\n    rubrics={\n        \"custom\": \"[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E\"\n    }\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generation\": \"something done\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/prometheuseval/#critique-using-a-reference-answer","title":"Critique using a reference answer","text":"
from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"helpfulness\",\n    reference=True,\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\n                \"instruction\": \"make something\",\n                \"generation\": \"something done\",\n                \"reference\": \"this is a reference answer\",\n            },\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'reference': 'this is a reference answer',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/prometheuseval/#references","title":"References","text":"
  • Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models

  • prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf

"},{"location":"components-gallery/tasks/complexityscorer/","title":"ComplexityScorer","text":"

Score instructions based on their complexity using an LLM.

ComplexityScorer is a pre-defined task used to rank a list of instructions based in their complexity. It's an implementation of the complexity score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

"},{"location":"components-gallery/tasks/complexityscorer/#attributes","title":"Attributes","text":"
  • _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/complexityscorer/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instructions]\n        end\n        subgraph New columns\n            OCOL0[scores]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph ComplexityScorer\n        StepInput[Input Columns: instructions]\n        StepOutput[Output Columns: scores, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/complexityscorer/#inputs","title":"Inputs","text":"
  • instructions (List[str]): The list of instructions to be scored.
"},{"location":"components-gallery/tasks/complexityscorer/#outputs","title":"Outputs","text":"
  • scores (List[float]): The score for each instruction.

  • model_name (str): The model name used to generate the scores.

"},{"location":"components-gallery/tasks/complexityscorer/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/complexityscorer/#evaluate-the-complexity-of-your-instructions","title":"Evaluate the complexity of your instructions","text":"
from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n    )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n
"},{"location":"components-gallery/tasks/complexityscorer/#generate-structured-output-with-default-schema","title":"Generate structured output with default schema","text":"
from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    use_default_structured_output=use_default_structured_output\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n    )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n  \"scores\": [\\n    1, \\n    2\\n  ]\\n}'}}]\n
"},{"location":"components-gallery/tasks/complexityscorer/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/qualityscorer/","title":"QualityScorer","text":"

Score responses based on their quality using an LLM.

QualityScorer is a pre-defined task that defines the instruction as the input and score as the output. This task is used to rate the quality of instructions and responses. It's an implementation of the quality score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs are scored in terms of quality, obtaining a quality score for each instruction.

"},{"location":"components-gallery/tasks/qualityscorer/#attributes","title":"Attributes","text":"
  • _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/qualityscorer/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[responses]\n        end\n        subgraph New columns\n            OCOL0[scores]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph QualityScorer\n        StepInput[Input Columns: instruction, responses]\n        StepOutput[Output Columns: scores, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/qualityscorer/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction that was used to generate the responses.

  • responses (List[str]): The responses to be scored. Each response forms a pair with the instruction.

"},{"location":"components-gallery/tasks/qualityscorer/#outputs","title":"Outputs","text":"
  • scores (List[float]): The score for each instruction.

  • model_name (str): The model name used to generate the scores.

"},{"location":"components-gallery/tasks/qualityscorer/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/qualityscorer/#evaluate-the-quality-of-your-instructions","title":"Evaluate the quality of your instructions","text":"
from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = QualityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n            }\n        ]\n    )\n)\n# result\n[\n    {\n        'instructions': 'instruction',\n        'model_name': 'test',\n        'scores': [5, 3, 1],\n    }\n]\n
"},{"location":"components-gallery/tasks/qualityscorer/#generate-structured-output-with-default-schema","title":"Generate structured output with default schema","text":"
from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\nscorer = QualityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    use_default_structured_output=True\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n            }\n        ]\n    )\n)\n\n# result\n[{'instruction': 'instruction',\n'responses': ['good response', 'weird response', 'bad response'],\n'scores': [1, 2, 3],\n'distilabel_metadata': {'raw_output_quality_scorer_0': '{  \"scores\": [1, 2, 3] }'},\n'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/qualityscorer/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/clair/","title":"CLAIR","text":"

Contrastive Learning from AI Revisions (CLAIR).

CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting preference A preferred A\u2019 is much more contrastive and precise.

"},{"location":"components-gallery/tasks/clair/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n            ICOL1[student_solution]\n        end\n        subgraph New columns\n            OCOL0[revision]\n            OCOL1[rational]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph CLAIR\n        StepInput[Input Columns: task, student_solution]\n        StepOutput[Output Columns: revision, rational, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/clair/#inputs","title":"Inputs","text":"
  • task (str): The task or instruction.

  • student_solution (str): An answer to the task that is to be revised.

"},{"location":"components-gallery/tasks/clair/#outputs","title":"Outputs","text":"
  • revision (str): The revised text.

  • rational (str): The rational for the provided revision.

  • model_name (str): The name of the model used to generate the revision and rational.

"},{"location":"components-gallery/tasks/clair/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/clair/#create-contrastive-preference-pairs","title":"Create contrastive preference pairs","text":"
from distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 4096,\n    },\n)\nclair_task = CLAIR(llm=llm)\n\nclair_task.load()\n\nresult = next(\n    clair_task.process(\n        [\n            {\n                \"task\": \"How many gaps are there between the earth and the moon?\",\n                \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n            }\n        ]\n    )\n)\n# result\n# [{'task': 'How many gaps are there between the earth and the moon?',\n# 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n# 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n# 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n#     'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n#     {'role': 'user',\n#     'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/clair/#references","title":"References","text":"
  • Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment

  • APO and CLAIR - GitHub Repository

"},{"location":"components-gallery/tasks/ultrafeedback/","title":"UltraFeedback","text":"

Rank generations focusing on different aspects using an LLM.

UltraFeedback: Boosting Language Models with High-quality Feedback.

"},{"location":"components-gallery/tasks/ultrafeedback/#attributes","title":"Attributes","text":"
  • aspect: The aspect to perform with the UltraFeedback model. The available aspects are: - helpfulness: Evaluate text outputs based on helpfulness. - honesty: Evaluate text outputs based on honesty. - instruction-following: Evaluate text outputs based on given instructions. - truthfulness: Evaluate text outputs based on truthfulness. Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall assessment of the text outputs within a single prompt. The custom aspect is: - overall-rating: Evaluate text outputs based on an overall assessment. Defaults to \"overall-rating\".
"},{"location":"components-gallery/tasks/ultrafeedback/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generations]\n        end\n        subgraph New columns\n            OCOL0[ratings]\n            OCOL1[rationales]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph UltraFeedback\n        StepInput[Input Columns: instruction, generations]\n        StepOutput[Output Columns: ratings, rationales, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/ultrafeedback/#inputs","title":"Inputs","text":"
  • instruction (str): The reference instruction to evaluate the text outputs.

  • generations (List[str]): The text outputs to evaluate for the given instruction.

"},{"location":"components-gallery/tasks/ultrafeedback/#outputs","title":"Outputs","text":"
  • ratings (List[float]): The ratings for each of the provided text outputs.

  • rationales (List[str]): The rationales for each of the provided text outputs.

  • model_name (str): The name of the model used to generate the ratings and rationales.

"},{"location":"components-gallery/tasks/ultrafeedback/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-selected-aspect","title":"Rate generations from different LLMs based on the selected aspect","text":"
from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    use_default_structured_output=False\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'How much is 2+2?',\n#         'generations': ['4', 'and a car'],\n#         'ratings': [1, 2],\n#         'rationales': ['explanation for 4', 'explanation for and a car'],\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-honesty-using-the-default-structured-output","title":"Rate generations from different LLMs based on the honesty, using the default structured output","text":"
from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    aspect=\"honesty\"\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [5, 1],\n# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n# \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\n    5,\\n    1\\n] \\n\\n,\"rationales\": [\\n    \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\n    \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\n] }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-helpfulness-using-the-default-structured-output","title":"Rate generations from different LLMs based on the helpfulness, using the default structured output","text":"
from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512},\n    ),\n    aspect=\"helpfulness\"\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n#   'generations': ['4', 'and a car'],\n#   'ratings': [1, 5],\n#   'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n#    'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n#   'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n#    'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n#   'types': [1, 3, 1],\n#   'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n  \"ratings\": [\\n    1,\\n    5\\n  ]\\n ,\\n  \"rationales\": [\\n    \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\n    \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\n  ]\\n ,\\n  \"rationales_for_rating\": [\\n    \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\n    \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\n  ]\\n ,\\n  \"types\": [\\n    1, 3,\\n    1\\n  ]\\n  }'},\n#   'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/ultrafeedback/#references","title":"References","text":"
  • UltraFeedback: Boosting Language Models with High-quality Feedback

  • UltraFeedback - GitHub Repository

"},{"location":"components-gallery/tasks/pairrm/","title":"PairRM","text":"

Rank the candidates based on the input using the LLM model.

"},{"location":"components-gallery/tasks/pairrm/#note","title":"Note","text":"

This step differs to other tasks as there is a single implementation of this model currently, and we will use a specific LLM.

"},{"location":"components-gallery/tasks/pairrm/#attributes","title":"Attributes","text":"
  • model: The model to use for the ranking. Defaults to \"llm-blender/PairRM\".

  • instructions: The instructions to use for the model. Defaults to None.

"},{"location":"components-gallery/tasks/pairrm/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[inputs]\n            ICOL1[candidates]\n        end\n        subgraph New columns\n            OCOL0[ranks]\n            OCOL1[ranked_candidates]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph PairRM\n        StepInput[Input Columns: inputs, candidates]\n        StepOutput[Output Columns: ranks, ranked_candidates, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/pairrm/#inputs","title":"Inputs","text":"
  • inputs (List[Dict[str, Any]]): The input text or conversation to rank the candidates for.

  • candidates (List[Dict[str, Any]]): The candidates to rank.

"},{"location":"components-gallery/tasks/pairrm/#outputs","title":"Outputs","text":"
  • ranks (List[int]): The ranks of the candidates based on the input.

  • ranked_candidates (List[Dict[str, Any]]): The candidates ranked based on the input.

  • model_name (str): The model name used to rank the candidate responses. Defaults to \"llm-blender/PairRM\".

"},{"location":"components-gallery/tasks/pairrm/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/pairrm/#rank-llm-candidates","title":"Rank LLM candidates","text":"
from distilabel.steps.tasks import PairRM\n\n# Consider this as a placeholder for your actual LLM.\npair_rm = PairRM()\n\npair_rm.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'input': 'Hello, how are you?',\n#         'candidates': ['fine', 'good', 'bad'],\n#         'ranks': [2, 1, 3],\n#         'ranked_candidates': ['good', 'fine', 'bad'],\n#         'model_name': 'llm-blender/PairRM',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/pairrm/#references","title":"References","text":"
  • LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion

  • Pair Ranking Model

"},{"location":"components-gallery/tasks/generatesentencepair/","title":"GenerateSentencePair","text":"

Generate a positive and negative (optionally) sentences given an anchor sentence.

GenerateSentencePair is a pre-defined task that given an anchor sentence generates a positive sentence related to the anchor and optionally a negative sentence unrelated to the anchor or similar to it. Optionally, you can give a context to guide the LLM towards more specific behavior. This task is useful to generate training datasets for training embeddings models.

"},{"location":"components-gallery/tasks/generatesentencepair/#attributes","title":"Attributes","text":"
  • triplet: a flag to indicate if the task should generate a triplet of sentences (anchor, positive, negative). Defaults to False.

  • action: the action to perform to generate the positive sentence.

  • context: the context to use for the generation. Can be helpful to guide the LLM towards more specific context. Not used by default.

  • hard_negative: A flag to indicate if the negative should be a hard-negative or not. Hard negatives make it hard for the model to distinguish against the positive, with a higher degree of semantic similarity.

"},{"location":"components-gallery/tasks/generatesentencepair/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[anchor]\n        end\n        subgraph New columns\n            OCOL0[positive]\n            OCOL1[negative]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph GenerateSentencePair\n        StepInput[Input Columns: anchor]\n        StepOutput[Output Columns: positive, negative, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generatesentencepair/#inputs","title":"Inputs","text":"
  • anchor (str): The anchor sentence to generate the positive and negative sentences.
"},{"location":"components-gallery/tasks/generatesentencepair/#outputs","title":"Outputs","text":"
  • positive (str): The positive sentence related to the anchor.

  • negative (str): The negative sentence unrelated to the anchor if triplet=True, or more similar to the positive to make it more challenging for a model to distinguish in case hard_negative=True.

  • model_name (str): The name of the model that was used to generate the sentences.

"},{"location":"components-gallery/tasks/generatesentencepair/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatesentencepair/#paraphrasing","title":"Paraphrasing","text":"
from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"paraphrase\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n
"},{"location":"components-gallery/tasks/generatesentencepair/#generating-semantically-similar-sentences","title":"Generating semantically similar sentences","text":"
from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import GenerateSentencePair\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"semantically-similar\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n
"},{"location":"components-gallery/tasks/generatesentencepair/#generating-queries","title":"Generating queries","text":"
from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n
"},{"location":"components-gallery/tasks/generatesentencepair/#generating-answers","title":"Generating answers","text":"
from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"answer\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n
"},{"location":"components-gallery/tasks/generatesentencepair/#_1","title":")","text":"
from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    context=\"Argilla is an open-source data curation platform for LLMs.\",\n    hard_negative=True,\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n    use_default_structured_output=True\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n
"},{"location":"components-gallery/tasks/generateembeddings/","title":"GenerateEmbeddings","text":"

Generate embeddings using the last hidden state of an LLM.

Generate embeddings for a text input using the last hidden state of an LLM, as described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

"},{"location":"components-gallery/tasks/generateembeddings/#attributes","title":"Attributes","text":"
  • llm: The LLM to use to generate the embeddings.
"},{"location":"components-gallery/tasks/generateembeddings/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n        end\n        subgraph New columns\n            OCOL0[embedding]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph GenerateEmbeddings\n        StepInput[Input Columns: text]\n        StepOutput[Output Columns: embedding, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generateembeddings/#inputs","title":"Inputs","text":"
  • text (str, List[Dict[str, str]]): The input text or conversation to generate embeddings for.
"},{"location":"components-gallery/tasks/generateembeddings/#outputs","title":"Outputs","text":"
  • embedding (List[float]): The embedding of the input text or conversation.

  • model_name (str): The model name used to generate the embeddings.

"},{"location":"components-gallery/tasks/generateembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generateembeddings/#rank-llm-candidates","title":"Rank LLM candidates","text":"
from distilabel.steps.tasks import GenerateEmbeddings\nfrom distilabel.models.llms.huggingface import TransformersLLM\n\n# Consider this as a placeholder for your actual LLM.\nembedder = GenerateEmbeddings(\n    llm=TransformersLLM(\n        model=\"TaylorAI/bge-micro-v2\",\n        model_kwargs={\"is_decoder\": True},\n        cuda_devices=[],\n    )\n)\nembedder.load()\n\nresult = next(\n    embedder.process(\n        [\n            {\"text\": \"Hello, how are you?\"},\n        ]\n    )\n)\n
"},{"location":"components-gallery/tasks/generateembeddings/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/textclustering/","title":"TextClustering","text":"

Task that clusters a set of texts and generates summary labels for each cluster.

This is a GlobalTask that inherits from TextClassification, this means that all the attributes from that class are available here. Also, in this case we deal with all the inputs at once, instead of using batches. The input_batch_size is used here to send the examples to the LLM in batches (a subtle difference with the more common Task definitions). The task looks in each cluster for a given number of representative examples (the number is set by the samples_per_cluster attribute), and sends them to the LLM to get a label/s that represent the cluster. The labels are then assigned to each text in the cluster. The clusters and projections used in the step, are assumed to be obtained from the UMAP + DBSCAN steps, but could be generated for similar steps, as long as they represent the same concepts. This step runs a pipeline like the one in this repository: https://github.com/huggingface/text-clustering

"},{"location":"components-gallery/tasks/textclustering/#attributes","title":"Attributes","text":"
  • savefig: Whether to generate and save a figure with the clustering of the texts. - samples_per_cluster: The number of examples to use in the LLM as a sample of the cluster.
"},{"location":"components-gallery/tasks/textclustering/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n            ICOL1[projection]\n            ICOL2[cluster_label]\n        end\n        subgraph New columns\n            OCOL0[summary_label]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph TextClustering\n        StepInput[Input Columns: text, projection, cluster_label]\n        StepOutput[Output Columns: summary_label, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/textclustering/#inputs","title":"Inputs","text":"
  • text (str): The reference text we want to obtain labels for.

  • projection (List[float]): Vector representation of the text to cluster, normally the output from the UMAP step.

  • cluster_label (int): Integer representing the label of a given cluster. -1 means it wasn't clustered.

"},{"location":"components-gallery/tasks/textclustering/#outputs","title":"Outputs","text":"
  • summary_label (str): The label or list of labels for the text.

  • model_name (str): The name of the model used to generate the label/s.

"},{"location":"components-gallery/tasks/textclustering/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textclustering/#generate-labels-for-a-set-of-texts-using-clustering","title":"Generate labels for a set of texts using clustering","text":"
from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import UMAP, DBSCAN, TextClustering\nfrom distilabel.pipeline import Pipeline\n\nds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\nwith Pipeline(name=\"Text clustering dataset\") as pipeline:\n    batch_size = 500\n\n    ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n    loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n    umap = UMAP(n_components=2, metric=\"cosine\")\n    dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n    text_clustering = TextClustering(\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        ),\n        n=3,  # 3 labels per example\n        query_title=\"Examples of Personas\",\n        samples_per_cluster=10,\n        context=(\n            \"Describe the main themes, topics, or categories that could describe the \"\n            \"following types of personas. All the examples of personas must share \"\n            \"the same set of labels.\"\n        ),\n        default_label=\"None\",\n        savefig=True,\n        input_batch_size=8,\n        input_mappings={\"text\": \"persona\"},\n        use_default_structured_output=True,\n    )\n\n    loader >> umap >> dbscan >> text_clustering\n
"},{"location":"components-gallery/tasks/textclustering/#references","title":"References","text":"
  • text-clustering repository
"},{"location":"components-gallery/tasks/apigensemanticchecker/","title":"APIGenSemanticChecker","text":"

Generate queries and answers for the given functions in JSON format.

The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format.

"},{"location":"components-gallery/tasks/apigensemanticchecker/#attributes","title":"Attributes","text":"
  • system_prompt: System prompt for the task. Has a default one.

  • exclude_failed_execution: Whether to exclude failed executions (won't run on those rows that have a False in keep_row_after_execution_check column, which comes from running APIGenExecutionChecker). Defaults to True.

"},{"location":"components-gallery/tasks/apigensemanticchecker/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[func_desc]\n            ICOL1[query]\n            ICOL2[answers]\n            ICOL3[execution_result]\n        end\n        subgraph New columns\n            OCOL0[thought]\n            OCOL1[keep_row_after_semantic_check]\n        end\n    end\n\n    subgraph APIGenSemanticChecker\n        StepInput[Input Columns: func_desc, query, answers, execution_result]\n        StepOutput[Output Columns: thought, keep_row_after_semantic_check]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/apigensemanticchecker/#inputs","title":"Inputs","text":"
  • func_desc (str): Description of what the function should do.

  • query (str): Instruction from the user.

  • answers (str): JSON encoded list with arguments to be passed to the function/API. Should be loaded using json.loads.

  • execution_result (str): Result of the function/API executed.

"},{"location":"components-gallery/tasks/apigensemanticchecker/#outputs","title":"Outputs","text":"
  • thought (str): Reasoning for the output on whether to keep this output or not.

  • keep_row_after_semantic_check (bool): True or False, can be used to filter afterwards.

"},{"location":"components-gallery/tasks/apigensemanticchecker/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/apigensemanticchecker/#semantic-checker-for-generated-function-calls-original-implementation","title":"Semantic checker for generated function calls (original implementation)","text":"
from distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\nsemantic_checker = APIGenSemanticChecker(\n    use_default_structured_output=False,\n    llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n    semantic_checker.process(\n        [\n            {\n                \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n            }\n        ]\n    )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'thought': '',\n# 'keep_row_after_semantic_check': True,\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n#     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n#     {'role': 'user',\n#     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n   \"thought\": \"Concisely describe your reasoning here\",\\n   \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/apigensemanticchecker/#semantic-checker-for-generated-function-calls-structured-output","title":"Semantic checker for generated function calls (structured output)","text":"
from distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\nsemantic_checker = APIGenSemanticChecker(\n    use_default_structured_output=True,\n    llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n    semantic_checker.process(\n        [\n            {\n                \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n            }\n        ]\n    )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'keep_row_after_semantic_check': True,\n# 'thought': '',\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n#     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n#     {'role': 'user',\n#     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/apigensemanticchecker/#references","title":"References","text":"
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets

  • Salesforce/xlam-function-calling-60k

"},{"location":"components-gallery/tasks/generatetextretrievaldata/","title":"GenerateTextRetrievalData","text":"

Generate text retrieval data with an LLM to later on train an embedding model.

GenerateTextRetrievalData is a Task that generates text retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/generatetextretrievaldata/#note","title":"Note","text":"

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-retrieval\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-retrieval category.

"},{"location":"components-gallery/tasks/generatetextretrievaldata/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • query_type: The type of query to be generated, which can be extremely long-tail, long-tail, or common. Defaults to None, meaning that it will be randomly sampled.

  • query_length: The length of the query to be generated, which can be less than 5 words, 5 to 15 words, or at least 10 words. Defaults to None, meaning that it will be randomly sampled.

  • difficulty: The difficulty of the query to be generated, which can be high school, college, or PhD. Defaults to None, meaning that it will be randomly sampled.

  • clarity: The clarity of the query to be generated, which can be clear, understandable with some effort, or ambiguous. Defaults to None, meaning that it will be randomly sampled.

  • num_words: The number of words in the query to be generated, which can be 50, 100, 200, 300, 400, or 500. Defaults to None, meaning that it will be randomly sampled.

  • seed: The random seed to be set in case there's any sampling within the format_input method.

"},{"location":"components-gallery/tasks/generatetextretrievaldata/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n        end\n        subgraph New columns\n            OCOL0[user_query]\n            OCOL1[positive_document]\n            OCOL2[hard_negative_document]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph GenerateTextRetrievalData\n        StepInput[Input Columns: task]\n        StepOutput[Output Columns: user_query, positive_document, hard_negative_document, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generatetextretrievaldata/#inputs","title":"Inputs","text":"
  • task (str): The task description to be used in the generation.
"},{"location":"components-gallery/tasks/generatetextretrievaldata/#outputs","title":"Outputs","text":"
  • user_query (str): the user query generated by the LLM.

  • positive_document (str): the positive document generated by the LLM.

  • hard_negative_document (str): the hard negative document generated by the LLM.

  • model_name (str): the name of the model used to generate the text retrieval data.

"},{"location":"components-gallery/tasks/generatetextretrievaldata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatetextretrievaldata/#generate-synthetic-text-retrieval-data-for-training-embedding-models","title":"Generate synthetic text retrieval data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-retrieval\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateTextRetrievalData(\n        language=\"English\",\n        query_type=\"common\",\n        query_length=\"5 to 15 words\",\n        difficulty=\"high school\",\n        clarity=\"clear\",\n        num_words=100,\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
"},{"location":"components-gallery/tasks/generatetextretrievaldata/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/","title":"GenerateShortTextMatchingData","text":"

Generate short text matching data with an LLM to later on train an embedding model.

GenerateShortTextMatchingData is a Task that generates short text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#note","title":"Note","text":"

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-short\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-short category.

"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • seed: The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params.

"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n        end\n        subgraph New columns\n            OCOL0[input]\n            OCOL1[positive_document]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph GenerateShortTextMatchingData\n        StepInput[Input Columns: task]\n        StepOutput[Output Columns: input, positive_document, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#inputs","title":"Inputs","text":"
  • task (str): The task description to be used in the generation.
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#outputs","title":"Outputs","text":"
  • input (str): the input generated by the LLM.

  • positive_document (str): the positive document generated by the LLM.

  • model_name (str): the name of the model used to generate the short text matching data.

"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#generate-synthetic-short-text-matching-data-for-training-embedding-models","title":"Generate synthetic short text matching data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-matching-short\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateShortTextMatchingData(\n        language=\"English\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/","title":"GenerateLongTextMatchingData","text":"

Generate long text matching data with an LLM to later on train an embedding model.

GenerateLongTextMatchingData is a Task that generates long text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#note","title":"Note","text":"

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-long\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-long category.

"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • seed: The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params.

"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n        end\n        subgraph New columns\n            OCOL0[input]\n            OCOL1[positive_document]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph GenerateLongTextMatchingData\n        StepInput[Input Columns: task]\n        StepOutput[Output Columns: input, positive_document, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#inputs","title":"Inputs","text":"
  • task (str): The task description to be used in the generation.
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#outputs","title":"Outputs","text":"
  • input (str): the input generated by the LLM.

  • positive_document (str): the positive document generated by the LLM.

  • model_name (str): the name of the model used to generate the long text matching data.

"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#generate-synthetic-long-text-matching-data-for-training-embedding-models","title":"Generate synthetic long text matching data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-matching-long\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateLongTextMatchingData(\n        language=\"English\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generatetextclassificationdata/","title":"GenerateTextClassificationData","text":"

Generate text classification data with an LLM to later on train an embedding model.

GenerateTextClassificationData is a Task that generates text classification data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/generatetextclassificationdata/#note","title":"Note","text":"

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-classification\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-classification category.

"},{"location":"components-gallery/tasks/generatetextclassificationdata/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • difficulty: The difficulty of the query to be generated, which can be high school, college, or PhD. Defaults to None, meaning that it will be randomly sampled.

  • clarity: The clarity of the query to be generated, which can be clear, understandable with some effort, or ambiguous. Defaults to None, meaning that it will be randomly sampled.

  • seed: The random seed to be set in case there's any sampling within the format_input method.

"},{"location":"components-gallery/tasks/generatetextclassificationdata/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n        end\n        subgraph New columns\n            OCOL0[input_text]\n            OCOL1[label]\n            OCOL2[misleading_label]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph GenerateTextClassificationData\n        StepInput[Input Columns: task]\n        StepOutput[Output Columns: input_text, label, misleading_label, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generatetextclassificationdata/#inputs","title":"Inputs","text":"
  • task (str): The task description to be used in the generation.
"},{"location":"components-gallery/tasks/generatetextclassificationdata/#outputs","title":"Outputs","text":"
  • input_text (str): the input text generated by the LLM.

  • label (str): the label generated by the LLM.

  • misleading_label (str): the misleading label generated by the LLM.

  • model_name (str): the name of the model used to generate the text classification data.

"},{"location":"components-gallery/tasks/generatetextclassificationdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatetextclassificationdata/#generate-synthetic-text-classification-data-for-training-embedding-models","title":"Generate synthetic text classification data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-classification\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateTextClassificationData(\n        language=\"English\",\n        difficulty=\"high school\",\n        clarity=\"clear\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
"},{"location":"components-gallery/tasks/generatetextclassificationdata/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/structuredgeneration/","title":"StructuredGeneration","text":"

Generate structured content for a given instruction using an LLM.

StructuredGeneration is a pre-defined task that defines the instruction and the structured_output as the inputs, and generation as the output. This task is used to generate structured content based on the input instruction and following the schema provided within the structured_output column per each instruction. The model_name also returned as part of the output in order to enhance it.

"},{"location":"components-gallery/tasks/structuredgeneration/#attributes","title":"Attributes","text":"
  • use_system_prompt: Whether to use the system prompt in the generation. Defaults to True, which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored.
"},{"location":"components-gallery/tasks/structuredgeneration/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[structured_output]\n        end\n        subgraph New columns\n            OCOL0[generation]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph StructuredGeneration\n        StepInput[Input Columns: instruction, structured_output]\n        StepOutput[Output Columns: generation, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/structuredgeneration/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to generate structured content from.

  • structured_output (Dict[str, Any]): The structured_output to generate structured content from. It should be a Python dictionary with the keys format and schema, where format should be one of json or regex, and the schema should be either the JSON schema or the regex pattern, respectively.

"},{"location":"components-gallery/tasks/structuredgeneration/#outputs","title":"Outputs","text":"
  • generation (str): The generated text matching the provided schema, if possible.

  • model_name (str): The name of the model used to generate the text.

"},{"location":"components-gallery/tasks/structuredgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/structuredgeneration/#generate-structured-output-from-a-json-schema","title":"Generate structured output from a JSON schema","text":"
from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\n\nstructured_gen.load()\n\nresult = next(\n    structured_gen.process(\n        [\n            {\n                \"instruction\": \"Create an RPG character\",\n                \"structured_output\": {\n                    \"format\": \"json\",\n                    \"schema\": {\n                        \"properties\": {\n                            \"name\": {\n                                \"title\": \"Name\",\n                                \"type\": \"string\"\n                            },\n                            \"description\": {\n                                \"title\": \"Description\",\n                                \"type\": \"string\"\n                            },\n                            \"role\": {\n                                \"title\": \"Role\",\n                                \"type\": \"string\"\n                            },\n                            \"weapon\": {\n                                \"title\": \"Weapon\",\n                                \"type\": \"string\"\n                            }\n                        },\n                        \"required\": [\n                            \"name\",\n                            \"description\",\n                            \"role\",\n                            \"weapon\"\n                        ],\n                        \"title\": \"Character\",\n                        \"type\": \"object\"\n                    }\n                },\n            }\n        ]\n    )\n)\n
"},{"location":"components-gallery/tasks/structuredgeneration/#generate-structured-output-from-a-regex-pattern-only-works-with-llms-that-support-regex-the-providers-using-outlines","title":"Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines)","text":"
from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\n\nstructured_gen.load()\n\nresult = next(\n    structured_gen.process(\n        [\n            {\n                \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n                \"structured_output\": {\n                    \"format\": \"regex\",\n                    \"schema\": r\"(\\d{1,2})\u00b0C\"\n                },\n\n            }\n        ]\n    )\n)\n
"},{"location":"components-gallery/tasks/monolingualtripletgenerator/","title":"MonolingualTripletGenerator","text":"

Generate monolingual triplets with an LLM to later on train an embedding model.

MonolingualTripletGenerator is a GeneratorTask that generates monolingual triplets with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/monolingualtripletgenerator/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • unit: The unit of the data to be generated, which can be sentence, phrase, or passage. Defaults to None, meaning that it will be randomly sampled.

  • difficulty: The difficulty of the query to be generated, which can be elementary school, high school, or college. Defaults to None, meaning that it will be randomly sampled.

  • high_score: The high score of the query to be generated, which can be 4, 4.5, or 5. Defaults to None, meaning that it will be randomly sampled.

  • low_score: The low score of the query to be generated, which can be 2.5, 3, or 3.5. Defaults to None, meaning that it will be randomly sampled.

  • seed: The random seed to be set in case there's any sampling within the format_input method.

"},{"location":"components-gallery/tasks/monolingualtripletgenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[S1]\n            OCOL1[S2]\n            OCOL2[S3]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph MonolingualTripletGenerator\n        StepOutput[Output Columns: S1, S2, S3, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n
"},{"location":"components-gallery/tasks/monolingualtripletgenerator/#outputs","title":"Outputs","text":"
  • S1 (str): the first sentence generated by the LLM.

  • S2 (str): the second sentence generated by the LLM.

  • S3 (str): the third sentence generated by the LLM.

  • model_name (str): the name of the model used to generate the monolingual triplets.

"},{"location":"components-gallery/tasks/monolingualtripletgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/monolingualtripletgenerator/#generate-monolingual-triplets-for-training-embedding-models","title":"Generate monolingual triplets for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import MonolingualTripletGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = MonolingualTripletGenerator(\n        language=\"English\",\n        unit=\"sentence\",\n        difficulty=\"elementary school\",\n        high_score=\"4\",\n        low_score=\"2.5\",\n        llm=...,\n    )\n\n    ...\n\n    task >> ...\n
"},{"location":"components-gallery/tasks/bitextretrievalgenerator/","title":"BitextRetrievalGenerator","text":"

Generate bitext retrieval data with an LLM to later on train an embedding model.

BitextRetrievalGenerator is a GeneratorTask that generates bitext retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/bitextretrievalgenerator/#attributes","title":"Attributes","text":"
  • source_language: The source language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • target_language: The target language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • unit: The unit of the data to be generated, which can be sentence, phrase, or passage. Defaults to None, meaning that it will be randomly sampled.

  • difficulty: The difficulty of the query to be generated, which can be elementary school, high school, or college. Defaults to None, meaning that it will be randomly sampled.

  • high_score: The high score of the query to be generated, which can be 4, 4.5, or 5. Defaults to None, meaning that it will be randomly sampled.

  • low_score: The low score of the query to be generated, which can be 2.5, 3, or 3.5. Defaults to None, meaning that it will be randomly sampled.

  • seed: The random seed to be set in case there's any sampling within the format_input method.

"},{"location":"components-gallery/tasks/bitextretrievalgenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[S1]\n            OCOL1[S2]\n            OCOL2[S3]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph BitextRetrievalGenerator\n        StepOutput[Output Columns: S1, S2, S3, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n
"},{"location":"components-gallery/tasks/bitextretrievalgenerator/#outputs","title":"Outputs","text":"
  • S1 (str): the first sentence generated by the LLM.

  • S2 (str): the second sentence generated by the LLM.

  • S3 (str): the third sentence generated by the LLM.

  • model_name (str): the name of the model used to generate the bitext retrieval data.

"},{"location":"components-gallery/tasks/bitextretrievalgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/bitextretrievalgenerator/#generate-bitext-retrieval-data-for-training-embedding-models","title":"Generate bitext retrieval data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import BitextRetrievalGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = BitextRetrievalGenerator(\n        source_language=\"English\",\n        target_language=\"Spanish\",\n        unit=\"sentence\",\n        difficulty=\"elementary school\",\n        high_score=\"4\",\n        low_score=\"2.5\",\n        llm=...,\n    )\n\n    ...\n\n    task >> ...\n
"},{"location":"components-gallery/tasks/embeddingtaskgenerator/","title":"EmbeddingTaskGenerator","text":"

Generate task descriptions for embedding-related tasks using an LLM.

EmbeddingTaskGenerator is a GeneratorTask that doesn't receieve any input besides the provided attributes that generates task descriptions for embedding-related tasks using a pre-defined prompt based on the category attribute. The category attribute should be one of the following:

- `text-retrieval`: Generate task descriptions for text retrieval tasks.\n- `text-matching-short`: Generate task descriptions for short text matching tasks.\n- `text-matching-long`: Generate task descriptions for long text matching tasks.\n- `text-classification`: Generate task descriptions for text classification tasks.\n
"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#attributes","title":"Attributes","text":"
  • category: The category of the task to be generated, which can either be text-retrieval, text-matching-short, text-matching-long, or text-classification.

  • flatten_tasks: Whether to flatten the tasks i.e. since a list of tasks is generated by the LLM, this attribute indicates whether to flatten the list or not. Defaults to False, meaning that running this task with num_generations=1 will return a distilabel.Distiset with one row only containing a list with around 20 tasks; otherwise, if set to True, it will return a distilabel.Distiset with around 20 rows, each containing one task.

"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[tasks]\n            OCOL1[task]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph EmbeddingTaskGenerator\n        StepOutput[Output Columns: tasks, task, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n
"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#outputs","title":"Outputs","text":"
  • tasks (List[str]): the list of tasks generated by the LLM.

  • task (str): the task generated by the LLM if flatten_tasks=True.

  • model_name (str): the name of the model used to generate the tasks.

"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/embeddingtaskgenerator/#generate-embedding-tasks-for-text-retrieval","title":"Generate embedding tasks for text retrieval","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-retrieval\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    ...\n\n    task >> ...\n
"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/llms/","title":"LLMs Gallery","text":"
  • AnthropicLLM

    Anthropic LLM implementation running the Async API client.

    AnthropicLLM

  • OpenAILLM

    OpenAI LLM implementation running the async API client.

    OpenAILLM

  • AnyscaleLLM

    Anyscale LLM implementation running the async API client of OpenAI.

    AnyscaleLLM

  • AzureOpenAILLM

    Azure OpenAI LLM implementation running the async API client.

    AzureOpenAILLM

  • TogetherLLM

    TogetherLLM LLM implementation running the async API client of OpenAI.

    TogetherLLM

  • ClientvLLM

    A client for the vLLM server implementing the OpenAI API specification.

    ClientvLLM

  • CohereLLM

    Cohere API implementation using the async client for concurrent text generation.

    CohereLLM

  • GroqLLM

    Groq API implementation using the async client for concurrent text generation.

    GroqLLM

  • InferenceEndpointsLLM

    InferenceEndpoints LLM implementation running the async API client.

    InferenceEndpointsLLM

  • LiteLLM

    LiteLLM implementation running the async API client.

    LiteLLM

  • MistralLLM

    Mistral LLM implementation running the async API client.

    MistralLLM

  • MixtureOfAgentsLLM

    Mixture-of-Agents implementation.

    MixtureOfAgentsLLM

  • OllamaLLM

    Ollama LLM implementation running the Async API client.

    OllamaLLM

  • VertexAILLM

    VertexAI LLM implementation running the async API clients for Gemini.

    VertexAILLM

  • TransformersLLM

    Hugging Face transformers library LLM implementation using the text generation

    TransformersLLM

  • LlamaCppLLM

    llama.cpp LLM implementation running the Python bindings for the C++ code.

    LlamaCppLLM

  • vLLM

    vLLM library LLM implementation.

    vLLM

"},{"location":"components-gallery/llms/anthropicllm/","title":"AnthropicLLM","text":"

Anthropic LLM implementation running the Async API client.

"},{"location":"components-gallery/llms/anthropicllm/#attributes","title":"Attributes","text":"
  • model: the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\", \"claude-3-sonnet-20240229\", etc. Available models can be checked here: Anthropic: Models overview.

  • api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable.

  • base_url: the base URL to use for the Anthropic API. Defaults to None which means that https://api.anthropic.com will be used internally.

  • timeout: the maximum time in seconds to wait for a response. Defaults to 600.0.

  • max_retries: The maximum number of times to retry the request before failing. Defaults to 6.

  • http_client: if provided, an alternative HTTP client to use for calling Anthropic API. Defaults to None.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

  • _api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally.

  • _aclient: the AsyncAnthropic client to use for the Anthropic API. It is meant to be used internally. Set in the load method.

"},{"location":"components-gallery/llms/anthropicllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable.

  • base_url: the base URL to use for the Anthropic API. Defaults to \"https://api.anthropic.com\".

  • timeout: the maximum time in seconds to wait for a response. Defaults to 600.0.

  • max_retries: the maximum number of times to retry the request before failing. Defaults to 6.

"},{"location":"components-gallery/llms/anthropicllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/anthropicllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import AnthropicLLM\n\nllm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/anthropicllm/#generate-structured-data","title":"Generate structured data","text":"
from pydantic import BaseModel\nfrom distilabel.models.llms import AnthropicLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = AnthropicLLM(\n    model=\"claude-3-opus-20240229\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/llms/openaillm/","title":"OpenAILLM","text":"

OpenAI LLM implementation running the async API client.

"},{"location":"components-gallery/llms/openaillm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc. Supported models can be found here.

  • base_url: the base URL to use for the OpenAI API requests. Defaults to None, which means that the value set for the environment variable OPENAI_BASE_URL will be used, or \"https://api.openai.com/v1\" if not set.

  • api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set.

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

"},{"location":"components-gallery/llms/openaillm/#runtime-parameters","title":"Runtime Parameters","text":"
  • base_url: the base URL to use for the OpenAI API requests. Defaults to None.

  • api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None.

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

"},{"location":"components-gallery/llms/openaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/openaillm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/openaillm/#generate-text-from-a-custom-endpoint-following-the-openai-api","title":"Generate text from a custom endpoint following the OpenAI API","text":"
from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/openaillm/#generate-structured-data","title":"Generate structured data","text":"
from pydantic import BaseModel\nfrom distilabel.models.llms import OpenAILLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = OpenAILLM(\n    model=\"gpt-4-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/llms/openaillm/#generate-with-batch-api-offline-batch-generation","title":"Generate with Batch API (offline batch generation)","text":"
from distilabel.models.llms import OpenAILLM\n\nload = llm = OpenAILLM(\n    model=\"gpt-3.5-turbo\",\n    use_offline_batch_generation=True,\n    offline_batch_generation_block_until_done=5,  # poll for results every 5 seconds\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n# [['Hello! How can I assist you today?']]\n
"},{"location":"components-gallery/llms/anyscalellm/","title":"AnyscaleLLM","text":"

Anyscale LLM implementation running the async API client of OpenAI.

"},{"location":"components-gallery/llms/anyscalellm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM, e.g., google/gemma-7b-it. See the supported models under the \"Text Generation -> Supported Models\" section here.

  • base_url: the base URL to use for the Anyscale API requests. Defaults to None, which means that the value set for the environment variable ANYSCALE_BASE_URL will be used, or \"https://api.endpoints.anyscale.com/v1\" if not set.

  • api_key: the API key to authenticate the requests to the Anyscale API. Defaults to None which means that the value set for the environment variable ANYSCALE_API_KEY will be used, or None if not set.

  • _api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally.

"},{"location":"components-gallery/llms/anyscalellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/anyscalellm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import AnyscaleLLM\n\nllm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/azureopenaillm/","title":"AzureOpenAILLM","text":"

Azure OpenAI LLM implementation running the async API client.

"},{"location":"components-gallery/llms/azureopenaillm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM i.e. the name of the Azure deployment.

  • base_url: the base URL to use for the Azure OpenAI API can be set with AZURE_OPENAI_ENDPOINT. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_ENDPOINT will be used, or None if not set.

  • api_key: the API key to authenticate the requests to the Azure OpenAI API. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_API_KEY will be used, or None if not set.

  • api_version: the API version to use for the Azure OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_VERSION will be used, or None if not set.

"},{"location":"components-gallery/llms/azureopenaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/azureopenaillm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/azureopenaillm/#generate-text-from-a-custom-endpoint-following-the-openai-api","title":"Generate text from a custom endpoint following the OpenAI API","text":"
from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/azureopenaillm/#generate-structured-data","title":"Generate structured data","text":"
from pydantic import BaseModel\nfrom distilabel.models.llms import AzureOpenAILLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = AzureOpenAILLM(\n    model=\"gpt-4-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/llms/togetherllm/","title":"TogetherLLM","text":"

TogetherLLM LLM implementation running the async API client of OpenAI.

"},{"location":"components-gallery/llms/togetherllm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\". Supported models can be found here.

  • base_url: the base URL to use for the Together API can be set with TOGETHER_BASE_URL. Defaults to None which means that the value set for the environment variable TOGETHER_BASE_URL will be used, or \"https://api.together.xyz/v1\" if not set.

  • api_key: the API key to authenticate the requests to the Together API. Defaults to None which means that the value set for the environment variable TOGETHER_API_KEY will be used, or None if not set.

  • _api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally.

"},{"location":"components-gallery/llms/togetherllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/togetherllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import AnyscaleLLM\n\nllm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/clientvllm/","title":"ClientvLLM","text":"

A client for the vLLM server implementing the OpenAI API specification.

"},{"location":"components-gallery/llms/clientvllm/#attributes","title":"Attributes","text":"
  • base_url: the base URL of the vLLM server. Defaults to \"http://localhost:8000\".

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None.

  • tokenizer: the Hugging Face Hub repo id or path of the tokenizer that will be used to apply the chat template and tokenize the inputs before sending it to the server. Defaults to None.

  • tokenizer_revision: the revision of the tokenizer to load. Defaults to None.

  • _aclient: the httpx.AsyncClient used to comunicate with the vLLM server. Defaults to None.

"},{"location":"components-gallery/llms/clientvllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • base_url: the base url of the vLLM server. Defaults to \"http://localhost:8000\".

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None.

"},{"location":"components-gallery/llms/clientvllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/clientvllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import ClientvLLM\n\nllm = ClientvLLM(\n    base_url=\"http://localhost:8000/v1\",\n    tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n)\n\nllm.load()\n\nresults = llm.generate_outputs(\n    inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n    temperature=0.7,\n    top_p=1.0,\n    max_new_tokens=256,\n)\n# [\n#     [\n#         \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n#         \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n#         \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n#     ]\n# ]\n
"},{"location":"components-gallery/llms/coherellm/","title":"CohereLLM","text":"

Cohere API implementation using the async client for concurrent text generation.

"},{"location":"components-gallery/llms/coherellm/#attributes","title":"Attributes","text":"
  • model: the name of the model from the Cohere API to use for the generation.

  • base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\".

  • api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • client_name: the name of the client to use for the API requests. Defaults to \"distilabel\".

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

  • _ChatMessage: the ChatMessage class from the cohere package.

  • _aclient: the AsyncClient client from the cohere package.

"},{"location":"components-gallery/llms/coherellm/#runtime-parameters","title":"Runtime Parameters","text":"
  • base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\".

  • api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • client_name: the name of the client to use for the API requests. Defaults to \"distilabel\".

"},{"location":"components-gallery/llms/coherellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/coherellm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import CohereLLM\n\nllm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n
"},{"location":"components-gallery/llms/groqllm/","title":"GroqLLM","text":"

Groq API implementation using the async client for concurrent text generation.

"},{"location":"components-gallery/llms/groqllm/#attributes","title":"Attributes","text":"
  • model: the name of the model from the Groq API to use for the generation.

  • base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\".

  • api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable.

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

  • _api_key_env_var: the name of the environment variable to use for the API key.

  • _aclient: the AsyncGroq client from the groq package.

"},{"location":"components-gallery/llms/groqllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\".

  • api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable.

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

"},{"location":"components-gallery/llms/groqllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/groqllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import GroqLLM\n\nllm = GroqLLM(model=\"llama3-70b-8192\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n
"},{"location":"components-gallery/llms/inferenceendpointsllm/","title":"InferenceEndpointsLLM","text":"

InferenceEndpoints LLM implementation running the async API client.

This LLM will internally use huggingface_hub.AsyncInferenceClient.

"},{"location":"components-gallery/llms/inferenceendpointsllm/#attributes","title":"Attributes","text":"
  • model_id: the model ID to use for the LLM as available in the Hugging Face Hub, which will be used to resolve the base URL for the serverless Inference Endpoints API requests. Defaults to None.

  • endpoint_name: the name of the Inference Endpoint to use for the LLM. Defaults to None.

  • endpoint_namespace: the namespace of the Inference Endpoint to use for the LLM. Defaults to None.

  • base_url: the base URL to use for the Inference Endpoints API requests.

  • api_key: the API key to authenticate the requests to the Inference Endpoints API.

  • tokenizer_id: the tokenizer ID to use for the LLM as available in the Hugging Face Hub. Defaults to None, but defining one is recommended to properly format the prompt.

  • model_display_name: the model display name to use for the LLM. Defaults to None.

  • use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

  • magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

  • structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

"},{"location":"components-gallery/llms/inferenceendpointsllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/inferenceendpointsllm/#free-serverless-inference-api-set-the-input_batch_size-of-the-task-that-uses-this-to-avoid-model-is-overloaded","title":"Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded","text":"
from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/inferenceendpointsllm/#dedicated-inference-endpoints","title":"Dedicated Inference Endpoints","text":"
from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    endpoint_name=\"<ENDPOINT_NAME>\",\n    api_key=\"<HF_API_KEY>\",\n    endpoint_namespace=\"<USER|ORG>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/inferenceendpointsllm/#dedicated-inference-endpoints-or-tgi","title":"Dedicated Inference Endpoints or TGI","text":"
from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    api_key=\"<HF_API_KEY>\",\n    base_url=\"<BASE_URL>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/inferenceendpointsllm/#generate-structured-data","title":"Generate structured data","text":"
from pydantic import BaseModel\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    api_key=\"api.key\",\n    structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n
"},{"location":"components-gallery/llms/litellm/","title":"LiteLLM","text":"

LiteLLM implementation running the async API client.

"},{"location":"components-gallery/llms/litellm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\", etc.

  • verbose: whether to log the LiteLLM client's logs. Defaults to False.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

"},{"location":"components-gallery/llms/litellm/#runtime-parameters","title":"Runtime Parameters","text":"
  • verbose: whether to log the LiteLLM client's logs. Defaults to False.
"},{"location":"components-gallery/llms/litellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/litellm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import LiteLLM\n\nllm = LiteLLM(model=\"gpt-3.5-turbo\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n
"},{"location":"components-gallery/llms/mistralllm/","title":"MistralLLM","text":"

Mistral LLM implementation running the async API client.

"},{"location":"components-gallery/llms/mistralllm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc.

  • endpoint: the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\".

  • api_key: the API key to authenticate the requests to the Mistral API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set.

  • max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5.

  • timeout: the maximum time in seconds to wait for a response. Defaults to 120.

  • max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

  • _api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally.

  • _aclient: the Mistral to use for the Mistral API. It is meant to be used internally. Set in the load method.

"},{"location":"components-gallery/llms/mistralllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • api_key: the API key to authenticate the requests to the Mistral API.

  • max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5.

  • timeout: the maximum time in seconds to wait for a response. Defaults to 120.

  • max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64.

"},{"location":"components-gallery/llms/mistralllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/mistralllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import MistralLLM\n\nllm = MistralLLM(model=\"open-mixtral-8x22b\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n
"},{"location":"components-gallery/llms/mixtureofagentsllm/","title":"MixtureOfAgentsLLM","text":"

Mixture-of-Agents implementation.

An LLM class that leverages LLMs collective strenghts to generate a response, as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\" paper. There is a list of LLMs proposing/generating outputs that LLMs from the next round/layer can use as auxiliary information. Finally, there is an LLM that aggregates the outputs to generate the final response.

"},{"location":"components-gallery/llms/mixtureofagentsllm/#attributes","title":"Attributes","text":"
  • aggregator_llm: The LLM that aggregates the outputs of the proposer LLMs.

  • proposers_llms: The list of LLMs that propose outputs to be aggregated.

  • rounds: The number of layers or rounds that the proposers_llms will generate outputs. Defaults to 1.

"},{"location":"components-gallery/llms/mixtureofagentsllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/mixtureofagentsllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\nllm = MixtureOfAgentsLLM(\n    aggregator_llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n    proposers_llms=[\n        InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        ),\n        InferenceEndpointsLLM(\n            model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n            tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n        ),\n        InferenceEndpointsLLM(\n            model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n            tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n        ),\n    ],\n    rounds=2,\n)\n\nllm.load()\n\noutput = llm.generate_outputs(\n    inputs=[\n        [\n            {\n                \"role\": \"user\",\n                \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n            }\n        ]\n    ]\n)\n
"},{"location":"components-gallery/llms/mixtureofagentsllm/#references","title":"References","text":"
  • Mixture-of-Agents Enhances Large Language Model Capabilities
"},{"location":"components-gallery/llms/ollamallm/","title":"OllamaLLM","text":"

Ollama LLM implementation running the Async API client.

"},{"location":"components-gallery/llms/ollamallm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"notus\".

  • host: the Ollama server host.

  • timeout: the timeout for the LLM. Defaults to 120.

  • _aclient: the AsyncClient to use for the Ollama API. It is meant to be used internally. Set in the load method.

"},{"location":"components-gallery/llms/ollamallm/#runtime-parameters","title":"Runtime Parameters","text":"
  • host: the Ollama server host.

  • timeout: the client timeout for the Ollama API. Defaults to 120.

"},{"location":"components-gallery/llms/ollamallm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/ollamallm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import OllamaLLM\n\nllm = OllamaLLM(model=\"llama3\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/vertexaillm/","title":"VertexAILLM","text":"

VertexAI LLM implementation running the async API clients for Gemini.

  • Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini

    To use the VertexAILLM is necessary to have configured the Google Cloud authentication using one of these methods:

    • Setting GOOGLE_CLOUD_CREDENTIALS environment variable
    • Using gcloud auth application-default login command
    • Using vertexai.init function from the google-cloud-aiplatform library
"},{"location":"components-gallery/llms/vertexaillm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"gemini-1.0-pro\". Supported models.

  • _aclient: the GenerativeModel to use for the Vertex AI Gemini API. It is meant to be used internally. Set in the load method.

"},{"location":"components-gallery/llms/vertexaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/vertexaillm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import VertexAILLM\n\nllm = VertexAILLM(model=\"gemini-1.5-pro\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/transformersllm/","title":"TransformersLLM","text":"

Hugging Face transformers library LLM implementation using the text generation

pipeline.

"},{"location":"components-gallery/llms/transformersllm/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

  • torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\".

  • trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

  • model_kwargs: additional dictionary of keyword arguments that will be passed to the from_pretrained method of the model.

  • tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer config files. If not provided, the one associated to the model will be used. Defaults to None.

  • use_fast: whether to use a fast tokenizer or not. Defaults to True.

  • chat_template: a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None.

  • device: the name or index of the device where the model will be loaded. Defaults to None.

  • device_map: a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\". Defaults to None.

  • token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

  • structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

  • use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

  • magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

"},{"location":"components-gallery/llms/transformersllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/transformersllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import TransformersLLM\n\nllm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/llamacppllm/","title":"LlamaCppLLM","text":"

llama.cpp LLM implementation running the Python bindings for the C++ code.

"},{"location":"components-gallery/llms/llamacppllm/#attributes","title":"Attributes","text":"
  • model_path: contains the path to the GGUF quantized model, compatible with the installed version of the llama.cpp Python bindings.

  • n_gpu_layers: the number of layers to use for the GPU. Defaults to -1, meaning that the available GPU device will be used.

  • chat_format: the chat format to use for the model. Defaults to None, which means the Llama format will be used.

  • n_ctx: the context size to use for the model. Defaults to 512.

  • n_batch: the prompt processing maximum batch size to use for the model. Defaults to 512.

  • seed: random seed to use for the generation. Defaults to 4294967295.

  • verbose: whether to print verbose output. Defaults to False.

  • structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {}.

  • _model: the Llama model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

"},{"location":"components-gallery/llms/llamacppllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • model_path: the path to the GGUF quantized model.

  • n_gpu_layers: the number of layers to use for the GPU. Defaults to -1.

  • chat_format: the chat format to use for the model. Defaults to None.

  • verbose: whether to print verbose output. Defaults to False.

  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {}.

"},{"location":"components-gallery/llms/llamacppllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/llamacppllm/#generate-text","title":"Generate text","text":"
from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\n# You can follow along this example downloading the following model running the following\n# command in the terminal, that will download the model to the `Downloads` folder:\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nllm = LlamaCppLLM(\n    model_path=str(Path.home() / model_path),\n    n_gpu_layers=-1,  # To use the GPU if available\n    n_ctx=1024,       # Set the context size\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/llamacppllm/#generate-structured-data","title":"Generate structured data","text":"
from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = LlamaCppLLM(\n    model_path=str(Path.home() / model_path),  # type: ignore\n    n_gpu_layers=-1,\n    n_ctx=1024,\n    structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/llms/llamacppllm/#references","title":"References","text":"
  • llama.cpp

  • llama-cpp-python

"},{"location":"components-gallery/llms/vllm/","title":"vLLM","text":"

vLLM library LLM implementation.

"},{"location":"components-gallery/llms/vllm/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • dtype: the data type to use for the model. Defaults to auto.

  • trust_remote_code: whether to trust the remote code when loading the model. Defaults to False.

  • quantization: the quantization mode to use for the model. Defaults to None.

  • revision: the revision of the model to load. Defaults to None.

  • tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer files. If not provided, the tokenizer will be loaded from the model directory. Defaults to None.

  • tokenizer_mode: the mode to use for the tokenizer. Defaults to auto.

  • tokenizer_revision: the revision of the tokenizer to load. Defaults to None.

  • skip_tokenizer_init: whether to skip the initialization of the tokenizer. Defaults to False.

  • chat_template: a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None.

  • structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

  • seed: the seed to use for the random number generator. Defaults to 0.

  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {}.

  • _model: the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

  • _tokenizer: the tokenizer instance used to format the prompt before passing it to the LLM. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

  • use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

  • magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

"},{"location":"components-gallery/llms/vllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library.
"},{"location":"components-gallery/llms/vllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/vllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import vLLM\n\n# You can pass a custom chat_template to the model\nllm = vLLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/vllm/#generate-structured-data","title":"Generate structured data","text":"
from pathlib import Path\nfrom distilabel.models.llms import vLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = vLLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\"\n    structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/embeddings/","title":"Embeddings Gallery","text":"
  • SentenceTransformerEmbeddings

    sentence-transformers library implementation for embedding generation.

    SentenceTransformerEmbeddings

  • vLLMEmbeddings

    vllm library implementation for embedding generation.

    vLLMEmbeddings

"},{"location":"components-gallery/embeddings/sentencetransformerembeddings/","title":"SentenceTransformerEmbeddings","text":"

sentence-transformers library implementation for embedding generation.

"},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • device: the name of the device used to load the model e.g. \"cuda\", \"mps\", etc. Defaults to None.

  • prompts: a dictionary containing prompts to be used with the model. Defaults to None.

  • default_prompt_name: the default prompt (in prompts) that will be applied to the inputs. If not provided, then no prompt will be used. Defaults to None.

  • trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

  • revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

  • token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

  • truncate_dim: the dimension to truncate the sentence embeddings. Defaults to None.

  • model_kwargs: extra kwargs that will be passed to the Hugging Face transformers model class. Defaults to None.

  • tokenizer_kwargs: extra kwargs that will be passed to the Hugging Face transformers tokenizer class. Defaults to None.

  • config_kwargs: extra kwargs that will be passed to the Hugging Face transformers configuration class. Defaults to None.

  • precision: the dtype that will have the resulting embeddings. Defaults to \"float32\".

  • normalize_embeddings: whether to normalize the embeddings so they have a length of 1. Defaults to None.

"},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#generating-sentence-embeddings","title":"Generating sentence embeddings","text":"
from distilabel.models import SentenceTransformerEmbeddings\n\nembeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n#   [-0.05447685346007347, -0.01623094454407692, ...],\n#   [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n
"},{"location":"components-gallery/embeddings/vllmembeddings/","title":"vLLMEmbeddings","text":"

vllm library implementation for embedding generation.

"},{"location":"components-gallery/embeddings/vllmembeddings/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • dtype: the data type to use for the model. Defaults to auto.

  • trust_remote_code: whether to trust the remote code when loading the model. Defaults to False.

  • quantization: the quantization mode to use for the model. Defaults to None.

  • revision: the revision of the model to load. Defaults to None.

  • enforce_eager: whether to enforce eager execution. Defaults to True.

  • seed: the seed to use for the random number generator. Defaults to 0.

  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {}.

  • _model: the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

"},{"location":"components-gallery/embeddings/vllmembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/embeddings/vllmembeddings/#generating-sentence-embeddings","title":"Generating sentence embeddings","text":"
from distilabel.models import vLLMEmbeddings\n\nembeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n#   [-0.05447685346007347, -0.01623094454407692, ...],\n#   [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n
"},{"location":"components-gallery/embeddings/vllmembeddings/#references","title":"References","text":"
  • Offline inference embeddings
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Distilabel","text":"Synthesize data for AI and add feedback on the fly!

Distilabel is the framework for synthetic data and AI feedback for engineers who need fast, reliable and scalable pipelines based on verified research papers.

  • Get started in 5 minutes!

    Install distilabel with pip and run your first Pipeline to generate and evaluate synthetic data.

    Quickstart

  • How-to guides

    Get familiar with the basics of distilabel. Learn how to define steps, tasks and llms and run your Pipeline.

    Learn more

"},{"location":"#why-use-distilabel","title":"Why use distilabel?","text":"

Distilabel can be used for generating synthetic data and AI feedback for a wide variety of projects including traditional predictive NLP (classification, extraction, etc.), or generative and large language model scenarios (instruction following, dialogue generation, judging etc.). Distilabel's programmatic approach allows you to build scalable pipelines for data generation and AI feedback. The goal of distilabel is to accelerate your AI development by quickly generating high-quality, diverse datasets based on verified research methodologies for generating and judging with AI feedback.

Improve your AI output quality through data quality

Compute is expensive and output quality is important. We help you focus on data quality, which tackles the root cause of both of these problems at once. Distilabel helps you to synthesize and judge data to let you spend your valuable time achieving and keeping high-quality standards for your synthetic data.

Take control of your data and models

Ownership of data for fine-tuning your own LLMs is not easy but distilabel can help you to get started. We integrate AI feedback from any LLM provider out there using one unified API.

Improve efficiency by quickly iterating on the right data and models

Synthesize and judge data with latest research papers while ensuring flexibility, scalability and fault tolerance. So you can focus on improving your data and training your models.

"},{"location":"#what-do-people-build-with-distilabel","title":"What do people build with distilabel?","text":"

The Argilla community uses distilabel to create amazing datasets and models.

  • The 1M OpenHermesPreference is a dataset of ~1 million AI preferences derived from teknium/OpenHermes-2.5. It shows how we can use Distilabel to synthesize data on an immense scale.
  • Our distilabeled Intel Orca DPO dataset and the improved OpenHermes model, show how we improve model performance by filtering out 50% of the original dataset through AI feedback.
  • The haiku DPO data outlines how anyone can create a dataset for a specific task and the latest research papers to improve the quality of the dataset.
"},{"location":"api/cli/","title":"Command Line Interface (CLI)","text":"

This section contains the API reference for the CLI. For more information on how to use the CLI, see Tutorial - CLI.

"},{"location":"api/cli/#utility-functions-for-the-distilabel-pipeline-sub-commands","title":"Utility functions for the distilabel pipeline sub-commands","text":"

Here are some utility functions to help working with the pipelines in the console.

"},{"location":"api/cli/#distilabel.cli.pipeline.utils","title":"utils","text":""},{"location":"api/cli/#distilabel.cli.pipeline.utils.parse_runtime_parameters","title":"parse_runtime_parameters(params)","text":"

Parses the runtime parameters from the CLI format to the format expected by the Pipeline.run method. The CLI format is a list of tuples, where the first element is a list of keys and the second element is the value.

Parameters:

Name Type Description Default params List[Tuple[List[str], str]]

A list of tuples, where the first element is a list of keys and the second element is the value.

required

Returns:

Type Description Dict[str, Dict[str, Any]]

A dictionary with the runtime parameters in the format expected by the

Dict[str, Dict[str, Any]]

Pipeline.run method.

Source code in src/distilabel/cli/pipeline/utils.py
def parse_runtime_parameters(\n    params: List[Tuple[List[str], str]],\n) -> Dict[str, Dict[str, Any]]:\n    \"\"\"Parses the runtime parameters from the CLI format to the format expected by the\n    `Pipeline.run` method. The CLI format is a list of tuples, where the first element is\n    a list of keys and the second element is the value.\n\n    Args:\n        params: A list of tuples, where the first element is a list of keys and the\n            second element is the value.\n\n    Returns:\n        A dictionary with the runtime parameters in the format expected by the\n        `Pipeline.run` method.\n    \"\"\"\n    runtime_params = {}\n    for keys, value in params:\n        current = runtime_params\n        for i, key in enumerate(keys):\n            if i == len(keys) - 1:\n                current[key] = value\n            else:\n                current = current.setdefault(key, {})\n    return runtime_params\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.valid_http_url","title":"valid_http_url(url)","text":"

Check if the URL is a valid HTTP URL.

Parameters:

Name Type Description Default url str

The URL to check.

required

Returns:

Type Description bool

True, if the URL is a valid HTTP URL. False, otherwise.

Source code in src/distilabel/cli/pipeline/utils.py
def valid_http_url(url: str) -> bool:\n    \"\"\"Check if the URL is a valid HTTP URL.\n\n    Args:\n        url: The URL to check.\n\n    Returns:\n        `True`, if the URL is a valid HTTP URL. `False`, otherwise.\n    \"\"\"\n    try:\n        TypeAdapter(HttpUrl).validate_python(url)  # type: ignore\n    except ValidationError:\n        return False\n\n    return True\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_config_from_url","title":"get_config_from_url(url)","text":"

Loads the pipeline configuration from a URL pointing to a JSON or YAML file.

Parameters:

Name Type Description Default url str

The URL pointing to the pipeline configuration file.

required

Returns:

Type Description Dict[str, Any]

The pipeline configuration as a dictionary.

Raises:

Type Description ValueError

If the file format is not supported.

Source code in src/distilabel/cli/pipeline/utils.py
def get_config_from_url(url: str) -> Dict[str, Any]:\n    \"\"\"Loads the pipeline configuration from a URL pointing to a JSON or YAML file.\n\n    Args:\n        url: The URL pointing to the pipeline configuration file.\n\n    Returns:\n        The pipeline configuration as a dictionary.\n\n    Raises:\n        ValueError: If the file format is not supported.\n    \"\"\"\n    if not url.endswith((\".json\", \".yaml\", \".yml\")):\n        raise DistilabelUserError(\n            f\"Unsupported file format for '{url}'. Only JSON and YAML are supported\",\n            page=\"sections/how_to_guides/basic/pipeline/?h=seriali#serializing-the-pipeline\",\n        )\n    response = _download_remote_file(url)\n\n    if url.endswith((\".yaml\", \".yml\")):\n        content = response.content.decode(\"utf-8\")\n        return yaml.safe_load(content)\n\n    return response.json()\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_pipeline_from_url","title":"get_pipeline_from_url(url, pipeline_name='pipeline')","text":"

Downloads the file to the current working directory and loads the pipeline object from a python script.

Parameters:

Name Type Description Default url str

The URL pointing to the python script with the pipeline definition.

required pipeline_name str

The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:....

'pipeline'

Returns:

Type Description BasePipeline

The pipeline instantiated.

Raises:

Type Description ValueError

If the file format is not supported.

Source code in src/distilabel/cli/pipeline/utils.py
def get_pipeline_from_url(url: str, pipeline_name: str = \"pipeline\") -> \"BasePipeline\":\n    \"\"\"Downloads the file to the current working directory and loads the pipeline object\n    from a python script.\n\n    Args:\n        url: The URL pointing to the python script with the pipeline definition.\n        pipeline_name: The name of the pipeline in the script.\n            I.e: `with Pipeline(...) as pipeline:...`.\n\n    Returns:\n        The pipeline instantiated.\n\n    Raises:\n        ValueError: If the file format is not supported.\n    \"\"\"\n    if not url.endswith(\".py\"):\n        raise DistilabelUserError(\n            f\"Unsupported file format for '{url}'. It must be a python file.\",\n            page=\"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run\",\n        )\n    response = _download_remote_file(url)\n\n    content = response.content.decode(\"utf-8\")\n    script_local = Path.cwd() / Path(url).name\n    script_local.write_text(content)\n\n    # Add the current working directory to sys.path\n    sys.path.insert(0, os.getcwd())\n    module = importlib.import_module(str(Path(url).stem))\n    pipeline = getattr(module, pipeline_name, None)\n    if not pipeline:\n        raise ImportError(\n            f\"The script must contain an object with the pipeline named: '{pipeline_name}' that can be imported\"\n        )\n\n    return pipeline\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.get_pipeline","title":"get_pipeline(config_or_script, pipeline_name='pipeline')","text":"

Get a pipeline from a configuration file or a remote python script.

Parameters:

Name Type Description Default config_or_script str

The path or URL to the pipeline configuration file or URL to a python script.

required pipeline_name str

The name of the pipeline in the script. I.e: with Pipeline(...) as pipeline:....

'pipeline'

Returns:

Type Description BasePipeline

The pipeline.

Raises:

Type Description ValueError

If the file format is not supported.

FileNotFoundError

If the configuration file does not exist.

Source code in src/distilabel/cli/pipeline/utils.py
def get_pipeline(\n    config_or_script: str, pipeline_name: str = \"pipeline\"\n) -> \"BasePipeline\":\n    \"\"\"Get a pipeline from a configuration file or a remote python script.\n\n    Args:\n        config_or_script: The path or URL to the pipeline configuration file\n            or URL to a python script.\n        pipeline_name: The name of the pipeline in the script.\n            I.e: `with Pipeline(...) as pipeline:...`.\n\n    Returns:\n        The pipeline.\n\n    Raises:\n        ValueError: If the file format is not supported.\n        FileNotFoundError: If the configuration file does not exist.\n    \"\"\"\n    config = script = None\n    if config_or_script.endswith((\".json\", \".yaml\", \".yml\")):\n        config = config_or_script\n    elif config_or_script.endswith(\".py\"):\n        script = config_or_script\n    else:\n        raise DistilabelUserError(\n            \"The file must be a valid config file or python script with a pipeline.\",\n            page=\"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run\",\n        )\n\n    if valid_http_url(config_or_script):\n        if config:\n            data = get_config_from_url(config)\n            return Pipeline.from_dict(data)\n        return get_pipeline_from_url(script, pipeline_name=pipeline_name)\n\n    if not config:\n        raise ValueError(\n            f\"To run a pipeline from a python script, run it as `python {script}`\"\n        )\n\n    if Path(config).is_file():\n        return Pipeline.from_file(config)\n\n    raise FileNotFoundError(f\"File '{config_or_script}' does not exist.\")\n
"},{"location":"api/cli/#distilabel.cli.pipeline.utils.display_pipeline_information","title":"display_pipeline_information(pipeline)","text":"

Displays the pipeline information to the console.

Parameters:

Name Type Description Default pipeline BasePipeline

The pipeline.

required Source code in src/distilabel/cli/pipeline/utils.py
def display_pipeline_information(pipeline: \"BasePipeline\") -> None:\n    \"\"\"Displays the pipeline information to the console.\n\n    Args:\n        pipeline: The pipeline.\n    \"\"\"\n    from rich.console import Console\n\n    Console().print(_build_pipeline_panel(pipeline))\n
"},{"location":"api/distiset/","title":"Distiset","text":"

This section contains the API reference for the Distiset. For more information on how to use the CLI, see Tutorial - CLI.

"},{"location":"api/distiset/#distilabel.distiset.Distiset","title":"Distiset","text":"

Bases: dict

Convenient wrapper around datasets.Dataset to push to the Hugging Face Hub.

It's a dictionary where the keys correspond to the different leaf_steps from the internal DAG and the values are datasets.Dataset.

Attributes:

Name Type Description _pipeline_path Optional[Path]

Optional path to the pipeline.yaml file that generated the dataset. Defaults to None.

_artifacts_path Optional[Path]

Optional path to the directory containing the generated artifacts by the pipeline steps. Defaults to None.

_log_filename_path Optional[Path]

Optional path to the pipeline.log file that generated was written by the pipeline. Defaults to None.

_citations Optional[List[str]]

Optional list containing citations that will be included in the dataset card. Defaults to None.

Source code in src/distilabel/distiset.py
class Distiset(dict):\n    \"\"\"Convenient wrapper around `datasets.Dataset` to push to the Hugging Face Hub.\n\n    It's a dictionary where the keys correspond to the different leaf_steps from the internal\n    `DAG` and the values are `datasets.Dataset`.\n\n    Attributes:\n        _pipeline_path: Optional path to the `pipeline.yaml` file that generated the dataset.\n            Defaults to `None`.\n        _artifacts_path: Optional path to the directory containing the generated artifacts\n            by the pipeline steps. Defaults to `None`.\n        _log_filename_path: Optional path to the `pipeline.log` file that generated was written\n            by the pipeline. Defaults to `None`.\n        _citations: Optional list containing citations that will be included in the dataset\n            card. Defaults to `None`.\n    \"\"\"\n\n    _pipeline_path: Optional[Path] = None\n    _artifacts_path: Optional[Path] = None\n    _log_filename_path: Optional[Path] = None\n    _citations: Optional[List[str]] = None\n\n    def push_to_hub(\n        self,\n        repo_id: str,\n        private: bool = False,\n        token: Optional[str] = None,\n        generate_card: bool = True,\n        include_script: bool = False,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration\n        corresponding to the leaf step that generated it.\n\n        Args:\n            repo_id:\n                The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n                `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n                of the logged-in user.\n            private:\n                Whether the dataset repository should be set to private or not. Only affects repository creation:\n                a repository that already exists will not be affected by that parameter.\n            token:\n                An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n                to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n                if no token is passed and the user is not logged-in.\n            generate_card:\n                Whether to generate a dataset card or not. Defaults to True.\n            include_script:\n                Whether you want to push the pipeline script to the hugging face hub to share it.\n                If set to True, the name of the script that was run to create the distiset will be\n                automatically determined, and that will be the name of the file uploaded to your\n                repository. Take into account, this operation only makes sense for a distiset obtained\n                from calling `Pipeline.run()` method. Defaults to False.\n            **kwargs:\n                Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.\n\n        Raises:\n            ValueError: If no token is provided and couldn't be retrieved automatically.\n        \"\"\"\n        script_filename = sys.argv[0]\n        filename_py = (\n            script_filename.split(\"/\")[-1]\n            if \"/\" in script_filename\n            else script_filename\n        )\n        script_path = Path.cwd() / script_filename\n\n        if token is None:\n            token = get_hf_token(self.__class__.__name__, \"token\")\n\n        for name, dataset in self.items():\n            dataset.push_to_hub(\n                repo_id=repo_id,\n                config_name=name,\n                private=private,\n                token=token,\n                **kwargs,\n            )\n\n        if self.artifacts_path:\n            upload_folder(\n                repo_id=repo_id,\n                folder_path=self.artifacts_path,\n                path_in_repo=\"artifacts\",\n                token=token,\n                repo_type=\"dataset\",\n                commit_message=\"Include pipeline artifacts\",\n            )\n\n        if include_script and script_path.exists():\n            upload_file(\n                path_or_fileobj=script_path,\n                path_in_repo=filename_py,\n                repo_id=repo_id,\n                repo_type=\"dataset\",\n                token=token,\n                commit_message=\"Include pipeline script\",\n            )\n\n        if generate_card:\n            self._generate_card(\n                repo_id, token, include_script=include_script, filename_py=filename_py\n            )\n\n    def _get_card(\n        self,\n        repo_id: str,\n        token: Optional[str] = None,\n        include_script: bool = False,\n        filename_py: Optional[str] = None,\n    ) -> DistilabelDatasetCard:\n        \"\"\"Generates the dataset card for the `Distiset`.\n\n        Note:\n            If `repo_id` and `token` are provided, it will extract the metadata from the README.md file\n            on the hub.\n\n        Args:\n            repo_id: Name of the repository to push to, or the path for the distiset if saved to disk.\n            token: The token to authenticate with the Hugging Face Hub.\n                We assume that if it's provided, the dataset will be in the Hugging Face Hub,\n                so the README metadata will be extracted from there.\n            include_script: Whether to upload the script to the hugging face repository.\n            filename_py: The name of the script. If `include_script` is True, the script will\n                be uploaded to the repository using this name, otherwise it won't be used.\n\n        Returns:\n            The dataset card for the `Distiset`.\n        \"\"\"\n        sample_records = {}\n        for name, dataset in self.items():\n            sample_records[name] = (\n                dataset[0] if not isinstance(dataset, dict) else dataset[\"train\"][0]\n            )\n\n        readme_metadata = {}\n        if repo_id and token:\n            readme_metadata = self._extract_readme_metadata(repo_id, token)\n\n        metadata = {\n            **readme_metadata,\n            \"size_categories\": size_categories_parser(\n                max(len(dataset) for dataset in self.values())\n            ),\n            \"tags\": [\"synthetic\", \"distilabel\", \"rlaif\"],\n        }\n\n        card = DistilabelDatasetCard.from_template(\n            card_data=DatasetCardData(**metadata),\n            repo_id=repo_id,\n            sample_records=sample_records,\n            include_script=include_script,\n            filename_py=filename_py,\n            artifacts=self._get_artifacts_metadata(),\n            references=self.citations,\n        )\n\n        return card\n\n    def _get_artifacts_metadata(self) -> Dict[str, List[Dict[str, Any]]]:\n        \"\"\"Gets a dictionary with the metadata of the artifacts generated by the pipeline steps.\n\n        Returns:\n            A dictionary in which the key is the name of the step and the value is a list\n            of dictionaries, each of them containing the name and metadata of the step artifact.\n        \"\"\"\n        if not self.artifacts_path:\n            return {}\n\n        def iterdir_ignore_hidden(path: Path) -> Generator[Path, None, None]:\n            return (f for f in Path(path).iterdir() if not f.name.startswith(\".\"))\n\n        artifacts_metadata = defaultdict(list)\n        for step_artifacts_dir in iterdir_ignore_hidden(self.artifacts_path):\n            step_name = step_artifacts_dir.stem\n            for artifact_dir in iterdir_ignore_hidden(step_artifacts_dir):\n                artifact_name = artifact_dir.stem\n                metadata_path = artifact_dir / \"metadata.json\"\n                metadata = json.loads(metadata_path.read_text())\n                artifacts_metadata[step_name].append(\n                    {\"name\": artifact_name, \"metadata\": metadata}\n                )\n\n        return dict(artifacts_metadata)\n\n    def _extract_readme_metadata(\n        self, repo_id: str, token: Optional[str]\n    ) -> Dict[str, Any]:\n        \"\"\"Extracts the metadata from the README.md file of the dataset repository.\n\n        We have to download the previous README.md file in the repo, extract the metadata from it,\n        and generate a dict again to be passed thorough the `DatasetCardData` object.\n\n        Args:\n            repo_id: The ID of the repository to push to, from the `push_to_hub` method.\n            token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.\n\n        Returns:\n            The metadata extracted from the README.md file of the dataset repository as a dict.\n        \"\"\"\n        readme_path = Path(\n            hf_hub_download(repo_id, \"README.md\", repo_type=\"dataset\", token=token)\n        )\n        # Remove the '---' from the metadata\n        metadata = re.findall(r\"---\\n(.*?)\\n---\", readme_path.read_text(), re.DOTALL)[0]\n        metadata = yaml.safe_load(metadata)\n        return metadata\n\n    def _generate_card(\n        self,\n        repo_id: str,\n        token: str,\n        include_script: bool = False,\n        filename_py: Optional[str] = None,\n    ) -> None:\n        \"\"\"Generates a dataset card and pushes it to the Hugging Face Hub, and\n        if the `pipeline.yaml` path is available in the `Distiset`, uploads that\n        to the same repository.\n\n        Args:\n            repo_id: The ID of the repository to push to, from the `push_to_hub` method.\n            token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.\n            include_script: Whether to upload the script to the hugging face repository.\n            filename_py: The name of the script. If `include_script` is True, the script will\n                be uploaded to the repository using this name, otherwise it won't be used.\n        \"\"\"\n        card = self._get_card(\n            repo_id=repo_id,\n            token=token,\n            include_script=include_script,\n            filename_py=filename_py,\n        )\n\n        card.push_to_hub(\n            repo_id,\n            repo_type=\"dataset\",\n            token=token,\n        )\n\n        if self.pipeline_path:\n            # If the pipeline.yaml is available, upload it to the Hugging Face Hub as well.\n            HfApi().upload_file(\n                path_or_fileobj=self.pipeline_path,\n                path_in_repo=PIPELINE_CONFIG_FILENAME,\n                repo_id=repo_id,\n                repo_type=\"dataset\",\n                token=token,\n            )\n\n        if self.log_filename_path:\n            # The same we had with \"pipeline.yaml\" but with the log file.\n            HfApi().upload_file(\n                path_or_fileobj=self.log_filename_path,\n                path_in_repo=PIPELINE_LOG_FILENAME,\n                repo_id=repo_id,\n                repo_type=\"dataset\",\n                token=token,\n            )\n\n    def train_test_split(\n        self,\n        train_size: float,\n        shuffle: bool = True,\n        seed: Optional[int] = None,\n    ) -> Self:\n        \"\"\"Return a `Distiset` whose values will be a `datasets.DatasetDict` with two random train and test subsets.\n        Splits are created from the dataset according to `train_size` and `shuffle`.\n\n        Args:\n            train_size:\n                Float between `0.0` and `1.0` representing the proportion of the dataset to include in the test split.\n                It will be applied to all the datasets in the `Distiset`.\n            shuffle: Whether or not to shuffle the data before splitting\n            seed:\n                A seed to initialize the default BitGenerator, passed to the underlying method.\n\n        Returns:\n            The `Distiset` with the train-test split applied to all the datasets.\n        \"\"\"\n        assert 0 < train_size < 1, \"train_size must be a float between 0 and 1\"\n        for name, dataset in self.items():\n            self[name] = dataset.train_test_split(\n                train_size=train_size,\n                shuffle=shuffle,\n                seed=seed,\n            )\n        return self\n\n    def save_to_disk(\n        self,\n        distiset_path: PathLike,\n        max_shard_size: Optional[Union[str, int]] = None,\n        num_shards: Optional[int] = None,\n        num_proc: Optional[int] = None,\n        storage_options: Optional[dict] = None,\n        save_card: bool = True,\n        save_pipeline_config: bool = True,\n        save_pipeline_log: bool = True,\n    ) -> None:\n        r\"\"\"\n        Saves a `Distiset` to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n        In case you want to save the `Distiset` in a remote filesystem, you can pass the `storage_options` parameter\n        as you would do with `datasets`'s `Dataset.save_to_disk` method: [see example](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets)\n\n        Args:\n            distiset_path: Path where you want to save the `Distiset`. It can be a local path\n                (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n            max_shard_size: The maximum size of the dataset shards to be uploaded to the hub.\n                If expressed as a string, needs to be digits followed by a unit (like `\"50MB\"`).\n                Defaults to `None`.\n            num_shards: Number of shards to write. By default the number of shards depends on\n                `max_shard_size` and `num_proc`. Defaults to `None`.\n            num_proc: Number of processes when downloading and generating the dataset locally.\n                Multiprocessing is disabled by default. Defaults to `None`.\n            storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n                Defaults to `None`.\n            save_card: Whether to save the dataset card. Defaults to `True`.\n            save_pipeline_config: Whether to save the pipeline configuration file (aka the `pipeline.yaml` file).\n                Defaults to `True`.\n            save_pipeline_log: Whether to save the pipeline log file (aka the `pipeline.log` file).\n                Defaults to `True`.\n\n        Examples:\n            ```python\n            # Save your distiset in a local folder:\n            distiset.save_to_disk(distiset_path=\"my-distiset\")\n            # Save your distiset in a remote storage:\n            storage_options = {\n                \"key\": os.environ[\"S3_ACCESS_KEY\"],\n                \"secret\": os.environ[\"S3_SECRET_KEY\"],\n                \"client_kwargs\": {\n                    \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n                    \"region_name\": os.environ[\"S3_REGION\"],\n                },\n            }\n            distiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n            ```\n        \"\"\"\n        distiset_path = str(distiset_path)\n        for name, dataset in self.items():\n            dataset.save_to_disk(\n                f\"{distiset_path}/{name}\",\n                max_shard_size=max_shard_size,\n                num_shards=num_shards,\n                num_proc=num_proc,\n                storage_options=storage_options,\n            )\n\n        distiset_config_folder = posixpath.join(distiset_path, DISTISET_CONFIG_FOLDER)\n\n        fs: fsspec.AbstractFileSystem\n        fs, _, _ = fsspec.get_fs_token_paths(\n            distiset_config_folder, storage_options=storage_options\n        )\n        fs.makedirs(distiset_config_folder, exist_ok=True)\n\n        if self.artifacts_path:\n            distiset_artifacts_folder = posixpath.join(\n                distiset_path, DISTISET_ARTIFACTS_FOLDER\n            )\n            fs.copy(str(self.artifacts_path), distiset_artifacts_folder, recursive=True)\n\n        if save_card:\n            # NOTE:\u00a0Currently the card is not the same if we write to disk or push to the HF hub,\n            # as we aren't generating the README copying/updating the data from the dataset repo.\n            card = self._get_card(repo_id=Path(distiset_path).stem, token=None)\n            new_filename = posixpath.join(distiset_config_folder, \"README.md\")\n            if storage_options:\n                # Write the card the same way as DatasetCard.save does:\n                with fs.open(new_filename, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n                    f.write(str(card))\n            else:\n                card.save(new_filename)\n\n        # Write our internal files to the distiset folder by copying them to the distiset folder.\n        if save_pipeline_config and self.pipeline_path:\n            new_filename = posixpath.join(\n                distiset_config_folder, PIPELINE_CONFIG_FILENAME\n            )\n            if self.pipeline_path.exists() and (not fs.isfile(new_filename)):\n                data = yaml.safe_load(self.pipeline_path.read_text())\n                with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n                    yaml.dump(data, f, default_flow_style=False)\n\n        if save_pipeline_log and self.log_filename_path:\n            new_filename = posixpath.join(distiset_config_folder, PIPELINE_LOG_FILENAME)\n            if self.log_filename_path.exists() and (not fs.isfile(new_filename)):\n                data = self.log_filename_path.read_text()\n                with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n                    f.write(data)\n\n    @classmethod\n    def load_from_disk(\n        cls,\n        distiset_path: PathLike,\n        keep_in_memory: Optional[bool] = None,\n        storage_options: Optional[Dict[str, Any]] = None,\n        download_dir: Optional[PathLike] = None,\n    ) -> Self:\n        \"\"\"Loads a dataset that was previously saved using `Distiset.save_to_disk` from a dataset\n        directory, or from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n        Args:\n            distiset_path: Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").\n            keep_in_memory: Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk``\n                for more information. Defaults to `None`.\n            storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n                Defaults to `None`.\n            download_dir: Optional directory to download the dataset to. Defaults to None,\n                in which case it will create a temporary directory.\n\n        Returns:\n            A `Distiset` loaded from disk, it should be a `Distiset` object created using `Distiset.save_to_disk`.\n        \"\"\"\n        original_distiset_path = str(distiset_path)\n\n        fs: fsspec.AbstractFileSystem\n        fs, _, [distiset_path] = fsspec.get_fs_token_paths(  # type: ignore\n            original_distiset_path, storage_options=storage_options\n        )\n        dest_distiset_path = distiset_path\n\n        assert fs.isdir(\n            original_distiset_path\n        ), \"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem.\"\n\n        has_config = False\n        has_artifacts = False\n        distiset = cls()\n\n        if is_remote_filesystem(fs):\n            src_dataset_path = distiset_path\n            if download_dir:\n                dest_distiset_path = download_dir\n            else:\n                dest_distiset_path = Dataset._build_local_temp_path(src_dataset_path)  # type: ignore\n            fs.download(src_dataset_path, dest_distiset_path.as_posix(), recursive=True)  # type: ignore\n\n        # Now we should have the distiset locally, so we can read those files\n        for folder in Path(dest_distiset_path).iterdir():\n            if folder.stem == DISTISET_CONFIG_FOLDER:\n                has_config = True\n                continue\n            elif folder.stem == DISTISET_ARTIFACTS_FOLDER:\n                has_artifacts = True\n                continue\n            distiset[folder.stem] = load_from_disk(\n                str(folder),\n                keep_in_memory=keep_in_memory,\n            )\n\n        # From the config folder we just need to point to the files. Once downloaded we set the path to point to point to the files. Once downloaded we set the path\n        # to wherever they are.\n        if has_config:\n            distiset_config_folder = posixpath.join(\n                dest_distiset_path, DISTISET_CONFIG_FOLDER\n            )\n\n            pipeline_path = posixpath.join(\n                distiset_config_folder, PIPELINE_CONFIG_FILENAME\n            )\n            if Path(pipeline_path).exists():\n                distiset.pipeline_path = Path(pipeline_path)\n\n            log_filename_path = posixpath.join(\n                distiset_config_folder, PIPELINE_LOG_FILENAME\n            )\n            if Path(log_filename_path).exists():\n                distiset.log_filename_path = Path(log_filename_path)\n\n        if has_artifacts:\n            distiset.artifacts_path = Path(\n                posixpath.join(dest_distiset_path, DISTISET_ARTIFACTS_FOLDER)\n            )\n\n        return distiset\n\n    @property\n    def pipeline_path(self) -> Union[Path, None]:\n        \"\"\"Returns the path to the `pipeline.yaml` file that generated the `Pipeline`.\"\"\"\n        return self._pipeline_path\n\n    @pipeline_path.setter\n    def pipeline_path(self, path: PathLike) -> None:\n        self._pipeline_path = Path(path)\n\n    @property\n    def artifacts_path(self) -> Union[Path, None]:\n        \"\"\"Returns the path to the directory containing the artifacts generated by the steps\n        of the pipeline.\"\"\"\n        return self._artifacts_path\n\n    @artifacts_path.setter\n    def artifacts_path(self, path: PathLike) -> None:\n        self._artifacts_path = Path(path)\n\n    @property\n    def log_filename_path(self) -> Union[Path, None]:\n        \"\"\"Returns the path to the `pipeline.log` file that generated the `Pipeline`.\"\"\"\n        return self._log_filename_path\n\n    @log_filename_path.setter\n    def log_filename_path(self, path: PathLike) -> None:\n        self._log_filename_path = Path(path)\n\n    @property\n    def citations(self) -> Union[List[str], None]:\n        \"\"\"Bibtex references to be included in the README.\"\"\"\n        return self._citations\n\n    @citations.setter\n    def citations(self, citations_: List[str]) -> None:\n        self._citations = sorted(set(citations_))\n\n    def __repr__(self):\n        # Copy from `datasets.DatasetDict.__repr__`.\n        repr = \"\\n\".join([f\"{k}: {v}\" for k, v in self.items()])\n        repr = re.sub(r\"^\", \" \" * 4, repr, count=0, flags=re.M)\n        return f\"Distiset({{\\n{repr}\\n}})\"\n
"},{"location":"api/distiset/#distilabel.distiset.Distiset.pipeline_path","title":"pipeline_path: Union[Path, None] property writable","text":"

Returns the path to the pipeline.yaml file that generated the Pipeline.

"},{"location":"api/distiset/#distilabel.distiset.Distiset.artifacts_path","title":"artifacts_path: Union[Path, None] property writable","text":"

Returns the path to the directory containing the artifacts generated by the steps of the pipeline.

"},{"location":"api/distiset/#distilabel.distiset.Distiset.log_filename_path","title":"log_filename_path: Union[Path, None] property writable","text":"

Returns the path to the pipeline.log file that generated the Pipeline.

"},{"location":"api/distiset/#distilabel.distiset.Distiset.citations","title":"citations: Union[List[str], None] property writable","text":"

Bibtex references to be included in the README.

"},{"location":"api/distiset/#distilabel.distiset.Distiset.push_to_hub","title":"push_to_hub(repo_id, private=False, token=None, generate_card=True, include_script=False, **kwargs)","text":"

Pushes the Distiset to the Hugging Face Hub, each dataset will be pushed as a different configuration corresponding to the leaf step that generated it.

Parameters:

Name Type Description Default repo_id str

The ID of the repository to push to in the following format: <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.

required private bool

Whether the dataset repository should be set to private or not. Only affects repository creation: a repository that already exists will not be affected by that parameter.

False token Optional[str]

An optional authentication token for the Hugging Face Hub. If no token is passed, will default to the token saved locally when logging in with huggingface-cli login. Will raise an error if no token is passed and the user is not logged-in.

None generate_card bool

Whether to generate a dataset card or not. Defaults to True.

True include_script bool

Whether you want to push the pipeline script to the hugging face hub to share it. If set to True, the name of the script that was run to create the distiset will be automatically determined, and that will be the name of the file uploaded to your repository. Take into account, this operation only makes sense for a distiset obtained from calling Pipeline.run() method. Defaults to False.

False **kwargs Any

Additional keyword arguments to pass to the push_to_hub method of the datasets.Dataset object.

{}

Raises:

Type Description ValueError

If no token is provided and couldn't be retrieved automatically.

Source code in src/distilabel/distiset.py
def push_to_hub(\n    self,\n    repo_id: str,\n    private: bool = False,\n    token: Optional[str] = None,\n    generate_card: bool = True,\n    include_script: bool = False,\n    **kwargs: Any,\n) -> None:\n    \"\"\"Pushes the `Distiset` to the Hugging Face Hub, each dataset will be pushed as a different configuration\n    corresponding to the leaf step that generated it.\n\n    Args:\n        repo_id:\n            The ID of the repository to push to in the following format: `<user>/<dataset_name>` or\n            `<org>/<dataset_name>`. Also accepts `<dataset_name>`, which will default to the namespace\n            of the logged-in user.\n        private:\n            Whether the dataset repository should be set to private or not. Only affects repository creation:\n            a repository that already exists will not be affected by that parameter.\n        token:\n            An optional authentication token for the Hugging Face Hub. If no token is passed, will default\n            to the token saved locally when logging in with `huggingface-cli login`. Will raise an error\n            if no token is passed and the user is not logged-in.\n        generate_card:\n            Whether to generate a dataset card or not. Defaults to True.\n        include_script:\n            Whether you want to push the pipeline script to the hugging face hub to share it.\n            If set to True, the name of the script that was run to create the distiset will be\n            automatically determined, and that will be the name of the file uploaded to your\n            repository. Take into account, this operation only makes sense for a distiset obtained\n            from calling `Pipeline.run()` method. Defaults to False.\n        **kwargs:\n            Additional keyword arguments to pass to the `push_to_hub` method of the `datasets.Dataset` object.\n\n    Raises:\n        ValueError: If no token is provided and couldn't be retrieved automatically.\n    \"\"\"\n    script_filename = sys.argv[0]\n    filename_py = (\n        script_filename.split(\"/\")[-1]\n        if \"/\" in script_filename\n        else script_filename\n    )\n    script_path = Path.cwd() / script_filename\n\n    if token is None:\n        token = get_hf_token(self.__class__.__name__, \"token\")\n\n    for name, dataset in self.items():\n        dataset.push_to_hub(\n            repo_id=repo_id,\n            config_name=name,\n            private=private,\n            token=token,\n            **kwargs,\n        )\n\n    if self.artifacts_path:\n        upload_folder(\n            repo_id=repo_id,\n            folder_path=self.artifacts_path,\n            path_in_repo=\"artifacts\",\n            token=token,\n            repo_type=\"dataset\",\n            commit_message=\"Include pipeline artifacts\",\n        )\n\n    if include_script and script_path.exists():\n        upload_file(\n            path_or_fileobj=script_path,\n            path_in_repo=filename_py,\n            repo_id=repo_id,\n            repo_type=\"dataset\",\n            token=token,\n            commit_message=\"Include pipeline script\",\n        )\n\n    if generate_card:\n        self._generate_card(\n            repo_id, token, include_script=include_script, filename_py=filename_py\n        )\n
"},{"location":"api/distiset/#distilabel.distiset.Distiset.train_test_split","title":"train_test_split(train_size, shuffle=True, seed=None)","text":"

Return a Distiset whose values will be a datasets.DatasetDict with two random train and test subsets. Splits are created from the dataset according to train_size and shuffle.

Parameters:

Name Type Description Default train_size float

Float between 0.0 and 1.0 representing the proportion of the dataset to include in the test split. It will be applied to all the datasets in the Distiset.

required shuffle bool

Whether or not to shuffle the data before splitting

True seed Optional[int]

A seed to initialize the default BitGenerator, passed to the underlying method.

None

Returns:

Type Description Self

The Distiset with the train-test split applied to all the datasets.

Source code in src/distilabel/distiset.py
def train_test_split(\n    self,\n    train_size: float,\n    shuffle: bool = True,\n    seed: Optional[int] = None,\n) -> Self:\n    \"\"\"Return a `Distiset` whose values will be a `datasets.DatasetDict` with two random train and test subsets.\n    Splits are created from the dataset according to `train_size` and `shuffle`.\n\n    Args:\n        train_size:\n            Float between `0.0` and `1.0` representing the proportion of the dataset to include in the test split.\n            It will be applied to all the datasets in the `Distiset`.\n        shuffle: Whether or not to shuffle the data before splitting\n        seed:\n            A seed to initialize the default BitGenerator, passed to the underlying method.\n\n    Returns:\n        The `Distiset` with the train-test split applied to all the datasets.\n    \"\"\"\n    assert 0 < train_size < 1, \"train_size must be a float between 0 and 1\"\n    for name, dataset in self.items():\n        self[name] = dataset.train_test_split(\n            train_size=train_size,\n            shuffle=shuffle,\n            seed=seed,\n        )\n    return self\n
"},{"location":"api/distiset/#distilabel.distiset.Distiset.save_to_disk","title":"save_to_disk(distiset_path, max_shard_size=None, num_shards=None, num_proc=None, storage_options=None, save_card=True, save_pipeline_config=True, save_pipeline_log=True)","text":"

Saves a Distiset to a dataset directory, or in a filesystem using any implementation of fsspec.spec.AbstractFileSystem.

In case you want to save the Distiset in a remote filesystem, you can pass the storage_options parameter as you would do with datasets's Dataset.save_to_disk method: see example

Parameters:

Name Type Description Default distiset_path PathLike

Path where you want to save the Distiset. It can be a local path (e.g. dataset/train) or remote URI (e.g. s3://my-bucket/dataset/train)

required max_shard_size Optional[Union[str, int]]

The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit (like \"50MB\"). Defaults to None.

None num_shards Optional[int]

Number of shards to write. By default the number of shards depends on max_shard_size and num_proc. Defaults to None.

None num_proc Optional[int]

Number of processes when downloading and generating the dataset locally. Multiprocessing is disabled by default. Defaults to None.

None storage_options Optional[dict]

Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

None save_card bool

Whether to save the dataset card. Defaults to True.

True save_pipeline_config bool

Whether to save the pipeline configuration file (aka the pipeline.yaml file). Defaults to True.

True save_pipeline_log bool

Whether to save the pipeline log file (aka the pipeline.log file). Defaults to True.

True

Examples:

# Save your distiset in a local folder:\ndistiset.save_to_disk(distiset_path=\"my-distiset\")\n# Save your distiset in a remote storage:\nstorage_options = {\n    \"key\": os.environ[\"S3_ACCESS_KEY\"],\n    \"secret\": os.environ[\"S3_SECRET_KEY\"],\n    \"client_kwargs\": {\n        \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n        \"region_name\": os.environ[\"S3_REGION\"],\n    },\n}\ndistiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n
Source code in src/distilabel/distiset.py
def save_to_disk(\n    self,\n    distiset_path: PathLike,\n    max_shard_size: Optional[Union[str, int]] = None,\n    num_shards: Optional[int] = None,\n    num_proc: Optional[int] = None,\n    storage_options: Optional[dict] = None,\n    save_card: bool = True,\n    save_pipeline_config: bool = True,\n    save_pipeline_log: bool = True,\n) -> None:\n    r\"\"\"\n    Saves a `Distiset` to a dataset directory, or in a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n    In case you want to save the `Distiset` in a remote filesystem, you can pass the `storage_options` parameter\n    as you would do with `datasets`'s `Dataset.save_to_disk` method: [see example](https://huggingface.co/docs/datasets/filesystems#saving-serialized-datasets)\n\n    Args:\n        distiset_path: Path where you want to save the `Distiset`. It can be a local path\n            (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`)\n        max_shard_size: The maximum size of the dataset shards to be uploaded to the hub.\n            If expressed as a string, needs to be digits followed by a unit (like `\"50MB\"`).\n            Defaults to `None`.\n        num_shards: Number of shards to write. By default the number of shards depends on\n            `max_shard_size` and `num_proc`. Defaults to `None`.\n        num_proc: Number of processes when downloading and generating the dataset locally.\n            Multiprocessing is disabled by default. Defaults to `None`.\n        storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n        save_card: Whether to save the dataset card. Defaults to `True`.\n        save_pipeline_config: Whether to save the pipeline configuration file (aka the `pipeline.yaml` file).\n            Defaults to `True`.\n        save_pipeline_log: Whether to save the pipeline log file (aka the `pipeline.log` file).\n            Defaults to `True`.\n\n    Examples:\n        ```python\n        # Save your distiset in a local folder:\n        distiset.save_to_disk(distiset_path=\"my-distiset\")\n        # Save your distiset in a remote storage:\n        storage_options = {\n            \"key\": os.environ[\"S3_ACCESS_KEY\"],\n            \"secret\": os.environ[\"S3_SECRET_KEY\"],\n            \"client_kwargs\": {\n                \"endpoint_url\": os.environ[\"S3_ENDPOINT_URL\"],\n                \"region_name\": os.environ[\"S3_REGION\"],\n            },\n        }\n        distiset.save_to_disk(distiset_path=\"my-distiset\", storage_options=storage_options)\n        ```\n    \"\"\"\n    distiset_path = str(distiset_path)\n    for name, dataset in self.items():\n        dataset.save_to_disk(\n            f\"{distiset_path}/{name}\",\n            max_shard_size=max_shard_size,\n            num_shards=num_shards,\n            num_proc=num_proc,\n            storage_options=storage_options,\n        )\n\n    distiset_config_folder = posixpath.join(distiset_path, DISTISET_CONFIG_FOLDER)\n\n    fs: fsspec.AbstractFileSystem\n    fs, _, _ = fsspec.get_fs_token_paths(\n        distiset_config_folder, storage_options=storage_options\n    )\n    fs.makedirs(distiset_config_folder, exist_ok=True)\n\n    if self.artifacts_path:\n        distiset_artifacts_folder = posixpath.join(\n            distiset_path, DISTISET_ARTIFACTS_FOLDER\n        )\n        fs.copy(str(self.artifacts_path), distiset_artifacts_folder, recursive=True)\n\n    if save_card:\n        # NOTE:\u00a0Currently the card is not the same if we write to disk or push to the HF hub,\n        # as we aren't generating the README copying/updating the data from the dataset repo.\n        card = self._get_card(repo_id=Path(distiset_path).stem, token=None)\n        new_filename = posixpath.join(distiset_config_folder, \"README.md\")\n        if storage_options:\n            # Write the card the same way as DatasetCard.save does:\n            with fs.open(new_filename, \"w\", newline=\"\", encoding=\"utf-8\") as f:\n                f.write(str(card))\n        else:\n            card.save(new_filename)\n\n    # Write our internal files to the distiset folder by copying them to the distiset folder.\n    if save_pipeline_config and self.pipeline_path:\n        new_filename = posixpath.join(\n            distiset_config_folder, PIPELINE_CONFIG_FILENAME\n        )\n        if self.pipeline_path.exists() and (not fs.isfile(new_filename)):\n            data = yaml.safe_load(self.pipeline_path.read_text())\n            with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n                yaml.dump(data, f, default_flow_style=False)\n\n    if save_pipeline_log and self.log_filename_path:\n        new_filename = posixpath.join(distiset_config_folder, PIPELINE_LOG_FILENAME)\n        if self.log_filename_path.exists() and (not fs.isfile(new_filename)):\n            data = self.log_filename_path.read_text()\n            with fs.open(new_filename, \"w\", encoding=\"utf-8\") as f:\n                f.write(data)\n
"},{"location":"api/distiset/#distilabel.distiset.Distiset.load_from_disk","title":"load_from_disk(distiset_path, keep_in_memory=None, storage_options=None, download_dir=None) classmethod","text":"

Loads a dataset that was previously saved using Distiset.save_to_disk from a dataset directory, or from a filesystem using any implementation of fsspec.spec.AbstractFileSystem.

Parameters:

Name Type Description Default distiset_path PathLike

Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").

required keep_in_memory Optional[bool]

Whether to copy the dataset in-memory, see datasets.Dataset.load_from_disk`` for more information. Defaults toNone`.

None storage_options Optional[Dict[str, Any]]

Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

None download_dir Optional[PathLike]

Optional directory to download the dataset to. Defaults to None, in which case it will create a temporary directory.

None

Returns:

Type Description Self

A Distiset loaded from disk, it should be a Distiset object created using Distiset.save_to_disk.

Source code in src/distilabel/distiset.py
@classmethod\ndef load_from_disk(\n    cls,\n    distiset_path: PathLike,\n    keep_in_memory: Optional[bool] = None,\n    storage_options: Optional[Dict[str, Any]] = None,\n    download_dir: Optional[PathLike] = None,\n) -> Self:\n    \"\"\"Loads a dataset that was previously saved using `Distiset.save_to_disk` from a dataset\n    directory, or from a filesystem using any implementation of `fsspec.spec.AbstractFileSystem`.\n\n    Args:\n        distiset_path: Path (\"dataset/train\") or remote URI (\"s3://bucket/dataset/train\").\n        keep_in_memory: Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk``\n            for more information. Defaults to `None`.\n        storage_options: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n        download_dir: Optional directory to download the dataset to. Defaults to None,\n            in which case it will create a temporary directory.\n\n    Returns:\n        A `Distiset` loaded from disk, it should be a `Distiset` object created using `Distiset.save_to_disk`.\n    \"\"\"\n    original_distiset_path = str(distiset_path)\n\n    fs: fsspec.AbstractFileSystem\n    fs, _, [distiset_path] = fsspec.get_fs_token_paths(  # type: ignore\n        original_distiset_path, storage_options=storage_options\n    )\n    dest_distiset_path = distiset_path\n\n    assert fs.isdir(\n        original_distiset_path\n    ), \"`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem.\"\n\n    has_config = False\n    has_artifacts = False\n    distiset = cls()\n\n    if is_remote_filesystem(fs):\n        src_dataset_path = distiset_path\n        if download_dir:\n            dest_distiset_path = download_dir\n        else:\n            dest_distiset_path = Dataset._build_local_temp_path(src_dataset_path)  # type: ignore\n        fs.download(src_dataset_path, dest_distiset_path.as_posix(), recursive=True)  # type: ignore\n\n    # Now we should have the distiset locally, so we can read those files\n    for folder in Path(dest_distiset_path).iterdir():\n        if folder.stem == DISTISET_CONFIG_FOLDER:\n            has_config = True\n            continue\n        elif folder.stem == DISTISET_ARTIFACTS_FOLDER:\n            has_artifacts = True\n            continue\n        distiset[folder.stem] = load_from_disk(\n            str(folder),\n            keep_in_memory=keep_in_memory,\n        )\n\n    # From the config folder we just need to point to the files. Once downloaded we set the path to point to point to the files. Once downloaded we set the path\n    # to wherever they are.\n    if has_config:\n        distiset_config_folder = posixpath.join(\n            dest_distiset_path, DISTISET_CONFIG_FOLDER\n        )\n\n        pipeline_path = posixpath.join(\n            distiset_config_folder, PIPELINE_CONFIG_FILENAME\n        )\n        if Path(pipeline_path).exists():\n            distiset.pipeline_path = Path(pipeline_path)\n\n        log_filename_path = posixpath.join(\n            distiset_config_folder, PIPELINE_LOG_FILENAME\n        )\n        if Path(log_filename_path).exists():\n            distiset.log_filename_path = Path(log_filename_path)\n\n    if has_artifacts:\n        distiset.artifacts_path = Path(\n            posixpath.join(dest_distiset_path, DISTISET_ARTIFACTS_FOLDER)\n        )\n\n    return distiset\n
"},{"location":"api/distiset/#distilabel.distiset.create_distiset","title":"create_distiset(data_dir, pipeline_path=None, log_filename_path=None, enable_metadata=False, dag=None)","text":"

Creates a Distiset from the buffer folder.

This function is intended to be used as a helper to create a Distiset from from the folder where the cached data was written by the _WriteBuffer.

Parameters:

Name Type Description Default data_dir Path

Folder where the data buffers were written by the _WriteBuffer. It should correspond to CacheLocation.data.

required pipeline_path Optional[Path]

Optional path to the pipeline.yaml file that generated the dataset. Internally this will be passed to the Distiset object on creation to allow uploading the pipeline.yaml file to the repo upon Distiset.push_to_hub.

None log_filename_path Optional[Path]

Optional path to the pipeline.log file that was generated during the pipeline run. Internally this will be passed to the Distiset object on creation to allow uploading the pipeline.log file to the repo upon Distiset.push_to_hub.

None enable_metadata bool

Whether to include the distilabel metadata column in the dataset or not. Defaults to False.

False dag Optional[DAG]

DAG contained in a Pipeline. If informed, will be used to extract the references/ citations from it.

None

Returns:

Type Description Distiset

The dataset created from the buffer folder, where the different leaf steps will

Distiset

correspond to different configurations of the dataset.

Examples:

from pathlib import Path\ndistiset = create_distiset(Path.home() / \".cache/distilabel/pipelines/path-to-pipe-hashname\")\n
Source code in src/distilabel/distiset.py
def create_distiset(  # noqa: C901\n    data_dir: Path,\n    pipeline_path: Optional[Path] = None,\n    log_filename_path: Optional[Path] = None,\n    enable_metadata: bool = False,\n    dag: Optional[\"DAG\"] = None,\n) -> Distiset:\n    \"\"\"Creates a `Distiset` from the buffer folder.\n\n    This function is intended to be used as a helper to create a `Distiset` from from the folder\n    where the cached data was written by the `_WriteBuffer`.\n\n    Args:\n        data_dir: Folder where the data buffers were written by the `_WriteBuffer`.\n            It should correspond to `CacheLocation.data`.\n        pipeline_path: Optional path to the pipeline.yaml file that generated the dataset.\n            Internally this will be passed to the `Distiset` object on creation to allow\n            uploading the `pipeline.yaml` file to the repo upon `Distiset.push_to_hub`.\n        log_filename_path: Optional path to the pipeline.log file that was generated during the pipeline run.\n            Internally this will be passed to the `Distiset` object on creation to allow\n            uploading the `pipeline.log` file to the repo upon `Distiset.push_to_hub`.\n        enable_metadata: Whether to include the distilabel metadata column in the dataset or not.\n            Defaults to `False`.\n        dag: DAG contained in a `Pipeline`. If informed, will be used to extract the references/\n            citations from it.\n\n    Returns:\n        The dataset created from the buffer folder, where the different leaf steps will\n        correspond to different configurations of the dataset.\n\n    Examples:\n        ```python\n        from pathlib import Path\n        distiset = create_distiset(Path.home() / \".cache/distilabel/pipelines/path-to-pipe-hashname\")\n        ```\n    \"\"\"\n    from distilabel.constants import DISTILABEL_METADATA_KEY\n\n    logger = logging.getLogger(\"distilabel.distiset\")\n\n    steps_outputs_dir = data_dir / STEPS_OUTPUTS_PATH\n\n    distiset = Distiset()\n    for file in steps_outputs_dir.iterdir():\n        if file.is_file():\n            continue\n\n        files = [str(file) for file in list_files_in_dir(file)]\n        if files:\n            try:\n                ds = load_dataset(\n                    \"parquet\", name=file.stem, data_files={\"train\": files}\n                )\n                if not enable_metadata and DISTILABEL_METADATA_KEY in ds.column_names:\n                    ds = ds.remove_columns(DISTILABEL_METADATA_KEY)\n                distiset[file.stem] = ds\n            except ArrowInvalid:\n                logger.warning(f\"\u274c Failed to load the subset from '{file}' directory.\")\n                continue\n        else:\n            logger.warning(\n                f\"No output files for step '{file.stem}', can't create a dataset.\"\n                \" Did the step produce any data?\"\n            )\n\n    # If there's only one dataset i.e. one config, then set the config name to `default`\n    if len(distiset.keys()) == 1:\n        distiset[\"default\"] = distiset.pop(list(distiset.keys())[0])\n\n    # If there's any artifact set the `artifacts_path` so they can be uploaded\n    steps_artifacts_dir = data_dir / STEPS_ARTIFACTS_PATH\n    if any(steps_artifacts_dir.rglob(\"*\")):\n        distiset.artifacts_path = steps_artifacts_dir\n\n    # Include `pipeline.yaml` if exists\n    if pipeline_path:\n        distiset.pipeline_path = pipeline_path\n    else:\n        # If the pipeline path is not provided, try to find it in the parent directory\n        # and assume that's the wanted file.\n        pipeline_path = steps_outputs_dir.parent / \"pipeline.yaml\"\n        if pipeline_path.exists():\n            distiset.pipeline_path = pipeline_path\n\n    # Include `pipeline.log` if exists\n    if log_filename_path:\n        distiset.log_filename_path = log_filename_path\n    else:\n        log_filename_path = steps_outputs_dir.parent / \"pipeline.log\"\n        if log_filename_path.exists():\n            distiset.log_filename_path = log_filename_path\n\n    if dag:\n        distiset._citations = _grab_citations(dag)\n\n    return distiset\n
"},{"location":"api/errors/","title":"Errors","text":"

This section contains the distilabel custom errors. Unlike exceptions, errors in distilabel are used to handle unexpected situations that can't be anticipated and that can't be handled in a controlled way.

"},{"location":"api/errors/#distilabel.errors.DistilabelError","title":"DistilabelError","text":"

A mixin class for common functionality shared by all Distilabel-specific errors.

Attributes:

Name Type Description message

A message describing the error.

page

An optional error code from PydanticErrorCodes enum.

Examples:

raise DistilabelUserError(\"This is an error message.\")\nThis is an error message.\n\nraise DistilabelUserError(\"This is an error message.\", page=\"sections/getting_started/faq/\")\nThis is an error message.\nFor further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'\n
Source code in src/distilabel/errors.py
class DistilabelError:\n    \"\"\"A mixin class for common functionality shared by all Distilabel-specific errors.\n\n    Attributes:\n        message: A message describing the error.\n        page: An optional error code from PydanticErrorCodes enum.\n\n    Examples:\n        ```python\n        raise DistilabelUserError(\"This is an error message.\")\n        This is an error message.\n\n        raise DistilabelUserError(\"This is an error message.\", page=\"sections/getting_started/faq/\")\n        This is an error message.\n        For further information visit 'https://distilabel.argilla.io/latest/sections/getting_started/faq/'\n        ```\n    \"\"\"\n\n    def __init__(self, message: str, *, page: Optional[str] = None) -> None:\n        self.message = message\n        self.page = page\n\n    def __str__(self) -> str:\n        if self.page is None:\n            return self.message\n        else:\n            return f\"{self.message}\\n\\nFor further information visit '{DISTILABEL_DOCS_URL}{self.page}'\"\n
"},{"location":"api/errors/#distilabel.errors.DistilabelUserError","title":"DistilabelUserError","text":"

Bases: DistilabelError, ValueError

ValueError that we can redirect to a given page in the documentation.

Source code in src/distilabel/errors.py
class DistilabelUserError(DistilabelError, ValueError):\n    \"\"\"ValueError that we can redirect to a given page in the documentation.\"\"\"\n\n    pass\n
"},{"location":"api/errors/#distilabel.errors.DistilabelTypeError","title":"DistilabelTypeError","text":"

Bases: DistilabelError, TypeError

TypeError that we can redirect to a given page in the documentation.

Source code in src/distilabel/errors.py
class DistilabelTypeError(DistilabelError, TypeError):\n    \"\"\"TypeError that we can redirect to a given page in the documentation.\"\"\"\n\n    pass\n
"},{"location":"api/errors/#distilabel.errors.DistilabelNotImplementedError","title":"DistilabelNotImplementedError","text":"

Bases: DistilabelError, NotImplementedError

NotImplementedError that we can redirect to a given page in the documentation.

Source code in src/distilabel/errors.py
class DistilabelNotImplementedError(DistilabelError, NotImplementedError):\n    \"\"\"NotImplementedError that we can redirect to a given page in the documentation.\"\"\"\n\n    pass\n
"},{"location":"api/exceptions/","title":"Exceptions","text":"

This section contains the distilabel custom exceptions. Unlike errors, exceptions in distilabel are used to handle specific situations that can be anticipated and that can be handled in a controlled way internally by the library.

"},{"location":"api/exceptions/#distilabel.exceptions.DistilabelException","title":"DistilabelException","text":"

Bases: Exception

Base exception (can be gracefully handled) for distilabel framework.

Source code in src/distilabel/exceptions.py
class DistilabelException(Exception):\n    \"\"\"Base exception (can be gracefully handled) for `distilabel` framework.\"\"\"\n\n    pass\n
"},{"location":"api/exceptions/#distilabel.exceptions.DistilabelGenerationException","title":"DistilabelGenerationException","text":"

Bases: DistilabelException

Base exception for LLM generation errors.

Source code in src/distilabel/exceptions.py
class DistilabelGenerationException(DistilabelException):\n    \"\"\"Base exception for `LLM` generation errors.\"\"\"\n\n    pass\n
"},{"location":"api/exceptions/#distilabel.exceptions.DistilabelOfflineBatchGenerationNotFinishedException","title":"DistilabelOfflineBatchGenerationNotFinishedException","text":"

Bases: DistilabelGenerationException

Exception raised when a batch generation is not finished.

Source code in src/distilabel/exceptions.py
class DistilabelOfflineBatchGenerationNotFinishedException(\n    DistilabelGenerationException\n):\n    \"\"\"Exception raised when a batch generation is not finished.\"\"\"\n\n    jobs_ids: Tuple[str, ...]\n\n    def __init__(self, jobs_ids: Tuple[str, ...]) -> None:\n        self.jobs_ids = jobs_ids\n        super().__init__(f\"Batch generation with jobs_ids={jobs_ids} is not finished\")\n
"},{"location":"api/mixins/requirements/","title":"RequirementsMixin","text":""},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin","title":"RequirementsMixin","text":"

Mixin for classes that have requirements attribute.

Used to add requirements to a Step and a Pipeline.

Source code in src/distilabel/mixins/requirements.py
class RequirementsMixin:\n    \"\"\"Mixin for classes that have `requirements` attribute.\n\n    Used to add requirements to a `Step` and a `Pipeline`.\n    \"\"\"\n\n    _requirements: Union[List[Requirement], None] = []\n\n    def _gather_requirements(self) -> List[str]:\n        \"\"\"This method will be overwritten in the `BasePipeline` class to gather the requirements\n        from each step.\n        \"\"\"\n        return []\n\n    @property\n    def requirements(self) -> List[str]:\n        \"\"\"Return a list of requirements that must be installed to run the `Pipeline`.\n\n        The requirements in a Pipeline will include the requirements from all the steps (if any).\n\n        Returns:\n            List of requirements that must be installed to run the `Pipeline`, sorted alphabetically.\n        \"\"\"\n        self.requirements = self._gather_requirements()\n\n        return [str(r) for r in self._requirements]\n\n    @requirements.setter\n    def requirements(self, _requirements: List[str]) -> None:\n        requirements = []\n        if not isinstance(_requirements, list):\n            _requirements = [_requirements]\n\n        for r in _requirements:\n            try:\n                requirements.append(Requirement(r))\n            except InvalidRequirement:\n                self._logger.warning(f\"Invalid requirement: `{r}`\")\n\n        self._requirements = sorted(\n            set(self._requirements).union(set(requirements)), key=lambda x: str(x)\n        )\n\n    def requirements_to_install(self) -> List[str]:\n        \"\"\"Check if the requirements are installed in the current environment, and returns the ones that aren't.\n\n        Returns:\n            List of requirements required to run the pipeline that are not installed in the current environment.\n        \"\"\"\n\n        to_install = []\n        for req in self.requirements:\n            requirement = Requirement(req)\n            if importlib.util.find_spec(requirement.name):\n                if (str(requirement.specifier) != \"\") and (\n                    version(requirement.name) != str(requirement.specifier)\n                ):\n                    to_install.append(req)\n            else:\n                to_install.append(req)\n        return to_install\n
"},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin.requirements","title":"requirements: List[str] property writable","text":"

Return a list of requirements that must be installed to run the Pipeline.

The requirements in a Pipeline will include the requirements from all the steps (if any).

Returns:

Type Description List[str]

List of requirements that must be installed to run the Pipeline, sorted alphabetically.

"},{"location":"api/mixins/requirements/#distilabel.mixins.requirements.RequirementsMixin.requirements_to_install","title":"requirements_to_install()","text":"

Check if the requirements are installed in the current environment, and returns the ones that aren't.

Returns:

Type Description List[str]

List of requirements required to run the pipeline that are not installed in the current environment.

Source code in src/distilabel/mixins/requirements.py
def requirements_to_install(self) -> List[str]:\n    \"\"\"Check if the requirements are installed in the current environment, and returns the ones that aren't.\n\n    Returns:\n        List of requirements required to run the pipeline that are not installed in the current environment.\n    \"\"\"\n\n    to_install = []\n    for req in self.requirements:\n        requirement = Requirement(req)\n        if importlib.util.find_spec(requirement.name):\n            if (str(requirement.specifier) != \"\") and (\n                version(requirement.name) != str(requirement.specifier)\n            ):\n                to_install.append(req)\n        else:\n            to_install.append(req)\n    return to_install\n
"},{"location":"api/mixins/runtime_parameters/","title":"RuntimeParametersMixin","text":""},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin","title":"RuntimeParametersMixin","text":"

Bases: BaseModel

Mixin for classes that have RuntimeParameters attributes.

Attributes:

Name Type Description _runtime_parameters Dict[str, Any]

A dictionary containing the values of the runtime parameters of the class. This attribute is meant to be used internally and should not be accessed directly.

Source code in src/distilabel/mixins/runtime_parameters.py
class RuntimeParametersMixin(BaseModel):\n    \"\"\"Mixin for classes that have `RuntimeParameter`s attributes.\n\n    Attributes:\n        _runtime_parameters: A dictionary containing the values of the runtime parameters\n            of the class. This attribute is meant to be used internally and should not be\n            accessed directly.\n    \"\"\"\n\n    _runtime_parameters: Dict[str, Any] = PrivateAttr(default_factory=dict)\n\n    @property\n    def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n        \"\"\"Returns a dictionary containing the name of the runtime parameters of the class\n        as keys and whether the parameter is required or not as values.\n\n        Returns:\n            A dictionary containing the name of the runtime parameters of the class as keys\n            and whether the parameter is required or not as values.\n        \"\"\"\n\n        runtime_parameters = {}\n\n        for name, field_info in self.model_fields.items():  # type: ignore\n            # `field: RuntimeParameter[Any]` or `field: Optional[RuntimeParameter[Any]]`\n            is_runtime_param, is_optional = _is_runtime_parameter(field_info)\n            if is_runtime_param:\n                runtime_parameters[name] = is_optional\n                continue\n\n            attr = getattr(self, name)\n\n            # `field: RuntimeParametersMixin`\n            if isinstance(attr, RuntimeParametersMixin):\n                runtime_parameters[name] = attr.runtime_parameters_names\n\n            # `field: List[RuntimeParametersMixin]`\n            if (\n                isinstance(attr, list)\n                and attr\n                and isinstance(attr[0], RuntimeParametersMixin)\n            ):\n                runtime_parameters[name] = {\n                    str(i): item.runtime_parameters_names for i, item in enumerate(attr)\n                }\n\n        return runtime_parameters\n\n    def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n        \"\"\"Gets the information of the runtime parameters of the class such as the name and\n        the description. This function is meant to include the information of the runtime\n        parameters in the serialized data of the class.\n\n        Returns:\n            A list containing the information for each runtime parameter of the class.\n        \"\"\"\n        runtime_parameters_info = []\n        for name, field_info in self.model_fields.items():  # type: ignore\n            if name not in self.runtime_parameters_names:\n                continue\n\n            attr = getattr(self, name)\n\n            # Get runtime parameters info for `RuntimeParametersMixin` field\n            if isinstance(attr, RuntimeParametersMixin):\n                runtime_parameters_info.append(\n                    {\n                        \"name\": name,\n                        \"runtime_parameters_info\": attr.get_runtime_parameters_info(),\n                    }\n                )\n                continue\n\n            # Get runtime parameters info for `List[RuntimeParametersMixin]` field\n            if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n                runtime_parameters_info.append(\n                    {\n                        \"name\": name,\n                        \"runtime_parameters_info\": {\n                            str(i): item.get_runtime_parameters_info()\n                            for i, item in enumerate(attr)\n                        },\n                    }\n                )\n                continue\n\n            info = {\"name\": name, \"optional\": self.runtime_parameters_names[name]}\n            if field_info.description is not None:\n                info[\"description\"] = field_info.description\n            runtime_parameters_info.append(info)\n        return runtime_parameters_info\n\n    def set_runtime_parameters(self, runtime_parameters: Dict[str, Any]) -> None:\n        \"\"\"Sets the runtime parameters of the class using the provided values. If the attr\n        to be set is a `RuntimeParametersMixin`, it will call `set_runtime_parameters` on\n        the attr.\n\n        Args:\n            runtime_parameters: A dictionary containing the values of the runtime parameters\n                to set.\n        \"\"\"\n        runtime_parameters_names = list(self.runtime_parameters_names.keys())\n        for name, value in runtime_parameters.items():\n            if name not in self.runtime_parameters_names:\n                # Check done just to ensure the unit tests for the mixin run\n                if getattr(self, \"pipeline\", None):\n                    closest = difflib.get_close_matches(\n                        name, runtime_parameters_names, cutoff=0.5\n                    )\n                    msg = (\n                        f\"\u26a0\ufe0f  Runtime parameter '{name}' unknown in step '{self.name}'.\"  # type: ignore\n                    )\n                    if closest:\n                        msg += f\" Did you mean any of: {closest}\"\n                    else:\n                        msg += f\" Available runtime parameters for the step: {runtime_parameters_names}.\"\n                    self.pipeline._logger.warning(msg)  # type: ignore\n                continue\n\n            attr = getattr(self, name)\n\n            # Set runtime parameters for `RuntimeParametersMixin` field\n            if isinstance(attr, RuntimeParametersMixin):\n                attr.set_runtime_parameters(value)\n                self._runtime_parameters[name] = value\n                continue\n\n            # Set runtime parameters for `List[RuntimeParametersMixin]` field\n            if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n                for i, item in enumerate(attr):\n                    item_value = value.get(str(i), {})\n                    item.set_runtime_parameters(item_value)\n                self._runtime_parameters[name] = value\n                continue\n\n            # Handle settings values for `_SecretField`\n            field_info = self.model_fields[name]\n            inner_type = extract_annotation_inner_type(field_info.annotation)\n            if is_type_pydantic_secret_field(inner_type):\n                value = inner_type(value)\n\n            # Set the value of the runtime parameter\n            setattr(self, name, value)\n            self._runtime_parameters[name] = value\n
"},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property","text":"

Returns a dictionary containing the name of the runtime parameters of the class as keys and whether the parameter is required or not as values.

Returns:

Type Description RuntimeParametersNames

A dictionary containing the name of the runtime parameters of the class as keys

RuntimeParametersNames

and whether the parameter is required or not as values.

"},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.get_runtime_parameters_info","title":"get_runtime_parameters_info()","text":"

Gets the information of the runtime parameters of the class such as the name and the description. This function is meant to include the information of the runtime parameters in the serialized data of the class.

Returns:

Type Description List[RuntimeParameterInfo]

A list containing the information for each runtime parameter of the class.

Source code in src/distilabel/mixins/runtime_parameters.py
def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n    \"\"\"Gets the information of the runtime parameters of the class such as the name and\n    the description. This function is meant to include the information of the runtime\n    parameters in the serialized data of the class.\n\n    Returns:\n        A list containing the information for each runtime parameter of the class.\n    \"\"\"\n    runtime_parameters_info = []\n    for name, field_info in self.model_fields.items():  # type: ignore\n        if name not in self.runtime_parameters_names:\n            continue\n\n        attr = getattr(self, name)\n\n        # Get runtime parameters info for `RuntimeParametersMixin` field\n        if isinstance(attr, RuntimeParametersMixin):\n            runtime_parameters_info.append(\n                {\n                    \"name\": name,\n                    \"runtime_parameters_info\": attr.get_runtime_parameters_info(),\n                }\n            )\n            continue\n\n        # Get runtime parameters info for `List[RuntimeParametersMixin]` field\n        if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n            runtime_parameters_info.append(\n                {\n                    \"name\": name,\n                    \"runtime_parameters_info\": {\n                        str(i): item.get_runtime_parameters_info()\n                        for i, item in enumerate(attr)\n                    },\n                }\n            )\n            continue\n\n        info = {\"name\": name, \"optional\": self.runtime_parameters_names[name]}\n        if field_info.description is not None:\n            info[\"description\"] = field_info.description\n        runtime_parameters_info.append(info)\n    return runtime_parameters_info\n
"},{"location":"api/mixins/runtime_parameters/#distilabel.mixins.runtime_parameters.RuntimeParametersMixin.set_runtime_parameters","title":"set_runtime_parameters(runtime_parameters)","text":"

Sets the runtime parameters of the class using the provided values. If the attr to be set is a RuntimeParametersMixin, it will call set_runtime_parameters on the attr.

Parameters:

Name Type Description Default runtime_parameters Dict[str, Any]

A dictionary containing the values of the runtime parameters to set.

required Source code in src/distilabel/mixins/runtime_parameters.py
def set_runtime_parameters(self, runtime_parameters: Dict[str, Any]) -> None:\n    \"\"\"Sets the runtime parameters of the class using the provided values. If the attr\n    to be set is a `RuntimeParametersMixin`, it will call `set_runtime_parameters` on\n    the attr.\n\n    Args:\n        runtime_parameters: A dictionary containing the values of the runtime parameters\n            to set.\n    \"\"\"\n    runtime_parameters_names = list(self.runtime_parameters_names.keys())\n    for name, value in runtime_parameters.items():\n        if name not in self.runtime_parameters_names:\n            # Check done just to ensure the unit tests for the mixin run\n            if getattr(self, \"pipeline\", None):\n                closest = difflib.get_close_matches(\n                    name, runtime_parameters_names, cutoff=0.5\n                )\n                msg = (\n                    f\"\u26a0\ufe0f  Runtime parameter '{name}' unknown in step '{self.name}'.\"  # type: ignore\n                )\n                if closest:\n                    msg += f\" Did you mean any of: {closest}\"\n                else:\n                    msg += f\" Available runtime parameters for the step: {runtime_parameters_names}.\"\n                self.pipeline._logger.warning(msg)  # type: ignore\n            continue\n\n        attr = getattr(self, name)\n\n        # Set runtime parameters for `RuntimeParametersMixin` field\n        if isinstance(attr, RuntimeParametersMixin):\n            attr.set_runtime_parameters(value)\n            self._runtime_parameters[name] = value\n            continue\n\n        # Set runtime parameters for `List[RuntimeParametersMixin]` field\n        if isinstance(attr, list) and isinstance(attr[0], RuntimeParametersMixin):\n            for i, item in enumerate(attr):\n                item_value = value.get(str(i), {})\n                item.set_runtime_parameters(item_value)\n            self._runtime_parameters[name] = value\n            continue\n\n        # Handle settings values for `_SecretField`\n        field_info = self.model_fields[name]\n        inner_type = extract_annotation_inner_type(field_info.annotation)\n        if is_type_pydantic_secret_field(inner_type):\n            value = inner_type(value)\n\n        # Set the value of the runtime parameter\n        setattr(self, name, value)\n        self._runtime_parameters[name] = value\n
"},{"location":"api/models/embedding/","title":"Embedding","text":"

This section contains the API reference for the distilabel embeddings.

For more information on how the Embeddings works and see some examples.

"},{"location":"api/models/embedding/#distilabel.models.embeddings.base","title":"base","text":""},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings","title":"Embeddings","text":"

Bases: RuntimeParametersMixin, BaseModel, _Serializable, ABC

Base class for Embeddings models.

To implement an Embeddings subclass, you need to subclass this class and implement: - load method to load the Embeddings model. Don't forget to call super().load(), so the _logger attribute is initialized. - model_name property to return the model name used for the Embeddings. - encode method to generate the sentence embeddings.

Attributes:

Name Type Description _logger Logger

the logger to be used for the Embeddings model. It will be initialized when the load method is called.

Source code in src/distilabel/models/embeddings/base.py
class Embeddings(RuntimeParametersMixin, BaseModel, _Serializable, ABC):\n    \"\"\"Base class for `Embeddings` models.\n\n    To implement an `Embeddings` subclass, you need to subclass this class and implement:\n        - `load` method to load the `Embeddings` model. Don't forget to call `super().load()`,\n            so the `_logger` attribute is initialized.\n        - `model_name` property to return the model name used for the `Embeddings`.\n        - `encode` method to generate the sentence embeddings.\n\n    Attributes:\n        _logger: the logger to be used for the `Embeddings` model. It will be initialized\n            when the `load` method is called.\n    \"\"\"\n\n    model_config = ConfigDict(\n        arbitrary_types_allowed=True,\n        protected_namespaces=(),\n        validate_default=True,\n        validate_assignment=True,\n        extra=\"forbid\",\n    )\n    _logger: \"Logger\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Method to be called to initialize the `Embeddings`\"\"\"\n        self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n\n    def unload(self) -> None:\n        \"\"\"Method to be called to unload the `Embeddings` and release any resources.\"\"\"\n        pass\n\n    @property\n    @abstractmethod\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the `Embeddings`.\"\"\"\n        pass\n\n    @abstractmethod\n    def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n        \"\"\"Generates embeddings for the provided inputs.\n\n        Args:\n            inputs: a list of texts for which an embedding has to be generated.\n\n        Returns:\n            The generated embeddings.\n        \"\"\"\n        pass\n
"},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.model_name","title":"model_name: str abstractmethod property","text":"

Returns the model name used for the Embeddings.

"},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.load","title":"load()","text":"

Method to be called to initialize the Embeddings

Source code in src/distilabel/models/embeddings/base.py
def load(self) -> None:\n    \"\"\"Method to be called to initialize the `Embeddings`\"\"\"\n    self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n
"},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.unload","title":"unload()","text":"

Method to be called to unload the Embeddings and release any resources.

Source code in src/distilabel/models/embeddings/base.py
def unload(self) -> None:\n    \"\"\"Method to be called to unload the `Embeddings` and release any resources.\"\"\"\n    pass\n
"},{"location":"api/models/embedding/#distilabel.models.embeddings.base.Embeddings.encode","title":"encode(inputs) abstractmethod","text":"

Generates embeddings for the provided inputs.

Parameters:

Name Type Description Default inputs List[str]

a list of texts for which an embedding has to be generated.

required

Returns:

Type Description List[List[Union[int, float]]]

The generated embeddings.

Source code in src/distilabel/models/embeddings/base.py
@abstractmethod\ndef encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n    \"\"\"Generates embeddings for the provided inputs.\n\n    Args:\n        inputs: a list of texts for which an embedding has to be generated.\n\n    Returns:\n        The generated embeddings.\n    \"\"\"\n    pass\n
"},{"location":"api/models/embedding/embedding_gallery/","title":"Embedding Gallery","text":"

This section contains the existing Embeddings subclasses implemented in distilabel.

"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings","title":"embeddings","text":""},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings","title":"SentenceTransformerEmbeddings","text":"

Bases: Embeddings, CudaDevicePlacementMixin

sentence-transformers library implementation for embedding generation.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

device Optional[RuntimeParameter[str]]

the name of the device used to load the model e.g. \"cuda\", \"mps\", etc. Defaults to None.

prompts Optional[Dict[str, str]]

a dictionary containing prompts to be used with the model. Defaults to None.

default_prompt_name Optional[str]

the default prompt (in prompts) that will be applied to the inputs. If not provided, then no prompt will be used. Defaults to None.

trust_remote_code bool

whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

revision Optional[str]

if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

token Optional[str]

the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

truncate_dim Optional[int]

the dimension to truncate the sentence embeddings. Defaults to None.

model_kwargs Optional[Dict[str, Any]]

extra kwargs that will be passed to the Hugging Face transformers model class. Defaults to None.

tokenizer_kwargs Optional[Dict[str, Any]]

extra kwargs that will be passed to the Hugging Face transformers tokenizer class. Defaults to None.

config_kwargs Optional[Dict[str, Any]]

extra kwargs that will be passed to the Hugging Face transformers configuration class. Defaults to None.

precision Optional[Literal['float32', 'int8', 'uint8', 'binary', 'ubinary']]

the dtype that will have the resulting embeddings. Defaults to \"float32\".

normalize_embeddings RuntimeParameter[bool]

whether to normalize the embeddings so they have a length of 1. Defaults to None.

Examples:

Generating sentence embeddings:

from distilabel.models import SentenceTransformerEmbeddings\n\nembeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n#   [-0.05447685346007347, -0.01623094454407692, ...],\n#   [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n
Source code in src/distilabel/models/embeddings/sentence_transformers.py
class SentenceTransformerEmbeddings(Embeddings, CudaDevicePlacementMixin):\n    \"\"\"`sentence-transformers` library implementation for embedding generation.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        device: the name of the device used to load the model e.g. \"cuda\", \"mps\", etc.\n            Defaults to `None`.\n        prompts: a dictionary containing prompts to be used with the model. Defaults to\n            `None`.\n        default_prompt_name: the default prompt (in `prompts`) that will be applied to the\n            inputs. If not provided, then no prompt will be used. Defaults to `None`.\n        trust_remote_code: whether to allow fetching and executing remote code fetched\n            from the repository in the Hub. Defaults to `False`.\n        revision: if `model` refers to a Hugging Face Hub repository, then the revision\n            (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n        token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n            Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n            local configuration will be used. Defaults to `None`.\n        truncate_dim: the dimension to truncate the sentence embeddings. Defaults to `None`.\n        model_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n            model class. Defaults to `None`.\n        tokenizer_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n            tokenizer class. Defaults to `None`.\n        config_kwargs: extra kwargs that will be passed to the Hugging Face `transformers`\n            configuration class. Defaults to `None`.\n        precision: the dtype that will have the resulting embeddings. Defaults to `\"float32\"`.\n        normalize_embeddings: whether to normalize the embeddings so they have a length\n            of 1. Defaults to `None`.\n\n    Examples:\n        Generating sentence embeddings:\n\n        ```python\n        from distilabel.models import SentenceTransformerEmbeddings\n\n        embeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\n        embeddings.load()\n\n        results = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n        # [\n        #   [-0.05447685346007347, -0.01623094454407692, ...],\n        #   [4.4889533455716446e-05, 0.044016145169734955, ...],\n        # ]\n        ```\n    \"\"\"\n\n    model: str\n    device: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The device to be used to load the model. If `None`, then it\"\n        \" will check if a GPU can be used.\",\n    )\n    prompts: Optional[Dict[str, str]] = None\n    default_prompt_name: Optional[str] = None\n    trust_remote_code: bool = False\n    revision: Optional[str] = None\n    token: Optional[str] = None\n    truncate_dim: Optional[int] = None\n    model_kwargs: Optional[Dict[str, Any]] = None\n    tokenizer_kwargs: Optional[Dict[str, Any]] = None\n    config_kwargs: Optional[Dict[str, Any]] = None\n    precision: Optional[Literal[\"float32\", \"int8\", \"uint8\", \"binary\", \"ubinary\"]] = (\n        \"float32\"\n    )\n    normalize_embeddings: RuntimeParameter[bool] = Field(\n        default=True,\n        description=\"Whether to normalize the embeddings so the generated vectors\"\n        \" have a length of 1 or not.\",\n    )\n\n    _model: Union[\"SentenceTransformer\", None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the Sentence Transformer model\"\"\"\n        super().load()\n\n        if self.device == \"cuda\":\n            CudaDevicePlacementMixin.load(self)\n\n        try:\n            from sentence_transformers import SentenceTransformer\n        except ImportError as e:\n            raise ImportError(\n                \"`sentence-transformers` package is not installed. Please install it using\"\n                \" `pip install sentence-transformers`.\"\n            ) from e\n\n        self._model = SentenceTransformer(\n            model_name_or_path=self.model,\n            device=self.device,\n            prompts=self.prompts,\n            default_prompt_name=self.default_prompt_name,\n            trust_remote_code=self.trust_remote_code,\n            revision=self.revision,\n            token=self.token,\n            truncate_dim=self.truncate_dim,\n            model_kwargs=self.model_kwargs,\n            tokenizer_kwargs=self.tokenizer_kwargs,\n            config_kwargs=self.config_kwargs,\n        )\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the name of the model.\"\"\"\n        return self.model\n\n    def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n        \"\"\"Generates embeddings for the provided inputs.\n\n        Args:\n            inputs: a list of texts for which an embedding has to be generated.\n\n        Returns:\n            The generated embeddings.\n        \"\"\"\n        return self._model.encode(  # type: ignore\n            sentences=inputs,\n            batch_size=len(inputs),\n            convert_to_numpy=True,\n            precision=self.precision,  # type: ignore\n            normalize_embeddings=self.normalize_embeddings,  # type: ignore\n        ).tolist()  # type: ignore\n\n    def unload(self) -> None:\n        del self._model\n        if self.device == \"cuda\":\n            CudaDevicePlacementMixin.unload(self)\n        super().unload()\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.model_name","title":"model_name: str property","text":"

Returns the name of the model.

"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.load","title":"load()","text":"

Loads the Sentence Transformer model

Source code in src/distilabel/models/embeddings/sentence_transformers.py
def load(self) -> None:\n    \"\"\"Loads the Sentence Transformer model\"\"\"\n    super().load()\n\n    if self.device == \"cuda\":\n        CudaDevicePlacementMixin.load(self)\n\n    try:\n        from sentence_transformers import SentenceTransformer\n    except ImportError as e:\n        raise ImportError(\n            \"`sentence-transformers` package is not installed. Please install it using\"\n            \" `pip install sentence-transformers`.\"\n        ) from e\n\n    self._model = SentenceTransformer(\n        model_name_or_path=self.model,\n        device=self.device,\n        prompts=self.prompts,\n        default_prompt_name=self.default_prompt_name,\n        trust_remote_code=self.trust_remote_code,\n        revision=self.revision,\n        token=self.token,\n        truncate_dim=self.truncate_dim,\n        model_kwargs=self.model_kwargs,\n        tokenizer_kwargs=self.tokenizer_kwargs,\n        config_kwargs=self.config_kwargs,\n    )\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.SentenceTransformerEmbeddings.encode","title":"encode(inputs)","text":"

Generates embeddings for the provided inputs.

Parameters:

Name Type Description Default inputs List[str]

a list of texts for which an embedding has to be generated.

required

Returns:

Type Description List[List[Union[int, float]]]

The generated embeddings.

Source code in src/distilabel/models/embeddings/sentence_transformers.py
def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n    \"\"\"Generates embeddings for the provided inputs.\n\n    Args:\n        inputs: a list of texts for which an embedding has to be generated.\n\n    Returns:\n        The generated embeddings.\n    \"\"\"\n    return self._model.encode(  # type: ignore\n        sentences=inputs,\n        batch_size=len(inputs),\n        convert_to_numpy=True,\n        precision=self.precision,  # type: ignore\n        normalize_embeddings=self.normalize_embeddings,  # type: ignore\n    ).tolist()  # type: ignore\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings","title":"vLLMEmbeddings","text":"

Bases: Embeddings, CudaDevicePlacementMixin

vllm library implementation for embedding generation.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

dtype str

the data type to use for the model. Defaults to auto.

trust_remote_code bool

whether to trust the remote code when loading the model. Defaults to False.

quantization Optional[str]

the quantization mode to use for the model. Defaults to None.

revision Optional[str]

the revision of the model to load. Defaults to None.

enforce_eager bool

whether to enforce eager execution. Defaults to True.

seed int

the seed to use for the random number generator. Defaults to 0.

extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]]

additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {}.

_model LLM

the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

References
  • Offline inference embeddings

Examples:

Generating sentence embeddings:

from distilabel.models import vLLMEmbeddings\n\nembeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n#   [-0.05447685346007347, -0.01623094454407692, ...],\n#   [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n
Source code in src/distilabel/models/embeddings/vllm.py
class vLLMEmbeddings(Embeddings, CudaDevicePlacementMixin):\n    \"\"\"`vllm` library implementation for embedding generation.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        dtype: the data type to use for the model. Defaults to `auto`.\n        trust_remote_code: whether to trust the remote code when loading the model. Defaults\n            to `False`.\n        quantization: the quantization mode to use for the model. Defaults to `None`.\n        revision: the revision of the model to load. Defaults to `None`.\n        enforce_eager: whether to enforce eager execution. Defaults to `True`.\n        seed: the seed to use for the random number generator. Defaults to `0`.\n        extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n            `LLM` class of `vllm` library. Defaults to `{}`.\n        _model: the `vLLM` model instance. This attribute is meant to be used internally\n            and should not be accessed directly. It will be set in the `load` method.\n\n    References:\n        - [Offline inference embeddings](https://docs.vllm.ai/en/latest/getting_started/examples/offline_inference_embedding.html)\n\n    Examples:\n        Generating sentence embeddings:\n\n        ```python\n        from distilabel.models import vLLMEmbeddings\n\n        embeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\n        embeddings.load()\n\n        results = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n        # [\n        #   [-0.05447685346007347, -0.01623094454407692, ...],\n        #   [4.4889533455716446e-05, 0.044016145169734955, ...],\n        # ]\n        ```\n    \"\"\"\n\n    model: str\n    dtype: str = \"auto\"\n    trust_remote_code: bool = False\n    quantization: Optional[str] = None\n    revision: Optional[str] = None\n\n    enforce_eager: bool = True\n\n    seed: int = 0\n\n    extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n        default_factory=dict,\n        description=\"Additional dictionary of keyword arguments that will be passed to the\"\n        \" `vLLM` class of `vllm` library. See all the supported arguments at: \"\n        \"https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\",\n    )\n\n    _model: \"_vLLM\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\"\"\"\n        super().load()\n\n        CudaDevicePlacementMixin.load(self)\n\n        try:\n            from vllm import LLM as _vLLM\n\n        except ImportError as ie:\n            raise ImportError(\n                \"vLLM is not installed. Please install it using `pip install vllm`.\"\n            ) from ie\n\n        self._model = _vLLM(\n            self.model,\n            dtype=self.dtype,\n            trust_remote_code=self.trust_remote_code,\n            quantization=self.quantization,\n            revision=self.revision,\n            enforce_eager=self.enforce_eager,\n            seed=self.seed,\n            **self.extra_kwargs,  # type: ignore\n        )\n\n    def unload(self) -> None:\n        \"\"\"Unloads the `vLLM` model.\"\"\"\n        CudaDevicePlacementMixin.unload(self)\n        super().unload()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the name of the model.\"\"\"\n        return self.model\n\n    def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n        \"\"\"Generates embeddings for the provided inputs.\n\n        Args:\n            inputs: a list of texts for which an embedding has to be generated.\n\n        Returns:\n            The generated embeddings.\n        \"\"\"\n        return [output.outputs.embedding for output in self._model.encode(inputs)]\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.model_name","title":"model_name: str property","text":"

Returns the name of the model.

"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.load","title":"load()","text":"

Loads the vLLM model using either the path or the Hugging Face Hub repository id.

Source code in src/distilabel/models/embeddings/vllm.py
def load(self) -> None:\n    \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\"\"\"\n    super().load()\n\n    CudaDevicePlacementMixin.load(self)\n\n    try:\n        from vllm import LLM as _vLLM\n\n    except ImportError as ie:\n        raise ImportError(\n            \"vLLM is not installed. Please install it using `pip install vllm`.\"\n        ) from ie\n\n    self._model = _vLLM(\n        self.model,\n        dtype=self.dtype,\n        trust_remote_code=self.trust_remote_code,\n        quantization=self.quantization,\n        revision=self.revision,\n        enforce_eager=self.enforce_eager,\n        seed=self.seed,\n        **self.extra_kwargs,  # type: ignore\n    )\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.unload","title":"unload()","text":"

Unloads the vLLM model.

Source code in src/distilabel/models/embeddings/vllm.py
def unload(self) -> None:\n    \"\"\"Unloads the `vLLM` model.\"\"\"\n    CudaDevicePlacementMixin.unload(self)\n    super().unload()\n
"},{"location":"api/models/embedding/embedding_gallery/#distilabel.models.embeddings.vLLMEmbeddings.encode","title":"encode(inputs)","text":"

Generates embeddings for the provided inputs.

Parameters:

Name Type Description Default inputs List[str]

a list of texts for which an embedding has to be generated.

required

Returns:

Type Description List[List[Union[int, float]]]

The generated embeddings.

Source code in src/distilabel/models/embeddings/vllm.py
def encode(self, inputs: List[str]) -> List[List[Union[int, float]]]:\n    \"\"\"Generates embeddings for the provided inputs.\n\n    Args:\n        inputs: a list of texts for which an embedding has to be generated.\n\n    Returns:\n        The generated embeddings.\n    \"\"\"\n    return [output.outputs.embedding for output in self._model.encode(inputs)]\n
"},{"location":"api/models/llm/","title":"LLM","text":"

This section contains the API reference for the distilabel LLMs, both for the LLM synchronous implementation, and for the AsyncLLM asynchronous one.

For more information and examples on how to use existing LLMs or create custom ones, please refer to Tutorial - LLM.

"},{"location":"api/models/llm/#distilabel.models.llms.base","title":"base","text":""},{"location":"api/models/llm/#distilabel.models.llms.base.LLM","title":"LLM","text":"

Bases: RuntimeParametersMixin, BaseModel, _Serializable, ABC

Base class for LLMs to be used in distilabel framework.

To implement an LLM subclass, you need to subclass this class and implement: - load method to load the LLM if needed. Don't forget to call super().load(), so the _logger attribute is initialized. - model_name property to return the model name used for the LLM. - generate method to generate num_generations per input in inputs.

Attributes:

Name Type Description generation_kwargs Optional[RuntimeParameter[Dict[str, Any]]]

the kwargs to be propagated to either generate or agenerate methods within each LLM.

use_offline_batch_generation Optional[RuntimeParameter[bool]]

whether to use the offline_batch_generate method to generate the responses.

offline_batch_generation_block_until_done Optional[RuntimeParameter[int]]

if provided, then polling will be done until the ofline_batch_generate method is able to retrieve the results. The value indicate the time to wait between each polling.

jobs_ids Union[Tuple[str, ...], None]

the job ids generated by the offline_batch_generate method. This attribute is used to store the job ids generated by the offline_batch_generate method so later they can be used to retrieve the results. It is not meant to be set by the user.

_logger Logger

the logger to be used for the LLM. It will be initialized when the load method is called.

Source code in src/distilabel/models/llms/base.py
class LLM(RuntimeParametersMixin, BaseModel, _Serializable, ABC):\n    \"\"\"Base class for `LLM`s to be used in `distilabel` framework.\n\n    To implement an `LLM` subclass, you need to subclass this class and implement:\n        - `load` method to load the `LLM` if needed. Don't forget to call `super().load()`,\n            so the `_logger` attribute is initialized.\n        - `model_name` property to return the model name used for the LLM.\n        - `generate` method to generate `num_generations` per input in `inputs`.\n\n    Attributes:\n        generation_kwargs: the kwargs to be propagated to either `generate` or `agenerate`\n            methods within each `LLM`.\n        use_offline_batch_generation: whether to use the `offline_batch_generate` method to\n            generate the responses.\n        offline_batch_generation_block_until_done: if provided, then polling will be done until\n            the `ofline_batch_generate` method is able to retrieve the results. The value indicate\n            the time to wait between each polling.\n        jobs_ids: the job ids generated by the `offline_batch_generate` method. This attribute\n            is used to store the job ids generated by the `offline_batch_generate` method\n            so later they can be used to retrieve the results. It is not meant to be set by\n            the user.\n        _logger: the logger to be used for the `LLM`. It will be initialized when the `load`\n            method is called.\n    \"\"\"\n\n    model_config = ConfigDict(\n        arbitrary_types_allowed=True,\n        protected_namespaces=(),\n        validate_default=True,\n        validate_assignment=True,\n        extra=\"forbid\",\n    )\n\n    generation_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n        default_factory=dict,\n        description=\"The kwargs to be propagated to either `generate` or `agenerate`\"\n        \" methods within each `LLM`.\",\n    )\n    use_offline_batch_generation: Optional[RuntimeParameter[bool]] = Field(\n        default=False,\n        description=\"Whether to use the `offline_batch_generate` method to generate\"\n        \" the responses.\",\n    )\n    offline_batch_generation_block_until_done: Optional[RuntimeParameter[int]] = Field(\n        default=None,\n        description=\"If provided, then polling will be done until the `ofline_batch_generate`\"\n        \" method is able to retrieve the results. The value indicate the time to wait between\"\n        \" each polling.\",\n    )\n\n    jobs_ids: Union[Tuple[str, ...], None] = Field(default=None)\n    _logger: \"Logger\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Method to be called to initialize the `LLM`, its logger and optionally the\n        structured output generator.\"\"\"\n        self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n\n    def unload(self) -> None:\n        \"\"\"Method to be called to unload the `LLM` and release any resources.\"\"\"\n        pass\n\n    @property\n    @abstractmethod\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        pass\n\n    def get_generation_kwargs(self) -> Dict[str, Any]:\n        \"\"\"Returns the generation kwargs to be used for the generation. This method can\n        be overridden to provide a more complex logic for the generation kwargs.\n\n        Returns:\n            The kwargs to be used for the generation.\n        \"\"\"\n        return self.generation_kwargs  # type: ignore\n\n    @abstractmethod\n    def generate(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Abstract method to be implemented by each LLM to generate `num_generations`\n        per input in `inputs`.\n\n        Args:\n            inputs: the list of inputs to generate responses for which follows OpenAI's\n                API format:\n\n                ```python\n                [\n                    {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n                    {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n                    {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n                    {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n                ]\n                ```\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n        \"\"\"\n        pass\n\n    def generate_outputs(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Generates outputs for the given inputs using either `generate` method or the\n        `offine_batch_generate` method if `use_offline_\n        \"\"\"\n        if self.use_offline_batch_generation:\n            if self.offline_batch_generation_block_until_done is not None:\n                return self._offline_batch_generate_polling(\n                    inputs=inputs,\n                    num_generations=num_generations,\n                    **kwargs,\n                )\n\n            # This will raise `DistilabelOfflineBatchGenerationNotFinishedException` right away\n            # if the batch generation is not finished.\n            return self.offline_batch_generate(\n                inputs=inputs,\n                num_generations=num_generations,\n                **kwargs,\n            )\n\n        return self.generate(inputs=inputs, num_generations=num_generations, **kwargs)\n\n    def _offline_batch_generate_polling(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Method to poll the `offline_batch_generate` method until the batch generation\n        is finished.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        while True:\n            try:\n                return self.offline_batch_generate(\n                    inputs=inputs,\n                    num_generations=num_generations,\n                    **kwargs,\n                )\n            except DistilabelOfflineBatchGenerationNotFinishedException as e:\n                self._logger.info(\n                    f\"Waiting for the offline batch generation to finish: {e}. Sleeping\"\n                    f\" for {self.offline_batch_generation_block_until_done} seconds before\"\n                    \" trying to get the results again.\"\n                )\n                # When running a `Step` in a child process, SIGINT is overriden so the child\n                # process doesn't stop when the parent process receives a SIGINT signal.\n                # The new handler sets an environment variable that is checked here to stop\n                # the polling.\n                if os.getenv(SIGINT_HANDLER_CALLED_ENV_NAME) is not None:\n                    self._logger.info(\n                        \"Received a KeyboardInterrupt. Stopping polling for checking if the\"\n                        \" offline batch generation is finished...\"\n                    )\n                    raise e\n                time.sleep(self.offline_batch_generation_block_until_done)  # type: ignore\n            except KeyboardInterrupt as e:\n                # This is for the case the `LLM` is being executed outside a pipeline\n                self._logger.info(\n                    \"Received a KeyboardInterrupt. Stopping polling for checking if the\"\n                    \" offline batch generation is finished...\"\n                )\n                raise DistilabelOfflineBatchGenerationNotFinishedException(\n                    jobs_ids=self.jobs_ids  # type: ignore\n                ) from e\n\n    @property\n    def generate_parameters(self) -> List[\"inspect.Parameter\"]:\n        \"\"\"Returns the parameters of the `generate` method.\n\n        Returns:\n            A list containing the parameters of the `generate` method.\n        \"\"\"\n        return list(inspect.signature(self.generate).parameters.values())\n\n    @property\n    def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n        \"\"\"Returns the runtime parameters of the `LLM`, which are combination of the\n        attributes of the `LLM` type hinted with `RuntimeParameter` and the parameters\n        of the `generate` method that are not `input` and `num_generations`.\n\n        Returns:\n            A dictionary with the name of the runtime parameters as keys and a boolean\n            indicating if the parameter is optional or not.\n        \"\"\"\n        runtime_parameters = super().runtime_parameters_names\n        runtime_parameters[\"generation_kwargs\"] = {}\n\n        # runtime parameters from the `generate` method\n        for param in self.generate_parameters:\n            if param.name in [\"input\", \"inputs\", \"num_generations\"]:\n                continue\n            is_optional = param.default != inspect.Parameter.empty\n            runtime_parameters[\"generation_kwargs\"][param.name] = is_optional\n\n        return runtime_parameters\n\n    def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n        \"\"\"Gets the information of the runtime parameters of the `LLM` such as the name\n        and the description. This function is meant to include the information of the runtime\n        parameters in the serialized data of the `LLM`.\n\n        Returns:\n            A list containing the information for each runtime parameter of the `LLM`.\n        \"\"\"\n        runtime_parameters_info = super().get_runtime_parameters_info()\n\n        generation_kwargs_info = next(\n            (\n                runtime_parameter_info\n                for runtime_parameter_info in runtime_parameters_info\n                if runtime_parameter_info[\"name\"] == \"generation_kwargs\"\n            ),\n            None,\n        )\n\n        # If `generation_kwargs` attribute is present, we need to include the `generate`\n        # method arguments as the information for this attribute.\n        if generation_kwargs_info:\n            generate_docstring_args = self.generate_parsed_docstring[\"args\"]\n\n            generation_kwargs_info[\"keys\"] = []\n            for key, value in generation_kwargs_info[\"optional\"].items():\n                info = {\"name\": key, \"optional\": value}\n                if description := generate_docstring_args.get(key):\n                    info[\"description\"] = description\n                generation_kwargs_info[\"keys\"].append(info)\n\n            generation_kwargs_info.pop(\"optional\")\n\n        return runtime_parameters_info\n\n    @cached_property\n    def generate_parsed_docstring(self) -> \"Docstring\":\n        \"\"\"Returns the parsed docstring of the `generate` method.\n\n        Returns:\n            The parsed docstring of the `generate` method.\n        \"\"\"\n        return parse_google_docstring(self.generate)\n\n    def get_last_hidden_states(\n        self, inputs: List[\"StandardInput\"]\n    ) -> List[\"HiddenState\"]:\n        \"\"\"Method to get the last hidden states of the model for a list of inputs.\n\n        Args:\n            inputs: the list of inputs to get the last hidden states from.\n\n        Returns:\n            A list containing the last hidden state for each sequence using a NumPy array\n                with shape [num_tokens, hidden_size].\n        \"\"\"\n        # TODO: update to use `DistilabelNotImplementedError`\n        raise NotImplementedError(\n            f\"Method `get_last_hidden_states` is not implemented for `{self.__class__.__name__}`\"\n        )\n\n    def _prepare_structured_output(\n        self, structured_output: Optional[\"StructuredOutputType\"] = None\n    ) -> Union[Any, None]:\n        \"\"\"Method in charge of preparing the structured output generator.\n\n        By default will raise a `NotImplementedError`, subclasses that allow it must override this\n        method with the implementation.\n\n        Args:\n            structured_output: the config to prepare the guided generation.\n\n        Returns:\n            The structure to be used for the guided generation.\n        \"\"\"\n        # TODO: update to use `DistilabelNotImplementedError`\n        raise NotImplementedError(\n            f\"Guided generation is not implemented for `{type(self).__name__}`\"\n        )\n\n    def offline_batch_generate(\n        self,\n        inputs: Union[List[\"FormattedInput\"], None] = None,\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Method to generate a list of outputs for the given inputs using an offline batch\n        generation method to be implemented by each `LLM`.\n\n        This method should create jobs the first time is called and store the job ids, so\n        the second and subsequent calls can retrieve the results of the batch generation.\n        If subsequent calls are made before the batch generation is finished, then the method\n        should raise a `DistilabelOfflineBatchGenerationNotFinishedException`. This exception\n        will be handled automatically by the `Pipeline` which will store all the required\n        information for recovering the pipeline execution when the batch generation is finished.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        raise DistilabelNotImplementedError(\n            f\"`offline_batch_generate` is not implemented for `{self.__class__.__name__}`\",\n            page=\"sections/how_to_guides/advanced/offline-batch-generation/\",\n        )\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.model_name","title":"model_name: str abstractmethod property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_parameters","title":"generate_parameters: List[inspect.Parameter] property","text":"

Returns the parameters of the generate method.

Returns:

Type Description List[Parameter]

A list containing the parameters of the generate method.

"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property","text":"

Returns the runtime parameters of the LLM, which are combination of the attributes of the LLM type hinted with RuntimeParameter and the parameters of the generate method that are not input and num_generations.

Returns:

Type Description RuntimeParametersNames

A dictionary with the name of the runtime parameters as keys and a boolean

RuntimeParametersNames

indicating if the parameter is optional or not.

"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_parsed_docstring","title":"generate_parsed_docstring: Docstring cached property","text":"

Returns the parsed docstring of the generate method.

Returns:

Type Description Docstring

The parsed docstring of the generate method.

"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.load","title":"load()","text":"

Method to be called to initialize the LLM, its logger and optionally the structured output generator.

Source code in src/distilabel/models/llms/base.py
def load(self) -> None:\n    \"\"\"Method to be called to initialize the `LLM`, its logger and optionally the\n    structured output generator.\"\"\"\n    self._logger = logging.getLogger(f\"distilabel.llm.{self.model_name}\")\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.unload","title":"unload()","text":"

Method to be called to unload the LLM and release any resources.

Source code in src/distilabel/models/llms/base.py
def unload(self) -> None:\n    \"\"\"Method to be called to unload the `LLM` and release any resources.\"\"\"\n    pass\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_generation_kwargs","title":"get_generation_kwargs()","text":"

Returns the generation kwargs to be used for the generation. This method can be overridden to provide a more complex logic for the generation kwargs.

Returns:

Type Description Dict[str, Any]

The kwargs to be used for the generation.

Source code in src/distilabel/models/llms/base.py
def get_generation_kwargs(self) -> Dict[str, Any]:\n    \"\"\"Returns the generation kwargs to be used for the generation. This method can\n    be overridden to provide a more complex logic for the generation kwargs.\n\n    Returns:\n        The kwargs to be used for the generation.\n    \"\"\"\n    return self.generation_kwargs  # type: ignore\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate","title":"generate(inputs, num_generations=1, **kwargs) abstractmethod","text":"

Abstract method to be implemented by each LLM to generate num_generations per input in inputs.

Parameters:

Name Type Description Default inputs List[FormattedInput]

the list of inputs to generate responses for which follows OpenAI's API format:

[\n    {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n    {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n    {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n    {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n]\n
required num_generations int

the number of generations to generate per input.

1 **kwargs Any

the additional kwargs to be used for the generation.

{} Source code in src/distilabel/models/llms/base.py
@abstractmethod\ndef generate(\n    self,\n    inputs: List[\"FormattedInput\"],\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Abstract method to be implemented by each LLM to generate `num_generations`\n    per input in `inputs`.\n\n    Args:\n        inputs: the list of inputs to generate responses for which follows OpenAI's\n            API format:\n\n            ```python\n            [\n                {\"role\": \"system\", \"content\": \"You're a helpful assistant...\"},\n                {\"role\": \"user\", \"content\": \"Give a template email for B2B communications...\"},\n                {\"role\": \"assistant\", \"content\": \"Sure, here's a template you can use...\"},\n                {\"role\": \"user\", \"content\": \"Modify the second paragraph...\"}\n            ]\n            ```\n        num_generations: the number of generations to generate per input.\n        **kwargs: the additional kwargs to be used for the generation.\n    \"\"\"\n    pass\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.generate_outputs","title":"generate_outputs(inputs, num_generations=1, **kwargs)","text":"

Generates outputs for the given inputs using either generate method or the offine_batch_generate method if `use_offline_

Source code in src/distilabel/models/llms/base.py
def generate_outputs(\n    self,\n    inputs: List[\"FormattedInput\"],\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Generates outputs for the given inputs using either `generate` method or the\n    `offine_batch_generate` method if `use_offline_\n    \"\"\"\n    if self.use_offline_batch_generation:\n        if self.offline_batch_generation_block_until_done is not None:\n            return self._offline_batch_generate_polling(\n                inputs=inputs,\n                num_generations=num_generations,\n                **kwargs,\n            )\n\n        # This will raise `DistilabelOfflineBatchGenerationNotFinishedException` right away\n        # if the batch generation is not finished.\n        return self.offline_batch_generate(\n            inputs=inputs,\n            num_generations=num_generations,\n            **kwargs,\n        )\n\n    return self.generate(inputs=inputs, num_generations=num_generations, **kwargs)\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_runtime_parameters_info","title":"get_runtime_parameters_info()","text":"

Gets the information of the runtime parameters of the LLM such as the name and the description. This function is meant to include the information of the runtime parameters in the serialized data of the LLM.

Returns:

Type Description List[RuntimeParameterInfo]

A list containing the information for each runtime parameter of the LLM.

Source code in src/distilabel/models/llms/base.py
def get_runtime_parameters_info(self) -> List[\"RuntimeParameterInfo\"]:\n    \"\"\"Gets the information of the runtime parameters of the `LLM` such as the name\n    and the description. This function is meant to include the information of the runtime\n    parameters in the serialized data of the `LLM`.\n\n    Returns:\n        A list containing the information for each runtime parameter of the `LLM`.\n    \"\"\"\n    runtime_parameters_info = super().get_runtime_parameters_info()\n\n    generation_kwargs_info = next(\n        (\n            runtime_parameter_info\n            for runtime_parameter_info in runtime_parameters_info\n            if runtime_parameter_info[\"name\"] == \"generation_kwargs\"\n        ),\n        None,\n    )\n\n    # If `generation_kwargs` attribute is present, we need to include the `generate`\n    # method arguments as the information for this attribute.\n    if generation_kwargs_info:\n        generate_docstring_args = self.generate_parsed_docstring[\"args\"]\n\n        generation_kwargs_info[\"keys\"] = []\n        for key, value in generation_kwargs_info[\"optional\"].items():\n            info = {\"name\": key, \"optional\": value}\n            if description := generate_docstring_args.get(key):\n                info[\"description\"] = description\n            generation_kwargs_info[\"keys\"].append(info)\n\n        generation_kwargs_info.pop(\"optional\")\n\n    return runtime_parameters_info\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.get_last_hidden_states","title":"get_last_hidden_states(inputs)","text":"

Method to get the last hidden states of the model for a list of inputs.

Parameters:

Name Type Description Default inputs List[StandardInput]

the list of inputs to get the last hidden states from.

required

Returns:

Type Description List[HiddenState]

A list containing the last hidden state for each sequence using a NumPy array with shape [num_tokens, hidden_size].

Source code in src/distilabel/models/llms/base.py
def get_last_hidden_states(\n    self, inputs: List[\"StandardInput\"]\n) -> List[\"HiddenState\"]:\n    \"\"\"Method to get the last hidden states of the model for a list of inputs.\n\n    Args:\n        inputs: the list of inputs to get the last hidden states from.\n\n    Returns:\n        A list containing the last hidden state for each sequence using a NumPy array\n            with shape [num_tokens, hidden_size].\n    \"\"\"\n    # TODO: update to use `DistilabelNotImplementedError`\n    raise NotImplementedError(\n        f\"Method `get_last_hidden_states` is not implemented for `{self.__class__.__name__}`\"\n    )\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.LLM.offline_batch_generate","title":"offline_batch_generate(inputs=None, num_generations=1, **kwargs)","text":"

Method to generate a list of outputs for the given inputs using an offline batch generation method to be implemented by each LLM.

This method should create jobs the first time is called and store the job ids, so the second and subsequent calls can retrieve the results of the batch generation. If subsequent calls are made before the batch generation is finished, then the method should raise a DistilabelOfflineBatchGenerationNotFinishedException. This exception will be handled automatically by the Pipeline which will store all the required information for recovering the pipeline execution when the batch generation is finished.

Parameters:

Name Type Description Default inputs Union[List[FormattedInput], None]

the list of inputs to generate responses for.

None num_generations int

the number of generations to generate per input.

1 **kwargs Any

the additional kwargs to be used for the generation.

{}

Returns:

Type Description List[GenerateOutput]

A list containing the generations for each input.

Source code in src/distilabel/models/llms/base.py
def offline_batch_generate(\n    self,\n    inputs: Union[List[\"FormattedInput\"], None] = None,\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Method to generate a list of outputs for the given inputs using an offline batch\n    generation method to be implemented by each `LLM`.\n\n    This method should create jobs the first time is called and store the job ids, so\n    the second and subsequent calls can retrieve the results of the batch generation.\n    If subsequent calls are made before the batch generation is finished, then the method\n    should raise a `DistilabelOfflineBatchGenerationNotFinishedException`. This exception\n    will be handled automatically by the `Pipeline` which will store all the required\n    information for recovering the pipeline execution when the batch generation is finished.\n\n    Args:\n        inputs: the list of inputs to generate responses for.\n        num_generations: the number of generations to generate per input.\n        **kwargs: the additional kwargs to be used for the generation.\n\n    Returns:\n        A list containing the generations for each input.\n    \"\"\"\n    raise DistilabelNotImplementedError(\n        f\"`offline_batch_generate` is not implemented for `{self.__class__.__name__}`\",\n        page=\"sections/how_to_guides/advanced/offline-batch-generation/\",\n    )\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM","title":"AsyncLLM","text":"

Bases: LLM

Abstract class for asynchronous LLMs, so as to benefit from the async capabilities of each LLM implementation. This class is meant to be subclassed by each LLM, and the method agenerate needs to be implemented to provide the asynchronous generation of responses.

Attributes:

Name Type Description _event_loop AbstractEventLoop

the event loop to be used for the asynchronous generation of responses.

Source code in src/distilabel/models/llms/base.py
class AsyncLLM(LLM):\n    \"\"\"Abstract class for asynchronous LLMs, so as to benefit from the async capabilities\n    of each LLM implementation. This class is meant to be subclassed by each LLM, and the\n    method `agenerate` needs to be implemented to provide the asynchronous generation of\n    responses.\n\n    Attributes:\n        _event_loop: the event loop to be used for the asynchronous generation of responses.\n    \"\"\"\n\n    _num_generations_param_supported = True\n    _event_loop: \"asyncio.AbstractEventLoop\" = PrivateAttr(default=None)\n    _new_event_loop: bool = PrivateAttr(default=False)\n\n    @property\n    def generate_parameters(self) -> List[inspect.Parameter]:\n        \"\"\"Returns the parameters of the `agenerate` method.\n\n        Returns:\n            A list containing the parameters of the `agenerate` method.\n        \"\"\"\n        return list(inspect.signature(self.agenerate).parameters.values())\n\n    @cached_property\n    def generate_parsed_docstring(self) -> \"Docstring\":\n        \"\"\"Returns the parsed docstring of the `agenerate` method.\n\n        Returns:\n            The parsed docstring of the `agenerate` method.\n        \"\"\"\n        return parse_google_docstring(self.agenerate)\n\n    @property\n    def event_loop(self) -> \"asyncio.AbstractEventLoop\":\n        if self._event_loop is None:\n            try:\n                self._event_loop = asyncio.get_running_loop()\n                if self._event_loop.is_closed():\n                    self._event_loop = asyncio.new_event_loop()  # type: ignore\n                    self._new_event_loop = True\n            except RuntimeError:\n                self._event_loop = asyncio.new_event_loop()\n                self._new_event_loop = True\n        asyncio.set_event_loop(self._event_loop)\n        return self._event_loop\n\n    @abstractmethod\n    async def agenerate(\n        self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n    ) -> List[Union[str, None]]:\n        \"\"\"Method to generate a `num_generations` responses for a given input asynchronously,\n        and executed concurrently in `generate` method.\n        \"\"\"\n        pass\n\n    async def _agenerate(\n        self, inputs: List[\"FormattedInput\"], num_generations: int = 1, **kwargs: Any\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        if self._num_generations_param_supported:\n            tasks = [\n                asyncio.create_task(\n                    self.agenerate(\n                        input=input, num_generations=num_generations, **kwargs\n                    )\n                )\n                for input in inputs\n            ]\n            return await asyncio.gather(*tasks)\n\n        tasks = [\n            asyncio.create_task(self.agenerate(input=input, **kwargs))\n            for input in inputs\n            for _ in range(num_generations)\n        ]\n        outputs = [outputs[0] for outputs in await asyncio.gather(*tasks)]\n        return [\n            list(group)\n            for group in grouper(outputs, n=num_generations, incomplete=\"ignore\")\n        ]\n\n    def generate(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Method to generate a list of responses asynchronously, returning the output\n        synchronously awaiting for the response of each input sent to `agenerate`.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        return self.event_loop.run_until_complete(\n            self._agenerate(inputs=inputs, num_generations=num_generations, **kwargs)\n        )\n\n    def __del__(self) -> None:\n        \"\"\"Closes the event loop when the object is deleted.\"\"\"\n        if sys.meta_path is None:\n            return\n\n        if self._new_event_loop:\n            if self._event_loop.is_running():\n                self._event_loop.stop()\n            self._event_loop.close()\n\n    @staticmethod\n    def _prepare_structured_output(  # type: ignore\n        structured_output: \"InstructorStructuredOutputType\",\n        client: Any = None,\n        framework: Optional[str] = None,\n    ) -> Dict[str, Union[str, Any]]:\n        \"\"\"Wraps the client and updates the schema to work store it internally as a json schema.\n\n        Args:\n            structured_output: The configuration dict to prepare the structured output.\n            client: The client to wrap to generate structured output. Implemented to work\n                with `instructor`.\n            framework: The name of the framework.\n\n        Returns:\n            A dictionary containing the wrapped client and the schema to update the structured_output\n            variable in case it is a pydantic model.\n        \"\"\"\n        from distilabel.steps.tasks.structured_outputs.instructor import (\n            prepare_instructor,\n        )\n\n        result = {}\n        client = prepare_instructor(\n            client,\n            mode=structured_output.get(\"mode\"),\n            framework=framework,  # type: ignore\n        )\n        result[\"client\"] = client\n\n        schema = structured_output.get(\"schema\")\n        if not schema:\n            raise DistilabelUserError(\n                f\"The `structured_output` argument must contain a schema: {structured_output}\",\n                page=\"sections/how_to_guides/advanced/structured_generation/#instructor\",\n            )\n        if inspect.isclass(schema) and issubclass(schema, BaseModel):\n            # We want a json schema for the serialization, but instructor wants a pydantic BaseModel.\n            structured_output[\"schema\"] = schema.model_json_schema()  # type: ignore\n            result[\"structured_output\"] = structured_output\n\n        return result\n\n    @staticmethod\n    def _prepare_kwargs(\n        arguments: Dict[str, Any], structured_output: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Helper method to update the kwargs with the structured output configuration,\n        used in case they are defined.\n\n        Args:\n            arguments: The arguments that would be passed to the LLM as **kwargs.\n                to update with the structured output configuration.\n            structured_outputs: The structured output configuration to update the arguments.\n\n        Returns:\n            kwargs updated with the special arguments used by `instructor`.\n        \"\"\"\n        # We can deal with json schema or BaseModel, but we need to convert it to a BaseModel\n        # for the Instructor client.\n        schema = structured_output.get(\"schema\", {})\n\n        # If there's already a pydantic model, we don't need to do anything,\n        # otherwise, try to obtain one.\n        if not (inspect.isclass(schema) and issubclass(schema, BaseModel)):\n            from distilabel.steps.tasks.structured_outputs.utils import (\n                json_schema_to_model,\n            )\n\n            if isinstance(schema, str):\n                # In case it was saved in the dataset as a string.\n                schema = json.loads(schema)\n\n            try:\n                schema = json_schema_to_model(schema)\n            except Exception as e:\n                raise ValueError(\n                    f\"Failed to convert the schema to a pydantic model, the model is too complex currently: {e}\"\n                ) from e\n\n        arguments.update(\n            **{\n                \"response_model\": schema,\n                \"max_retries\": structured_output.get(\"max_retries\", 1),\n            },\n        )\n        return arguments\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate_parameters","title":"generate_parameters: List[inspect.Parameter] property","text":"

Returns the parameters of the agenerate method.

Returns:

Type Description List[Parameter]

A list containing the parameters of the agenerate method.

"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate_parsed_docstring","title":"generate_parsed_docstring: Docstring cached property","text":"

Returns the parsed docstring of the agenerate method.

Returns:

Type Description Docstring

The parsed docstring of the agenerate method.

"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.agenerate","title":"agenerate(input, num_generations=1, **kwargs) abstractmethod async","text":"

Method to generate a num_generations responses for a given input asynchronously, and executed concurrently in generate method.

Source code in src/distilabel/models/llms/base.py
@abstractmethod\nasync def agenerate(\n    self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n) -> List[Union[str, None]]:\n    \"\"\"Method to generate a `num_generations` responses for a given input asynchronously,\n    and executed concurrently in `generate` method.\n    \"\"\"\n    pass\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.generate","title":"generate(inputs, num_generations=1, **kwargs)","text":"

Method to generate a list of responses asynchronously, returning the output synchronously awaiting for the response of each input sent to agenerate.

Parameters:

Name Type Description Default inputs List[FormattedInput]

the list of inputs to generate responses for.

required num_generations int

the number of generations to generate per input.

1 **kwargs Any

the additional kwargs to be used for the generation.

{}

Returns:

Type Description List[GenerateOutput]

A list containing the generations for each input.

Source code in src/distilabel/models/llms/base.py
def generate(\n    self,\n    inputs: List[\"FormattedInput\"],\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Method to generate a list of responses asynchronously, returning the output\n    synchronously awaiting for the response of each input sent to `agenerate`.\n\n    Args:\n        inputs: the list of inputs to generate responses for.\n        num_generations: the number of generations to generate per input.\n        **kwargs: the additional kwargs to be used for the generation.\n\n    Returns:\n        A list containing the generations for each input.\n    \"\"\"\n    return self.event_loop.run_until_complete(\n        self._agenerate(inputs=inputs, num_generations=num_generations, **kwargs)\n    )\n
"},{"location":"api/models/llm/#distilabel.models.llms.base.AsyncLLM.__del__","title":"__del__()","text":"

Closes the event loop when the object is deleted.

Source code in src/distilabel/models/llms/base.py
def __del__(self) -> None:\n    \"\"\"Closes the event loop when the object is deleted.\"\"\"\n    if sys.meta_path is None:\n        return\n\n    if self._new_event_loop:\n        if self._event_loop.is_running():\n            self._event_loop.stop()\n        self._event_loop.close()\n
"},{"location":"api/models/llm/llm_gallery/","title":"LLM Gallery","text":"

This section contains the existing LLM subclasses implemented in distilabel.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms","title":"llms","text":""},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM","title":"AnthropicLLM","text":"

Bases: AsyncLLM

Anthropic LLM implementation running the Async API client.

Attributes:

Name Type Description model str

the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\", \"claude-3-sonnet-20240229\", etc. Available models can be checked here: Anthropic: Models overview.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Anthropic API. Defaults to None which means that https://api.anthropic.com will be used internally.

timeout RuntimeParameter[float]

the maximum time in seconds to wait for a response. Defaults to 600.0.

max_retries RuntimeParameter[int]

The maximum number of times to retry the request before failing. Defaults to 6.

http_client Optional[AsyncClient]

if provided, an alternative HTTP client to use for calling Anthropic API. Defaults to None.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

_api_key_env_var str

the name of the environment variable to use for the API key. It is meant to be used internally.

_aclient Optional[AsyncAnthropic]

the AsyncAnthropic client to use for the Anthropic API. It is meant to be used internally. Set in the load method.

Runtime parameters
  • api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable.
  • base_url: the base URL to use for the Anthropic API. Defaults to \"https://api.anthropic.com\".
  • timeout: the maximum time in seconds to wait for a response. Defaults to 600.0.
  • max_retries: the maximum number of times to retry the request before failing. Defaults to 6.

Examples:

Generate text:

from distilabel.models.llms import AnthropicLLM\n\nllm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pydantic import BaseModel\nfrom distilabel.models.llms import AnthropicLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = AnthropicLLM(\n    model=\"claude-3-opus-20240229\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/anthropic.py
class AnthropicLLM(AsyncLLM):\n    \"\"\"Anthropic LLM implementation running the Async API client.\n\n    Attributes:\n        model: the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\",\n            \"claude-3-sonnet-20240229\", etc. Available models can be checked here:\n            [Anthropic: Models overview](https://docs.anthropic.com/claude/docs/models-overview).\n        api_key: the API key to authenticate the requests to the Anthropic API. If not provided,\n            it will be read from `ANTHROPIC_API_KEY` environment variable.\n        base_url: the base URL to use for the Anthropic API. Defaults to `None` which means\n            that `https://api.anthropic.com` will be used internally.\n        timeout: the maximum time in seconds to wait for a response. Defaults to `600.0`.\n        max_retries: The maximum number of times to retry the request before failing. Defaults\n            to `6`.\n        http_client: if provided, an alternative HTTP client to use for calling Anthropic\n            API. Defaults to `None`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n        _api_key_env_var: the name of the environment variable to use for the API key. It\n            is meant to be used internally.\n        _aclient: the `AsyncAnthropic` client to use for the Anthropic API. It is meant\n            to be used internally. Set in the `load` method.\n\n    Runtime parameters:\n        - `api_key`: the API key to authenticate the requests to the Anthropic API. If not\n            provided, it will be read from `ANTHROPIC_API_KEY` environment variable.\n        - `base_url`: the base URL to use for the Anthropic API. Defaults to `\"https://api.anthropic.com\"`.\n        - `timeout`: the maximum time in seconds to wait for a response. Defaults to `600.0`.\n        - `max_retries`: the maximum number of times to retry the request before failing.\n            Defaults to `6`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import AnthropicLLM\n\n        llm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import AnthropicLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = AnthropicLLM(\n            model=\"claude-3-opus-20240229\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"ANTHROPIC_BASE_URL\", \"https://api.anthropic.com\"\n        ),\n        description=\"The base URL to use for the Anthropic API.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_ANTHROPIC_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Anthropic API.\",\n    )\n    timeout: RuntimeParameter[float] = Field(\n        default=600.0,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    max_retries: RuntimeParameter[int] = Field(\n        default=6,\n        description=\"The maximum number of times to retry the request to the API before\"\n        \" failing.\",\n    )\n    http_client: Optional[AsyncClient] = Field(default=None, exclude=True)\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _api_key_env_var: str = PrivateAttr(default=_ANTHROPIC_API_KEY_ENV_VAR_NAME)\n    _aclient: Optional[\"AsyncAnthropic\"] = PrivateAttr(...)\n\n    def _check_model_exists(self) -> None:\n        \"\"\"Checks if the specified model exists in the available models.\"\"\"\n        from anthropic import AsyncAnthropic\n\n        annotation = get_type_hints(AsyncAnthropic().messages.create).get(\"model\", None)\n        models = [\n            value\n            for type_ in get_args(annotation)\n            if get_origin(type_) is Literal\n            for value in get_args(type_)\n        ]\n\n        if self.model not in models:\n            raise ValueError(\n                f\"Model {self.model} does not exist among available models. \"\n                f\"The available models are {', '.join(models)}\"\n            )\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncAnthropic` client to use the Anthropic async API.\"\"\"\n        super().load()\n\n        try:\n            from anthropic import AsyncAnthropic\n        except ImportError as ie:\n            raise ImportError(\n                \"Anthropic Python client is not installed. Please install it using\"\n                \" `pip install anthropic`.\"\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        self._check_model_exists()\n\n        self._aclient = AsyncAnthropic(\n            api_key=self.api_key.get_secret_value(),\n            base_url=self.base_url,\n            timeout=self.timeout,\n            http_client=self.http_client,\n            max_retries=self.max_retries,\n        )\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"anthropic\",\n            )\n            self._aclient = result.get(\"client\")\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        max_tokens: int = 128,\n        stop_sequences: Union[List[str], None] = None,\n        temperature: float = 1.0,\n        top_p: Union[float, None] = None,\n        top_k: Union[int, None] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates a response asynchronously, using the [Anthropic Async API definition](https://github.com/anthropics/anthropic-sdk-python).\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            max_tokens: the maximum number of new tokens that the model will generate. Defaults to `128`.\n            stop_sequences: custom text sequences that will cause the model to stop generating. Defaults to `NOT_GIVEN`.\n            temperature: the temperature to use for the generation. Set only if top_p is None. Defaults to `1.0`.\n            top_p: the top-p value to use for the generation. Defaults to `NOT_GIVEN`.\n            top_k: the top-k value to use for the generation. Defaults to `NOT_GIVEN`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        from anthropic._types import NOT_GIVEN\n\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,\n                client=self._aclient,\n                framework=\"anthropic\",\n            )\n            self._aclient = result.get(\"client\")\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"messages\": input,  # type: ignore\n            \"model\": self.model,\n            \"system\": (\n                input.pop(0)[\"content\"]\n                if input and input[0][\"role\"] == \"system\"\n                else NOT_GIVEN\n            ),\n            \"max_tokens\": max_tokens,\n            \"stream\": False,\n            \"stop_sequences\": NOT_GIVEN if stop_sequences is None else stop_sequences,\n            \"temperature\": temperature,\n            \"top_p\": NOT_GIVEN if top_p is None else top_p,\n            \"top_k\": NOT_GIVEN if top_k is None else top_k,\n        }\n\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n        generations = []\n\n        completion = await self._aclient.messages.create(**kwargs)  # type: ignore\n        if structured_output:\n            generations.append(completion.model_dump_json())\n            return generations\n\n        if (content := completion.content[0].text) is None:\n            self._logger.warning(\n                f\"Received no response using Anthropic client (model: '{self.model}').\"\n                f\" Finish reason was: {completion.stop_reason}\"\n            )\n        generations.append(content)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM._check_model_exists","title":"_check_model_exists()","text":"

Checks if the specified model exists in the available models.

Source code in src/distilabel/models/llms/anthropic.py
def _check_model_exists(self) -> None:\n    \"\"\"Checks if the specified model exists in the available models.\"\"\"\n    from anthropic import AsyncAnthropic\n\n    annotation = get_type_hints(AsyncAnthropic().messages.create).get(\"model\", None)\n    models = [\n        value\n        for type_ in get_args(annotation)\n        if get_origin(type_) is Literal\n        for value in get_args(type_)\n    ]\n\n    if self.model not in models:\n        raise ValueError(\n            f\"Model {self.model} does not exist among available models. \"\n            f\"The available models are {', '.join(models)}\"\n        )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.load","title":"load()","text":"

Loads the AsyncAnthropic client to use the Anthropic async API.

Source code in src/distilabel/models/llms/anthropic.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncAnthropic` client to use the Anthropic async API.\"\"\"\n    super().load()\n\n    try:\n        from anthropic import AsyncAnthropic\n    except ImportError as ie:\n        raise ImportError(\n            \"Anthropic Python client is not installed. Please install it using\"\n            \" `pip install anthropic`.\"\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    self._check_model_exists()\n\n    self._aclient = AsyncAnthropic(\n        api_key=self.api_key.get_secret_value(),\n        base_url=self.base_url,\n        timeout=self.timeout,\n        http_client=self.http_client,\n        max_retries=self.max_retries,\n    )\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"anthropic\",\n        )\n        self._aclient = result.get(\"client\")\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnthropicLLM.agenerate","title":"agenerate(input, max_tokens=128, stop_sequences=None, temperature=1.0, top_p=None, top_k=None) async","text":"

Generates a response asynchronously, using the Anthropic Async API definition.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required max_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 stop_sequences Union[List[str], None]

custom text sequences that will cause the model to stop generating. Defaults to NOT_GIVEN.

None temperature float

the temperature to use for the generation. Set only if top_p is None. Defaults to 1.0.

1.0 top_p Union[float, None]

the top-p value to use for the generation. Defaults to NOT_GIVEN.

None top_k Union[int, None]

the top-k value to use for the generation. Defaults to NOT_GIVEN.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/anthropic.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    max_tokens: int = 128,\n    stop_sequences: Union[List[str], None] = None,\n    temperature: float = 1.0,\n    top_p: Union[float, None] = None,\n    top_k: Union[int, None] = None,\n) -> GenerateOutput:\n    \"\"\"Generates a response asynchronously, using the [Anthropic Async API definition](https://github.com/anthropics/anthropic-sdk-python).\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        max_tokens: the maximum number of new tokens that the model will generate. Defaults to `128`.\n        stop_sequences: custom text sequences that will cause the model to stop generating. Defaults to `NOT_GIVEN`.\n        temperature: the temperature to use for the generation. Set only if top_p is None. Defaults to `1.0`.\n        top_p: the top-p value to use for the generation. Defaults to `NOT_GIVEN`.\n        top_k: the top-k value to use for the generation. Defaults to `NOT_GIVEN`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    from anthropic._types import NOT_GIVEN\n\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,\n            client=self._aclient,\n            framework=\"anthropic\",\n        )\n        self._aclient = result.get(\"client\")\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"messages\": input,  # type: ignore\n        \"model\": self.model,\n        \"system\": (\n            input.pop(0)[\"content\"]\n            if input and input[0][\"role\"] == \"system\"\n            else NOT_GIVEN\n        ),\n        \"max_tokens\": max_tokens,\n        \"stream\": False,\n        \"stop_sequences\": NOT_GIVEN if stop_sequences is None else stop_sequences,\n        \"temperature\": temperature,\n        \"top_p\": NOT_GIVEN if top_p is None else top_p,\n        \"top_k\": NOT_GIVEN if top_k is None else top_k,\n    }\n\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n    generations = []\n\n    completion = await self._aclient.messages.create(**kwargs)  # type: ignore\n    if structured_output:\n        generations.append(completion.model_dump_json())\n        return generations\n\n    if (content := completion.content[0].text) is None:\n        self._logger.warning(\n            f\"Received no response using Anthropic client (model: '{self.model}').\"\n            f\" Finish reason was: {completion.stop_reason}\"\n        )\n    generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AnyscaleLLM","title":"AnyscaleLLM","text":"

Bases: OpenAILLM

Anyscale LLM implementation running the async API client of OpenAI.

Attributes:

Name Type Description model

the model name to use for the LLM, e.g., google/gemma-7b-it. See the supported models under the \"Text Generation -> Supported Models\" section here.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Anyscale API requests. Defaults to None, which means that the value set for the environment variable ANYSCALE_BASE_URL will be used, or \"https://api.endpoints.anyscale.com/v1\" if not set.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Anyscale API. Defaults to None which means that the value set for the environment variable ANYSCALE_API_KEY will be used, or None if not set.

_api_key_env_var str

the name of the environment variable to use for the API key. It is meant to be used internally.

Examples:

Generate text:

from distilabel.models.llms import AnyscaleLLM\n\nllm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/anyscale.py
class AnyscaleLLM(OpenAILLM):\n    \"\"\"Anyscale LLM implementation running the async API client of OpenAI.\n\n    Attributes:\n        model: the model name to use for the LLM, e.g., `google/gemma-7b-it`. See the\n            supported models under the \"Text Generation -> Supported Models\" section\n            [here](https://docs.endpoints.anyscale.com/).\n        base_url: the base URL to use for the Anyscale API requests. Defaults to `None`, which\n            means that the value set for the environment variable `ANYSCALE_BASE_URL` will be used, or\n            \"https://api.endpoints.anyscale.com/v1\" if not set.\n        api_key: the API key to authenticate the requests to the Anyscale API. Defaults to `None` which\n            means that the value set for the environment variable `ANYSCALE_API_KEY` will be used, or\n            `None` if not set.\n        _api_key_env_var: the name of the environment variable to use for the API key.\n            It is meant to be used internally.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import AnyscaleLLM\n\n        llm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"ANYSCALE_BASE_URL\", \"https://api.endpoints.anyscale.com/v1\"\n        ),\n        description=\"The base URL to use for the Anyscale API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_ANYSCALE_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Anyscale API.\",\n    )\n\n    _api_key_env_var: str = PrivateAttr(_ANYSCALE_API_KEY_ENV_VAR_NAME)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AzureOpenAILLM","title":"AzureOpenAILLM","text":"

Bases: OpenAILLM

Azure OpenAI LLM implementation running the async API client.

Attributes:

Name Type Description model

the model name to use for the LLM i.e. the name of the Azure deployment.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Azure OpenAI API can be set with AZURE_OPENAI_ENDPOINT. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_ENDPOINT will be used, or None if not set.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Azure OpenAI API. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_API_KEY will be used, or None if not set.

api_version Optional[RuntimeParameter[str]]

the API version to use for the Azure OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_VERSION will be used, or None if not set.

Icon

:material-microsoft-azure:

Examples:

Generate text:

from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate text from a custom endpoint following the OpenAI API:

from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pydantic import BaseModel\nfrom distilabel.models.llms import AzureOpenAILLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = AzureOpenAILLM(\n    model=\"gpt-4-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/azure.py
class AzureOpenAILLM(OpenAILLM):\n    \"\"\"Azure OpenAI LLM implementation running the async API client.\n\n    Attributes:\n        model: the model name to use for the LLM i.e. the name of the Azure deployment.\n        base_url: the base URL to use for the Azure OpenAI API can be set with `AZURE_OPENAI_ENDPOINT`.\n            Defaults to `None` which means that the value set for the environment variable\n            `AZURE_OPENAI_ENDPOINT` will be used, or `None` if not set.\n        api_key: the API key to authenticate the requests to the Azure OpenAI API. Defaults to `None`\n            which means that the value set for the environment variable `AZURE_OPENAI_API_KEY` will be\n            used, or `None` if not set.\n        api_version: the API version to use for the Azure OpenAI API. Defaults to `None` which means\n            that the value set for the environment variable `OPENAI_API_VERSION` will be used, or\n            `None` if not set.\n\n    Icon:\n        `:material-microsoft-azure:`\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import AzureOpenAILLM\n\n        llm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate text from a custom endpoint following the OpenAI API:\n\n        ```python\n        from distilabel.models.llms import AzureOpenAILLM\n\n        llm = AzureOpenAILLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\",\n            base_url=r\"http://localhost:8080/v1\"\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import AzureOpenAILLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = AzureOpenAILLM(\n            model=\"gpt-4-turbo\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(_AZURE_OPENAI_ENDPOINT_ENV_VAR_NAME),\n        description=\"The base URL to use for the Azure OpenAI API requests i.e. the Azure OpenAI endpoint.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_AZURE_OPENAI_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Azure OpenAI API.\",\n    )\n\n    api_version: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\"OPENAI_API_VERSION\"),\n        description=\"The API version to use for the Azure OpenAI API.\",\n    )\n\n    _base_url_env_var: str = PrivateAttr(_AZURE_OPENAI_ENDPOINT_ENV_VAR_NAME)\n    _api_key_env_var: str = PrivateAttr(_AZURE_OPENAI_API_KEY_ENV_VAR_NAME)\n    _aclient: Optional[\"AsyncAzureOpenAI\"] = PrivateAttr(...)  # type: ignore\n\n    @override\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncAzureOpenAI` client to benefit from async requests.\"\"\"\n        # This is a workaround to avoid the `OpenAILLM` calling the _prepare_structured_output\n        # in the load method before we have the proper client.\n        with patch(\n            \"distilabel.models.openai.OpenAILLM._prepare_structured_output\", lambda x: x\n        ):\n            super().load()\n\n        try:\n            from openai import AsyncAzureOpenAI\n        except ImportError as ie:\n            raise ImportError(\n                \"OpenAI Python client is not installed. Please install it using\"\n                \" `pip install openai`.\"\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        # TODO: May be worth adding the AD auth too? Also the `organization`?\n        self._aclient = AsyncAzureOpenAI(  # type: ignore\n            azure_endpoint=self.base_url,  # type: ignore\n            azure_deployment=self.model,\n            api_version=self.api_version,\n            api_key=self.api_key.get_secret_value(),\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        if self.structured_output:\n            self._prepare_structured_output(self.structured_output)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.AzureOpenAILLM.load","title":"load()","text":"

Loads the AsyncAzureOpenAI client to benefit from async requests.

Source code in src/distilabel/models/llms/azure.py
@override\ndef load(self) -> None:\n    \"\"\"Loads the `AsyncAzureOpenAI` client to benefit from async requests.\"\"\"\n    # This is a workaround to avoid the `OpenAILLM` calling the _prepare_structured_output\n    # in the load method before we have the proper client.\n    with patch(\n        \"distilabel.models.openai.OpenAILLM._prepare_structured_output\", lambda x: x\n    ):\n        super().load()\n\n    try:\n        from openai import AsyncAzureOpenAI\n    except ImportError as ie:\n        raise ImportError(\n            \"OpenAI Python client is not installed. Please install it using\"\n            \" `pip install openai`.\"\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    # TODO: May be worth adding the AD auth too? Also the `organization`?\n    self._aclient = AsyncAzureOpenAI(  # type: ignore\n        azure_endpoint=self.base_url,  # type: ignore\n        azure_deployment=self.model,\n        api_version=self.api_version,\n        api_key=self.api_key.get_secret_value(),\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    if self.structured_output:\n        self._prepare_structured_output(self.structured_output)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM","title":"CohereLLM","text":"

Bases: AsyncLLM

Cohere API implementation using the async client for concurrent text generation.

Attributes:

Name Type Description model str

the name of the model from the Cohere API to use for the generation.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\".

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable.

timeout RuntimeParameter[int]

the maximum time in seconds to wait for a response from the API. Defaults to 120.

client_name RuntimeParameter[str]

the name of the client to use for the API requests. Defaults to \"distilabel\".

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

_ChatMessage Type[ChatMessage]

the ChatMessage class from the cohere package.

_aclient AsyncClient

the AsyncClient client from the cohere package.

Runtime parameters
  • base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\".
  • api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable.
  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.
  • client_name: the name of the client to use for the API requests. Defaults to \"distilabel\".

Examples:

Generate text:

from distilabel.models.llms import CohereLLM\n\nllm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import CohereLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = CohereLLM(\n    model=\"CohereForAI/c4ai-command-r-plus\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/cohere.py
class CohereLLM(AsyncLLM):\n    \"\"\"Cohere API implementation using the async client for concurrent text generation.\n\n    Attributes:\n        model: the name of the model from the Cohere API to use for the generation.\n        base_url: the base URL to use for the Cohere API requests. Defaults to\n            `\"https://api.cohere.ai/v1\"`.\n        api_key: the API key to authenticate the requests to the Cohere API. Defaults to\n            the value of the `COHERE_API_KEY` environment variable.\n        timeout: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        client_name: the name of the client to use for the API requests. Defaults to\n            `\"distilabel\"`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n        _ChatMessage: the `ChatMessage` class from the `cohere` package.\n        _aclient: the `AsyncClient` client from the `cohere` package.\n\n    Runtime parameters:\n        - `base_url`: the base URL to use for the Cohere API requests. Defaults to\n            `\"https://api.cohere.ai/v1\"`.\n        - `api_key`: the API key to authenticate the requests to the Cohere API. Defaults\n            to the value of the `COHERE_API_KEY` environment variable.\n        - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        - `client_name`: the name of the client to use for the API requests. Defaults to\n            `\"distilabel\"`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import CohereLLM\n\n        llm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import CohereLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = CohereLLM(\n            model=\"CohereForAI/c4ai-command-r-plus\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"COHERE_BASE_URL\", \"https://api.cohere.ai/v1\"\n        ),\n        description=\"The base URL to use for the Cohere API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_COHERE_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Cohere API.\",\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    client_name: RuntimeParameter[str] = Field(\n        default=\"distilabel\",\n        description=\"The name of the client to use for the API requests.\",\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _ChatMessage: Type[\"ChatMessage\"] = PrivateAttr(...)\n    _aclient: \"AsyncClient\" = PrivateAttr(...)\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncClient` client from the `cohere` package.\"\"\"\n\n        super().load()\n\n        try:\n            from cohere import AsyncClient, ChatMessage\n        except ImportError as ie:\n            raise ImportError(\n                \"The `cohere` package is required to use the `CohereLLM` class.\"\n            ) from ie\n\n        self._ChatMessage = ChatMessage\n\n        self._aclient = AsyncClient(\n            api_key=self.api_key.get_secret_value(),  # type: ignore\n            client_name=self.client_name,\n            base_url=self.base_url,\n            timeout=self.timeout,\n        )\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"cohere\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    def _format_chat_to_cohere(\n        self, input: \"FormattedInput\"\n    ) -> Tuple[Union[str, None], List[\"ChatMessage\"], str]:\n        \"\"\"Formats the chat input to the Cohere Chat API conversational format.\n\n        Args:\n            input: The chat input to format.\n\n        Returns:\n            A tuple containing the system, chat history, and message.\n        \"\"\"\n        system = None\n        message = None\n        chat_history = []\n        for item in input:\n            role = item[\"role\"]\n            content = item[\"content\"]\n            if role == \"system\":\n                system = content\n            elif role == \"user\":\n                message = content\n            elif role == \"assistant\":\n                if message is None:\n                    raise ValueError(\n                        \"An assistant message but be preceded by a user message.\"\n                    )\n                chat_history.append(self._ChatMessage(role=\"USER\", message=message))  # type: ignore\n                chat_history.append(self._ChatMessage(role=\"CHATBOT\", message=content))  # type: ignore\n                message = None\n\n        if message is None:\n            raise ValueError(\"The chat input must end with a user message.\")\n\n        return system, chat_history, message\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        temperature: Optional[float] = None,\n        max_tokens: Optional[int] = None,\n        k: Optional[int] = None,\n        p: Optional[float] = None,\n        seed: Optional[float] = None,\n        stop_sequences: Optional[Sequence[str]] = None,\n        frequency_penalty: Optional[float] = None,\n        presence_penalty: Optional[float] = None,\n        raw_prompting: Optional[bool] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates a response from the LLM given an input.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            temperature: the temperature to use for the generation. Defaults to `None`.\n            max_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `None`.\n            k: the number of highest probability vocabulary tokens to keep for the generation.\n                Defaults to `None`.\n            p: the nucleus sampling probability to use for the generation. Defaults to\n                `None`.\n            seed: the seed to use for the generation. Defaults to `None`.\n            stop_sequences: a list of sequences to use as stopping criteria for the generation.\n                Defaults to `None`.\n            frequency_penalty: the frequency penalty to use for the generation. Defaults\n                to `None`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `None`.\n            raw_prompting: a flag to use raw prompting for the generation. Defaults to\n                `None`.\n\n        Returns:\n            The generated response from the Cohere API model.\n        \"\"\"\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,  # type: ignore\n                client=self._aclient,\n                framework=\"cohere\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        system, chat_history, message = self._format_chat_to_cohere(input)\n\n        kwargs = {\n            \"message\": message,\n            \"model\": self.model,\n            \"preamble\": system,\n            \"chat_history\": chat_history,\n            \"temperature\": temperature,\n            \"max_tokens\": max_tokens,\n            \"k\": k,\n            \"p\": p,\n            \"seed\": seed,\n            \"stop_sequences\": stop_sequences,\n            \"frequency_penalty\": frequency_penalty,\n            \"presence_penalty\": presence_penalty,\n            \"raw_prompting\": raw_prompting,\n        }\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)  # type: ignore\n\n        response = await self._aclient.chat(**kwargs)  # type: ignore\n\n        if structured_output:\n            return [response.model_dump_json()]\n\n        if (text := response.text) == \"\":\n            self._logger.warning(  # type: ignore\n                f\"Received no response using Cohere client (model: '{self.model}').\"\n                f\" Finish reason was: {response.finish_reason}\"\n            )\n            return [None]\n\n        return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.load","title":"load()","text":"

Loads the AsyncClient client from the cohere package.

Source code in src/distilabel/models/llms/cohere.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncClient` client from the `cohere` package.\"\"\"\n\n    super().load()\n\n    try:\n        from cohere import AsyncClient, ChatMessage\n    except ImportError as ie:\n        raise ImportError(\n            \"The `cohere` package is required to use the `CohereLLM` class.\"\n        ) from ie\n\n    self._ChatMessage = ChatMessage\n\n    self._aclient = AsyncClient(\n        api_key=self.api_key.get_secret_value(),  # type: ignore\n        client_name=self.client_name,\n        base_url=self.base_url,\n        timeout=self.timeout,\n    )\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"cohere\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM._format_chat_to_cohere","title":"_format_chat_to_cohere(input)","text":"

Formats the chat input to the Cohere Chat API conversational format.

Parameters:

Name Type Description Default input FormattedInput

The chat input to format.

required

Returns:

Type Description Tuple[Union[str, None], List[ChatMessage], str]

A tuple containing the system, chat history, and message.

Source code in src/distilabel/models/llms/cohere.py
def _format_chat_to_cohere(\n    self, input: \"FormattedInput\"\n) -> Tuple[Union[str, None], List[\"ChatMessage\"], str]:\n    \"\"\"Formats the chat input to the Cohere Chat API conversational format.\n\n    Args:\n        input: The chat input to format.\n\n    Returns:\n        A tuple containing the system, chat history, and message.\n    \"\"\"\n    system = None\n    message = None\n    chat_history = []\n    for item in input:\n        role = item[\"role\"]\n        content = item[\"content\"]\n        if role == \"system\":\n            system = content\n        elif role == \"user\":\n            message = content\n        elif role == \"assistant\":\n            if message is None:\n                raise ValueError(\n                    \"An assistant message but be preceded by a user message.\"\n                )\n            chat_history.append(self._ChatMessage(role=\"USER\", message=message))  # type: ignore\n            chat_history.append(self._ChatMessage(role=\"CHATBOT\", message=content))  # type: ignore\n            message = None\n\n    if message is None:\n        raise ValueError(\"The chat input must end with a user message.\")\n\n    return system, chat_history, message\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CohereLLM.agenerate","title":"agenerate(input, temperature=None, max_tokens=None, k=None, p=None, seed=None, stop_sequences=None, frequency_penalty=None, presence_penalty=None, raw_prompting=None) async","text":"

Generates a response from the LLM given an input.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required temperature Optional[float]

the temperature to use for the generation. Defaults to None.

None max_tokens Optional[int]

the maximum number of new tokens that the model will generate. Defaults to None.

None k Optional[int]

the number of highest probability vocabulary tokens to keep for the generation. Defaults to None.

None p Optional[float]

the nucleus sampling probability to use for the generation. Defaults to None.

None seed Optional[float]

the seed to use for the generation. Defaults to None.

None stop_sequences Optional[Sequence[str]]

a list of sequences to use as stopping criteria for the generation. Defaults to None.

None frequency_penalty Optional[float]

the frequency penalty to use for the generation. Defaults to None.

None presence_penalty Optional[float]

the presence penalty to use for the generation. Defaults to None.

None raw_prompting Optional[bool]

a flag to use raw prompting for the generation. Defaults to None.

None

Returns:

Type Description GenerateOutput

The generated response from the Cohere API model.

Source code in src/distilabel/models/llms/cohere.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    temperature: Optional[float] = None,\n    max_tokens: Optional[int] = None,\n    k: Optional[int] = None,\n    p: Optional[float] = None,\n    seed: Optional[float] = None,\n    stop_sequences: Optional[Sequence[str]] = None,\n    frequency_penalty: Optional[float] = None,\n    presence_penalty: Optional[float] = None,\n    raw_prompting: Optional[bool] = None,\n) -> GenerateOutput:\n    \"\"\"Generates a response from the LLM given an input.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        temperature: the temperature to use for the generation. Defaults to `None`.\n        max_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `None`.\n        k: the number of highest probability vocabulary tokens to keep for the generation.\n            Defaults to `None`.\n        p: the nucleus sampling probability to use for the generation. Defaults to\n            `None`.\n        seed: the seed to use for the generation. Defaults to `None`.\n        stop_sequences: a list of sequences to use as stopping criteria for the generation.\n            Defaults to `None`.\n        frequency_penalty: the frequency penalty to use for the generation. Defaults\n            to `None`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `None`.\n        raw_prompting: a flag to use raw prompting for the generation. Defaults to\n            `None`.\n\n    Returns:\n        The generated response from the Cohere API model.\n    \"\"\"\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,  # type: ignore\n            client=self._aclient,\n            framework=\"cohere\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    system, chat_history, message = self._format_chat_to_cohere(input)\n\n    kwargs = {\n        \"message\": message,\n        \"model\": self.model,\n        \"preamble\": system,\n        \"chat_history\": chat_history,\n        \"temperature\": temperature,\n        \"max_tokens\": max_tokens,\n        \"k\": k,\n        \"p\": p,\n        \"seed\": seed,\n        \"stop_sequences\": stop_sequences,\n        \"frequency_penalty\": frequency_penalty,\n        \"presence_penalty\": presence_penalty,\n        \"raw_prompting\": raw_prompting,\n    }\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)  # type: ignore\n\n    response = await self._aclient.chat(**kwargs)  # type: ignore\n\n    if structured_output:\n        return [response.model_dump_json()]\n\n    if (text := response.text) == \"\":\n        self._logger.warning(  # type: ignore\n            f\"Received no response using Cohere client (model: '{self.model}').\"\n            f\" Finish reason was: {response.finish_reason}\"\n        )\n        return [None]\n\n    return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM","title":"GroqLLM","text":"

Bases: AsyncLLM

Groq API implementation using the async client for concurrent text generation.

Attributes:

Name Type Description model str

the name of the model from the Groq API to use for the generation.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\".

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable.

max_retries RuntimeParameter[int]

the maximum number of times to retry the request to the API before failing. Defaults to 2.

timeout RuntimeParameter[int]

the maximum time in seconds to wait for a response from the API. Defaults to 120.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

_api_key_env_var str

the name of the environment variable to use for the API key.

_aclient Optional[AsyncGroq]

the AsyncGroq client from the groq package.

Runtime parameters
  • base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\".
  • api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable.
  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2.
  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

Examples:

Generate text:

from distilabel.models.llms import GroqLLM\n\nllm = GroqLLM(model=\"llama3-70b-8192\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import GroqLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = GroqLLM(\n    model=\"llama3-70b-8192\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/groq.py
class GroqLLM(AsyncLLM):\n    \"\"\"Groq API implementation using the async client for concurrent text generation.\n\n    Attributes:\n        model: the name of the model from the Groq API to use for the generation.\n        base_url: the base URL to use for the Groq API requests. Defaults to\n            `\"https://api.groq.com\"`.\n        api_key: the API key to authenticate the requests to the Groq API. Defaults to\n            the value of the `GROQ_API_KEY` environment variable.\n        max_retries: the maximum number of times to retry the request to the API before\n            failing. Defaults to `2`.\n        timeout: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n        _api_key_env_var: the name of the environment variable to use for the API key.\n        _aclient: the `AsyncGroq` client from the `groq` package.\n\n    Runtime parameters:\n        - `base_url`: the base URL to use for the Groq API requests. Defaults to\n            `\"https://api.groq.com\"`.\n        - `api_key`: the API key to authenticate the requests to the Groq API. Defaults to\n            the value of the `GROQ_API_KEY` environment variable.\n        - `max_retries`: the maximum number of times to retry the request to the API before\n            failing. Defaults to `2`.\n        - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import GroqLLM\n\n        llm = GroqLLM(model=\"llama3-70b-8192\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import GroqLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = GroqLLM(\n            model=\"llama3-70b-8192\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            _GROQ_API_BASE_URL_ENV_VAR_NAME, \"https://api.groq.com\"\n        ),\n        description=\"The base URL to use for the Groq API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_GROQ_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Groq API.\",\n    )\n    max_retries: RuntimeParameter[int] = Field(\n        default=2,\n        description=\"The maximum number of times to retry the request to the API before\"\n        \" failing.\",\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _api_key_env_var: str = PrivateAttr(_GROQ_API_KEY_ENV_VAR_NAME)\n    _aclient: Optional[\"AsyncGroq\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncGroq` client to benefit from async requests.\"\"\"\n        super().load()\n\n        try:\n            from groq import AsyncGroq\n        except ImportError as ie:\n            raise ImportError(\n                \"Groq Python client is not installed. Please install it using\"\n                ' `pip install groq` or from the extras as `pip install \"distilabel[groq]\"`.'\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        self._aclient = AsyncGroq(\n            base_url=self.base_url,\n            api_key=self.api_key.get_secret_value(),\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"groq\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        seed: Optional[int] = None,\n        max_new_tokens: int = 128,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        stop: Optional[str] = None,\n    ) -> \"GenerateOutput\":\n        \"\"\"Generates `num_generations` responses for the given input using the Groq async\n        client.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            seed: the seed to use for the generation. Defaults to `None`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            stop: the stop sequence to use for the generation. Defaults to `None`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n\n        References:\n            - https://console.groq.com/docs/text-chat\n        \"\"\"\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,\n                client=self._aclient,\n                framework=\"groq\",\n            )\n            self._aclient = result.get(\"client\")\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"messages\": input,  # type: ignore\n            \"model\": self.model,\n            \"seed\": seed,\n            \"temperature\": temperature,\n            \"max_tokens\": max_new_tokens,\n            \"top_p\": top_p,\n            \"stream\": False,\n            \"stop\": stop,\n        }\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n        generations = []\n        completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n        if structured_output:\n            generations.append(completion.model_dump_json())\n            return generations\n\n        for choice in completion.choices:\n            if (content := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"Received no response using the Groq client (model: '{self.model}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(content)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.load","title":"load()","text":"

Loads the AsyncGroq client to benefit from async requests.

Source code in src/distilabel/models/llms/groq.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncGroq` client to benefit from async requests.\"\"\"\n    super().load()\n\n    try:\n        from groq import AsyncGroq\n    except ImportError as ie:\n        raise ImportError(\n            \"Groq Python client is not installed. Please install it using\"\n            ' `pip install groq` or from the extras as `pip install \"distilabel[groq]\"`.'\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    self._aclient = AsyncGroq(\n        base_url=self.base_url,\n        api_key=self.api_key.get_secret_value(),\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"groq\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.GroqLLM.agenerate","title":"agenerate(input, seed=None, max_new_tokens=128, temperature=1.0, top_p=1.0, stop=None) async","text":"

Generates num_generations responses for the given input using the Groq async client.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required seed Optional[int]

the seed to use for the generation. Defaults to None.

None max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 stop Optional[str]

the stop sequence to use for the generation. Defaults to None.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

References
  • https://console.groq.com/docs/text-chat
Source code in src/distilabel/models/llms/groq.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    seed: Optional[int] = None,\n    max_new_tokens: int = 128,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    stop: Optional[str] = None,\n) -> \"GenerateOutput\":\n    \"\"\"Generates `num_generations` responses for the given input using the Groq async\n    client.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        seed: the seed to use for the generation. Defaults to `None`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        stop: the stop sequence to use for the generation. Defaults to `None`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n\n    References:\n        - https://console.groq.com/docs/text-chat\n    \"\"\"\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,\n            client=self._aclient,\n            framework=\"groq\",\n        )\n        self._aclient = result.get(\"client\")\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"messages\": input,  # type: ignore\n        \"model\": self.model,\n        \"seed\": seed,\n        \"temperature\": temperature,\n        \"max_tokens\": max_new_tokens,\n        \"top_p\": top_p,\n        \"stream\": False,\n        \"stop\": stop,\n    }\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n    generations = []\n    completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n    if structured_output:\n        generations.append(completion.model_dump_json())\n        return generations\n\n    for choice in completion.choices:\n        if (content := choice.message.content) is None:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using the Groq client (model: '{self.model}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM","title":"InferenceEndpointsLLM","text":"

Bases: AsyncLLM, MagpieChatTemplateMixin

InferenceEndpoints LLM implementation running the async API client.

This LLM will internally use huggingface_hub.AsyncInferenceClient.

Attributes:

Name Type Description model_id Optional[str]

the model ID to use for the LLM as available in the Hugging Face Hub, which will be used to resolve the base URL for the serverless Inference Endpoints API requests. Defaults to None.

endpoint_name Optional[RuntimeParameter[str]]

the name of the Inference Endpoint to use for the LLM. Defaults to None.

endpoint_namespace Optional[RuntimeParameter[str]]

the namespace of the Inference Endpoint to use for the LLM. Defaults to None.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Inference Endpoints API requests.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Inference Endpoints API.

tokenizer_id Optional[str]

the tokenizer ID to use for the LLM as available in the Hugging Face Hub. Defaults to None, but defining one is recommended to properly format the prompt.

model_display_name Optional[str]

the model display name to use for the LLM. Defaults to None.

use_magpie_template Optional[str]

a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

magpie_pre_query_template Optional[str]

the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

structured_output Optional[RuntimeParameter[StructuredOutputType]]

a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

Icon

:hugging:

Examples:

Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded:

from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Dedicated Inference Endpoints:

from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    endpoint_name=\"<ENDPOINT_NAME>\",\n    api_key=\"<HF_API_KEY>\",\n    endpoint_namespace=\"<USER|ORG>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Dedicated Inference Endpoints or TGI:

from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    api_key=\"<HF_API_KEY>\",\n    base_url=\"<BASE_URL>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pydantic import BaseModel\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    api_key=\"api.key\",\n    structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n
Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
class InferenceEndpointsLLM(AsyncLLM, MagpieChatTemplateMixin):\n    \"\"\"InferenceEndpoints LLM implementation running the async API client.\n\n    This LLM will internally use `huggingface_hub.AsyncInferenceClient`.\n\n    Attributes:\n        model_id: the model ID to use for the LLM as available in the Hugging Face Hub, which\n            will be used to resolve the base URL for the serverless Inference Endpoints API requests.\n            Defaults to `None`.\n        endpoint_name: the name of the Inference Endpoint to use for the LLM. Defaults to `None`.\n        endpoint_namespace: the namespace of the Inference Endpoint to use for the LLM. Defaults to `None`.\n        base_url: the base URL to use for the Inference Endpoints API requests.\n        api_key: the API key to authenticate the requests to the Inference Endpoints API.\n        tokenizer_id: the tokenizer ID to use for the LLM as available in the Hugging Face Hub.\n            Defaults to `None`, but defining one is recommended to properly format the prompt.\n        model_display_name: the model display name to use for the LLM. Defaults to `None`.\n        use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n            template. Defaults to `False`.\n        magpie_pre_query_template: the pre-query template to be applied to the prompt or\n            sent to the LLM to generate an instruction or a follow up user message. Valid\n            values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n            to `None`.\n        structured_output: a dictionary containing the structured output configuration or\n            if more fine-grained control is needed, an instance of `OutlinesStructuredOutput`.\n            Defaults to None.\n\n    Icon:\n        `:hugging:`\n\n    Examples:\n        Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded:\n\n        ```python\n        from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n        llm = InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Dedicated Inference Endpoints:\n\n        ```python\n        from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n        llm = InferenceEndpointsLLM(\n            endpoint_name=\"<ENDPOINT_NAME>\",\n            api_key=\"<HF_API_KEY>\",\n            endpoint_namespace=\"<USER|ORG>\",\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Dedicated Inference Endpoints or TGI:\n\n        ```python\n        from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n        llm = InferenceEndpointsLLM(\n            api_key=\"<HF_API_KEY>\",\n            base_url=\"<BASE_URL>\",\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import InferenceEndpointsLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            api_key=\"api.key\",\n            structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n        ```\n    \"\"\"\n\n    model_id: Optional[str] = None\n\n    endpoint_name: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The name of the Inference Endpoint to use for the LLM.\",\n    )\n    endpoint_namespace: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The namespace of the Inference Endpoint to use for the LLM.\",\n    )\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The base URL to use for the Inference Endpoints API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR),\n        description=\"The API key to authenticate the requests to the Inference Endpoints API.\",\n    )\n\n    tokenizer_id: Optional[str] = None\n    model_display_name: Optional[str] = None\n\n    structured_output: Optional[RuntimeParameter[StructuredOutputType]] = Field(\n        default=None,\n        description=\"The structured output format to use across all the generations.\",\n    )\n\n    _num_generations_param_supported = False\n\n    _model_name: Optional[str] = PrivateAttr(default=None)\n    _tokenizer: Optional[\"PreTrainedTokenizer\"] = PrivateAttr(default=None)\n    _api_key_env_var: str = PrivateAttr(HF_TOKEN_ENV_VAR)\n    _aclient: Optional[\"AsyncInferenceClient\"] = PrivateAttr(...)\n\n    @model_validator(mode=\"after\")  # type: ignore\n    def only_one_of_model_id_endpoint_name_or_base_url_provided(\n        self,\n    ) -> \"InferenceEndpointsLLM\":\n        \"\"\"Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also\n        provided, a warning will be shown informing the user that the provided `base_url` will be ignored in\n        favour of the dynamically calculated one..\"\"\"\n\n        if self.base_url and (self.model_id or self.endpoint_name):\n            self._logger.warning(  # type: ignore\n                f\"Since the `base_url={self.base_url}` is available and either one of `model_id`\"\n                \" or `endpoint_name` is also provided, the `base_url` will either be ignored\"\n                \" or overwritten with the one generated from either of those args, for serverless\"\n                \" or dedicated inference endpoints, respectively.\"\n            )\n\n        if self.use_magpie_template and self.tokenizer_id is None:\n            raise ValueError(\n                \"`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,\"\n                \" set a `tokenizer_id` and try again.\"\n            )\n\n        if (\n            self.model_id\n            and self.tokenizer_id is None\n            and self.structured_output is not None\n        ):\n            self.tokenizer_id = self.model_id\n\n        if self.base_url and not (self.model_id or self.endpoint_name):\n            return self\n\n        if self.model_id and not self.endpoint_name:\n            return self\n\n        if self.endpoint_name and not self.model_id:\n            return self\n\n        raise ValidationError(\n            f\"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is\"\n            f\" provided too, it will be overwritten instead. Found `model_id`={self.model_id},\"\n            f\" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}.\"\n        )\n\n    def load(self) -> None:  # noqa: C901\n        \"\"\"Loads the `AsyncInferenceClient` client to connect to the Hugging Face Inference\n        Endpoint.\n\n        Raises:\n            ImportError: if the `huggingface-hub` Python client is not installed.\n            ValueError: if the model is not currently deployed or is not running the TGI framework.\n            ImportError: if the `transformers` Python client is not installed.\n        \"\"\"\n        super().load()\n\n        try:\n            from huggingface_hub import (\n                AsyncInferenceClient,\n                InferenceClient,\n                get_inference_endpoint,\n            )\n        except ImportError as ie:\n            raise ImportError(\n                \"Hugging Face Hub Python client is not installed. Please install it using\"\n                \" `pip install huggingface-hub`.\"\n            ) from ie\n\n        if self.api_key is None:\n            self.api_key = SecretStr(get_hf_token(self.__class__.__name__, \"api_key\"))\n\n        if self.model_id is not None:\n            client = InferenceClient(\n                model=self.model_id, token=self.api_key.get_secret_value()\n            )\n            status = client.get_model_status()\n\n            if (\n                status.state not in {\"Loadable\", \"Loaded\"}\n                and status.framework != \"text-generation-inference\"\n            ):\n                raise ValueError(\n                    f\"Model {self.model_id} is not currently deployed or is not running the TGI framework\"\n                )\n\n            self.base_url = client._resolve_url(\n                model=self.model_id, task=\"text-generation\"\n            )\n\n        if self.endpoint_name is not None:\n            client = get_inference_endpoint(\n                name=self.endpoint_name,\n                namespace=self.endpoint_namespace,\n                token=self.api_key.get_secret_value(),\n            )\n            if client.status in [\"paused\", \"scaledToZero\"]:\n                client.resume().wait(timeout=300)\n            elif client.status == \"initializing\":\n                client.wait(timeout=300)\n\n            self.base_url = client.url\n            self._model_name = client.repository\n\n        self._aclient = AsyncInferenceClient(\n            base_url=self.base_url,\n            token=self.api_key.get_secret_value(),\n        )\n\n        if self.tokenizer_id:\n            try:\n                from transformers import AutoTokenizer\n            except ImportError as ie:\n                raise ImportError(\n                    \"Transformers Python client is not installed. Please install it using\"\n                    \" `pip install transformers`.\"\n                ) from ie\n\n            self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id)\n\n    @property\n    @override\n    def model_name(self) -> Union[str, None]:  # type: ignore\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return (\n            self.model_display_name\n            or self._model_name\n            or self.model_id\n            or self.endpoint_name\n            or self.base_url\n        )\n\n    def prepare_input(self, input: \"StandardInput\") -> str:\n        \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n        input.\n\n        Args:\n            input: the input list containing chat items.\n\n        Returns:\n            The prompt to send to the LLM.\n        \"\"\"\n        prompt: str = (\n            self._tokenizer.apply_chat_template(  # type: ignore\n                conversation=input,  # type: ignore\n                tokenize=False,\n                add_generation_prompt=True,\n            )\n            if input\n            else \"\"\n        )\n        return super().apply_magpie_pre_query_template(prompt, input)\n\n    def _get_structured_output(\n        self, input: FormattedInput\n    ) -> Union[Dict[str, Any], None]:\n        \"\"\"Gets the structured output (if any) for the given input.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n\n        Returns:\n            The structured output that will be passed as `grammer` to the inference endpoint\n            or `None` if not required.\n        \"\"\"\n        structured_output = None\n\n        # Specific structured output per input\n        if isinstance(input, tuple):\n            input, structured_output = input\n            structured_output = {\n                \"type\": structured_output[\"format\"],  # type: ignore\n                \"value\": structured_output[\"schema\"],  # type: ignore\n            }\n\n        # Same structured output for all the inputs\n        if structured_output is None and self.structured_output is not None:\n            try:\n                structured_output = {\n                    \"type\": self.structured_output[\"format\"],  # type: ignore\n                    \"value\": self.structured_output[\"schema\"],  # type: ignore\n                }\n            except KeyError as e:\n                raise ValueError(\n                    \"To use the structured output you have to inform the `format` and `schema` in \"\n                    \"the `structured_output` attribute.\"\n                ) from e\n\n        if structured_output:\n            if isinstance(structured_output[\"value\"], ModelMetaclass):\n                structured_output[\"value\"] = structured_output[\n                    \"value\"\n                ].model_json_schema()\n\n        return structured_output\n\n    async def _generate_with_text_generation(\n        self,\n        input: FormattedInput,\n        max_new_tokens: int = 128,\n        repetition_penalty: Optional[float] = None,\n        frequency_penalty: Optional[float] = None,\n        temperature: float = 1.0,\n        do_sample: bool = False,\n        top_k: Optional[int] = None,\n        top_p: Optional[float] = None,\n        typical_p: Optional[float] = None,\n        stop_sequences: Union[List[str], None] = None,\n        return_full_text: bool = False,\n        seed: Optional[int] = None,\n        watermark: bool = False,\n    ) -> Union[str, None]:\n        structured_output = self._get_structured_output(input)\n\n        completion = None\n        try:\n            completion = await self._aclient.text_generation(  # type: ignore\n                prompt=self.prepare_input(input),  # type: ignore\n                max_new_tokens=max_new_tokens,\n                do_sample=do_sample,\n                typical_p=typical_p,\n                repetition_penalty=repetition_penalty,\n                frequency_penalty=frequency_penalty,\n                temperature=temperature,\n                top_p=top_p,\n                top_k=top_k,\n                stop_sequences=stop_sequences,\n                return_full_text=return_full_text,\n                # NOTE: here to ensure that the cache is not used and a different response is\n                # generated every time\n                seed=seed or random.randint(0, sys.maxsize),\n                watermark=watermark,\n                grammar=structured_output,  # type: ignore\n            )\n        except Exception as e:\n            self._logger.warning(  # type: ignore\n                f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n                f\" Finish reason was: {e}\"\n            )\n        return completion\n\n    async def _generate_with_chat_completion(\n        self,\n        input: \"StandardInput\",\n        max_new_tokens: int = 128,\n        frequency_penalty: Optional[float] = None,\n        logit_bias: Optional[List[float]] = None,\n        presence_penalty: Optional[float] = None,\n        seed: Optional[int] = None,\n        stop_sequences: Optional[List[str]] = None,\n        temperature: float = 1.0,\n        tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n        tool_prompt: Optional[str] = None,\n        tools: Optional[List[Dict[str, Any]]] = None,\n        top_p: Optional[float] = None,\n    ) -> Union[str, None]:\n        message = None\n        try:\n            completion = await self._aclient.chat_completion(  # type: ignore\n                messages=input,  # type: ignore\n                max_tokens=max_new_tokens,\n                frequency_penalty=frequency_penalty,\n                logit_bias=logit_bias,\n                presence_penalty=presence_penalty,\n                # NOTE: here to ensure that the cache is not used and a different response is\n                # generated every time\n                seed=seed or random.randint(0, sys.maxsize),\n                stop=stop_sequences,\n                temperature=temperature,\n                tool_choice=tool_choice,  # type: ignore\n                tool_prompt=tool_prompt,\n                tools=tools,  # type: ignore\n                top_p=top_p,\n            )\n            choice = completion.choices[0]\n            if (message := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n        except Exception as e:\n            self._logger.warning(  # type: ignore\n                f\"\u26a0\ufe0f Received no response using Inference Client (model: '{self.model_name}').\"\n                f\" Finish reason was: {e}\"\n            )\n        return message\n\n    def _check_stop_sequences(\n        self,\n        stop_sequences: Optional[Union[str, List[str]]] = None,\n    ) -> Union[List[str], None]:\n        \"\"\"Checks that no more than 4 stop sequences are provided.\n\n        Args:\n            stop_sequences: the stop sequences to be checked.\n\n        Returns:\n            The stop sequences.\n        \"\"\"\n        if stop_sequences is not None:\n            if isinstance(stop_sequences, str):\n                stop_sequences = [stop_sequences]\n            if len(stop_sequences) > 4:\n                warnings.warn(\n                    \"Only up to 4 stop sequences are allowed, so keeping the first 4 items only.\",\n                    UserWarning,\n                    stacklevel=2,\n                )\n                stop_sequences = stop_sequences[:4]\n        return stop_sequences\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        max_new_tokens: int = 128,\n        frequency_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n        logit_bias: Optional[List[float]] = None,\n        presence_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n        seed: Optional[int] = None,\n        stop_sequences: Optional[List[str]] = None,\n        temperature: float = 1.0,\n        tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n        tool_prompt: Optional[str] = None,\n        tools: Optional[List[Dict[str, Any]]] = None,\n        top_p: Optional[float] = None,\n        do_sample: bool = False,\n        repetition_penalty: Optional[float] = None,\n        return_full_text: bool = False,\n        top_k: Optional[int] = None,\n        typical_p: Optional[float] = None,\n        watermark: bool = False,\n    ) -> GenerateOutput:\n        \"\"\"Generates completions for the given input using the async client. This method\n        uses two methods of the `huggingface_hub.AsyncClient`: `chat_completion` and `text_generation`.\n        `chat_completion` method will be used only if no `tokenizer_id` has been specified.\n        Some arguments of this function are specific to the `text_generation` method, while\n        some others are specific to the `chat_completion` method.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n                new tokens based on their existing frequency in the text so far, decreasing\n                model's likelihood to repeat the same line verbatim. Defauls to `None`.\n            logit_bias: modify the likelihood of specified tokens appearing in the completion.\n                This argument is exclusive to the `chat_completion` method and will be used\n                only if `tokenizer_id` is `None`.\n                Defaults to `None`.\n            presence_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n                new tokens based on whether they appear in the text so far, increasing the\n                model likelihood to talk about new topics. This argument is exclusive to\n                the `chat_completion` method and will be used only if `tokenizer_id` is\n                `None`. Defauls to `None`.\n            seed: the seed to use for the generation. Defaults to `None`.\n            stop_sequences: either a single string or a list of strings containing the sequences\n                to stop the generation at. Defaults to `None`, but will be set to the\n                `tokenizer.eos_token` if available.\n            temperature: the temperature to use for the generation. Defaults to `1.0`.\n            tool_choice: the name of the tool the model should call. It can be a dictionary\n                like `{\"function_name\": \"my_tool\"}` or \"auto\". If not provided, then the\n                model won't use any tool. This argument is exclusive to the `chat_completion`\n                method and will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n            tool_prompt: A prompt to be appended before the tools. This argument is exclusive\n                to the `chat_completion` method and will be used only if `tokenizer_id`\n                is `None`. Defauls to `None`.\n            tools: a list of tools definitions that the LLM can use.\n                This argument is exclusive to the `chat_completion` method and will be used\n                only if `tokenizer_id` is `None`. Defaults to `None`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            do_sample: whether to use sampling for the generation. This argument is exclusive\n                of the `text_generation` method and will be only used if `tokenizer_id` is not\n                `None`. Defaults to `False`.\n            repetition_penalty: the repetition penalty to use for the generation. This argument\n                is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n                is not `None`. Defaults to `None`.\n            return_full_text: whether to return the full text of the completion or just\n                the generated text. Defaults to `False`, meaning that only the generated\n                text will be returned. This argument is exclusive of the `text_generation`\n                method and will be only used if `tokenizer_id` is not `None`.\n            top_k: the top-k value to use for the generation. This argument is exclusive\n                of the `text_generation` method and will be only used if `tokenizer_id`\n                is not `None`. Defaults to `0.8`, since neither `0.0` nor `1.0` are valid\n                values in TGI.\n            typical_p: the typical-p value to use for the generation. This argument is exclusive\n                of the `text_generation` method and will be only used if `tokenizer_id`\n                is not `None`. Defaults to `None`.\n            watermark: whether to add the watermark to the generated text. This argument\n                is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n                is not `None`. Defaults to `None`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        stop_sequences = self._check_stop_sequences(stop_sequences)\n\n        if self.tokenizer_id is None:\n            return [\n                await self._generate_with_chat_completion(\n                    input=input,  # type: ignore\n                    max_new_tokens=max_new_tokens,\n                    frequency_penalty=frequency_penalty,\n                    logit_bias=logit_bias,\n                    presence_penalty=presence_penalty,\n                    seed=seed,\n                    stop_sequences=stop_sequences,\n                    temperature=temperature,\n                    tool_choice=tool_choice,\n                    tool_prompt=tool_prompt,\n                    tools=tools,\n                    top_p=top_p,\n                )\n            ]\n\n        return [\n            await self._generate_with_text_generation(\n                input=input,\n                max_new_tokens=max_new_tokens,\n                do_sample=do_sample,\n                typical_p=typical_p,\n                repetition_penalty=repetition_penalty,\n                frequency_penalty=frequency_penalty,\n                temperature=temperature,\n                top_p=top_p,\n                top_k=top_k,\n                stop_sequences=stop_sequences,\n                return_full_text=return_full_text,\n                seed=seed,\n                watermark=watermark,\n            )\n        ]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.model_name","title":"model_name: Union[str, None] property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.only_one_of_model_id_endpoint_name_or_base_url_provided","title":"only_one_of_model_id_endpoint_name_or_base_url_provided()","text":"

Validates that only one of model_id or endpoint_name is provided; and if base_url is also provided, a warning will be shown informing the user that the provided base_url will be ignored in favour of the dynamically calculated one..

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
@model_validator(mode=\"after\")  # type: ignore\ndef only_one_of_model_id_endpoint_name_or_base_url_provided(\n    self,\n) -> \"InferenceEndpointsLLM\":\n    \"\"\"Validates that only one of `model_id` or `endpoint_name` is provided; and if `base_url` is also\n    provided, a warning will be shown informing the user that the provided `base_url` will be ignored in\n    favour of the dynamically calculated one..\"\"\"\n\n    if self.base_url and (self.model_id or self.endpoint_name):\n        self._logger.warning(  # type: ignore\n            f\"Since the `base_url={self.base_url}` is available and either one of `model_id`\"\n            \" or `endpoint_name` is also provided, the `base_url` will either be ignored\"\n            \" or overwritten with the one generated from either of those args, for serverless\"\n            \" or dedicated inference endpoints, respectively.\"\n        )\n\n    if self.use_magpie_template and self.tokenizer_id is None:\n        raise ValueError(\n            \"`use_magpie_template` cannot be `True` if `tokenizer_id` is `None`. Please,\"\n            \" set a `tokenizer_id` and try again.\"\n        )\n\n    if (\n        self.model_id\n        and self.tokenizer_id is None\n        and self.structured_output is not None\n    ):\n        self.tokenizer_id = self.model_id\n\n    if self.base_url and not (self.model_id or self.endpoint_name):\n        return self\n\n    if self.model_id and not self.endpoint_name:\n        return self\n\n    if self.endpoint_name and not self.model_id:\n        return self\n\n    raise ValidationError(\n        f\"Only one of `model_id` or `endpoint_name` must be provided. If `base_url` is\"\n        f\" provided too, it will be overwritten instead. Found `model_id`={self.model_id},\"\n        f\" `endpoint_name`={self.endpoint_name}, and `base_url`={self.base_url}.\"\n    )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.load","title":"load()","text":"

Loads the AsyncInferenceClient client to connect to the Hugging Face Inference Endpoint.

Raises:

Type Description ImportError

if the huggingface-hub Python client is not installed.

ValueError

if the model is not currently deployed or is not running the TGI framework.

ImportError

if the transformers Python client is not installed.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
def load(self) -> None:  # noqa: C901\n    \"\"\"Loads the `AsyncInferenceClient` client to connect to the Hugging Face Inference\n    Endpoint.\n\n    Raises:\n        ImportError: if the `huggingface-hub` Python client is not installed.\n        ValueError: if the model is not currently deployed or is not running the TGI framework.\n        ImportError: if the `transformers` Python client is not installed.\n    \"\"\"\n    super().load()\n\n    try:\n        from huggingface_hub import (\n            AsyncInferenceClient,\n            InferenceClient,\n            get_inference_endpoint,\n        )\n    except ImportError as ie:\n        raise ImportError(\n            \"Hugging Face Hub Python client is not installed. Please install it using\"\n            \" `pip install huggingface-hub`.\"\n        ) from ie\n\n    if self.api_key is None:\n        self.api_key = SecretStr(get_hf_token(self.__class__.__name__, \"api_key\"))\n\n    if self.model_id is not None:\n        client = InferenceClient(\n            model=self.model_id, token=self.api_key.get_secret_value()\n        )\n        status = client.get_model_status()\n\n        if (\n            status.state not in {\"Loadable\", \"Loaded\"}\n            and status.framework != \"text-generation-inference\"\n        ):\n            raise ValueError(\n                f\"Model {self.model_id} is not currently deployed or is not running the TGI framework\"\n            )\n\n        self.base_url = client._resolve_url(\n            model=self.model_id, task=\"text-generation\"\n        )\n\n    if self.endpoint_name is not None:\n        client = get_inference_endpoint(\n            name=self.endpoint_name,\n            namespace=self.endpoint_namespace,\n            token=self.api_key.get_secret_value(),\n        )\n        if client.status in [\"paused\", \"scaledToZero\"]:\n            client.resume().wait(timeout=300)\n        elif client.status == \"initializing\":\n            client.wait(timeout=300)\n\n        self.base_url = client.url\n        self._model_name = client.repository\n\n    self._aclient = AsyncInferenceClient(\n        base_url=self.base_url,\n        token=self.api_key.get_secret_value(),\n    )\n\n    if self.tokenizer_id:\n        try:\n            from transformers import AutoTokenizer\n        except ImportError as ie:\n            raise ImportError(\n                \"Transformers Python client is not installed. Please install it using\"\n                \" `pip install transformers`.\"\n            ) from ie\n\n        self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.prepare_input","title":"prepare_input(input)","text":"

Prepares the input (applying the chat template and tokenization) for the provided input.

Parameters:

Name Type Description Default input StandardInput

the input list containing chat items.

required

Returns:

Type Description str

The prompt to send to the LLM.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
def prepare_input(self, input: \"StandardInput\") -> str:\n    \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n    input.\n\n    Args:\n        input: the input list containing chat items.\n\n    Returns:\n        The prompt to send to the LLM.\n    \"\"\"\n    prompt: str = (\n        self._tokenizer.apply_chat_template(  # type: ignore\n            conversation=input,  # type: ignore\n            tokenize=False,\n            add_generation_prompt=True,\n        )\n        if input\n        else \"\"\n    )\n    return super().apply_magpie_pre_query_template(prompt, input)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM._get_structured_output","title":"_get_structured_output(input)","text":"

Gets the structured output (if any) for the given input.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required

Returns:

Type Description Union[Dict[str, Any], None]

The structured output that will be passed as grammer to the inference endpoint

Union[Dict[str, Any], None]

or None if not required.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
def _get_structured_output(\n    self, input: FormattedInput\n) -> Union[Dict[str, Any], None]:\n    \"\"\"Gets the structured output (if any) for the given input.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n\n    Returns:\n        The structured output that will be passed as `grammer` to the inference endpoint\n        or `None` if not required.\n    \"\"\"\n    structured_output = None\n\n    # Specific structured output per input\n    if isinstance(input, tuple):\n        input, structured_output = input\n        structured_output = {\n            \"type\": structured_output[\"format\"],  # type: ignore\n            \"value\": structured_output[\"schema\"],  # type: ignore\n        }\n\n    # Same structured output for all the inputs\n    if structured_output is None and self.structured_output is not None:\n        try:\n            structured_output = {\n                \"type\": self.structured_output[\"format\"],  # type: ignore\n                \"value\": self.structured_output[\"schema\"],  # type: ignore\n            }\n        except KeyError as e:\n            raise ValueError(\n                \"To use the structured output you have to inform the `format` and `schema` in \"\n                \"the `structured_output` attribute.\"\n            ) from e\n\n    if structured_output:\n        if isinstance(structured_output[\"value\"], ModelMetaclass):\n            structured_output[\"value\"] = structured_output[\n                \"value\"\n            ].model_json_schema()\n\n    return structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM._check_stop_sequences","title":"_check_stop_sequences(stop_sequences=None)","text":"

Checks that no more than 4 stop sequences are provided.

Parameters:

Name Type Description Default stop_sequences Optional[Union[str, List[str]]]

the stop sequences to be checked.

None

Returns:

Type Description Union[List[str], None]

The stop sequences.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
def _check_stop_sequences(\n    self,\n    stop_sequences: Optional[Union[str, List[str]]] = None,\n) -> Union[List[str], None]:\n    \"\"\"Checks that no more than 4 stop sequences are provided.\n\n    Args:\n        stop_sequences: the stop sequences to be checked.\n\n    Returns:\n        The stop sequences.\n    \"\"\"\n    if stop_sequences is not None:\n        if isinstance(stop_sequences, str):\n            stop_sequences = [stop_sequences]\n        if len(stop_sequences) > 4:\n            warnings.warn(\n                \"Only up to 4 stop sequences are allowed, so keeping the first 4 items only.\",\n                UserWarning,\n                stacklevel=2,\n            )\n            stop_sequences = stop_sequences[:4]\n    return stop_sequences\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.InferenceEndpointsLLM.agenerate","title":"agenerate(input, max_new_tokens=128, frequency_penalty=None, logit_bias=None, presence_penalty=None, seed=None, stop_sequences=None, temperature=1.0, tool_choice=None, tool_prompt=None, tools=None, top_p=None, do_sample=False, repetition_penalty=None, return_full_text=False, top_k=None, typical_p=None, watermark=False) async","text":"

Generates completions for the given input using the async client. This method uses two methods of the huggingface_hub.AsyncClient: chat_completion and text_generation. chat_completion method will be used only if no tokenizer_id has been specified. Some arguments of this function are specific to the text_generation method, while some others are specific to the chat_completion method.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty Optional[Annotated[float, Field(ge=-2.0, le=2.0)]]

a value between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing model's likelihood to repeat the same line verbatim. Defauls to None.

None logit_bias Optional[List[float]]

modify the likelihood of specified tokens appearing in the completion. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defaults to None.

None presence_penalty Optional[Annotated[float, Field(ge=-2.0, le=2.0)]]

a value between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model likelihood to talk about new topics. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defauls to None.

None seed Optional[int]

the seed to use for the generation. Defaults to None.

None stop_sequences Optional[List[str]]

either a single string or a list of strings containing the sequences to stop the generation at. Defaults to None, but will be set to the tokenizer.eos_token if available.

None temperature float

the temperature to use for the generation. Defaults to 1.0.

1.0 tool_choice Optional[Union[Dict[str, str], Literal['auto']]]

the name of the tool the model should call. It can be a dictionary like {\"function_name\": \"my_tool\"} or \"auto\". If not provided, then the model won't use any tool. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defaults to None.

None tool_prompt Optional[str]

A prompt to be appended before the tools. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defauls to None.

None tools Optional[List[Dict[str, Any]]]

a list of tools definitions that the LLM can use. This argument is exclusive to the chat_completion method and will be used only if tokenizer_id is None. Defaults to None.

None top_p Optional[float]

the top-p value to use for the generation. Defaults to 1.0.

None do_sample bool

whether to use sampling for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to False.

False repetition_penalty Optional[float]

the repetition penalty to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to None.

None return_full_text bool

whether to return the full text of the completion or just the generated text. Defaults to False, meaning that only the generated text will be returned. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None.

False top_k Optional[int]

the top-k value to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to 0.8, since neither 0.0 nor 1.0 are valid values in TGI.

None typical_p Optional[float]

the typical-p value to use for the generation. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to None.

None watermark bool

whether to add the watermark to the generated text. This argument is exclusive of the text_generation method and will be only used if tokenizer_id is not None. Defaults to None.

False

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/huggingface/inference_endpoints.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    max_new_tokens: int = 128,\n    frequency_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n    logit_bias: Optional[List[float]] = None,\n    presence_penalty: Optional[Annotated[float, Field(ge=-2.0, le=2.0)]] = None,\n    seed: Optional[int] = None,\n    stop_sequences: Optional[List[str]] = None,\n    temperature: float = 1.0,\n    tool_choice: Optional[Union[Dict[str, str], Literal[\"auto\"]]] = None,\n    tool_prompt: Optional[str] = None,\n    tools: Optional[List[Dict[str, Any]]] = None,\n    top_p: Optional[float] = None,\n    do_sample: bool = False,\n    repetition_penalty: Optional[float] = None,\n    return_full_text: bool = False,\n    top_k: Optional[int] = None,\n    typical_p: Optional[float] = None,\n    watermark: bool = False,\n) -> GenerateOutput:\n    \"\"\"Generates completions for the given input using the async client. This method\n    uses two methods of the `huggingface_hub.AsyncClient`: `chat_completion` and `text_generation`.\n    `chat_completion` method will be used only if no `tokenizer_id` has been specified.\n    Some arguments of this function are specific to the `text_generation` method, while\n    some others are specific to the `chat_completion` method.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n            new tokens based on their existing frequency in the text so far, decreasing\n            model's likelihood to repeat the same line verbatim. Defauls to `None`.\n        logit_bias: modify the likelihood of specified tokens appearing in the completion.\n            This argument is exclusive to the `chat_completion` method and will be used\n            only if `tokenizer_id` is `None`.\n            Defaults to `None`.\n        presence_penalty: a value between `-2.0` and `2.0`. Positive values penalize\n            new tokens based on whether they appear in the text so far, increasing the\n            model likelihood to talk about new topics. This argument is exclusive to\n            the `chat_completion` method and will be used only if `tokenizer_id` is\n            `None`. Defauls to `None`.\n        seed: the seed to use for the generation. Defaults to `None`.\n        stop_sequences: either a single string or a list of strings containing the sequences\n            to stop the generation at. Defaults to `None`, but will be set to the\n            `tokenizer.eos_token` if available.\n        temperature: the temperature to use for the generation. Defaults to `1.0`.\n        tool_choice: the name of the tool the model should call. It can be a dictionary\n            like `{\"function_name\": \"my_tool\"}` or \"auto\". If not provided, then the\n            model won't use any tool. This argument is exclusive to the `chat_completion`\n            method and will be used only if `tokenizer_id` is `None`. Defaults to `None`.\n        tool_prompt: A prompt to be appended before the tools. This argument is exclusive\n            to the `chat_completion` method and will be used only if `tokenizer_id`\n            is `None`. Defauls to `None`.\n        tools: a list of tools definitions that the LLM can use.\n            This argument is exclusive to the `chat_completion` method and will be used\n            only if `tokenizer_id` is `None`. Defaults to `None`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        do_sample: whether to use sampling for the generation. This argument is exclusive\n            of the `text_generation` method and will be only used if `tokenizer_id` is not\n            `None`. Defaults to `False`.\n        repetition_penalty: the repetition penalty to use for the generation. This argument\n            is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n            is not `None`. Defaults to `None`.\n        return_full_text: whether to return the full text of the completion or just\n            the generated text. Defaults to `False`, meaning that only the generated\n            text will be returned. This argument is exclusive of the `text_generation`\n            method and will be only used if `tokenizer_id` is not `None`.\n        top_k: the top-k value to use for the generation. This argument is exclusive\n            of the `text_generation` method and will be only used if `tokenizer_id`\n            is not `None`. Defaults to `0.8`, since neither `0.0` nor `1.0` are valid\n            values in TGI.\n        typical_p: the typical-p value to use for the generation. This argument is exclusive\n            of the `text_generation` method and will be only used if `tokenizer_id`\n            is not `None`. Defaults to `None`.\n        watermark: whether to add the watermark to the generated text. This argument\n            is exclusive of the `text_generation` method and will be only used if `tokenizer_id`\n            is not `None`. Defaults to `None`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    stop_sequences = self._check_stop_sequences(stop_sequences)\n\n    if self.tokenizer_id is None:\n        return [\n            await self._generate_with_chat_completion(\n                input=input,  # type: ignore\n                max_new_tokens=max_new_tokens,\n                frequency_penalty=frequency_penalty,\n                logit_bias=logit_bias,\n                presence_penalty=presence_penalty,\n                seed=seed,\n                stop_sequences=stop_sequences,\n                temperature=temperature,\n                tool_choice=tool_choice,\n                tool_prompt=tool_prompt,\n                tools=tools,\n                top_p=top_p,\n            )\n        ]\n\n    return [\n        await self._generate_with_text_generation(\n            input=input,\n            max_new_tokens=max_new_tokens,\n            do_sample=do_sample,\n            typical_p=typical_p,\n            repetition_penalty=repetition_penalty,\n            frequency_penalty=frequency_penalty,\n            temperature=temperature,\n            top_p=top_p,\n            top_k=top_k,\n            stop_sequences=stop_sequences,\n            return_full_text=return_full_text,\n            seed=seed,\n            watermark=watermark,\n        )\n    ]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM","title":"TransformersLLM","text":"

Bases: LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin

Hugging Face transformers library LLM implementation using the text generation pipeline.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

revision str

if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

torch_dtype str

the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\".

trust_remote_code bool

whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

model_kwargs Optional[Dict[str, Any]]

additional dictionary of keyword arguments that will be passed to the from_pretrained method of the model.

tokenizer Optional[str]

the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer config files. If not provided, the one associated to the model will be used. Defaults to None.

use_fast bool

whether to use a fast tokenizer or not. Defaults to True.

chat_template Optional[str]

a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None.

device Optional[Union[str, int]]

the name or index of the device where the model will be loaded. Defaults to None.

device_map Optional[Union[str, Dict[str, Any]]]

a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\". Defaults to None.

token Optional[SecretStr]

the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]]

a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

use_magpie_template Optional[RuntimeParameter[OutlinesStructuredOutputType]]

a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

magpie_pre_query_template Optional[RuntimeParameter[OutlinesStructuredOutputType]]

the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

Icon

:hugging:

Examples:

Generate text:

from distilabel.models.llms import TransformersLLM\n\nllm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/huggingface/transformers.py
class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):\n    \"\"\"Hugging Face `transformers` library LLM implementation using the text generation\n    pipeline.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        revision: if `model` refers to a Hugging Face Hub repository, then the revision\n            (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n        torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc.\n            Defaults to `\"auto\"`.\n        trust_remote_code: whether to allow fetching and executing remote code fetched\n            from the repository in the Hub. Defaults to `False`.\n        model_kwargs: additional dictionary of keyword arguments that will be passed to\n            the `from_pretrained` method of the model.\n        tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing\n            the tokenizer config files. If not provided, the one associated to the `model`\n            will be used. Defaults to `None`.\n        use_fast: whether to use a fast tokenizer or not. Defaults to `True`.\n        chat_template: a chat template that will be used to build the prompts before\n            sending them to the model. If not provided, the chat template defined in the\n            tokenizer config will be used. If not provided and the tokenizer doesn't have\n            a chat template, then ChatML template will be used. Defaults to `None`.\n        device: the name or index of the device where the model will be loaded. Defaults\n            to `None`.\n        device_map: a dictionary mapping each layer of the model to a device, or a mode\n            like `\"sequential\"` or `\"auto\"`. Defaults to `None`.\n        token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n            Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n            local configuration will be used. Defaults to `None`.\n        structured_output: a dictionary containing the structured output configuration or if more\n            fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n        use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n            template. Defaults to `False`.\n        magpie_pre_query_template: the pre-query template to be applied to the prompt or\n            sent to the LLM to generate an instruction or a follow up user message. Valid\n            values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n            to `None`.\n\n    Icon:\n        `:hugging:`\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import TransformersLLM\n\n        llm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    revision: str = \"main\"\n    torch_dtype: str = \"auto\"\n    trust_remote_code: bool = False\n    model_kwargs: Optional[Dict[str, Any]] = None\n    tokenizer: Optional[str] = None\n    use_fast: bool = True\n    chat_template: Optional[str] = None\n    device: Optional[Union[str, int]] = None\n    device_map: Optional[Union[str, Dict[str, Any]]] = None\n    token: Optional[SecretStr] = Field(\n        default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR)\n    )\n    structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n        default=None,\n        description=\"The structured output format to use across all the generations.\",\n    )\n\n    _pipeline: Optional[\"Pipeline\"] = PrivateAttr(...)\n    _prefix_allowed_tokens_fn: Union[Callable, None] = PrivateAttr(default=None)\n\n    def load(self) -> None:\n        \"\"\"Loads the model and tokenizer and creates the text generation pipeline. In addition,\n        it will configure the tokenizer chat template.\"\"\"\n        if self.device == \"cuda\":\n            CudaDevicePlacementMixin.load(self)\n\n        try:\n            from transformers import pipeline\n        except ImportError as ie:\n            raise ImportError(\n                \"Transformers is not installed. Please install it using `pip install transformers`.\"\n            ) from ie\n\n        token = self.token.get_secret_value() if self.token is not None else self.token\n\n        self._pipeline = pipeline(\n            \"text-generation\",\n            model=self.model,\n            revision=self.revision,\n            torch_dtype=self.torch_dtype,\n            trust_remote_code=self.trust_remote_code,\n            model_kwargs=self.model_kwargs or {},\n            tokenizer=self.tokenizer or self.model,\n            use_fast=self.use_fast,\n            device=self.device,\n            device_map=self.device_map,\n            token=token,\n            return_full_text=False,\n        )\n\n        if self.chat_template is not None:\n            self._pipeline.tokenizer.chat_template = self.chat_template  # type: ignore\n\n        if self._pipeline.tokenizer.pad_token is None:  # type: ignore\n            self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token  # type: ignore\n\n        if self.structured_output:\n            self._prefix_allowed_tokens_fn = self._prepare_structured_output(\n                self.structured_output\n            )\n\n        super().load()\n\n    def unload(self) -> None:\n        \"\"\"Unloads the `vLLM` model.\"\"\"\n        CudaDevicePlacementMixin.unload(self)\n        super().unload()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    def prepare_input(self, input: \"StandardInput\") -> str:\n        \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n        input.\n\n        Args:\n            input: the input list containing chat items.\n\n        Returns:\n            The prompt to send to the LLM.\n        \"\"\"\n        if self._pipeline.tokenizer.chat_template:  # type: ignore\n            return input[0][\"content\"]\n\n        prompt: str = (\n            self._pipeline.tokenizer.apply_chat_template(  # type: ignore\n                input,  # type: ignore\n                tokenize=False,\n                add_generation_prompt=True,\n            )\n            if input\n            else \"\"\n        )\n        return super().apply_magpie_pre_query_template(prompt, input)\n\n    @validate_call\n    def generate(  # type: ignore\n        self,\n        inputs: List[StandardInput],\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        temperature: float = 0.1,\n        repetition_penalty: float = 1.1,\n        top_p: float = 1.0,\n        top_k: int = 0,\n        do_sample: bool = True,\n    ) -> List[GenerateOutput]:\n        \"\"\"Generates `num_generations` responses for each input using the text generation\n        pipeline.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            repetition_penalty: the repetition penalty to use for the generation. Defaults\n                to `1.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            top_k: the top-k value to use for the generation. Defaults to `0`.\n            do_sample: whether to use sampling or not. Defaults to `True`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        prepared_inputs = [self.prepare_input(input=input) for input in inputs]\n\n        outputs: List[List[Dict[str, str]]] = self._pipeline(  # type: ignore\n            prepared_inputs,\n            max_new_tokens=max_new_tokens,\n            temperature=temperature,\n            repetition_penalty=repetition_penalty,\n            top_p=top_p,\n            top_k=top_k,\n            do_sample=do_sample,\n            num_return_sequences=num_generations,\n            prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,\n            pad_token_id=self._pipeline.tokenizer.eos_token_id,  # type: ignore\n        )\n        return [\n            [generation[\"generated_text\"] for generation in output]\n            for output in outputs\n        ]\n\n    def get_last_hidden_states(\n        self, inputs: List[\"StandardInput\"]\n    ) -> List[\"HiddenState\"]:\n        \"\"\"Gets the last `hidden_states` of the model for the given inputs. It doesn't\n        execute the task head.\n\n        Args:\n            inputs: a list of inputs in chat format to generate the embeddings for.\n\n        Returns:\n            A list containing the last hidden state for each sequence using a NumPy array\n            with shape [num_tokens, hidden_size].\n        \"\"\"\n        model: \"PreTrainedModel\" = (\n            self._pipeline.model.model  # type: ignore\n            if hasattr(self._pipeline.model, \"model\")  # type: ignore\n            else next(self._pipeline.model.children())  # type: ignore\n        )\n        tokenizer: \"PreTrainedTokenizer\" = self._pipeline.tokenizer  # type: ignore\n        input_ids = tokenizer(\n            [self.prepare_input(input) for input in inputs],  # type: ignore\n            return_tensors=\"pt\",\n            padding=True,\n        ).to(model.device)\n        last_hidden_states = model(**input_ids)[\"last_hidden_state\"]\n\n        return [\n            seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()\n            for seq_last_hidden_state, attention_mask in zip(\n                last_hidden_states,\n                input_ids[\"attention_mask\"],  # type: ignore\n            )\n        ]\n\n    def _prepare_structured_output(\n        self, structured_output: Optional[OutlinesStructuredOutputType] = None\n    ) -> Union[Callable, None]:\n        \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n        Args:\n            structured_output: the configuration dict to prepare the structured output.\n\n        Returns:\n            The callable that will be used to guide the generation of the model.\n        \"\"\"\n        from distilabel.steps.tasks.structured_outputs.outlines import (\n            prepare_guided_output,\n        )\n\n        result = prepare_guided_output(\n            structured_output, \"transformers\", self._pipeline\n        )\n        if schema := result.get(\"schema\"):\n            self.structured_output[\"schema\"] = schema\n        return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.load","title":"load()","text":"

Loads the model and tokenizer and creates the text generation pipeline. In addition, it will configure the tokenizer chat template.

Source code in src/distilabel/models/llms/huggingface/transformers.py
def load(self) -> None:\n    \"\"\"Loads the model and tokenizer and creates the text generation pipeline. In addition,\n    it will configure the tokenizer chat template.\"\"\"\n    if self.device == \"cuda\":\n        CudaDevicePlacementMixin.load(self)\n\n    try:\n        from transformers import pipeline\n    except ImportError as ie:\n        raise ImportError(\n            \"Transformers is not installed. Please install it using `pip install transformers`.\"\n        ) from ie\n\n    token = self.token.get_secret_value() if self.token is not None else self.token\n\n    self._pipeline = pipeline(\n        \"text-generation\",\n        model=self.model,\n        revision=self.revision,\n        torch_dtype=self.torch_dtype,\n        trust_remote_code=self.trust_remote_code,\n        model_kwargs=self.model_kwargs or {},\n        tokenizer=self.tokenizer or self.model,\n        use_fast=self.use_fast,\n        device=self.device,\n        device_map=self.device_map,\n        token=token,\n        return_full_text=False,\n    )\n\n    if self.chat_template is not None:\n        self._pipeline.tokenizer.chat_template = self.chat_template  # type: ignore\n\n    if self._pipeline.tokenizer.pad_token is None:  # type: ignore\n        self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token  # type: ignore\n\n    if self.structured_output:\n        self._prefix_allowed_tokens_fn = self._prepare_structured_output(\n            self.structured_output\n        )\n\n    super().load()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.unload","title":"unload()","text":"

Unloads the vLLM model.

Source code in src/distilabel/models/llms/huggingface/transformers.py
def unload(self) -> None:\n    \"\"\"Unloads the `vLLM` model.\"\"\"\n    CudaDevicePlacementMixin.unload(self)\n    super().unload()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.prepare_input","title":"prepare_input(input)","text":"

Prepares the input (applying the chat template and tokenization) for the provided input.

Parameters:

Name Type Description Default input StandardInput

the input list containing chat items.

required

Returns:

Type Description str

The prompt to send to the LLM.

Source code in src/distilabel/models/llms/huggingface/transformers.py
def prepare_input(self, input: \"StandardInput\") -> str:\n    \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n    input.\n\n    Args:\n        input: the input list containing chat items.\n\n    Returns:\n        The prompt to send to the LLM.\n    \"\"\"\n    if self._pipeline.tokenizer.chat_template:  # type: ignore\n        return input[0][\"content\"]\n\n    prompt: str = (\n        self._pipeline.tokenizer.apply_chat_template(  # type: ignore\n            input,  # type: ignore\n            tokenize=False,\n            add_generation_prompt=True,\n        )\n        if input\n        else \"\"\n    )\n    return super().apply_magpie_pre_query_template(prompt, input)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, temperature=0.1, repetition_penalty=1.1, top_p=1.0, top_k=0, do_sample=True)","text":"

Generates num_generations responses for each input using the text generation pipeline.

Parameters:

Name Type Description Default inputs List[StandardInput]

a list of inputs in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 temperature float

the temperature to use for the generation. Defaults to 0.1.

0.1 repetition_penalty float

the repetition penalty to use for the generation. Defaults to 1.1.

1.1 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 top_k int

the top-k value to use for the generation. Defaults to 0.

0 do_sample bool

whether to use sampling or not. Defaults to True.

True

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/huggingface/transformers.py
@validate_call\ndef generate(  # type: ignore\n    self,\n    inputs: List[StandardInput],\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    temperature: float = 0.1,\n    repetition_penalty: float = 1.1,\n    top_p: float = 1.0,\n    top_k: int = 0,\n    do_sample: bool = True,\n) -> List[GenerateOutput]:\n    \"\"\"Generates `num_generations` responses for each input using the text generation\n    pipeline.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        repetition_penalty: the repetition penalty to use for the generation. Defaults\n            to `1.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        top_k: the top-k value to use for the generation. Defaults to `0`.\n        do_sample: whether to use sampling or not. Defaults to `True`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    prepared_inputs = [self.prepare_input(input=input) for input in inputs]\n\n    outputs: List[List[Dict[str, str]]] = self._pipeline(  # type: ignore\n        prepared_inputs,\n        max_new_tokens=max_new_tokens,\n        temperature=temperature,\n        repetition_penalty=repetition_penalty,\n        top_p=top_p,\n        top_k=top_k,\n        do_sample=do_sample,\n        num_return_sequences=num_generations,\n        prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,\n        pad_token_id=self._pipeline.tokenizer.eos_token_id,  # type: ignore\n    )\n    return [\n        [generation[\"generated_text\"] for generation in output]\n        for output in outputs\n    ]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM.get_last_hidden_states","title":"get_last_hidden_states(inputs)","text":"

Gets the last hidden_states of the model for the given inputs. It doesn't execute the task head.

Parameters:

Name Type Description Default inputs List[StandardInput]

a list of inputs in chat format to generate the embeddings for.

required

Returns:

Type Description List[HiddenState]

A list containing the last hidden state for each sequence using a NumPy array

List[HiddenState]

with shape [num_tokens, hidden_size].

Source code in src/distilabel/models/llms/huggingface/transformers.py
def get_last_hidden_states(\n    self, inputs: List[\"StandardInput\"]\n) -> List[\"HiddenState\"]:\n    \"\"\"Gets the last `hidden_states` of the model for the given inputs. It doesn't\n    execute the task head.\n\n    Args:\n        inputs: a list of inputs in chat format to generate the embeddings for.\n\n    Returns:\n        A list containing the last hidden state for each sequence using a NumPy array\n        with shape [num_tokens, hidden_size].\n    \"\"\"\n    model: \"PreTrainedModel\" = (\n        self._pipeline.model.model  # type: ignore\n        if hasattr(self._pipeline.model, \"model\")  # type: ignore\n        else next(self._pipeline.model.children())  # type: ignore\n    )\n    tokenizer: \"PreTrainedTokenizer\" = self._pipeline.tokenizer  # type: ignore\n    input_ids = tokenizer(\n        [self.prepare_input(input) for input in inputs],  # type: ignore\n        return_tensors=\"pt\",\n        padding=True,\n    ).to(model.device)\n    last_hidden_states = model(**input_ids)[\"last_hidden_state\"]\n\n    return [\n        seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()\n        for seq_last_hidden_state, attention_mask in zip(\n            last_hidden_states,\n            input_ids[\"attention_mask\"],  # type: ignore\n        )\n    ]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TransformersLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None)","text":"

Creates the appropriate function to filter tokens to generate structured outputs.

Parameters:

Name Type Description Default structured_output Optional[OutlinesStructuredOutputType]

the configuration dict to prepare the structured output.

None

Returns:

Type Description Union[Callable, None]

The callable that will be used to guide the generation of the model.

Source code in src/distilabel/models/llms/huggingface/transformers.py
def _prepare_structured_output(\n    self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[Callable, None]:\n    \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n    Args:\n        structured_output: the configuration dict to prepare the structured output.\n\n    Returns:\n        The callable that will be used to guide the generation of the model.\n    \"\"\"\n    from distilabel.steps.tasks.structured_outputs.outlines import (\n        prepare_guided_output,\n    )\n\n    result = prepare_guided_output(\n        structured_output, \"transformers\", self._pipeline\n    )\n    if schema := result.get(\"schema\"):\n        self.structured_output[\"schema\"] = schema\n    return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM","title":"LiteLLM","text":"

Bases: AsyncLLM

LiteLLM implementation running the async API client.

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\", etc.

verbose RuntimeParameter[bool]

whether to log the LiteLLM client's logs. Defaults to False.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

Runtime parameters
  • verbose: whether to log the LiteLLM client's logs. Defaults to False.

Examples:

Generate text:

from distilabel.models.llms import LiteLLM\n\nllm = LiteLLM(model=\"gpt-3.5-turbo\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import LiteLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = LiteLLM(\n    model=\"gpt-3.5-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/litellm.py
class LiteLLM(AsyncLLM):\n    \"\"\"LiteLLM implementation running the async API client.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\",\n            etc.\n        verbose: whether to log the LiteLLM client's logs. Defaults to `False`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n\n    Runtime parameters:\n        - `verbose`: whether to log the LiteLLM client's logs. Defaults to `False`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import LiteLLM\n\n        llm = LiteLLM(model=\"gpt-3.5-turbo\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import LiteLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = LiteLLM(\n            model=\"gpt-3.5-turbo\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    verbose: RuntimeParameter[bool] = Field(\n        default=False, description=\"Whether to log the LiteLLM client's logs.\"\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _aclient: Optional[Callable] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"\n        Loads the `acompletion` LiteLLM client to benefit from async requests.\n        \"\"\"\n        super().load()\n\n        try:\n            import litellm\n\n            litellm.telemetry = False\n        except ImportError as e:\n            raise ImportError(\n                \"LiteLLM Python client is not installed. Please install it using\"\n                \" `pip install litellm`.\"\n            ) from e\n        self._aclient = litellm.acompletion\n\n        if not self.verbose:\n            litellm.suppress_debug_info = True\n            for key in logging.Logger.manager.loggerDict.keys():\n                if \"litellm\" not in key.lower():\n                    continue\n                logging.getLogger(key).setLevel(logging.CRITICAL)\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"litellm\",\n            )\n            self._aclient = result.get(\"client\")\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore # noqa: C901\n        self,\n        input: FormattedInput,\n        num_generations: int = 1,\n        functions: Optional[List] = None,\n        function_call: Optional[str] = None,\n        temperature: Optional[float] = 1.0,\n        top_p: Optional[float] = 1.0,\n        stop: Optional[Union[str, list]] = None,\n        max_tokens: Optional[int] = None,\n        presence_penalty: Optional[float] = None,\n        frequency_penalty: Optional[float] = None,\n        logit_bias: Optional[dict] = None,\n        user: Optional[str] = None,\n        metadata: Optional[dict] = None,\n        api_base: Optional[str] = None,\n        api_version: Optional[str] = None,\n        api_key: Optional[str] = None,\n        model_list: Optional[list] = None,\n        mock_response: Optional[str] = None,\n        force_timeout: Optional[int] = 600,\n        custom_llm_provider: Optional[str] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for the given input using the [LiteLLM async client](https://github.com/BerriAI/litellm).\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            functions: a list of functions to apply to the conversation messages. Defaults to\n                `None`.\n            function_call: the name of the function to call within the conversation. Defaults\n                to `None`.\n            temperature: the temperature to use for the generation. Defaults to `1.0`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            stop: Up to 4 sequences where the LLM API will stop generating further tokens.\n                Defaults to `None`.\n            max_tokens: The maximum number of tokens in the generated completion. Defaults to\n                `None`.\n            presence_penalty: It is used to penalize new tokens based on their existence in the\n                text so far. Defaults to `None`.\n            frequency_penalty: It is used to penalize new tokens based on their frequency in the\n                text so far. Defaults to `None`.\n            logit_bias: Used to modify the probability of specific tokens appearing in the\n                completion. Defaults to `None`.\n            user: A unique identifier representing your end-user. This can help the LLM provider\n                to monitor and detect abuse. Defaults to `None`.\n            metadata: Pass in additional metadata to tag your completion calls - eg. prompt\n                version, details, etc. Defaults to `None`.\n            api_base: Base URL for the API. Defaults to `None`.\n            api_version: API version. Defaults to `None`.\n            api_key: API key. Defaults to `None`.\n            model_list: List of api base, version, keys. Defaults to `None`.\n            mock_response: If provided, return a mock completion response for testing or debugging\n                purposes. Defaults to `None`.\n            force_timeout: The maximum execution time in seconds for the completion request.\n                Defaults to `600`.\n            custom_llm_provider: Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable)\n                model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to\n                `None`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        import litellm\n\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,\n                client=self._aclient,\n                framework=\"litellm\",\n            )\n            self._aclient = result.get(\"client\")\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"model\": self.model,\n            \"messages\": input,\n            \"n\": num_generations,\n            \"functions\": functions,\n            \"function_call\": function_call,\n            \"temperature\": temperature,\n            \"top_p\": top_p,\n            \"stream\": False,\n            \"stop\": stop,\n            \"max_tokens\": max_tokens,\n            \"presence_penalty\": presence_penalty,\n            \"frequency_penalty\": frequency_penalty,\n            \"logit_bias\": logit_bias,\n            \"user\": user,\n            \"metadata\": metadata,\n            \"api_base\": api_base,\n            \"api_version\": api_version,\n            \"api_key\": api_key,\n            \"model_list\": model_list,\n            \"mock_response\": mock_response,\n            \"force_timeout\": force_timeout,\n            \"custom_llm_provider\": custom_llm_provider,\n        }\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n        async def _call_aclient_until_n_choices() -> List[\"Choices\"]:\n            choices = []\n            while len(choices) < num_generations:\n                completion = await self._aclient(**kwargs)  # type: ignore\n                if not self.structured_output:\n                    completion = completion.choices\n                choices.extend(completion)\n            return choices\n\n        # litellm.drop_params is used to en/disable sending **kwargs parameters to the API if they cannot be used\n        try:\n            litellm.drop_params = False\n            choices = await _call_aclient_until_n_choices()\n        except litellm.exceptions.APIError as e:\n            if \"does not support parameters\" in str(e):\n                litellm.drop_params = True\n                choices = await _call_aclient_until_n_choices()\n            else:\n                raise e\n\n        generations = []\n\n        if self.structured_output:\n            generations.append([choice.model_dump_json() for choice in choices])\n            return generations\n\n        for choice in choices:\n            if (content := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"Received no response using LiteLLM client (model: '{self.model}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(content)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.load","title":"load()","text":"

Loads the acompletion LiteLLM client to benefit from async requests.

Source code in src/distilabel/models/llms/litellm.py
def load(self) -> None:\n    \"\"\"\n    Loads the `acompletion` LiteLLM client to benefit from async requests.\n    \"\"\"\n    super().load()\n\n    try:\n        import litellm\n\n        litellm.telemetry = False\n    except ImportError as e:\n        raise ImportError(\n            \"LiteLLM Python client is not installed. Please install it using\"\n            \" `pip install litellm`.\"\n        ) from e\n    self._aclient = litellm.acompletion\n\n    if not self.verbose:\n        litellm.suppress_debug_info = True\n        for key in logging.Logger.manager.loggerDict.keys():\n            if \"litellm\" not in key.lower():\n                continue\n            logging.getLogger(key).setLevel(logging.CRITICAL)\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"litellm\",\n        )\n        self._aclient = result.get(\"client\")\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LiteLLM.agenerate","title":"agenerate(input, num_generations=1, functions=None, function_call=None, temperature=1.0, top_p=1.0, stop=None, max_tokens=None, presence_penalty=None, frequency_penalty=None, logit_bias=None, user=None, metadata=None, api_base=None, api_version=None, api_key=None, model_list=None, mock_response=None, force_timeout=600, custom_llm_provider=None) async","text":"

Generates num_generations responses for the given input using the LiteLLM async client.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 functions Optional[List]

a list of functions to apply to the conversation messages. Defaults to None.

None function_call Optional[str]

the name of the function to call within the conversation. Defaults to None.

None temperature Optional[float]

the temperature to use for the generation. Defaults to 1.0.

1.0 top_p Optional[float]

the top-p value to use for the generation. Defaults to 1.0.

1.0 stop Optional[Union[str, list]]

Up to 4 sequences where the LLM API will stop generating further tokens. Defaults to None.

None max_tokens Optional[int]

The maximum number of tokens in the generated completion. Defaults to None.

None presence_penalty Optional[float]

It is used to penalize new tokens based on their existence in the text so far. Defaults to None.

None frequency_penalty Optional[float]

It is used to penalize new tokens based on their frequency in the text so far. Defaults to None.

None logit_bias Optional[dict]

Used to modify the probability of specific tokens appearing in the completion. Defaults to None.

None user Optional[str]

A unique identifier representing your end-user. This can help the LLM provider to monitor and detect abuse. Defaults to None.

None metadata Optional[dict]

Pass in additional metadata to tag your completion calls - eg. prompt version, details, etc. Defaults to None.

None api_base Optional[str]

Base URL for the API. Defaults to None.

None api_version Optional[str]

API version. Defaults to None.

None api_key Optional[str]

API key. Defaults to None.

None model_list Optional[list]

List of api base, version, keys. Defaults to None.

None mock_response Optional[str]

If provided, return a mock completion response for testing or debugging purposes. Defaults to None.

None force_timeout Optional[int]

The maximum execution time in seconds for the completion request. Defaults to 600.

600 custom_llm_provider Optional[str]

Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable) model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to None.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/litellm.py
@validate_call\nasync def agenerate(  # type: ignore # noqa: C901\n    self,\n    input: FormattedInput,\n    num_generations: int = 1,\n    functions: Optional[List] = None,\n    function_call: Optional[str] = None,\n    temperature: Optional[float] = 1.0,\n    top_p: Optional[float] = 1.0,\n    stop: Optional[Union[str, list]] = None,\n    max_tokens: Optional[int] = None,\n    presence_penalty: Optional[float] = None,\n    frequency_penalty: Optional[float] = None,\n    logit_bias: Optional[dict] = None,\n    user: Optional[str] = None,\n    metadata: Optional[dict] = None,\n    api_base: Optional[str] = None,\n    api_version: Optional[str] = None,\n    api_key: Optional[str] = None,\n    model_list: Optional[list] = None,\n    mock_response: Optional[str] = None,\n    force_timeout: Optional[int] = 600,\n    custom_llm_provider: Optional[str] = None,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for the given input using the [LiteLLM async client](https://github.com/BerriAI/litellm).\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        functions: a list of functions to apply to the conversation messages. Defaults to\n            `None`.\n        function_call: the name of the function to call within the conversation. Defaults\n            to `None`.\n        temperature: the temperature to use for the generation. Defaults to `1.0`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        stop: Up to 4 sequences where the LLM API will stop generating further tokens.\n            Defaults to `None`.\n        max_tokens: The maximum number of tokens in the generated completion. Defaults to\n            `None`.\n        presence_penalty: It is used to penalize new tokens based on their existence in the\n            text so far. Defaults to `None`.\n        frequency_penalty: It is used to penalize new tokens based on their frequency in the\n            text so far. Defaults to `None`.\n        logit_bias: Used to modify the probability of specific tokens appearing in the\n            completion. Defaults to `None`.\n        user: A unique identifier representing your end-user. This can help the LLM provider\n            to monitor and detect abuse. Defaults to `None`.\n        metadata: Pass in additional metadata to tag your completion calls - eg. prompt\n            version, details, etc. Defaults to `None`.\n        api_base: Base URL for the API. Defaults to `None`.\n        api_version: API version. Defaults to `None`.\n        api_key: API key. Defaults to `None`.\n        model_list: List of api base, version, keys. Defaults to `None`.\n        mock_response: If provided, return a mock completion response for testing or debugging\n            purposes. Defaults to `None`.\n        force_timeout: The maximum execution time in seconds for the completion request.\n            Defaults to `600`.\n        custom_llm_provider: Used for Non-OpenAI LLMs, Example usage for bedrock, set(iterable)\n            model=\"amazon.titan-tg1-large\" and custom_llm_provider=\"bedrock\". Defaults to\n            `None`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    import litellm\n\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,\n            client=self._aclient,\n            framework=\"litellm\",\n        )\n        self._aclient = result.get(\"client\")\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"model\": self.model,\n        \"messages\": input,\n        \"n\": num_generations,\n        \"functions\": functions,\n        \"function_call\": function_call,\n        \"temperature\": temperature,\n        \"top_p\": top_p,\n        \"stream\": False,\n        \"stop\": stop,\n        \"max_tokens\": max_tokens,\n        \"presence_penalty\": presence_penalty,\n        \"frequency_penalty\": frequency_penalty,\n        \"logit_bias\": logit_bias,\n        \"user\": user,\n        \"metadata\": metadata,\n        \"api_base\": api_base,\n        \"api_version\": api_version,\n        \"api_key\": api_key,\n        \"model_list\": model_list,\n        \"mock_response\": mock_response,\n        \"force_timeout\": force_timeout,\n        \"custom_llm_provider\": custom_llm_provider,\n    }\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)\n\n    async def _call_aclient_until_n_choices() -> List[\"Choices\"]:\n        choices = []\n        while len(choices) < num_generations:\n            completion = await self._aclient(**kwargs)  # type: ignore\n            if not self.structured_output:\n                completion = completion.choices\n            choices.extend(completion)\n        return choices\n\n    # litellm.drop_params is used to en/disable sending **kwargs parameters to the API if they cannot be used\n    try:\n        litellm.drop_params = False\n        choices = await _call_aclient_until_n_choices()\n    except litellm.exceptions.APIError as e:\n        if \"does not support parameters\" in str(e):\n            litellm.drop_params = True\n            choices = await _call_aclient_until_n_choices()\n        else:\n            raise e\n\n    generations = []\n\n    if self.structured_output:\n        generations.append([choice.model_dump_json() for choice in choices])\n        return generations\n\n    for choice in choices:\n        if (content := choice.message.content) is None:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using LiteLLM client (model: '{self.model}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM","title":"LlamaCppLLM","text":"

Bases: LLM

llama.cpp LLM implementation running the Python bindings for the C++ code.

Attributes:

Name Type Description model_path RuntimeParameter[FilePath]

contains the path to the GGUF quantized model, compatible with the installed version of the llama.cpp Python bindings.

n_gpu_layers RuntimeParameter[int]

the number of layers to use for the GPU. Defaults to -1, meaning that the available GPU device will be used.

chat_format Optional[RuntimeParameter[str]]

the chat format to use for the model. Defaults to None, which means the Llama format will be used.

n_ctx int

the context size to use for the model. Defaults to 512.

n_batch int

the prompt processing maximum batch size to use for the model. Defaults to 512.

seed int

random seed to use for the generation. Defaults to 4294967295.

verbose RuntimeParameter[bool]

whether to print verbose output. Defaults to False.

structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]]

a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]]

additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {}.

_model Optional[Llama]

the Llama model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

Runtime parameters
  • model_path: the path to the GGUF quantized model.
  • n_gpu_layers: the number of layers to use for the GPU. Defaults to -1.
  • chat_format: the chat format to use for the model. Defaults to None.
  • verbose: whether to print verbose output. Defaults to False.
  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {}.
References
  • llama.cpp
  • llama-cpp-python

Examples:

Generate text:

from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\n# You can follow along this example downloading the following model running the following\n# command in the terminal, that will download the model to the `Downloads` folder:\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nllm = LlamaCppLLM(\n    model_path=str(Path.home() / model_path),\n    n_gpu_layers=-1,  # To use the GPU if available\n    n_ctx=1024,       # Set the context size\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = LlamaCppLLM(\n    model_path=str(Path.home() / model_path),  # type: ignore\n    n_gpu_layers=-1,\n    n_ctx=1024,\n    structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/llamacpp.py
class LlamaCppLLM(LLM):\n    \"\"\"llama.cpp LLM implementation running the Python bindings for the C++ code.\n\n    Attributes:\n        model_path: contains the path to the GGUF quantized model, compatible with the\n            installed version of the `llama.cpp` Python bindings.\n        n_gpu_layers: the number of layers to use for the GPU. Defaults to `-1`, meaning that\n            the available GPU device will be used.\n        chat_format: the chat format to use for the model. Defaults to `None`, which means the\n            Llama format will be used.\n        n_ctx: the context size to use for the model. Defaults to `512`.\n        n_batch: the prompt processing maximum batch size to use for the model. Defaults to `512`.\n        seed: random seed to use for the generation. Defaults to `4294967295`.\n        verbose: whether to print verbose output. Defaults to `False`.\n        structured_output: a dictionary containing the structured output configuration or if more\n            fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n        extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n            `Llama` class of `llama_cpp` library. Defaults to `{}`.\n        _model: the Llama model instance. This attribute is meant to be used internally and\n            should not be accessed directly. It will be set in the `load` method.\n\n    Runtime parameters:\n        - `model_path`: the path to the GGUF quantized model.\n        - `n_gpu_layers`: the number of layers to use for the GPU. Defaults to `-1`.\n        - `chat_format`: the chat format to use for the model. Defaults to `None`.\n        - `verbose`: whether to print verbose output. Defaults to `False`.\n        - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to the\n            `Llama` class of `llama_cpp` library. Defaults to `{}`.\n\n    References:\n        - [`llama.cpp`](https://github.com/ggerganov/llama.cpp)\n        - [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python)\n\n    Examples:\n        Generate text:\n\n        ```python\n        from pathlib import Path\n        from distilabel.models.llms import LlamaCppLLM\n\n        # You can follow along this example downloading the following model running the following\n        # command in the terminal, that will download the model to the `Downloads` folder:\n        # curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\n        model_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\n        llm = LlamaCppLLM(\n            model_path=str(Path.home() / model_path),\n            n_gpu_layers=-1,  # To use the GPU if available\n            n_ctx=1024,       # Set the context size\n        )\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pathlib import Path\n        from distilabel.models.llms import LlamaCppLLM\n\n        model_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = LlamaCppLLM(\n            model_path=str(Path.home() / model_path),  # type: ignore\n            n_gpu_layers=-1,\n            n_ctx=1024,\n            structured_output={\"format\": \"json\", \"schema\": Character},\n        )\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model_path: RuntimeParameter[FilePath] = Field(\n        default=None, description=\"The path to the GGUF quantized model.\", exclude=True\n    )\n    n_gpu_layers: RuntimeParameter[int] = Field(\n        default=-1,\n        description=\"The number of layers that will be loaded in the GPU.\",\n    )\n    chat_format: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The chat format to use for the model. Defaults to `None`, which means the Llama format will be used.\",\n    )\n\n    n_ctx: int = 512\n    n_batch: int = 512\n    seed: int = 4294967295\n    verbose: RuntimeParameter[bool] = Field(\n        default=False,\n        description=\"Whether to print verbose output from llama.cpp library.\",\n    )\n    extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n        default_factory=dict,\n        description=\"Additional dictionary of keyword arguments that will be passed to the\"\n        \" `Llama` class of `llama_cpp` library. See all the supported arguments at: \"\n        \"https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__\",\n    )\n    structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n        default=None,\n        description=\"The structured output format to use across all the generations.\",\n    )\n\n    _logits_processor: Optional[\"LogitsProcessorList\"] = PrivateAttr(default=None)\n    _model: Optional[\"Llama\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `Llama` model from the `model_path`.\"\"\"\n        try:\n            from llama_cpp import Llama\n        except ImportError as ie:\n            raise ImportError(\n                \"The `llama_cpp` package is required to use the `LlamaCppLLM` class.\"\n            ) from ie\n\n        self._model = Llama(\n            model_path=self.model_path.as_posix(),  # type: ignore\n            seed=self.seed,\n            n_ctx=self.n_ctx,\n            n_batch=self.n_batch,\n            chat_format=self.chat_format,\n            n_gpu_layers=self.n_gpu_layers,\n            verbose=self.verbose,\n            **self.extra_kwargs,\n        )\n\n        if self.structured_output:\n            self._logits_processor = self._prepare_structured_output(\n                self.structured_output\n            )\n\n        # NOTE: Here because of the custom `logging` interface used, since it will create the logging name\n        # out of the model name, which won't be available until the `Llama` instance is created.\n        super().load()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self._model.model_path  # type: ignore\n\n    @validate_call\n    def generate(  # type: ignore\n        self,\n        inputs: List[FormattedInput],\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        frequency_penalty: float = 0.0,\n        presence_penalty: float = 0.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        extra_generation_kwargs: Optional[Dict[str, Any]] = None,\n    ) -> List[GenerateOutput]:\n        \"\"\"Generates `num_generations` responses for the given input using the Llama model.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            extra_generation_kwargs: dictionary with additional arguments to be passed to\n                the `create_chat_completion` method. Reference at\n                https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        structured_output = None\n        batch_outputs = []\n        for input in inputs:\n            if isinstance(input, tuple):\n                input, structured_output = input\n            elif self.structured_output:\n                structured_output = self.structured_output\n\n            outputs = []\n            for _ in range(num_generations):\n                # NOTE(plaguss): There seems to be a bug in how the logits processor\n                # is used. Basically it consumes the FSM internally, and it isn't reinitialized\n                # after each generation, so subsequent calls yield nothing. This is a workaround\n                # until is fixed in the `llama_cpp` or `outlines` libraries.\n                if structured_output:\n                    self._logits_processor = self._prepare_structured_output(\n                        structured_output\n                    )\n                chat_completions: \"CreateChatCompletionResponse\" = (\n                    self._model.create_chat_completion(  # type: ignore\n                        messages=input,  # type: ignore\n                        max_tokens=max_new_tokens,\n                        frequency_penalty=frequency_penalty,\n                        presence_penalty=presence_penalty,\n                        temperature=temperature,\n                        top_p=top_p,\n                        logits_processor=self._logits_processor,\n                        **(extra_generation_kwargs or {}),\n                    )\n                )\n                outputs.append(chat_completions[\"choices\"][0][\"message\"][\"content\"])\n            batch_outputs.append(outputs)\n        return batch_outputs\n\n    def _prepare_structured_output(\n        self, structured_output: Optional[OutlinesStructuredOutputType] = None\n    ) -> Union[\"LogitsProcessorList\", None]:\n        \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n        Args:\n            structured_output: the configuration dict to prepare the structured output.\n\n        Returns:\n            The callable that will be used to guide the generation of the model.\n        \"\"\"\n        from distilabel.steps.tasks.structured_outputs.outlines import (\n            prepare_guided_output,\n        )\n\n        result = prepare_guided_output(structured_output, \"llamacpp\", self._model)\n        if (schema := result.get(\"schema\")) and self.structured_output:\n            self.structured_output[\"schema\"] = schema\n        return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.load","title":"load()","text":"

Loads the Llama model from the model_path.

Source code in src/distilabel/models/llms/llamacpp.py
def load(self) -> None:\n    \"\"\"Loads the `Llama` model from the `model_path`.\"\"\"\n    try:\n        from llama_cpp import Llama\n    except ImportError as ie:\n        raise ImportError(\n            \"The `llama_cpp` package is required to use the `LlamaCppLLM` class.\"\n        ) from ie\n\n    self._model = Llama(\n        model_path=self.model_path.as_posix(),  # type: ignore\n        seed=self.seed,\n        n_ctx=self.n_ctx,\n        n_batch=self.n_batch,\n        chat_format=self.chat_format,\n        n_gpu_layers=self.n_gpu_layers,\n        verbose=self.verbose,\n        **self.extra_kwargs,\n    )\n\n    if self.structured_output:\n        self._logits_processor = self._prepare_structured_output(\n            self.structured_output\n        )\n\n    # NOTE: Here because of the custom `logging` interface used, since it will create the logging name\n    # out of the model name, which won't be available until the `Llama` instance is created.\n    super().load()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, extra_generation_kwargs=None)","text":"

Generates num_generations responses for the given input using the Llama model.

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 extra_generation_kwargs Optional[Dict[str, Any]]

dictionary with additional arguments to be passed to the create_chat_completion method. Reference at https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion

None

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/llamacpp.py
@validate_call\ndef generate(  # type: ignore\n    self,\n    inputs: List[FormattedInput],\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    frequency_penalty: float = 0.0,\n    presence_penalty: float = 0.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    extra_generation_kwargs: Optional[Dict[str, Any]] = None,\n) -> List[GenerateOutput]:\n    \"\"\"Generates `num_generations` responses for the given input using the Llama model.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        extra_generation_kwargs: dictionary with additional arguments to be passed to\n            the `create_chat_completion` method. Reference at\n            https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    structured_output = None\n    batch_outputs = []\n    for input in inputs:\n        if isinstance(input, tuple):\n            input, structured_output = input\n        elif self.structured_output:\n            structured_output = self.structured_output\n\n        outputs = []\n        for _ in range(num_generations):\n            # NOTE(plaguss): There seems to be a bug in how the logits processor\n            # is used. Basically it consumes the FSM internally, and it isn't reinitialized\n            # after each generation, so subsequent calls yield nothing. This is a workaround\n            # until is fixed in the `llama_cpp` or `outlines` libraries.\n            if structured_output:\n                self._logits_processor = self._prepare_structured_output(\n                    structured_output\n                )\n            chat_completions: \"CreateChatCompletionResponse\" = (\n                self._model.create_chat_completion(  # type: ignore\n                    messages=input,  # type: ignore\n                    max_tokens=max_new_tokens,\n                    frequency_penalty=frequency_penalty,\n                    presence_penalty=presence_penalty,\n                    temperature=temperature,\n                    top_p=top_p,\n                    logits_processor=self._logits_processor,\n                    **(extra_generation_kwargs or {}),\n                )\n            )\n            outputs.append(chat_completions[\"choices\"][0][\"message\"][\"content\"])\n        batch_outputs.append(outputs)\n    return batch_outputs\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.LlamaCppLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None)","text":"

Creates the appropriate function to filter tokens to generate structured outputs.

Parameters:

Name Type Description Default structured_output Optional[OutlinesStructuredOutputType]

the configuration dict to prepare the structured output.

None

Returns:

Type Description Union[LogitsProcessorList, None]

The callable that will be used to guide the generation of the model.

Source code in src/distilabel/models/llms/llamacpp.py
def _prepare_structured_output(\n    self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[\"LogitsProcessorList\", None]:\n    \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n    Args:\n        structured_output: the configuration dict to prepare the structured output.\n\n    Returns:\n        The callable that will be used to guide the generation of the model.\n    \"\"\"\n    from distilabel.steps.tasks.structured_outputs.outlines import (\n        prepare_guided_output,\n    )\n\n    result = prepare_guided_output(structured_output, \"llamacpp\", self._model)\n    if (schema := result.get(\"schema\")) and self.structured_output:\n        self.structured_output[\"schema\"] = schema\n    return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM","title":"MistralLLM","text":"

Bases: AsyncLLM

Mistral LLM implementation running the async API client.

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc.

endpoint str

the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\".

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Mistral API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set.

max_retries RuntimeParameter[int]

the maximum number of retries to attempt when a request fails. Defaults to 5.

timeout RuntimeParameter[int]

the maximum time in seconds to wait for a response. Defaults to 120.

max_concurrent_requests RuntimeParameter[int]

the maximum number of concurrent requests to send. Defaults to 64.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

_api_key_env_var str

the name of the environment variable to use for the API key. It is meant to be used internally.

_aclient Optional[Mistral]

the Mistral to use for the Mistral API. It is meant to be used internally. Set in the load method.

Runtime parameters
  • api_key: the API key to authenticate the requests to the Mistral API.
  • max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5.
  • timeout: the maximum time in seconds to wait for a response. Defaults to 120.
  • max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64.

Examples:

Generate text:

from distilabel.models.llms import MistralLLM\n\nllm = MistralLLM(model=\"open-mixtral-8x22b\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n\n```python\nfrom pydantic import BaseModel\nfrom distilabel.models.llms import MistralLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = MistralLLM(\n    model=\"open-mixtral-8x22b\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/mistral.py
class MistralLLM(AsyncLLM):\n    \"\"\"Mistral LLM implementation running the async API client.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc.\n        endpoint: the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\".\n        api_key: the API key to authenticate the requests to the Mistral API. Defaults to `None` which\n            means that the value set for the environment variable `OPENAI_API_KEY` will be used, or\n            `None` if not set.\n        max_retries: the maximum number of retries to attempt when a request fails. Defaults to `5`.\n        timeout: the maximum time in seconds to wait for a response. Defaults to `120`.\n        max_concurrent_requests: the maximum number of concurrent requests to send. Defaults\n            to `64`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n        _api_key_env_var: the name of the environment variable to use for the API key. It is meant to\n            be used internally.\n        _aclient: the `Mistral` to use for the Mistral API. It is meant to be used internally.\n            Set in the `load` method.\n\n    Runtime parameters:\n        - `api_key`: the API key to authenticate the requests to the Mistral API.\n        - `max_retries`: the maximum number of retries to attempt when a request fails.\n            Defaults to `5`.\n        - `timeout`: the maximum time in seconds to wait for a response. Defaults to `120`.\n        - `max_concurrent_requests`: the maximum number of concurrent requests to send.\n            Defaults to `64`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import MistralLLM\n\n        llm = MistralLLM(model=\"open-mixtral-8x22b\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import MistralLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = MistralLLM(\n            model=\"open-mixtral-8x22b\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    endpoint: str = \"https://api.mistral.ai\"\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_MISTRALAI_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Mistral API.\",\n    )\n    max_retries: RuntimeParameter[int] = Field(\n        default=6,\n        description=\"The maximum number of times to retry the request to the API before\"\n        \" failing.\",\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    max_concurrent_requests: RuntimeParameter[int] = Field(\n        default=64, description=\"The maximum number of concurrent requests to send.\"\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _api_key_env_var: str = PrivateAttr(_MISTRALAI_API_KEY_ENV_VAR_NAME)\n    _aclient: Optional[\"Mistral\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `Mistral` client to benefit from async requests.\"\"\"\n        super().load()\n\n        try:\n            from mistralai import Mistral\n        except ImportError as ie:\n            raise ImportError(\n                \"MistralAI Python client is not installed. Please install it using\"\n                \" `pip install mistralai`.\"\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        self._aclient = Mistral(\n            api_key=self.api_key.get_secret_value(),\n            endpoint=self.endpoint,\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,  # type: ignore\n            max_concurrent_requests=self.max_concurrent_requests,  # type: ignore\n        )\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"mistral\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    # TODO: add `num_generations` parameter once Mistral client allows `n` parameter\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        max_new_tokens: Optional[int] = None,\n        temperature: Optional[float] = None,\n        top_p: Optional[float] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for the given input using the MistralAI async\n        client.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,\n                client=self._aclient,\n                framework=\"mistral\",\n            )\n            self._aclient = result.get(\"client\")\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"messages\": input,  # type: ignore\n            \"model\": self.model,\n            \"max_tokens\": max_new_tokens,\n            \"temperature\": temperature,\n            \"top_p\": top_p,\n        }\n        generations = []\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)\n            # TODO:\u00a0This should work just with the _aclient.chat method, but it's not working.\n            # We need to check instructor and see if we can create a PR.\n            completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n        else:\n            # completion = await self._aclient.chat(**kwargs)  # type: ignore\n            completion = await self._aclient.chat.complete_async(**kwargs)  # type: ignore\n\n        if structured_output:\n            generations.append(completion.model_dump_json())\n            return generations\n\n        for choice in completion.choices:\n            if (content := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"Received no response using MistralAI client (model: '{self.model}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(content)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.load","title":"load()","text":"

Loads the Mistral client to benefit from async requests.

Source code in src/distilabel/models/llms/mistral.py
def load(self) -> None:\n    \"\"\"Loads the `Mistral` client to benefit from async requests.\"\"\"\n    super().load()\n\n    try:\n        from mistralai import Mistral\n    except ImportError as ie:\n        raise ImportError(\n            \"MistralAI Python client is not installed. Please install it using\"\n            \" `pip install mistralai`.\"\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    self._aclient = Mistral(\n        api_key=self.api_key.get_secret_value(),\n        endpoint=self.endpoint,\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,  # type: ignore\n        max_concurrent_requests=self.max_concurrent_requests,  # type: ignore\n    )\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"mistral\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MistralLLM.agenerate","title":"agenerate(input, max_new_tokens=None, temperature=None, top_p=None) async","text":"

Generates num_generations responses for the given input using the MistralAI async client.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required max_new_tokens Optional[int]

the maximum number of new tokens that the model will generate. Defaults to 128.

None temperature Optional[float]

the temperature to use for the generation. Defaults to 0.1.

None top_p Optional[float]

the top-p value to use for the generation. Defaults to 1.0.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/mistral.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    max_new_tokens: Optional[int] = None,\n    temperature: Optional[float] = None,\n    top_p: Optional[float] = None,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for the given input using the MistralAI async\n    client.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,\n            client=self._aclient,\n            framework=\"mistral\",\n        )\n        self._aclient = result.get(\"client\")\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"messages\": input,  # type: ignore\n        \"model\": self.model,\n        \"max_tokens\": max_new_tokens,\n        \"temperature\": temperature,\n        \"top_p\": top_p,\n    }\n    generations = []\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)\n        # TODO:\u00a0This should work just with the _aclient.chat method, but it's not working.\n        # We need to check instructor and see if we can create a PR.\n        completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n    else:\n        # completion = await self._aclient.chat(**kwargs)  # type: ignore\n        completion = await self._aclient.chat.complete_async(**kwargs)  # type: ignore\n\n    if structured_output:\n        generations.append(completion.model_dump_json())\n        return generations\n\n    for choice in completion.choices:\n        if (content := choice.message.content) is None:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using MistralAI client (model: '{self.model}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM","title":"MixtureOfAgentsLLM","text":"

Bases: AsyncLLM

Mixture-of-Agents implementation.

An LLM class that leverages LLMs collective strenghts to generate a response, as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\" paper. There is a list of LLMs proposing/generating outputs that LLMs from the next round/layer can use as auxiliary information. Finally, there is an LLM that aggregates the outputs to generate the final response.

Attributes:

Name Type Description aggregator_llm LLM

The LLM that aggregates the outputs of the proposer LLMs.

proposers_llms List[AsyncLLM]

The list of LLMs that propose outputs to be aggregated.

rounds int

The number of layers or rounds that the proposers_llms will generate outputs. Defaults to 1.

References
  • Mixture-of-Agents Enhances Large Language Model Capabilities

Examples:

Generate text:

from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\nllm = MixtureOfAgentsLLM(\n    aggregator_llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n    proposers_llms=[\n        InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        ),\n        InferenceEndpointsLLM(\n            model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n            tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n        ),\n        InferenceEndpointsLLM(\n            model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n            tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n        ),\n    ],\n    rounds=2,\n)\n\nllm.load()\n\noutput = llm.generate_outputs(\n    inputs=[\n        [\n            {\n                \"role\": \"user\",\n                \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n            }\n        ]\n    ]\n)\n
Source code in src/distilabel/models/llms/moa.py
class MixtureOfAgentsLLM(AsyncLLM):\n    \"\"\"`Mixture-of-Agents` implementation.\n\n    An `LLM` class that leverages `LLM`s collective strenghts to generate a response,\n    as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\"\n    paper. There is a list of `LLM`s proposing/generating outputs that `LLM`s from the next\n    round/layer can use as auxiliary information. Finally, there is an `LLM` that aggregates\n    the outputs to generate the final response.\n\n    Attributes:\n        aggregator_llm: The `LLM` that aggregates the outputs of the proposer `LLM`s.\n        proposers_llms: The list of `LLM`s that propose outputs to be aggregated.\n        rounds: The number of layers or rounds that the `proposers_llms` will generate\n            outputs. Defaults to `1`.\n\n    References:\n        - [Mixture-of-Agents Enhances Large Language Model Capabilities](https://arxiv.org/abs/2406.04692)\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\n        llm = MixtureOfAgentsLLM(\n            aggregator_llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            ),\n            proposers_llms=[\n                InferenceEndpointsLLM(\n                    model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                    tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                ),\n                InferenceEndpointsLLM(\n                    model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n                    tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n                ),\n                InferenceEndpointsLLM(\n                    model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n                    tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n                ),\n            ],\n            rounds=2,\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(\n            inputs=[\n                [\n                    {\n                        \"role\": \"user\",\n                        \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n                    }\n                ]\n            ]\n        )\n        ```\n    \"\"\"\n\n    aggregator_llm: LLM\n    proposers_llms: List[AsyncLLM] = Field(default_factory=list)\n    rounds: int = 1\n\n    @property\n    def runtime_parameters_names(self) -> \"RuntimeParametersNames\":\n        \"\"\"Returns the runtime parameters of the `LLM`, which are a combination of the\n        `RuntimeParameter`s of the `LLM`, the `aggregator_llm` and the `proposers_llms`.\n\n        Returns:\n            The runtime parameters of the `LLM`.\n        \"\"\"\n        runtime_parameters_names = super().runtime_parameters_names\n        del runtime_parameters_names[\"generation_kwargs\"]\n        return runtime_parameters_names\n\n    def load(self) -> None:\n        \"\"\"Loads all the `LLM`s in the `MixtureOfAgents`.\"\"\"\n        super().load()\n\n        for llm in self.proposers_llms:\n            self._logger.debug(f\"Loading proposer LLM in MoA: {llm}\")  # type: ignore\n            llm.load()\n\n        self._logger.debug(f\"Loading aggregator LLM in MoA: {self.aggregator_llm}\")  # type: ignore\n        self.aggregator_llm.load()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the aggregated model name.\"\"\"\n        return f\"moa-{self.aggregator_llm.model_name}-{'-'.join([llm.model_name for llm in self.proposers_llms])}\"\n\n    def get_generation_kwargs(self) -> Dict[str, Any]:\n        \"\"\"Returns the generation kwargs of the `MixtureOfAgents` as a dictionary.\n\n        Returns:\n            The generation kwargs of the `MixtureOfAgents`.\n        \"\"\"\n        return {\n            \"aggregator_llm\": self.aggregator_llm.get_generation_kwargs(),\n            \"proposers_llms\": [\n                llm.get_generation_kwargs() for llm in self.proposers_llms\n            ],\n        }\n\n    # `abstractmethod`, had to be implemented but not used\n    async def agenerate(\n        self, input: \"FormattedInput\", num_generations: int = 1, **kwargs: Any\n    ) -> List[Union[str, None]]:\n        raise NotImplementedError(\n            \"`agenerate` method is not implemented for `MixtureOfAgents`\"\n        )\n\n    def _build_moa_system_prompt(self, prev_outputs: List[str]) -> str:\n        \"\"\"Builds the Mixture-of-Agents system prompt.\n\n        Args:\n            prev_outputs: The list of previous outputs to use as references.\n\n        Returns:\n            The Mixture-of-Agents system prompt.\n        \"\"\"\n        moa_system_prompt = MOA_SYSTEM_PROMPT\n        for i, prev_output in enumerate(prev_outputs):\n            if prev_output is not None:\n                moa_system_prompt += f\"\\n{i + 1}. {prev_output}\"\n        return moa_system_prompt\n\n    def _inject_moa_system_prompt(\n        self, input: \"StandardInput\", prev_outputs: List[str]\n    ) -> \"StandardInput\":\n        \"\"\"Injects the Mixture-of-Agents system prompt into the input.\n\n        Args:\n            input: The input to inject the system prompt into.\n            prev_outputs: The list of previous outputs to use as references.\n\n        Returns:\n            The input with the Mixture-of-Agents system prompt injected.\n        \"\"\"\n        if len(prev_outputs) == 0:\n            return input\n\n        moa_system_prompt = self._build_moa_system_prompt(prev_outputs)\n\n        system = next((item for item in input if item[\"role\"] == \"system\"), None)\n        if system:\n            original_system_prompt = system[\"content\"]\n            system[\"content\"] = f\"{moa_system_prompt}\\n\\n{original_system_prompt}\"\n        else:\n            input.insert(0, {\"role\": \"system\", \"content\": moa_system_prompt})\n\n        return input\n\n    async def _agenerate(\n        self,\n        inputs: List[\"FormattedInput\"],\n        num_generations: int = 1,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n        Args:\n            inputs: the list of inputs to generate responses for.\n            num_generations: the number of generations to generate per input.\n            **kwargs: the additional kwargs to be used for the generation.\n\n        Returns:\n            A list containing the generations for each input.\n        \"\"\"\n        aggregator_llm_kwargs: Dict[str, Any] = kwargs.get(\"aggregator_llm\", {})\n        proposers_llms_kwargs: List[Dict[str, Any]] = kwargs.get(\n            \"proposers_llms\", [{}] * len(self.proposers_llms)\n        )\n\n        prev_outputs = []\n        for round in range(self.rounds):\n            self._logger.debug(f\"Generating round {round + 1}/{self.rounds} in MoA\")  # type: ignore\n\n            # Generate `num_generations` with each proposer LLM for each input\n            tasks = [\n                asyncio.create_task(\n                    llm._agenerate(\n                        inputs=[\n                            self._inject_moa_system_prompt(\n                                cast(\"StandardInput\", input), prev_input_outputs\n                            )\n                            for input, prev_input_outputs in itertools.zip_longest(\n                                inputs, prev_outputs, fillvalue=[]\n                            )\n                        ],\n                        num_generations=1,\n                        **generation_kwargs,\n                    )\n                )\n                for llm, generation_kwargs in zip(\n                    self.proposers_llms, proposers_llms_kwargs\n                )\n            ]\n\n            # Group generations per input\n            outputs: List[List[\"GenerateOutput\"]] = await asyncio.gather(*tasks)\n            prev_outputs = [\n                list(itertools.chain(*input_outputs)) for input_outputs in zip(*outputs)\n            ]\n\n        self._logger.debug(\"Aggregating outputs in MoA\")  # type: ignore\n        if isinstance(self.aggregator_llm, AsyncLLM):\n            return await self.aggregator_llm._agenerate(\n                inputs=[\n                    self._inject_moa_system_prompt(\n                        cast(\"StandardInput\", input), prev_input_outputs\n                    )\n                    for input, prev_input_outputs in zip(inputs, prev_outputs)\n                ],\n                num_generations=num_generations,\n                **aggregator_llm_kwargs,\n            )\n\n        return self.aggregator_llm.generate(\n            inputs=[\n                self._inject_moa_system_prompt(\n                    cast(\"StandardInput\", input), prev_input_outputs\n                )\n                for input, prev_input_outputs in zip(inputs, prev_outputs)\n            ],\n            num_generations=num_generations,\n            **aggregator_llm_kwargs,\n        )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.runtime_parameters_names","title":"runtime_parameters_names: RuntimeParametersNames property","text":"

Returns the runtime parameters of the LLM, which are a combination of the RuntimeParameters of the LLM, the aggregator_llm and the proposers_llms.

Returns:

Type Description RuntimeParametersNames

The runtime parameters of the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.model_name","title":"model_name: str property","text":"

Returns the aggregated model name.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.load","title":"load()","text":"

Loads all the LLMs in the MixtureOfAgents.

Source code in src/distilabel/models/llms/moa.py
def load(self) -> None:\n    \"\"\"Loads all the `LLM`s in the `MixtureOfAgents`.\"\"\"\n    super().load()\n\n    for llm in self.proposers_llms:\n        self._logger.debug(f\"Loading proposer LLM in MoA: {llm}\")  # type: ignore\n        llm.load()\n\n    self._logger.debug(f\"Loading aggregator LLM in MoA: {self.aggregator_llm}\")  # type: ignore\n    self.aggregator_llm.load()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM.get_generation_kwargs","title":"get_generation_kwargs()","text":"

Returns the generation kwargs of the MixtureOfAgents as a dictionary.

Returns:

Type Description Dict[str, Any]

The generation kwargs of the MixtureOfAgents.

Source code in src/distilabel/models/llms/moa.py
def get_generation_kwargs(self) -> Dict[str, Any]:\n    \"\"\"Returns the generation kwargs of the `MixtureOfAgents` as a dictionary.\n\n    Returns:\n        The generation kwargs of the `MixtureOfAgents`.\n    \"\"\"\n    return {\n        \"aggregator_llm\": self.aggregator_llm.get_generation_kwargs(),\n        \"proposers_llms\": [\n            llm.get_generation_kwargs() for llm in self.proposers_llms\n        ],\n    }\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._build_moa_system_prompt","title":"_build_moa_system_prompt(prev_outputs)","text":"

Builds the Mixture-of-Agents system prompt.

Parameters:

Name Type Description Default prev_outputs List[str]

The list of previous outputs to use as references.

required

Returns:

Type Description str

The Mixture-of-Agents system prompt.

Source code in src/distilabel/models/llms/moa.py
def _build_moa_system_prompt(self, prev_outputs: List[str]) -> str:\n    \"\"\"Builds the Mixture-of-Agents system prompt.\n\n    Args:\n        prev_outputs: The list of previous outputs to use as references.\n\n    Returns:\n        The Mixture-of-Agents system prompt.\n    \"\"\"\n    moa_system_prompt = MOA_SYSTEM_PROMPT\n    for i, prev_output in enumerate(prev_outputs):\n        if prev_output is not None:\n            moa_system_prompt += f\"\\n{i + 1}. {prev_output}\"\n    return moa_system_prompt\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._inject_moa_system_prompt","title":"_inject_moa_system_prompt(input, prev_outputs)","text":"

Injects the Mixture-of-Agents system prompt into the input.

Parameters:

Name Type Description Default input StandardInput

The input to inject the system prompt into.

required prev_outputs List[str]

The list of previous outputs to use as references.

required

Returns:

Type Description StandardInput

The input with the Mixture-of-Agents system prompt injected.

Source code in src/distilabel/models/llms/moa.py
def _inject_moa_system_prompt(\n    self, input: \"StandardInput\", prev_outputs: List[str]\n) -> \"StandardInput\":\n    \"\"\"Injects the Mixture-of-Agents system prompt into the input.\n\n    Args:\n        input: The input to inject the system prompt into.\n        prev_outputs: The list of previous outputs to use as references.\n\n    Returns:\n        The input with the Mixture-of-Agents system prompt injected.\n    \"\"\"\n    if len(prev_outputs) == 0:\n        return input\n\n    moa_system_prompt = self._build_moa_system_prompt(prev_outputs)\n\n    system = next((item for item in input if item[\"role\"] == \"system\"), None)\n    if system:\n        original_system_prompt = system[\"content\"]\n        system[\"content\"] = f\"{moa_system_prompt}\\n\\n{original_system_prompt}\"\n    else:\n        input.insert(0, {\"role\": \"system\", \"content\": moa_system_prompt})\n\n    return input\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.MixtureOfAgentsLLM._agenerate","title":"_agenerate(inputs, num_generations=1, **kwargs) async","text":"

Internal function to concurrently generate responses for a list of inputs.

Parameters:

Name Type Description Default inputs List[FormattedInput]

the list of inputs to generate responses for.

required num_generations int

the number of generations to generate per input.

1 **kwargs Any

the additional kwargs to be used for the generation.

{}

Returns:

Type Description List[GenerateOutput]

A list containing the generations for each input.

Source code in src/distilabel/models/llms/moa.py
async def _agenerate(\n    self,\n    inputs: List[\"FormattedInput\"],\n    num_generations: int = 1,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Internal function to concurrently generate responses for a list of inputs.\n\n    Args:\n        inputs: the list of inputs to generate responses for.\n        num_generations: the number of generations to generate per input.\n        **kwargs: the additional kwargs to be used for the generation.\n\n    Returns:\n        A list containing the generations for each input.\n    \"\"\"\n    aggregator_llm_kwargs: Dict[str, Any] = kwargs.get(\"aggregator_llm\", {})\n    proposers_llms_kwargs: List[Dict[str, Any]] = kwargs.get(\n        \"proposers_llms\", [{}] * len(self.proposers_llms)\n    )\n\n    prev_outputs = []\n    for round in range(self.rounds):\n        self._logger.debug(f\"Generating round {round + 1}/{self.rounds} in MoA\")  # type: ignore\n\n        # Generate `num_generations` with each proposer LLM for each input\n        tasks = [\n            asyncio.create_task(\n                llm._agenerate(\n                    inputs=[\n                        self._inject_moa_system_prompt(\n                            cast(\"StandardInput\", input), prev_input_outputs\n                        )\n                        for input, prev_input_outputs in itertools.zip_longest(\n                            inputs, prev_outputs, fillvalue=[]\n                        )\n                    ],\n                    num_generations=1,\n                    **generation_kwargs,\n                )\n            )\n            for llm, generation_kwargs in zip(\n                self.proposers_llms, proposers_llms_kwargs\n            )\n        ]\n\n        # Group generations per input\n        outputs: List[List[\"GenerateOutput\"]] = await asyncio.gather(*tasks)\n        prev_outputs = [\n            list(itertools.chain(*input_outputs)) for input_outputs in zip(*outputs)\n        ]\n\n    self._logger.debug(\"Aggregating outputs in MoA\")  # type: ignore\n    if isinstance(self.aggregator_llm, AsyncLLM):\n        return await self.aggregator_llm._agenerate(\n            inputs=[\n                self._inject_moa_system_prompt(\n                    cast(\"StandardInput\", input), prev_input_outputs\n                )\n                for input, prev_input_outputs in zip(inputs, prev_outputs)\n            ],\n            num_generations=num_generations,\n            **aggregator_llm_kwargs,\n        )\n\n    return self.aggregator_llm.generate(\n        inputs=[\n            self._inject_moa_system_prompt(\n                cast(\"StandardInput\", input), prev_input_outputs\n            )\n            for input, prev_input_outputs in zip(inputs, prev_outputs)\n        ],\n        num_generations=num_generations,\n        **aggregator_llm_kwargs,\n    )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM","title":"OllamaLLM","text":"

Bases: AsyncLLM

Ollama LLM implementation running the Async API client.

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"notus\".

host Optional[RuntimeParameter[str]]

the Ollama server host.

timeout RuntimeParameter[int]

the timeout for the LLM. Defaults to 120.

_aclient Optional[AsyncClient]

the AsyncClient to use for the Ollama API. It is meant to be used internally. Set in the load method.

Runtime parameters
  • host: the Ollama server host.
  • timeout: the client timeout for the Ollama API. Defaults to 120.

Examples:

Generate text:

from distilabel.models.llms import OllamaLLM\n\nllm = OllamaLLM(model=\"llama3\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/ollama.py
class OllamaLLM(AsyncLLM):\n    \"\"\"Ollama LLM implementation running the Async API client.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"notus\".\n        host: the Ollama server host.\n        timeout: the timeout for the LLM. Defaults to `120`.\n        _aclient: the `AsyncClient` to use for the Ollama API. It is meant to be used internally.\n            Set in the `load` method.\n\n    Runtime parameters:\n        - `host`: the Ollama server host.\n        - `timeout`: the client timeout for the Ollama API. Defaults to `120`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import OllamaLLM\n\n        llm = OllamaLLM(model=\"llama3\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    host: Optional[RuntimeParameter[str]] = Field(\n        default=None, description=\"The host of the Ollama API.\"\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120, description=\"The timeout for the Ollama API.\"\n    )\n    follow_redirects: bool = True\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _num_generations_param_supported = False\n\n    _aclient: Optional[\"AsyncClient\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncClient` to use Ollama async API.\"\"\"\n        super().load()\n\n        try:\n            from ollama import AsyncClient\n\n            self._aclient = AsyncClient(\n                host=self.host,\n                timeout=self.timeout,\n                follow_redirects=self.follow_redirects,\n            )\n        except ImportError as e:\n            raise ImportError(\n                \"Ollama Python client is not installed. Please install it using\"\n                \" `pip install ollama`.\"\n            ) from e\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: StandardInput,\n        format: Literal[\"\", \"json\"] = \"\",\n        # TODO: include relevant options from `Options` in `agenerate` method.\n        options: Union[Options, None] = None,\n        keep_alive: Union[bool, None] = None,\n    ) -> GenerateOutput:\n        \"\"\"\n        Generates a response asynchronously, using the [Ollama Async API definition](https://github.com/ollama/ollama-python).\n\n        Args:\n            input: the input to use for the generation.\n            format: the format to use for the generation. Defaults to `\"\"`.\n            options: the options to use for the generation. Defaults to `None`.\n            keep_alive: whether to keep the connection alive. Defaults to `None`.\n\n        Returns:\n            A list of strings as completion for the given input.\n        \"\"\"\n        text = None\n        try:\n            completion: Dict[str, Any] = await self._aclient.chat(  # type: ignore\n                model=self.model,\n                messages=input,  # type: ignore\n                stream=False,\n                format=format,\n                options=options,\n                keep_alive=keep_alive,\n            )\n            text = completion[\"message\"][\"content\"]\n        except Exception as e:\n            self._logger.warning(  # type: ignore\n                f\"\u26a0\ufe0f Received no response using Ollama client (model: '{self.model_name}').\"\n                f\" Finish reason was: {e}\"\n            )\n\n        return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.load","title":"load()","text":"

Loads the AsyncClient to use Ollama async API.

Source code in src/distilabel/models/llms/ollama.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncClient` to use Ollama async API.\"\"\"\n    super().load()\n\n    try:\n        from ollama import AsyncClient\n\n        self._aclient = AsyncClient(\n            host=self.host,\n            timeout=self.timeout,\n            follow_redirects=self.follow_redirects,\n        )\n    except ImportError as e:\n        raise ImportError(\n            \"Ollama Python client is not installed. Please install it using\"\n            \" `pip install ollama`.\"\n        ) from e\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OllamaLLM.agenerate","title":"agenerate(input, format='', options=None, keep_alive=None) async","text":"

Generates a response asynchronously, using the Ollama Async API definition.

Parameters:

Name Type Description Default input StandardInput

the input to use for the generation.

required format Literal['', 'json']

the format to use for the generation. Defaults to \"\".

'' options Union[Options, None]

the options to use for the generation. Defaults to None.

None keep_alive Union[bool, None]

whether to keep the connection alive. Defaults to None.

None

Returns:

Type Description GenerateOutput

A list of strings as completion for the given input.

Source code in src/distilabel/models/llms/ollama.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: StandardInput,\n    format: Literal[\"\", \"json\"] = \"\",\n    # TODO: include relevant options from `Options` in `agenerate` method.\n    options: Union[Options, None] = None,\n    keep_alive: Union[bool, None] = None,\n) -> GenerateOutput:\n    \"\"\"\n    Generates a response asynchronously, using the [Ollama Async API definition](https://github.com/ollama/ollama-python).\n\n    Args:\n        input: the input to use for the generation.\n        format: the format to use for the generation. Defaults to `\"\"`.\n        options: the options to use for the generation. Defaults to `None`.\n        keep_alive: whether to keep the connection alive. Defaults to `None`.\n\n    Returns:\n        A list of strings as completion for the given input.\n    \"\"\"\n    text = None\n    try:\n        completion: Dict[str, Any] = await self._aclient.chat(  # type: ignore\n            model=self.model,\n            messages=input,  # type: ignore\n            stream=False,\n            format=format,\n            options=options,\n            keep_alive=keep_alive,\n        )\n        text = completion[\"message\"][\"content\"]\n    except Exception as e:\n        self._logger.warning(  # type: ignore\n            f\"\u26a0\ufe0f Received no response using Ollama client (model: '{self.model_name}').\"\n            f\" Finish reason was: {e}\"\n        )\n\n    return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM","title":"OpenAILLM","text":"

Bases: AsyncLLM

OpenAI LLM implementation running the async API client.

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc. Supported models can be found here.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the OpenAI API requests. Defaults to None, which means that the value set for the environment variable OPENAI_BASE_URL will be used, or \"https://api.openai.com/v1\" if not set.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set.

max_retries RuntimeParameter[int]

the maximum number of times to retry the request to the API before failing. Defaults to 6.

timeout RuntimeParameter[int]

the maximum time in seconds to wait for a response from the API. Defaults to 120.

structured_output Optional[RuntimeParameter[InstructorStructuredOutputType]]

a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

Runtime parameters
  • base_url: the base URL to use for the OpenAI API requests. Defaults to None.
  • api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None.
  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.
  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.
Icon

:simple-openai:

Examples:

Generate text:

from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate text from a custom endpoint following the OpenAI API:

from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pydantic import BaseModel\nfrom distilabel.models.llms import OpenAILLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = OpenAILLM(\n    model=\"gpt-4-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n

Generate with Batch API (offline batch generation):

from distilabel.models.llms import OpenAILLM\n\nload = llm = OpenAILLM(\n    model=\"gpt-3.5-turbo\",\n    use_offline_batch_generation=True,\n    offline_batch_generation_block_until_done=5,  # poll for results every 5 seconds\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n# [['Hello! How can I assist you today?']]\n
Source code in src/distilabel/models/llms/openai.py
class OpenAILLM(AsyncLLM):\n    \"\"\"OpenAI LLM implementation running the async API client.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc.\n            Supported models can be found [here](https://platform.openai.com/docs/guides/text-generation).\n        base_url: the base URL to use for the OpenAI API requests. Defaults to `None`, which\n            means that the value set for the environment variable `OPENAI_BASE_URL` will\n            be used, or \"https://api.openai.com/v1\" if not set.\n        api_key: the API key to authenticate the requests to the OpenAI API. Defaults to\n            `None` which means that the value set for the environment variable `OPENAI_API_KEY`\n            will be used, or `None` if not set.\n        max_retries: the maximum number of times to retry the request to the API before\n            failing. Defaults to `6`.\n        timeout: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        structured_output: a dictionary containing the structured output configuration configuration\n            using `instructor`. You can take a look at the dictionary structure in\n            `InstructorStructuredOutputType` from `distilabel.steps.tasks.structured_outputs.instructor`.\n\n    Runtime parameters:\n        - `base_url`: the base URL to use for the OpenAI API requests. Defaults to `None`.\n        - `api_key`: the API key to authenticate the requests to the OpenAI API. Defaults\n            to `None`.\n        - `max_retries`: the maximum number of times to retry the request to the API before\n            failing. Defaults to `6`.\n        - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n\n    Icon:\n        `:simple-openai:`\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import OpenAILLM\n\n        llm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate text from a custom endpoint following the OpenAI API:\n\n        ```python\n        from distilabel.models.llms import OpenAILLM\n\n        llm = OpenAILLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\",\n            base_url=r\"http://localhost:8080/v1\"\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pydantic import BaseModel\n        from distilabel.models.llms import OpenAILLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = OpenAILLM(\n            model=\"gpt-4-turbo\",\n            api_key=\"api.key\",\n            structured_output={\"schema\": User}\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n\n        Generate with Batch API (offline batch generation):\n\n        ```python\n        from distilabel.models.llms import OpenAILLM\n\n        load = llm = OpenAILLM(\n            model=\"gpt-3.5-turbo\",\n            use_offline_batch_generation=True,\n            offline_batch_generation_block_until_done=5,  # poll for results every 5 seconds\n        )\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        # [['Hello! How can I assist you today?']]\n        ```\n    \"\"\"\n\n    model: str\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"OPENAI_BASE_URL\", \"https://api.openai.com/v1\"\n        ),\n        description=\"The base URL to use for the OpenAI API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_OPENAI_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the OpenAI API.\",\n    )\n    max_retries: RuntimeParameter[int] = Field(\n        default=6,\n        description=\"The maximum number of times to retry the request to the API before\"\n        \" failing.\",\n    )\n    timeout: RuntimeParameter[int] = Field(\n        default=120,\n        description=\"The maximum time in seconds to wait for a response from the API.\",\n    )\n    structured_output: Optional[RuntimeParameter[InstructorStructuredOutputType]] = (\n        Field(\n            default=None,\n            description=\"The structured output format to use across all the generations.\",\n        )\n    )\n\n    _api_key_env_var: str = PrivateAttr(_OPENAI_API_KEY_ENV_VAR_NAME)\n    _client: \"OpenAI\" = PrivateAttr(None)\n    _aclient: \"AsyncOpenAI\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the `AsyncOpenAI` client to benefit from async requests.\"\"\"\n        super().load()\n\n        try:\n            from openai import AsyncOpenAI, OpenAI\n        except ImportError as ie:\n            raise ImportError(\n                \"OpenAI Python client is not installed. Please install it using\"\n                \" `pip install openai`.\"\n            ) from ie\n\n        if self.api_key is None:\n            raise ValueError(\n                f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n                f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n            )\n\n        self._client = OpenAI(\n            base_url=self.base_url,\n            api_key=self.api_key.get_secret_value(),\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        self._aclient = AsyncOpenAI(\n            base_url=self.base_url,\n            api_key=self.api_key.get_secret_value(),\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        if self.structured_output:\n            result = self._prepare_structured_output(\n                structured_output=self.structured_output,\n                client=self._aclient,\n                framework=\"openai\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n            if structured_output := result.get(\"structured_output\"):\n                self.structured_output = structured_output\n\n    def unload(self) -> None:\n        \"\"\"Set clients to `None` as they both contain `thread._RLock` which cannot be pickled\n        in case an exception is raised and has to be handled in the main process\"\"\"\n\n        self._client = None  # type: ignore\n        self._aclient = None  # type: ignore\n        self.structured_output = None\n        super().unload()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        frequency_penalty: float = 0.0,\n        presence_penalty: float = 0.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        stop: Optional[Union[str, List[str]]] = None,\n        response_format: Optional[Dict[str, str]] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for the given input using the OpenAI async\n        client.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            stop: a string or a list of strings to use as a stop sequence for the generation.\n                Defaults to `None`.\n            response_format: the format of the response to return. Must be one of\n                \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n                for more information on how to use the JSON model from OpenAI. Defaults to None\n                which returns text. To return JSON, use {\"type\": \"json_object\"}.\n\n        Note:\n            If response_format\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n\n        structured_output = None\n        if isinstance(input, tuple):\n            input, structured_output = input\n            result = self._prepare_structured_output(\n                structured_output=structured_output,  # type: ignore\n                client=self._aclient,\n                framework=\"openai\",\n            )\n            self._aclient = result.get(\"client\")  # type: ignore\n\n        if structured_output is None and self.structured_output is not None:\n            structured_output = self.structured_output\n\n        kwargs = {\n            \"messages\": input,  # type: ignore\n            \"model\": self.model,\n            \"max_tokens\": max_new_tokens,\n            \"n\": num_generations,\n            \"frequency_penalty\": frequency_penalty,\n            \"presence_penalty\": presence_penalty,\n            \"temperature\": temperature,\n            \"top_p\": top_p,\n            \"stop\": stop,\n        }\n\n        if response_format is not None:\n            kwargs[\"response_format\"] = response_format\n\n        if structured_output:\n            kwargs = self._prepare_kwargs(kwargs, structured_output)  # type: ignore\n\n        completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n\n        if structured_output:\n            return self._generations_from_structured_output(completion)\n\n        return self._generations_from_openai_completion(completion)\n\n    def _generations_from_structured_output(\n        self, completion: \"BaseModel\"\n    ) -> \"GenerateOutput\":\n        \"\"\"Get the generations from the structured output object.\n\n        Args:\n            completion: an instance of `pydantic.BaseModel` with the content of the structuted\n                output.\n\n        Returns:\n            A list with the content of the structured output.\n        \"\"\"\n        return [completion.model_dump_json()]\n\n    def _generations_from_openai_completion(\n        self, completion: \"OpenAIChatCompletion\"\n    ) -> \"GenerateOutput\":\n        \"\"\"Get the generations from the OpenAI Chat Completion object.\n\n        Args:\n            completion: the completion object to get the generations from.\n\n        Returns:\n            A list of strings containing the generated responses for the input.\n        \"\"\"\n        generations = []\n        for choice in completion.choices:\n            if (content := choice.message.content) is None:\n                self._logger.warning(  # type: ignore\n                    f\"Received no response using OpenAI client (model: '{self.model}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(content)\n        return generations\n\n    def offline_batch_generate(\n        self,\n        inputs: Union[List[\"FormattedInput\"], None] = None,\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        frequency_penalty: float = 0.0,\n        presence_penalty: float = 0.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        stop: Optional[Union[str, List[str]]] = None,\n        response_format: Optional[str] = None,\n        **kwargs: Any,\n    ) -> List[\"GenerateOutput\"]:\n        \"\"\"Uses the OpenAI batch API to generate `num_generations` responses for the given\n        inputs.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            stop: a string or a list of strings to use as a stop sequence for the generation.\n                Defaults to `None`.\n            response_format: the format of the response to return. Must be one of\n                \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n                for more information on how to use the JSON model from OpenAI. Defaults to `text`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input\n            in `inputs`.\n\n        Raises:\n            DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n                is not finished yet.\n            ValueError: if no job IDs were found to retrieve the results from.\n        \"\"\"\n        if self.jobs_ids:\n            return self._check_and_get_batch_results()\n\n        if inputs:\n            self.jobs_ids = self._create_jobs(\n                inputs=inputs,\n                **{\n                    \"model\": self.model,\n                    \"max_tokens\": max_new_tokens,\n                    \"n\": num_generations,\n                    \"frequency_penalty\": frequency_penalty,\n                    \"presence_penalty\": presence_penalty,\n                    \"temperature\": temperature,\n                    \"top_p\": top_p,\n                    \"stop\": stop,\n                    \"response_format\": response_format,\n                },\n            )\n            raise DistilabelOfflineBatchGenerationNotFinishedException(\n                jobs_ids=self.jobs_ids\n            )\n\n        raise ValueError(\"No `inputs` were provided and no `jobs_ids` were found.\")\n\n    def _check_and_get_batch_results(self) -> List[\"GenerateOutput\"]:\n        \"\"\"Checks the status of the batch jobs and retrieves the results from the OpenAI\n        Batch API.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n\n        Raises:\n            ValueError: if no job IDs were found to retrieve the results from.\n            DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n                is not finished yet.\n            RuntimeError: if the only batch job found failed.\n        \"\"\"\n        if not self.jobs_ids:\n            raise ValueError(\"No job IDs were found to retrieve the results from.\")\n\n        outputs = []\n        for batch_id in self.jobs_ids:\n            batch = self._get_openai_batch(batch_id)\n\n            if batch.status in (\"validating\", \"in_progress\", \"finalizing\"):\n                raise DistilabelOfflineBatchGenerationNotFinishedException(\n                    jobs_ids=self.jobs_ids\n                )\n\n            if batch.status in (\"failed\", \"expired\", \"cancelled\", \"cancelling\"):\n                self._logger.error(  # type: ignore\n                    f\"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'.\"\n                )\n                if len(self.jobs_ids) == 1:\n                    self.jobs_ids = None\n                    raise RuntimeError(\n                        f\"The only OpenAI API Batch that was created with ID '{batch_id}'\"\n                        f\" failed with status '{batch.status}'.\"\n                    )\n\n                continue\n\n            outputs.extend(self._retrieve_batch_results(batch))\n\n        # sort by `custom_id` to return the results in the same order as the inputs\n        outputs = sorted(outputs, key=lambda x: int(x[\"custom_id\"]))\n        return [self._parse_output(output) for output in outputs]\n\n    def _parse_output(self, output: Dict[str, Any]) -> \"GenerateOutput\":\n        \"\"\"Parses the output from the OpenAI Batch API into a list of strings.\n\n        Args:\n            output: the output to parse.\n\n        Returns:\n            A list of strings containing the generated responses for the input.\n        \"\"\"\n        from openai.types.chat import ChatCompletion as OpenAIChatCompletion\n\n        if \"response\" not in output:\n            return []\n\n        if output[\"response\"][\"status_code\"] != 200:\n            return []\n\n        return self._generations_from_openai_completion(\n            OpenAIChatCompletion(**output[\"response\"][\"body\"])\n        )\n\n    def _get_openai_batch(self, batch_id: str) -> \"OpenAIBatch\":\n        \"\"\"Gets a batch from the OpenAI Batch API.\n\n        Args:\n            batch_id: the ID of the batch to retrieve.\n\n        Returns:\n            The batch retrieved from the OpenAI Batch API.\n\n        Raises:\n            openai.OpenAIError: if there was an error while retrieving the batch from the\n                OpenAI Batch API.\n        \"\"\"\n        import openai\n\n        try:\n            return self._client.batches.retrieve(batch_id)\n        except openai.OpenAIError as e:\n            self._logger.error(  # type: ignore\n                f\"Error while retrieving batch '{batch_id}' from OpenAI: {e}\"\n            )\n            raise e\n\n    def _retrieve_batch_results(self, batch: \"OpenAIBatch\") -> List[Dict[str, Any]]:\n        \"\"\"Retrieves the results of a batch from its output file, parsing the JSONL content\n        into a list of dictionaries.\n\n        Args:\n            batch: the batch to retrieve the results from.\n\n        Returns:\n            A list of dictionaries containing the results of the batch.\n\n        Raises:\n            AssertionError: if no output file ID was found in the batch.\n        \"\"\"\n        import openai\n\n        assert batch.output_file_id, \"No output file ID was found in the batch.\"\n\n        try:\n            file_response = self._client.files.content(batch.output_file_id)\n            return [orjson.loads(line) for line in file_response.text.splitlines()]\n        except openai.OpenAIError as e:\n            self._logger.error(  # type: ignore\n                f\"Error while retrieving batch results from file '{batch.output_file_id}': {e}\"\n            )\n            return []\n\n    def _create_jobs(\n        self, inputs: List[\"FormattedInput\"], **kwargs: Any\n    ) -> Tuple[str, ...]:\n        \"\"\"Creates jobs in the OpenAI Batch API to generate responses for the given inputs.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            kwargs: the keyword arguments to use for the generation.\n\n        Returns:\n            A list of job IDs created in the OpenAI Batch API.\n        \"\"\"\n        batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)\n        jobs = []\n        for batch_input_file in batch_input_files:\n            if batch := self._create_batch_api_job(batch_input_file):\n                jobs.append(batch.id)\n        return tuple(jobs)\n\n    def _create_batch_api_job(\n        self, batch_input_file: \"OpenAIFileObject\"\n    ) -> Union[\"OpenAIBatch\", None]:\n        \"\"\"Creates a job in the OpenAI Batch API to generate responses for the given input\n        file.\n\n        Args:\n            batch_input_file: the input file to generate responses for.\n\n        Returns:\n            The batch job created in the OpenAI Batch API.\n        \"\"\"\n        import openai\n\n        metadata = {\"description\": \"distilabel\"}\n\n        if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:\n            metadata[\"distilabel_pipeline_name\"] = distilabel_pipeline_name\n\n        if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:\n            metadata[\"distilabel_pipeline_cache_id\"] = distilabel_pipeline_cache_id\n\n        batch = None\n        try:\n            batch = self._client.batches.create(\n                completion_window=\"24h\",\n                endpoint=\"/v1/chat/completions\",\n                input_file_id=batch_input_file.id,\n                metadata=metadata,\n            )\n        except openai.OpenAIError as e:\n            self._logger.error(  # type: ignore\n                f\"Error while creating OpenAI Batch API job for file with ID\"\n                f\" '{batch_input_file.id}': {e}.\"\n            )\n            raise e\n        return batch\n\n    def _create_batch_files(\n        self, inputs: List[\"FormattedInput\"], **kwargs: Any\n    ) -> List[\"OpenAIFileObject\"]:\n        \"\"\"Creates the necessary input files for the batch API to generate responses. The\n        maximum size of each file so the OpenAI Batch API can process it is 100MB, so we\n        need to split the inputs into multiple files if necessary.\n\n        More information: https://platform.openai.com/docs/api-reference/files/create\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for, optionally\n                including structured output.\n            kwargs: the keyword arguments to use for the generation.\n\n        Returns:\n            The list of file objects created for the OpenAI Batch API.\n\n        Raises:\n            openai.OpenAIError: if there was an error while creating the batch input file\n                in the OpenAI Batch API.\n        \"\"\"\n        import openai\n\n        files = []\n        for file_no, buffer in enumerate(\n            self._create_jsonl_buffers(inputs=inputs, **kwargs)\n        ):\n            try:\n                # TODO: add distilabel pipeline name and id\n                batch_input_file = self._client.files.create(\n                    file=(self._name_for_openai_files(file_no), buffer),\n                    purpose=\"batch\",\n                )\n                files.append(batch_input_file)\n            except openai.OpenAIError as e:\n                self._logger.error(  # type: ignore\n                    f\"Error while creating OpenAI batch input file: {e}\"\n                )\n                raise e\n        return files\n\n    def _create_jsonl_buffers(\n        self, inputs: List[\"FormattedInput\"], **kwargs: Any\n    ) -> Generator[io.BytesIO, None, None]:\n        \"\"\"Creates a generator of buffers containing the JSONL formatted inputs to be\n        used by the OpenAI Batch API. The buffers created are of size 100MB or less.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for, optionally\n                including structured output.\n            kwargs: the keyword arguments to use for the generation.\n\n        Yields:\n            A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch\n            API.\n        \"\"\"\n        buffer = io.BytesIO()\n        buffer_current_size = 0\n        for i, input in enumerate(inputs):\n            # We create the smallest `custom_id` so we don't  increase the size of the file\n            # to much, but we can still sort the results with the order of the inputs.\n            row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)\n            row_size = len(row)\n            if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:\n                buffer.seek(0)\n                yield buffer\n                buffer = io.BytesIO()\n                buffer_current_size = 0\n            buffer.write(row)\n            buffer_current_size += row_size\n\n        if buffer_current_size > 0:\n            buffer.seek(0)\n            yield buffer\n\n    def _create_jsonl_row(\n        self, input: \"FormattedInput\", custom_id: str, **kwargs: Any\n    ) -> bytes:\n        \"\"\"Creates a JSONL formatted row to be used by the OpenAI Batch API.\n\n        Args:\n            input: a list of inputs in chat format to generate responses for, optionally\n                including structured output.\n            custom_id: a custom ID to use for the row.\n            kwargs: the keyword arguments to use for the generation.\n\n        Returns:\n            A JSONL formatted row to be used by the OpenAI Batch API.\n        \"\"\"\n        # TODO: depending on the format of the input, add `response_format` to the kwargs\n        row = {\n            \"custom_id\": custom_id,\n            \"method\": \"POST\",\n            \"url\": \"/v1/chat/completions\",\n            \"body\": {\"messages\": input, **kwargs},\n        }\n        json_row = orjson.dumps(row)\n        return json_row + b\"\\n\"\n\n    def _name_for_openai_files(self, file_no: int) -> str:\n        if (\n            envs.DISTILABEL_PIPELINE_NAME is None\n            or envs.DISTILABEL_PIPELINE_CACHE_ID is None\n        ):\n            return f\"distilabel-pipeline-fileno-{file_no}.jsonl\"\n\n        return f\"distilabel-pipeline-{envs.DISTILABEL_PIPELINE_NAME}-{envs.DISTILABEL_PIPELINE_CACHE_ID}-fileno-{file_no}.jsonl\"\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.load","title":"load()","text":"

Loads the AsyncOpenAI client to benefit from async requests.

Source code in src/distilabel/models/llms/openai.py
def load(self) -> None:\n    \"\"\"Loads the `AsyncOpenAI` client to benefit from async requests.\"\"\"\n    super().load()\n\n    try:\n        from openai import AsyncOpenAI, OpenAI\n    except ImportError as ie:\n        raise ImportError(\n            \"OpenAI Python client is not installed. Please install it using\"\n            \" `pip install openai`.\"\n        ) from ie\n\n    if self.api_key is None:\n        raise ValueError(\n            f\"To use `{self.__class__.__name__}` an API key must be provided via `api_key`\"\n            f\" attribute or runtime parameter, or set the environment variable `{self._api_key_env_var}`.\"\n        )\n\n    self._client = OpenAI(\n        base_url=self.base_url,\n        api_key=self.api_key.get_secret_value(),\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    self._aclient = AsyncOpenAI(\n        base_url=self.base_url,\n        api_key=self.api_key.get_secret_value(),\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    if self.structured_output:\n        result = self._prepare_structured_output(\n            structured_output=self.structured_output,\n            client=self._aclient,\n            framework=\"openai\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n        if structured_output := result.get(\"structured_output\"):\n            self.structured_output = structured_output\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.unload","title":"unload()","text":"

Set clients to None as they both contain thread._RLock which cannot be pickled in case an exception is raised and has to be handled in the main process

Source code in src/distilabel/models/llms/openai.py
def unload(self) -> None:\n    \"\"\"Set clients to `None` as they both contain `thread._RLock` which cannot be pickled\n    in case an exception is raised and has to be handled in the main process\"\"\"\n\n    self._client = None  # type: ignore\n    self._aclient = None  # type: ignore\n    self.structured_output = None\n    super().unload()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.agenerate","title":"agenerate(input, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, stop=None, response_format=None) async","text":"

Generates num_generations responses for the given input using the OpenAI async client.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 stop Optional[Union[str, List[str]]]

a string or a list of strings to use as a stop sequence for the generation. Defaults to None.

None response_format Optional[Dict[str, str]]

the format of the response to return. Must be one of \"text\" or \"json\". Read the documentation here for more information on how to use the JSON model from OpenAI. Defaults to None which returns text. To return JSON, use {\"type\": \"json_object\"}.

None Note

If response_format

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/openai.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    frequency_penalty: float = 0.0,\n    presence_penalty: float = 0.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    stop: Optional[Union[str, List[str]]] = None,\n    response_format: Optional[Dict[str, str]] = None,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for the given input using the OpenAI async\n    client.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        stop: a string or a list of strings to use as a stop sequence for the generation.\n            Defaults to `None`.\n        response_format: the format of the response to return. Must be one of\n            \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n            for more information on how to use the JSON model from OpenAI. Defaults to None\n            which returns text. To return JSON, use {\"type\": \"json_object\"}.\n\n    Note:\n        If response_format\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n\n    structured_output = None\n    if isinstance(input, tuple):\n        input, structured_output = input\n        result = self._prepare_structured_output(\n            structured_output=structured_output,  # type: ignore\n            client=self._aclient,\n            framework=\"openai\",\n        )\n        self._aclient = result.get(\"client\")  # type: ignore\n\n    if structured_output is None and self.structured_output is not None:\n        structured_output = self.structured_output\n\n    kwargs = {\n        \"messages\": input,  # type: ignore\n        \"model\": self.model,\n        \"max_tokens\": max_new_tokens,\n        \"n\": num_generations,\n        \"frequency_penalty\": frequency_penalty,\n        \"presence_penalty\": presence_penalty,\n        \"temperature\": temperature,\n        \"top_p\": top_p,\n        \"stop\": stop,\n    }\n\n    if response_format is not None:\n        kwargs[\"response_format\"] = response_format\n\n    if structured_output:\n        kwargs = self._prepare_kwargs(kwargs, structured_output)  # type: ignore\n\n    completion = await self._aclient.chat.completions.create(**kwargs)  # type: ignore\n\n    if structured_output:\n        return self._generations_from_structured_output(completion)\n\n    return self._generations_from_openai_completion(completion)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._generations_from_structured_output","title":"_generations_from_structured_output(completion)","text":"

Get the generations from the structured output object.

Parameters:

Name Type Description Default completion BaseModel

an instance of pydantic.BaseModel with the content of the structuted output.

required

Returns:

Type Description GenerateOutput

A list with the content of the structured output.

Source code in src/distilabel/models/llms/openai.py
def _generations_from_structured_output(\n    self, completion: \"BaseModel\"\n) -> \"GenerateOutput\":\n    \"\"\"Get the generations from the structured output object.\n\n    Args:\n        completion: an instance of `pydantic.BaseModel` with the content of the structuted\n            output.\n\n    Returns:\n        A list with the content of the structured output.\n    \"\"\"\n    return [completion.model_dump_json()]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._generations_from_openai_completion","title":"_generations_from_openai_completion(completion)","text":"

Get the generations from the OpenAI Chat Completion object.

Parameters:

Name Type Description Default completion ChatCompletion

the completion object to get the generations from.

required

Returns:

Type Description GenerateOutput

A list of strings containing the generated responses for the input.

Source code in src/distilabel/models/llms/openai.py
def _generations_from_openai_completion(\n    self, completion: \"OpenAIChatCompletion\"\n) -> \"GenerateOutput\":\n    \"\"\"Get the generations from the OpenAI Chat Completion object.\n\n    Args:\n        completion: the completion object to get the generations from.\n\n    Returns:\n        A list of strings containing the generated responses for the input.\n    \"\"\"\n    generations = []\n    for choice in completion.choices:\n        if (content := choice.message.content) is None:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using OpenAI client (model: '{self.model}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(content)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM.offline_batch_generate","title":"offline_batch_generate(inputs=None, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, presence_penalty=0.0, temperature=1.0, top_p=1.0, stop=None, response_format=None, **kwargs)","text":"

Uses the OpenAI batch API to generate num_generations responses for the given inputs.

Parameters:

Name Type Description Default inputs Union[List[FormattedInput], None]

a list of inputs in chat format to generate responses for.

None num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 stop Optional[Union[str, List[str]]]

a string or a list of strings to use as a stop sequence for the generation. Defaults to None.

None response_format Optional[str]

the format of the response to return. Must be one of \"text\" or \"json\". Read the documentation here for more information on how to use the JSON model from OpenAI. Defaults to text.

None

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input

List[GenerateOutput]

in inputs.

Raises:

Type Description DistilabelOfflineBatchGenerationNotFinishedException

if the batch generation is not finished yet.

ValueError

if no job IDs were found to retrieve the results from.

Source code in src/distilabel/models/llms/openai.py
def offline_batch_generate(\n    self,\n    inputs: Union[List[\"FormattedInput\"], None] = None,\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    frequency_penalty: float = 0.0,\n    presence_penalty: float = 0.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    stop: Optional[Union[str, List[str]]] = None,\n    response_format: Optional[str] = None,\n    **kwargs: Any,\n) -> List[\"GenerateOutput\"]:\n    \"\"\"Uses the OpenAI batch API to generate `num_generations` responses for the given\n    inputs.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        stop: a string or a list of strings to use as a stop sequence for the generation.\n            Defaults to `None`.\n        response_format: the format of the response to return. Must be one of\n            \"text\" or \"json\". Read the documentation [here](https://platform.openai.com/docs/guides/text-generation/json-mode)\n            for more information on how to use the JSON model from OpenAI. Defaults to `text`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input\n        in `inputs`.\n\n    Raises:\n        DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n            is not finished yet.\n        ValueError: if no job IDs were found to retrieve the results from.\n    \"\"\"\n    if self.jobs_ids:\n        return self._check_and_get_batch_results()\n\n    if inputs:\n        self.jobs_ids = self._create_jobs(\n            inputs=inputs,\n            **{\n                \"model\": self.model,\n                \"max_tokens\": max_new_tokens,\n                \"n\": num_generations,\n                \"frequency_penalty\": frequency_penalty,\n                \"presence_penalty\": presence_penalty,\n                \"temperature\": temperature,\n                \"top_p\": top_p,\n                \"stop\": stop,\n                \"response_format\": response_format,\n            },\n        )\n        raise DistilabelOfflineBatchGenerationNotFinishedException(\n            jobs_ids=self.jobs_ids\n        )\n\n    raise ValueError(\"No `inputs` were provided and no `jobs_ids` were found.\")\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._check_and_get_batch_results","title":"_check_and_get_batch_results()","text":"

Checks the status of the batch jobs and retrieves the results from the OpenAI Batch API.

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input.

Raises:

Type Description ValueError

if no job IDs were found to retrieve the results from.

DistilabelOfflineBatchGenerationNotFinishedException

if the batch generation is not finished yet.

RuntimeError

if the only batch job found failed.

Source code in src/distilabel/models/llms/openai.py
def _check_and_get_batch_results(self) -> List[\"GenerateOutput\"]:\n    \"\"\"Checks the status of the batch jobs and retrieves the results from the OpenAI\n    Batch API.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n\n    Raises:\n        ValueError: if no job IDs were found to retrieve the results from.\n        DistilabelOfflineBatchGenerationNotFinishedException: if the batch generation\n            is not finished yet.\n        RuntimeError: if the only batch job found failed.\n    \"\"\"\n    if not self.jobs_ids:\n        raise ValueError(\"No job IDs were found to retrieve the results from.\")\n\n    outputs = []\n    for batch_id in self.jobs_ids:\n        batch = self._get_openai_batch(batch_id)\n\n        if batch.status in (\"validating\", \"in_progress\", \"finalizing\"):\n            raise DistilabelOfflineBatchGenerationNotFinishedException(\n                jobs_ids=self.jobs_ids\n            )\n\n        if batch.status in (\"failed\", \"expired\", \"cancelled\", \"cancelling\"):\n            self._logger.error(  # type: ignore\n                f\"OpenAI API batch with ID '{batch_id}' failed with status '{batch.status}'.\"\n            )\n            if len(self.jobs_ids) == 1:\n                self.jobs_ids = None\n                raise RuntimeError(\n                    f\"The only OpenAI API Batch that was created with ID '{batch_id}'\"\n                    f\" failed with status '{batch.status}'.\"\n                )\n\n            continue\n\n        outputs.extend(self._retrieve_batch_results(batch))\n\n    # sort by `custom_id` to return the results in the same order as the inputs\n    outputs = sorted(outputs, key=lambda x: int(x[\"custom_id\"]))\n    return [self._parse_output(output) for output in outputs]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._parse_output","title":"_parse_output(output)","text":"

Parses the output from the OpenAI Batch API into a list of strings.

Parameters:

Name Type Description Default output Dict[str, Any]

the output to parse.

required

Returns:

Type Description GenerateOutput

A list of strings containing the generated responses for the input.

Source code in src/distilabel/models/llms/openai.py
def _parse_output(self, output: Dict[str, Any]) -> \"GenerateOutput\":\n    \"\"\"Parses the output from the OpenAI Batch API into a list of strings.\n\n    Args:\n        output: the output to parse.\n\n    Returns:\n        A list of strings containing the generated responses for the input.\n    \"\"\"\n    from openai.types.chat import ChatCompletion as OpenAIChatCompletion\n\n    if \"response\" not in output:\n        return []\n\n    if output[\"response\"][\"status_code\"] != 200:\n        return []\n\n    return self._generations_from_openai_completion(\n        OpenAIChatCompletion(**output[\"response\"][\"body\"])\n    )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._get_openai_batch","title":"_get_openai_batch(batch_id)","text":"

Gets a batch from the OpenAI Batch API.

Parameters:

Name Type Description Default batch_id str

the ID of the batch to retrieve.

required

Returns:

Type Description Batch

The batch retrieved from the OpenAI Batch API.

Raises:

Type Description OpenAIError

if there was an error while retrieving the batch from the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _get_openai_batch(self, batch_id: str) -> \"OpenAIBatch\":\n    \"\"\"Gets a batch from the OpenAI Batch API.\n\n    Args:\n        batch_id: the ID of the batch to retrieve.\n\n    Returns:\n        The batch retrieved from the OpenAI Batch API.\n\n    Raises:\n        openai.OpenAIError: if there was an error while retrieving the batch from the\n            OpenAI Batch API.\n    \"\"\"\n    import openai\n\n    try:\n        return self._client.batches.retrieve(batch_id)\n    except openai.OpenAIError as e:\n        self._logger.error(  # type: ignore\n            f\"Error while retrieving batch '{batch_id}' from OpenAI: {e}\"\n        )\n        raise e\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._retrieve_batch_results","title":"_retrieve_batch_results(batch)","text":"

Retrieves the results of a batch from its output file, parsing the JSONL content into a list of dictionaries.

Parameters:

Name Type Description Default batch Batch

the batch to retrieve the results from.

required

Returns:

Type Description List[Dict[str, Any]]

A list of dictionaries containing the results of the batch.

Raises:

Type Description AssertionError

if no output file ID was found in the batch.

Source code in src/distilabel/models/llms/openai.py
def _retrieve_batch_results(self, batch: \"OpenAIBatch\") -> List[Dict[str, Any]]:\n    \"\"\"Retrieves the results of a batch from its output file, parsing the JSONL content\n    into a list of dictionaries.\n\n    Args:\n        batch: the batch to retrieve the results from.\n\n    Returns:\n        A list of dictionaries containing the results of the batch.\n\n    Raises:\n        AssertionError: if no output file ID was found in the batch.\n    \"\"\"\n    import openai\n\n    assert batch.output_file_id, \"No output file ID was found in the batch.\"\n\n    try:\n        file_response = self._client.files.content(batch.output_file_id)\n        return [orjson.loads(line) for line in file_response.text.splitlines()]\n    except openai.OpenAIError as e:\n        self._logger.error(  # type: ignore\n            f\"Error while retrieving batch results from file '{batch.output_file_id}': {e}\"\n        )\n        return []\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jobs","title":"_create_jobs(inputs, **kwargs)","text":"

Creates jobs in the OpenAI Batch API to generate responses for the given inputs.

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for.

required kwargs Any

the keyword arguments to use for the generation.

{}

Returns:

Type Description Tuple[str, ...]

A list of job IDs created in the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _create_jobs(\n    self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> Tuple[str, ...]:\n    \"\"\"Creates jobs in the OpenAI Batch API to generate responses for the given inputs.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        kwargs: the keyword arguments to use for the generation.\n\n    Returns:\n        A list of job IDs created in the OpenAI Batch API.\n    \"\"\"\n    batch_input_files = self._create_batch_files(inputs=inputs, **kwargs)\n    jobs = []\n    for batch_input_file in batch_input_files:\n        if batch := self._create_batch_api_job(batch_input_file):\n            jobs.append(batch.id)\n    return tuple(jobs)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_batch_api_job","title":"_create_batch_api_job(batch_input_file)","text":"

Creates a job in the OpenAI Batch API to generate responses for the given input file.

Parameters:

Name Type Description Default batch_input_file FileObject

the input file to generate responses for.

required

Returns:

Type Description Union[Batch, None]

The batch job created in the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _create_batch_api_job(\n    self, batch_input_file: \"OpenAIFileObject\"\n) -> Union[\"OpenAIBatch\", None]:\n    \"\"\"Creates a job in the OpenAI Batch API to generate responses for the given input\n    file.\n\n    Args:\n        batch_input_file: the input file to generate responses for.\n\n    Returns:\n        The batch job created in the OpenAI Batch API.\n    \"\"\"\n    import openai\n\n    metadata = {\"description\": \"distilabel\"}\n\n    if distilabel_pipeline_name := envs.DISTILABEL_PIPELINE_NAME:\n        metadata[\"distilabel_pipeline_name\"] = distilabel_pipeline_name\n\n    if distilabel_pipeline_cache_id := envs.DISTILABEL_PIPELINE_CACHE_ID:\n        metadata[\"distilabel_pipeline_cache_id\"] = distilabel_pipeline_cache_id\n\n    batch = None\n    try:\n        batch = self._client.batches.create(\n            completion_window=\"24h\",\n            endpoint=\"/v1/chat/completions\",\n            input_file_id=batch_input_file.id,\n            metadata=metadata,\n        )\n    except openai.OpenAIError as e:\n        self._logger.error(  # type: ignore\n            f\"Error while creating OpenAI Batch API job for file with ID\"\n            f\" '{batch_input_file.id}': {e}.\"\n        )\n        raise e\n    return batch\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_batch_files","title":"_create_batch_files(inputs, **kwargs)","text":"

Creates the necessary input files for the batch API to generate responses. The maximum size of each file so the OpenAI Batch API can process it is 100MB, so we need to split the inputs into multiple files if necessary.

More information: https://platform.openai.com/docs/api-reference/files/create

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for, optionally including structured output.

required kwargs Any

the keyword arguments to use for the generation.

{}

Returns:

Type Description List[FileObject]

The list of file objects created for the OpenAI Batch API.

Raises:

Type Description OpenAIError

if there was an error while creating the batch input file in the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _create_batch_files(\n    self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> List[\"OpenAIFileObject\"]:\n    \"\"\"Creates the necessary input files for the batch API to generate responses. The\n    maximum size of each file so the OpenAI Batch API can process it is 100MB, so we\n    need to split the inputs into multiple files if necessary.\n\n    More information: https://platform.openai.com/docs/api-reference/files/create\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for, optionally\n            including structured output.\n        kwargs: the keyword arguments to use for the generation.\n\n    Returns:\n        The list of file objects created for the OpenAI Batch API.\n\n    Raises:\n        openai.OpenAIError: if there was an error while creating the batch input file\n            in the OpenAI Batch API.\n    \"\"\"\n    import openai\n\n    files = []\n    for file_no, buffer in enumerate(\n        self._create_jsonl_buffers(inputs=inputs, **kwargs)\n    ):\n        try:\n            # TODO: add distilabel pipeline name and id\n            batch_input_file = self._client.files.create(\n                file=(self._name_for_openai_files(file_no), buffer),\n                purpose=\"batch\",\n            )\n            files.append(batch_input_file)\n        except openai.OpenAIError as e:\n            self._logger.error(  # type: ignore\n                f\"Error while creating OpenAI batch input file: {e}\"\n            )\n            raise e\n    return files\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jsonl_buffers","title":"_create_jsonl_buffers(inputs, **kwargs)","text":"

Creates a generator of buffers containing the JSONL formatted inputs to be used by the OpenAI Batch API. The buffers created are of size 100MB or less.

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for, optionally including structured output.

required kwargs Any

the keyword arguments to use for the generation.

{}

Yields:

Type Description BytesIO

A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch

BytesIO

API.

Source code in src/distilabel/models/llms/openai.py
def _create_jsonl_buffers(\n    self, inputs: List[\"FormattedInput\"], **kwargs: Any\n) -> Generator[io.BytesIO, None, None]:\n    \"\"\"Creates a generator of buffers containing the JSONL formatted inputs to be\n    used by the OpenAI Batch API. The buffers created are of size 100MB or less.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for, optionally\n            including structured output.\n        kwargs: the keyword arguments to use for the generation.\n\n    Yields:\n        A buffer containing the JSONL formatted inputs to be used by the OpenAI Batch\n        API.\n    \"\"\"\n    buffer = io.BytesIO()\n    buffer_current_size = 0\n    for i, input in enumerate(inputs):\n        # We create the smallest `custom_id` so we don't  increase the size of the file\n        # to much, but we can still sort the results with the order of the inputs.\n        row = self._create_jsonl_row(input=input, custom_id=str(i), **kwargs)\n        row_size = len(row)\n        if row_size + buffer_current_size > _OPENAI_BATCH_API_MAX_FILE_SIZE:\n            buffer.seek(0)\n            yield buffer\n            buffer = io.BytesIO()\n            buffer_current_size = 0\n        buffer.write(row)\n        buffer_current_size += row_size\n\n    if buffer_current_size > 0:\n        buffer.seek(0)\n        yield buffer\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.OpenAILLM._create_jsonl_row","title":"_create_jsonl_row(input, custom_id, **kwargs)","text":"

Creates a JSONL formatted row to be used by the OpenAI Batch API.

Parameters:

Name Type Description Default input FormattedInput

a list of inputs in chat format to generate responses for, optionally including structured output.

required custom_id str

a custom ID to use for the row.

required kwargs Any

the keyword arguments to use for the generation.

{}

Returns:

Type Description bytes

A JSONL formatted row to be used by the OpenAI Batch API.

Source code in src/distilabel/models/llms/openai.py
def _create_jsonl_row(\n    self, input: \"FormattedInput\", custom_id: str, **kwargs: Any\n) -> bytes:\n    \"\"\"Creates a JSONL formatted row to be used by the OpenAI Batch API.\n\n    Args:\n        input: a list of inputs in chat format to generate responses for, optionally\n            including structured output.\n        custom_id: a custom ID to use for the row.\n        kwargs: the keyword arguments to use for the generation.\n\n    Returns:\n        A JSONL formatted row to be used by the OpenAI Batch API.\n    \"\"\"\n    # TODO: depending on the format of the input, add `response_format` to the kwargs\n    row = {\n        \"custom_id\": custom_id,\n        \"method\": \"POST\",\n        \"url\": \"/v1/chat/completions\",\n        \"body\": {\"messages\": input, **kwargs},\n    }\n    json_row = orjson.dumps(row)\n    return json_row + b\"\\n\"\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.TogetherLLM","title":"TogetherLLM","text":"

Bases: OpenAILLM

TogetherLLM LLM implementation running the async API client of OpenAI.

Attributes:

Name Type Description model

the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\". Supported models can be found here.

base_url Optional[RuntimeParameter[str]]

the base URL to use for the Together API can be set with TOGETHER_BASE_URL. Defaults to None which means that the value set for the environment variable TOGETHER_BASE_URL will be used, or \"https://api.together.xyz/v1\" if not set.

api_key Optional[RuntimeParameter[SecretStr]]

the API key to authenticate the requests to the Together API. Defaults to None which means that the value set for the environment variable TOGETHER_API_KEY will be used, or None if not set.

_api_key_env_var str

the name of the environment variable to use for the API key. It is meant to be used internally.

Examples:

Generate text:

from distilabel.models.llms import AnyscaleLLM\n\nllm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/together.py
class TogetherLLM(OpenAILLM):\n    \"\"\"TogetherLLM LLM implementation running the async API client of OpenAI.\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\".\n            Supported models can be found [here](https://api.together.xyz/models).\n        base_url: the base URL to use for the Together API can be set with `TOGETHER_BASE_URL`.\n            Defaults to `None` which means that the value set for the environment variable\n            `TOGETHER_BASE_URL` will be used, or \"https://api.together.xyz/v1\" if not set.\n        api_key: the API key to authenticate the requests to the Together API. Defaults to `None`\n            which means that the value set for the environment variable `TOGETHER_API_KEY` will be\n            used, or `None` if not set.\n        _api_key_env_var: the name of the environment variable to use for the API key. It\n            is meant to be used internally.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import AnyscaleLLM\n\n        llm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\n        llm.load()\n\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    base_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(\n            \"TOGETHER_BASE_URL\", \"https://api.together.xyz/v1\"\n        ),\n        description=\"The base URL to use for the Together API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_TOGETHER_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Together API.\",\n    )\n\n    _api_key_env_var: str = PrivateAttr(_TOGETHER_API_KEY_ENV_VAR_NAME)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM","title":"VertexAILLM","text":"

Bases: AsyncLLM

VertexAI LLM implementation running the async API clients for Gemini.

  • Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini

To use the VertexAILLM is necessary to have configured the Google Cloud authentication using one of these methods:

  • Setting GOOGLE_CLOUD_CREDENTIALS environment variable
  • Using gcloud auth application-default login command
  • Using vertexai.init function from the google-cloud-aiplatform library

Attributes:

Name Type Description model str

the model name to use for the LLM e.g. \"gemini-1.0-pro\". Supported models.

_aclient Optional[GenerativeModel]

the GenerativeModel to use for the Vertex AI Gemini API. It is meant to be used internally. Set in the load method.

Icon

:simple-googlecloud:

Examples:

Generate text:

from distilabel.models.llms import VertexAILLM\n\nllm = VertexAILLM(model=\"gemini-1.5-pro\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
Source code in src/distilabel/models/llms/vertexai.py
class VertexAILLM(AsyncLLM):\n    \"\"\"VertexAI LLM implementation running the async API clients for Gemini.\n\n    - Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini\n\n    To use the `VertexAILLM` is necessary to have configured the Google Cloud authentication\n    using one of these methods:\n\n    - Setting `GOOGLE_CLOUD_CREDENTIALS` environment variable\n    - Using `gcloud auth application-default login` command\n    - Using `vertexai.init` function from the `google-cloud-aiplatform` library\n\n    Attributes:\n        model: the model name to use for the LLM e.g. \"gemini-1.0-pro\". [Supported models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models).\n        _aclient: the `GenerativeModel` to use for the Vertex AI Gemini API. It is meant\n            to be used internally. Set in the `load` method.\n\n    Icon:\n        `:simple-googlecloud:`\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import VertexAILLM\n\n        llm = VertexAILLM(model=\"gemini-1.5-pro\")\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n    \"\"\"\n\n    model: str\n\n    _num_generations_param_supported = False\n\n    _aclient: Optional[\"GenerativeModel\"] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the `GenerativeModel` class which has access to `generate_content_async` to benefit from async requests.\"\"\"\n        super().load()\n\n        try:\n            from vertexai.generative_models import GenerationConfig, GenerativeModel\n\n            self._generation_config_class = GenerationConfig\n        except ImportError as e:\n            raise ImportError(\n                \"vertexai is not installed. Please install it using\"\n                \" `pip install google-cloud-aiplatform`.\"\n            ) from e\n\n        if _is_gemini_model(self.model):\n            self._aclient = GenerativeModel(model_name=self.model)\n        else:\n            raise NotImplementedError(\n                \"`VertexAILLM` is only implemented for `gemini` models that allow for `ChatType` data.\"\n            )\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    def _chattype_to_content(self, input: \"StandardInput\") -> List[\"Content\"]:\n        \"\"\"Converts a chat type to a list of content items expected by the API.\n\n        Args:\n            input: the chat type to be converted.\n\n        Returns:\n            List[str]: a list of content items expected by the API.\n        \"\"\"\n        from vertexai.generative_models import Content, Part\n\n        contents = []\n        for message in input:\n            if message[\"role\"] not in [\"user\", \"model\"]:\n                raise ValueError(\n                    \"`VertexAILLM only supports the roles 'user' or 'model'.\"\n                )\n            contents.append(\n                Content(\n                    role=message[\"role\"], parts=[Part.from_text(message[\"content\"])]\n                )\n            )\n        return contents\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: StandardInput,\n        temperature: Optional[float] = None,\n        top_p: Optional[float] = None,\n        top_k: Optional[int] = None,\n        max_output_tokens: Optional[int] = None,\n        stop_sequences: Optional[List[str]] = None,\n        safety_settings: Optional[Dict[str, Any]] = None,\n        tools: Optional[List[Dict[str, Any]]] = None,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for the given input using the [VertexAI async client definition](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini).\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            temperature: Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to `None`.\n            top_p: If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to `None`.\n            top_k: If specified, top-k sampling will be used. Defaults to `None`.\n            max_output_tokens: The maximum number of output tokens to generate per message. Defaults to `None`.\n            stop_sequences: A list of stop sequences. Defaults to `None`.\n            safety_settings: Safety configuration for returned content from the API. Defaults to `None`.\n            tools: A potential list of tools that can be used by the API. Defaults to `None`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        from vertexai.generative_models import GenerationConfig\n\n        content: \"GenerationResponse\" = await self._aclient.generate_content_async(  # type: ignore\n            contents=self._chattype_to_content(input),\n            generation_config=GenerationConfig(\n                candidate_count=1,  # only one candidate allowed per call\n                temperature=temperature,\n                top_k=top_k,\n                top_p=top_p,\n                max_output_tokens=max_output_tokens,\n                stop_sequences=stop_sequences,\n            ),\n            safety_settings=safety_settings,  # type: ignore\n            tools=tools,  # type: ignore\n            stream=False,\n        )\n\n        text = None\n        try:\n            text = content.candidates[0].text\n        except ValueError:\n            self._logger.warning(  # type: ignore\n                f\"Received no response using VertexAI client (model: '{self.model}').\"\n                f\" Finish reason was: '{content.candidates[0].finish_reason}'.\"\n            )\n\n        return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.load","title":"load()","text":"

Loads the GenerativeModel class which has access to generate_content_async to benefit from async requests.

Source code in src/distilabel/models/llms/vertexai.py
def load(self) -> None:\n    \"\"\"Loads the `GenerativeModel` class which has access to `generate_content_async` to benefit from async requests.\"\"\"\n    super().load()\n\n    try:\n        from vertexai.generative_models import GenerationConfig, GenerativeModel\n\n        self._generation_config_class = GenerationConfig\n    except ImportError as e:\n        raise ImportError(\n            \"vertexai is not installed. Please install it using\"\n            \" `pip install google-cloud-aiplatform`.\"\n        ) from e\n\n    if _is_gemini_model(self.model):\n        self._aclient = GenerativeModel(model_name=self.model)\n    else:\n        raise NotImplementedError(\n            \"`VertexAILLM` is only implemented for `gemini` models that allow for `ChatType` data.\"\n        )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM._chattype_to_content","title":"_chattype_to_content(input)","text":"

Converts a chat type to a list of content items expected by the API.

Parameters:

Name Type Description Default input StandardInput

the chat type to be converted.

required

Returns:

Type Description List[Content]

List[str]: a list of content items expected by the API.

Source code in src/distilabel/models/llms/vertexai.py
def _chattype_to_content(self, input: \"StandardInput\") -> List[\"Content\"]:\n    \"\"\"Converts a chat type to a list of content items expected by the API.\n\n    Args:\n        input: the chat type to be converted.\n\n    Returns:\n        List[str]: a list of content items expected by the API.\n    \"\"\"\n    from vertexai.generative_models import Content, Part\n\n    contents = []\n    for message in input:\n        if message[\"role\"] not in [\"user\", \"model\"]:\n            raise ValueError(\n                \"`VertexAILLM only supports the roles 'user' or 'model'.\"\n            )\n        contents.append(\n            Content(\n                role=message[\"role\"], parts=[Part.from_text(message[\"content\"])]\n            )\n        )\n    return contents\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.VertexAILLM.agenerate","title":"agenerate(input, temperature=None, top_p=None, top_k=None, max_output_tokens=None, stop_sequences=None, safety_settings=None, tools=None) async","text":"

Generates num_generations responses for the given input using the VertexAI async client definition.

Parameters:

Name Type Description Default input StandardInput

a single input in chat format to generate responses for.

required temperature Optional[float]

Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to None.

None top_p Optional[float]

If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to None.

None top_k Optional[int]

If specified, top-k sampling will be used. Defaults to None.

None max_output_tokens Optional[int]

The maximum number of output tokens to generate per message. Defaults to None.

None stop_sequences Optional[List[str]]

A list of stop sequences. Defaults to None.

None safety_settings Optional[Dict[str, Any]]

Safety configuration for returned content from the API. Defaults to None.

None tools Optional[List[Dict[str, Any]]]

A potential list of tools that can be used by the API. Defaults to None.

None

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/vertexai.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: StandardInput,\n    temperature: Optional[float] = None,\n    top_p: Optional[float] = None,\n    top_k: Optional[int] = None,\n    max_output_tokens: Optional[int] = None,\n    stop_sequences: Optional[List[str]] = None,\n    safety_settings: Optional[Dict[str, Any]] = None,\n    tools: Optional[List[Dict[str, Any]]] = None,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for the given input using the [VertexAI async client definition](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini).\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        temperature: Controls the randomness of predictions. Range: [0.0, 1.0]. Defaults to `None`.\n        top_p: If specified, nucleus sampling will be used. Range: (0.0, 1.0]. Defaults to `None`.\n        top_k: If specified, top-k sampling will be used. Defaults to `None`.\n        max_output_tokens: The maximum number of output tokens to generate per message. Defaults to `None`.\n        stop_sequences: A list of stop sequences. Defaults to `None`.\n        safety_settings: Safety configuration for returned content from the API. Defaults to `None`.\n        tools: A potential list of tools that can be used by the API. Defaults to `None`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    from vertexai.generative_models import GenerationConfig\n\n    content: \"GenerationResponse\" = await self._aclient.generate_content_async(  # type: ignore\n        contents=self._chattype_to_content(input),\n        generation_config=GenerationConfig(\n            candidate_count=1,  # only one candidate allowed per call\n            temperature=temperature,\n            top_k=top_k,\n            top_p=top_p,\n            max_output_tokens=max_output_tokens,\n            stop_sequences=stop_sequences,\n        ),\n        safety_settings=safety_settings,  # type: ignore\n        tools=tools,  # type: ignore\n        stream=False,\n    )\n\n    text = None\n    try:\n        text = content.candidates[0].text\n    except ValueError:\n        self._logger.warning(  # type: ignore\n            f\"Received no response using VertexAI client (model: '{self.model}').\"\n            f\" Finish reason was: '{content.candidates[0].finish_reason}'.\"\n        )\n\n    return [text]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM","title":"ClientvLLM","text":"

Bases: OpenAILLM, MagpieChatTemplateMixin

A client for the vLLM server implementing the OpenAI API specification.

Attributes:

Name Type Description base_url

the base URL of the vLLM server. Defaults to \"http://localhost:8000\".

max_retries

the maximum number of times to retry the request to the API before failing. Defaults to 6.

timeout

the maximum time in seconds to wait for a response from the API. Defaults to 120.

httpx_client_kwargs

extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None.

tokenizer Optional[str]

the Hugging Face Hub repo id or path of the tokenizer that will be used to apply the chat template and tokenize the inputs before sending it to the server. Defaults to None.

tokenizer_revision Optional[str]

the revision of the tokenizer to load. Defaults to None.

_aclient Optional[str]

the httpx.AsyncClient used to comunicate with the vLLM server. Defaults to None.

Runtime parameters
  • base_url: the base url of the vLLM server. Defaults to \"http://localhost:8000\".
  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.
  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.
  • httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None.

Examples:

Generate text:

from distilabel.models.llms import ClientvLLM\n\nllm = ClientvLLM(\n    base_url=\"http://localhost:8000/v1\",\n    tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n)\n\nllm.load()\n\nresults = llm.generate_outputs(\n    inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n    temperature=0.7,\n    top_p=1.0,\n    max_new_tokens=256,\n)\n# [\n#     [\n#         \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n#         \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n#         \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n#     ]\n# ]\n
Source code in src/distilabel/models/llms/vllm.py
class ClientvLLM(OpenAILLM, MagpieChatTemplateMixin):\n    \"\"\"A client for the `vLLM` server implementing the OpenAI API specification.\n\n    Attributes:\n        base_url: the base URL of the `vLLM` server. Defaults to `\"http://localhost:8000\"`.\n        max_retries: the maximum number of times to retry the request to the API before\n            failing. Defaults to `6`.\n        timeout: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        httpx_client_kwargs: extra kwargs that will be passed to the `httpx.AsyncClient`\n            created to comunicate with the `vLLM` server. Defaults to `None`.\n        tokenizer: the Hugging Face Hub repo id or path of the tokenizer that will be used\n            to apply the chat template and tokenize the inputs before sending it to the\n            server. Defaults to `None`.\n        tokenizer_revision: the revision of the tokenizer to load. Defaults to `None`.\n        _aclient: the `httpx.AsyncClient` used to comunicate with the `vLLM` server. Defaults\n            to `None`.\n\n    Runtime parameters:\n        - `base_url`: the base url of the `vLLM` server. Defaults to `\"http://localhost:8000\"`.\n        - `max_retries`: the maximum number of times to retry the request to the API before\n            failing. Defaults to `6`.\n        - `timeout`: the maximum time in seconds to wait for a response from the API. Defaults\n            to `120`.\n        - `httpx_client_kwargs`: extra kwargs that will be passed to the `httpx.AsyncClient`\n            created to comunicate with the `vLLM` server. Defaults to `None`.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import ClientvLLM\n\n        llm = ClientvLLM(\n            base_url=\"http://localhost:8000/v1\",\n            tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n        )\n\n        llm.load()\n\n        results = llm.generate_outputs(\n            inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n            temperature=0.7,\n            top_p=1.0,\n            max_new_tokens=256,\n        )\n        # [\n        #     [\n        #         \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n        #         \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n        #         \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n        #     ]\n        # ]\n        ```\n    \"\"\"\n\n    model: str = \"\"  # Default value so it's not needed to `ClientvLLM(model=\"...\")`\n    tokenizer: Optional[str] = None\n    tokenizer_revision: Optional[str] = None\n\n    # We need the sync client to get the list of models\n    _client: \"OpenAI\" = PrivateAttr(None)\n    _tokenizer: \"PreTrainedTokenizer\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Creates an `httpx.AsyncClient` to connect to the vLLM server and a tokenizer\n        optionally.\"\"\"\n\n        self.api_key = SecretStr(\"EMPTY\")\n\n        # We need to first create the sync client to get the model name that will be used\n        # in the `super().load()` when creating the logger.\n        try:\n            from openai import OpenAI\n        except ImportError as ie:\n            raise ImportError(\n                \"OpenAI Python client is not installed. Please install it using\"\n                \" `pip install openai`.\"\n            ) from ie\n\n        self._client = OpenAI(\n            base_url=self.base_url,\n            api_key=self.api_key.get_secret_value(),  # type: ignore\n            max_retries=self.max_retries,  # type: ignore\n            timeout=self.timeout,\n        )\n\n        super().load()\n\n        try:\n            from transformers import AutoTokenizer\n        except ImportError as ie:\n            raise ImportError(\n                \"To use `ClientvLLM` you need to install `transformers`.\"\n                \"Please install it using `pip install transformers`.\"\n            ) from ie\n\n        self._tokenizer = AutoTokenizer.from_pretrained(\n            self.tokenizer, revision=self.tokenizer_revision\n        )\n\n    @cached_property\n    def model_name(self) -> str:  # type: ignore\n        \"\"\"Returns the name of the model served with vLLM server.\"\"\"\n        models = self._client.models.list()\n        return models.data[0].id\n\n    def _prepare_input(self, input: \"StandardInput\") -> str:\n        \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n        input.\n\n        Args:\n            input: the input list containing chat items.\n\n        Returns:\n            The prompt to send to the LLM.\n        \"\"\"\n        prompt: str = (\n            self._tokenizer.apply_chat_template(  # type: ignore\n                input,  # type: ignore\n                tokenize=False,\n                add_generation_prompt=True,  # type: ignore\n            )\n            if input\n            else \"\"\n        )\n        return super().apply_magpie_pre_query_template(prompt, input)\n\n    @validate_call\n    async def agenerate(  # type: ignore\n        self,\n        input: FormattedInput,\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        frequency_penalty: float = 0.0,\n        logit_bias: Optional[Dict[str, int]] = None,\n        presence_penalty: float = 0.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n    ) -> GenerateOutput:\n        \"\"\"Generates `num_generations` responses for each input.\n\n        Args:\n            input: a single input in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            logit_bias: modify the likelihood of specified tokens appearing in the completion.\n                Defaults to ``\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: nucleus sampling. The value refers to the top-p tokens that should be\n                considered for sampling. Defaults to `1.0`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n\n        completion = await self._aclient.completions.create(\n            model=self.model_name,\n            prompt=self._prepare_input(input),  # type: ignore\n            n=num_generations,\n            max_tokens=max_new_tokens,\n            frequency_penalty=frequency_penalty,\n            logit_bias=logit_bias,\n            presence_penalty=presence_penalty,\n            temperature=temperature,\n            top_p=top_p,\n        )\n\n        generations = []\n        for choice in completion.choices:\n            if (text := choice.text) == \"\":\n                self._logger.warning(  # type: ignore\n                    f\"Received no response from vLLM server (model: '{self.model_name}').\"\n                    f\" Finish reason was: {choice.finish_reason}\"\n                )\n            generations.append(text)\n        return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.model_name","title":"model_name: str cached property","text":"

Returns the name of the model served with vLLM server.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.load","title":"load()","text":"

Creates an httpx.AsyncClient to connect to the vLLM server and a tokenizer optionally.

Source code in src/distilabel/models/llms/vllm.py
def load(self) -> None:\n    \"\"\"Creates an `httpx.AsyncClient` to connect to the vLLM server and a tokenizer\n    optionally.\"\"\"\n\n    self.api_key = SecretStr(\"EMPTY\")\n\n    # We need to first create the sync client to get the model name that will be used\n    # in the `super().load()` when creating the logger.\n    try:\n        from openai import OpenAI\n    except ImportError as ie:\n        raise ImportError(\n            \"OpenAI Python client is not installed. Please install it using\"\n            \" `pip install openai`.\"\n        ) from ie\n\n    self._client = OpenAI(\n        base_url=self.base_url,\n        api_key=self.api_key.get_secret_value(),  # type: ignore\n        max_retries=self.max_retries,  # type: ignore\n        timeout=self.timeout,\n    )\n\n    super().load()\n\n    try:\n        from transformers import AutoTokenizer\n    except ImportError as ie:\n        raise ImportError(\n            \"To use `ClientvLLM` you need to install `transformers`.\"\n            \"Please install it using `pip install transformers`.\"\n        ) from ie\n\n    self._tokenizer = AutoTokenizer.from_pretrained(\n        self.tokenizer, revision=self.tokenizer_revision\n    )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM._prepare_input","title":"_prepare_input(input)","text":"

Prepares the input (applying the chat template and tokenization) for the provided input.

Parameters:

Name Type Description Default input StandardInput

the input list containing chat items.

required

Returns:

Type Description str

The prompt to send to the LLM.

Source code in src/distilabel/models/llms/vllm.py
def _prepare_input(self, input: \"StandardInput\") -> str:\n    \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n    input.\n\n    Args:\n        input: the input list containing chat items.\n\n    Returns:\n        The prompt to send to the LLM.\n    \"\"\"\n    prompt: str = (\n        self._tokenizer.apply_chat_template(  # type: ignore\n            input,  # type: ignore\n            tokenize=False,\n            add_generation_prompt=True,  # type: ignore\n        )\n        if input\n        else \"\"\n    )\n    return super().apply_magpie_pre_query_template(prompt, input)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.ClientvLLM.agenerate","title":"agenerate(input, num_generations=1, max_new_tokens=128, frequency_penalty=0.0, logit_bias=None, presence_penalty=0.0, temperature=1.0, top_p=1.0) async","text":"

Generates num_generations responses for each input.

Parameters:

Name Type Description Default input FormattedInput

a single input in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 logit_bias Optional[Dict[str, int]]

modify the likelihood of specified tokens appearing in the completion. Defaults to ``

None presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

nucleus sampling. The value refers to the top-p tokens that should be considered for sampling. Defaults to 1.0.

1.0

Returns:

Type Description GenerateOutput

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/vllm.py
@validate_call\nasync def agenerate(  # type: ignore\n    self,\n    input: FormattedInput,\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    frequency_penalty: float = 0.0,\n    logit_bias: Optional[Dict[str, int]] = None,\n    presence_penalty: float = 0.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n) -> GenerateOutput:\n    \"\"\"Generates `num_generations` responses for each input.\n\n    Args:\n        input: a single input in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        logit_bias: modify the likelihood of specified tokens appearing in the completion.\n            Defaults to ``\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: nucleus sampling. The value refers to the top-p tokens that should be\n            considered for sampling. Defaults to `1.0`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n\n    completion = await self._aclient.completions.create(\n        model=self.model_name,\n        prompt=self._prepare_input(input),  # type: ignore\n        n=num_generations,\n        max_tokens=max_new_tokens,\n        frequency_penalty=frequency_penalty,\n        logit_bias=logit_bias,\n        presence_penalty=presence_penalty,\n        temperature=temperature,\n        top_p=top_p,\n    )\n\n    generations = []\n    for choice in completion.choices:\n        if (text := choice.text) == \"\":\n            self._logger.warning(  # type: ignore\n                f\"Received no response from vLLM server (model: '{self.model_name}').\"\n                f\" Finish reason was: {choice.finish_reason}\"\n            )\n        generations.append(text)\n    return generations\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM","title":"vLLM","text":"

Bases: LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin

vLLM library LLM implementation.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

dtype str

the data type to use for the model. Defaults to auto.

trust_remote_code bool

whether to trust the remote code when loading the model. Defaults to False.

quantization Optional[str]

the quantization mode to use for the model. Defaults to None.

revision Optional[str]

the revision of the model to load. Defaults to None.

tokenizer Optional[str]

the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer files. If not provided, the tokenizer will be loaded from the model directory. Defaults to None.

tokenizer_mode Literal['auto', 'slow']

the mode to use for the tokenizer. Defaults to auto.

tokenizer_revision Optional[str]

the revision of the tokenizer to load. Defaults to None.

skip_tokenizer_init bool

whether to skip the initialization of the tokenizer. Defaults to False.

chat_template Optional[str]

a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None.

structured_output Optional[RuntimeParameter[OutlinesStructuredOutputType]]

a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

seed int

the seed to use for the random number generator. Defaults to 0.

extra_kwargs Optional[RuntimeParameter[Dict[str, Any]]]

additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {}.

_model LLM

the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

_tokenizer PreTrainedTokenizer

the tokenizer instance used to format the prompt before passing it to the LLM. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

use_magpie_template PreTrainedTokenizer

a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

magpie_pre_query_template PreTrainedTokenizer

the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

References
  • https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
Runtime parameters
  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library.

Examples:

Generate text:

from distilabel.models.llms import vLLM\n\n# You can pass a custom chat_template to the model\nllm = vLLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n

Generate structured data:

from pathlib import Path\nfrom distilabel.models.llms import vLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = vLLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\"\n    structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
Source code in src/distilabel/models/llms/vllm.py
class vLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):\n    \"\"\"`vLLM` library LLM implementation.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        dtype: the data type to use for the model. Defaults to `auto`.\n        trust_remote_code: whether to trust the remote code when loading the model. Defaults\n            to `False`.\n        quantization: the quantization mode to use for the model. Defaults to `None`.\n        revision: the revision of the model to load. Defaults to `None`.\n        tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing\n            the tokenizer files. If not provided, the tokenizer will be loaded from the\n            model directory. Defaults to `None`.\n        tokenizer_mode: the mode to use for the tokenizer. Defaults to `auto`.\n        tokenizer_revision: the revision of the tokenizer to load. Defaults to `None`.\n        skip_tokenizer_init: whether to skip the initialization of the tokenizer. Defaults\n            to `False`.\n        chat_template: a chat template that will be used to build the prompts before\n            sending them to the model. If not provided, the chat template defined in the\n            tokenizer config will be used. If not provided and the tokenizer doesn't have\n            a chat template, then ChatML template will be used. Defaults to `None`.\n        structured_output: a dictionary containing the structured output configuration or if more\n            fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.\n        seed: the seed to use for the random number generator. Defaults to `0`.\n        extra_kwargs: additional dictionary of keyword arguments that will be passed to the\n            `LLM` class of `vllm` library. Defaults to `{}`.\n        _model: the `vLLM` model instance. This attribute is meant to be used internally\n            and should not be accessed directly. It will be set in the `load` method.\n        _tokenizer: the tokenizer instance used to format the prompt before passing it to\n            the `LLM`. This attribute is meant to be used internally and should not be\n            accessed directly. It will be set in the `load` method.\n        use_magpie_template: a flag used to enable/disable applying the Magpie pre-query\n            template. Defaults to `False`.\n        magpie_pre_query_template: the pre-query template to be applied to the prompt or\n            sent to the LLM to generate an instruction or a follow up user message. Valid\n            values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults\n            to `None`.\n\n    References:\n        - https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\n\n    Runtime parameters:\n        - `extra_kwargs`: additional dictionary of keyword arguments that will be passed to\n            the `LLM` class of `vllm` library.\n\n    Examples:\n        Generate text:\n\n        ```python\n        from distilabel.models.llms import vLLM\n\n        # You can pass a custom chat_template to the model\n        llm = vLLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\",\n            chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n        )\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n        ```\n\n        Generate structured data:\n\n        ```python\n        from pathlib import Path\n        from distilabel.models.llms import vLLM\n\n        class User(BaseModel):\n            name: str\n            last_name: str\n            id: int\n\n        llm = vLLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\"\n            structured_output={\"format\": \"json\", \"schema\": Character},\n        )\n\n        llm.load()\n\n        # Call the model\n        output = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n        ```\n    \"\"\"\n\n    model: str\n    dtype: str = \"auto\"\n    trust_remote_code: bool = False\n    quantization: Optional[str] = None\n    revision: Optional[str] = None\n\n    tokenizer: Optional[str] = None\n    tokenizer_mode: Literal[\"auto\", \"slow\"] = \"auto\"\n    tokenizer_revision: Optional[str] = None\n    skip_tokenizer_init: bool = False\n    chat_template: Optional[str] = None\n\n    seed: int = 0\n\n    extra_kwargs: Optional[RuntimeParameter[Dict[str, Any]]] = Field(\n        default_factory=dict,\n        description=\"Additional dictionary of keyword arguments that will be passed to the\"\n        \" `vLLM` class of `vllm` library. See all the supported arguments at: \"\n        \"https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py\",\n    )\n    structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(\n        default=None,\n        description=\"The structured output format to use across all the generations.\",\n    )\n\n    _model: \"_vLLM\" = PrivateAttr(None)\n    _tokenizer: \"PreTrainedTokenizer\" = PrivateAttr(None)\n    _structured_output_logits_processor: Optional[Callable] = PrivateAttr(default=None)\n\n    def load(self) -> None:\n        \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\n        Additionally, this method also sets the `chat_template` for the tokenizer, so as to properly\n        parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the\n        default value is ChatML format, unless explicitly provided.\n        \"\"\"\n        super().load()\n\n        CudaDevicePlacementMixin.load(self)\n\n        try:\n            from vllm import LLM as _vLLM\n        except ImportError as ie:\n            raise ImportError(\n                \"vLLM is not installed. Please install it using `pip install vllm`.\"\n            ) from ie\n\n        self._model = _vLLM(\n            self.model,\n            dtype=self.dtype,\n            trust_remote_code=self.trust_remote_code,\n            quantization=self.quantization,\n            revision=self.revision,\n            tokenizer=self.tokenizer,\n            tokenizer_mode=self.tokenizer_mode,\n            tokenizer_revision=self.tokenizer_revision,\n            skip_tokenizer_init=self.skip_tokenizer_init,\n            seed=self.seed,\n            **self.extra_kwargs,  # type: ignore\n        )\n\n        self._tokenizer = self._model.get_tokenizer()  # type: ignore\n        if self.chat_template is not None:\n            self._tokenizer.chat_template = self.chat_template  # type: ignore\n\n        if self.structured_output:\n            self._structured_output_logits_processor = self._prepare_structured_output(\n                self.structured_output\n            )\n\n    def unload(self) -> None:\n        \"\"\"Unloads the `vLLM` model.\"\"\"\n        self._model = None  # type: ignore\n        self._tokenizer = None  # type: ignore\n        CudaDevicePlacementMixin.unload(self)\n        super().unload()\n\n    @property\n    def model_name(self) -> str:\n        \"\"\"Returns the model name used for the LLM.\"\"\"\n        return self.model\n\n    def prepare_input(self, input: \"StandardInput\") -> str:\n        \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n        input.\n\n        Args:\n            input: the input list containing chat items.\n\n        Returns:\n            The prompt to send to the LLM.\n        \"\"\"\n        if self._tokenizer.chat_template is None:\n            return input[0][\"content\"]\n\n        prompt: str = (\n            self._tokenizer.apply_chat_template(\n                input,  # type: ignore\n                tokenize=False,\n                add_generation_prompt=True,  # type: ignore\n            )\n            if input\n            else \"\"\n        )\n        return super().apply_magpie_pre_query_template(prompt, input)\n\n    def _prepare_batches(\n        self, inputs: List[FormattedInput]\n    ) -> Tuple[List[List[FormattedInput]], List[int]]:\n        \"\"\"Prepares the inputs by grouping them by the structured output.\n\n        When we generate structured outputs with schemas obtained from a dataset, we need to\n        prepare the data to try to send batches of inputs instead of single inputs to the model\n        to take advante of the engine. So we group the inputs by the structured output to be\n        passed in the `generate` method.\n\n        Args:\n            inputs: The batch of inputs passed to the generate method. As we expect to be generating\n                structured outputs, each element will be a tuple containing the instruction and the\n                structured output.\n\n        Returns:\n            The prepared batches (sub-batches let's say) to be passed to the `generate` method.\n            Each new tuple will contain instead of the single instruction, a list of instructions\n        \"\"\"\n        instruction_order = {}\n        batches = {}\n        for i, (instruction, structured_output) in enumerate(inputs):\n            instruction = self.prepare_input(instruction)\n            instruction_order[instruction] = i\n            structured_output = json.dumps(structured_output)\n            if structured_output not in batches:\n                batches[structured_output] = [instruction]\n            else:\n                batches[structured_output].append(instruction)\n\n        # Flatten the instructions in prepared_data\n        flat_instructions = [\n            instruction for _, group in batches.items() for instruction in group\n        ]\n        # Generate the list of indices based on the original order\n        sorted_indices = [\n            instruction_order[instruction] for instruction in flat_instructions\n        ]\n        return [\n            (batch, json.loads(schema)) for schema, batch in batches.items()\n        ], sorted_indices\n\n    @validate_call\n    def generate(  # type: ignore\n        self,\n        inputs: List[FormattedInput],\n        num_generations: int = 1,\n        max_new_tokens: int = 128,\n        presence_penalty: float = 0.0,\n        frequency_penalty: float = 0.0,\n        repetition_penalty: float = 1.0,\n        temperature: float = 1.0,\n        top_p: float = 1.0,\n        top_k: int = -1,\n        min_p: float = 0.0,\n        stop: Optional[List[str]] = None,\n        stop_token_ids: Optional[List[int]] = None,\n        include_stop_str_in_output: bool = False,\n        logits_processors: Optional[LogitsProcessors] = None,\n        extra_sampling_params: Optional[Dict[str, Any]] = None,\n    ) -> List[GenerateOutput]:\n        \"\"\"Generates `num_generations` responses for each input.\n\n        Args:\n            inputs: a list of inputs in chat format to generate responses for.\n            num_generations: the number of generations to create per input. Defaults to\n                `1`.\n            max_new_tokens: the maximum number of new tokens that the model will generate.\n                Defaults to `128`.\n            presence_penalty: the presence penalty to use for the generation. Defaults to\n                `0.0`.\n            frequency_penalty: the repetition penalty to use for the generation. Defaults\n                to `0.0`.\n            repetition_penalty: the repetition penalty to use for the generation Defaults to\n                `1.0`.\n            temperature: the temperature to use for the generation. Defaults to `0.1`.\n            top_p: the top-p value to use for the generation. Defaults to `1.0`.\n            top_k: the top-k value to use for the generation. Defaults to `0`.\n            min_p: the minimum probability to use for the generation. Defaults to `0.0`.\n            stop: a list of strings that will be used to stop the generation when found.\n                Defaults to `None`.\n            stop_token_ids: a list of token ids that will be used to stop the generation\n                when found. Defaults to `None`.\n            include_stop_str_in_output: whether to include the stop string in the output.\n                Defaults to `False`.\n            logits_processors: a list of functions to process the logits before sampling.\n                Defaults to `None`.\n            extra_sampling_params: dictionary with additional arguments to be passed to\n                the `SamplingParams` class from `vllm`.\n\n        Returns:\n            A list of lists of strings containing the generated responses for each input.\n        \"\"\"\n        from vllm import SamplingParams\n\n        if not logits_processors:\n            logits_processors = []\n\n        if extra_sampling_params is None:\n            extra_sampling_params = {}\n\n        structured_output = None\n\n        if isinstance(inputs[0], tuple):\n            prepared_batches, sorted_indices = self._prepare_batches(inputs)\n        else:\n            # Simulate a batch without the structured output content\n            prepared_batches = [([self.prepare_input(input) for input in inputs], None)]\n            sorted_indices = None\n\n        # Case in which we have a single structured output for the dataset\n        if self._structured_output_logits_processor:\n            logits_processors.append(self._structured_output_logits_processor)\n\n        batched_outputs = []\n\n        for prepared_inputs, structured_output in prepared_batches:\n            if structured_output:\n                logits_processors.append(\n                    self._prepare_structured_output(structured_output)\n                )\n\n            sampling_params = SamplingParams(  # type: ignore\n                n=num_generations,\n                presence_penalty=presence_penalty,\n                frequency_penalty=frequency_penalty,\n                repetition_penalty=repetition_penalty,\n                temperature=temperature,\n                top_p=top_p,\n                top_k=top_k,\n                min_p=min_p,\n                max_tokens=max_new_tokens,\n                stop=stop,\n                stop_token_ids=stop_token_ids,\n                include_stop_str_in_output=include_stop_str_in_output,\n                logits_processors=logits_processors,\n                **extra_sampling_params,\n            )\n\n            batch_outputs = self._model.generate(\n                prepared_inputs,\n                sampling_params,\n                use_tqdm=False,  # type: ignore\n            )\n\n            batched_outputs += [\n                [output.text for output in outputs.outputs] for outputs in batch_outputs\n            ]\n\n        # If logits_processor is set, we need to sort the outputs back to the original order\n        # (would be needed only if we have multiple structured outputs in the dataset)\n        if sorted_indices is not None:\n            batched_outputs = _sort_batches(\n                batched_outputs, sorted_indices, num_generations=num_generations\n            )\n        return batched_outputs\n\n    def _prepare_structured_output(\n        self, structured_output: Optional[OutlinesStructuredOutputType] = None\n    ) -> Union[Callable, None]:\n        \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n        Args:\n            structured_output: the configuration dict to prepare the structured output.\n\n        Returns:\n            The callable that will be used to guide the generation of the model.\n        \"\"\"\n        from distilabel.steps.tasks.structured_outputs.outlines import (\n            prepare_guided_output,\n        )\n\n        result = prepare_guided_output(structured_output, \"vllm\", self._model)\n        if (schema := result.get(\"schema\")) and self.structured_output:\n            self.structured_output[\"schema\"] = schema\n        return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.model_name","title":"model_name: str property","text":"

Returns the model name used for the LLM.

"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.load","title":"load()","text":"

Loads the vLLM model using either the path or the Hugging Face Hub repository id. Additionally, this method also sets the chat_template for the tokenizer, so as to properly parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the default value is ChatML format, unless explicitly provided.

Source code in src/distilabel/models/llms/vllm.py
def load(self) -> None:\n    \"\"\"Loads the `vLLM` model using either the path or the Hugging Face Hub repository id.\n    Additionally, this method also sets the `chat_template` for the tokenizer, so as to properly\n    parse the list of OpenAI formatted inputs using the expected format by the model, otherwise, the\n    default value is ChatML format, unless explicitly provided.\n    \"\"\"\n    super().load()\n\n    CudaDevicePlacementMixin.load(self)\n\n    try:\n        from vllm import LLM as _vLLM\n    except ImportError as ie:\n        raise ImportError(\n            \"vLLM is not installed. Please install it using `pip install vllm`.\"\n        ) from ie\n\n    self._model = _vLLM(\n        self.model,\n        dtype=self.dtype,\n        trust_remote_code=self.trust_remote_code,\n        quantization=self.quantization,\n        revision=self.revision,\n        tokenizer=self.tokenizer,\n        tokenizer_mode=self.tokenizer_mode,\n        tokenizer_revision=self.tokenizer_revision,\n        skip_tokenizer_init=self.skip_tokenizer_init,\n        seed=self.seed,\n        **self.extra_kwargs,  # type: ignore\n    )\n\n    self._tokenizer = self._model.get_tokenizer()  # type: ignore\n    if self.chat_template is not None:\n        self._tokenizer.chat_template = self.chat_template  # type: ignore\n\n    if self.structured_output:\n        self._structured_output_logits_processor = self._prepare_structured_output(\n            self.structured_output\n        )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.unload","title":"unload()","text":"

Unloads the vLLM model.

Source code in src/distilabel/models/llms/vllm.py
def unload(self) -> None:\n    \"\"\"Unloads the `vLLM` model.\"\"\"\n    self._model = None  # type: ignore\n    self._tokenizer = None  # type: ignore\n    CudaDevicePlacementMixin.unload(self)\n    super().unload()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.prepare_input","title":"prepare_input(input)","text":"

Prepares the input (applying the chat template and tokenization) for the provided input.

Parameters:

Name Type Description Default input StandardInput

the input list containing chat items.

required

Returns:

Type Description str

The prompt to send to the LLM.

Source code in src/distilabel/models/llms/vllm.py
def prepare_input(self, input: \"StandardInput\") -> str:\n    \"\"\"Prepares the input (applying the chat template and tokenization) for the provided\n    input.\n\n    Args:\n        input: the input list containing chat items.\n\n    Returns:\n        The prompt to send to the LLM.\n    \"\"\"\n    if self._tokenizer.chat_template is None:\n        return input[0][\"content\"]\n\n    prompt: str = (\n        self._tokenizer.apply_chat_template(\n            input,  # type: ignore\n            tokenize=False,\n            add_generation_prompt=True,  # type: ignore\n        )\n        if input\n        else \"\"\n    )\n    return super().apply_magpie_pre_query_template(prompt, input)\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM._prepare_batches","title":"_prepare_batches(inputs)","text":"

Prepares the inputs by grouping them by the structured output.

When we generate structured outputs with schemas obtained from a dataset, we need to prepare the data to try to send batches of inputs instead of single inputs to the model to take advante of the engine. So we group the inputs by the structured output to be passed in the generate method.

Parameters:

Name Type Description Default inputs List[FormattedInput]

The batch of inputs passed to the generate method. As we expect to be generating structured outputs, each element will be a tuple containing the instruction and the structured output.

required

Returns:

Type Description List[List[FormattedInput]]

The prepared batches (sub-batches let's say) to be passed to the generate method.

List[int]

Each new tuple will contain instead of the single instruction, a list of instructions

Source code in src/distilabel/models/llms/vllm.py
def _prepare_batches(\n    self, inputs: List[FormattedInput]\n) -> Tuple[List[List[FormattedInput]], List[int]]:\n    \"\"\"Prepares the inputs by grouping them by the structured output.\n\n    When we generate structured outputs with schemas obtained from a dataset, we need to\n    prepare the data to try to send batches of inputs instead of single inputs to the model\n    to take advante of the engine. So we group the inputs by the structured output to be\n    passed in the `generate` method.\n\n    Args:\n        inputs: The batch of inputs passed to the generate method. As we expect to be generating\n            structured outputs, each element will be a tuple containing the instruction and the\n            structured output.\n\n    Returns:\n        The prepared batches (sub-batches let's say) to be passed to the `generate` method.\n        Each new tuple will contain instead of the single instruction, a list of instructions\n    \"\"\"\n    instruction_order = {}\n    batches = {}\n    for i, (instruction, structured_output) in enumerate(inputs):\n        instruction = self.prepare_input(instruction)\n        instruction_order[instruction] = i\n        structured_output = json.dumps(structured_output)\n        if structured_output not in batches:\n            batches[structured_output] = [instruction]\n        else:\n            batches[structured_output].append(instruction)\n\n    # Flatten the instructions in prepared_data\n    flat_instructions = [\n        instruction for _, group in batches.items() for instruction in group\n    ]\n    # Generate the list of indices based on the original order\n    sorted_indices = [\n        instruction_order[instruction] for instruction in flat_instructions\n    ]\n    return [\n        (batch, json.loads(schema)) for schema, batch in batches.items()\n    ], sorted_indices\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM.generate","title":"generate(inputs, num_generations=1, max_new_tokens=128, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, stop=None, stop_token_ids=None, include_stop_str_in_output=False, logits_processors=None, extra_sampling_params=None)","text":"

Generates num_generations responses for each input.

Parameters:

Name Type Description Default inputs List[FormattedInput]

a list of inputs in chat format to generate responses for.

required num_generations int

the number of generations to create per input. Defaults to 1.

1 max_new_tokens int

the maximum number of new tokens that the model will generate. Defaults to 128.

128 presence_penalty float

the presence penalty to use for the generation. Defaults to 0.0.

0.0 frequency_penalty float

the repetition penalty to use for the generation. Defaults to 0.0.

0.0 repetition_penalty float

the repetition penalty to use for the generation Defaults to 1.0.

1.0 temperature float

the temperature to use for the generation. Defaults to 0.1.

1.0 top_p float

the top-p value to use for the generation. Defaults to 1.0.

1.0 top_k int

the top-k value to use for the generation. Defaults to 0.

-1 min_p float

the minimum probability to use for the generation. Defaults to 0.0.

0.0 stop Optional[List[str]]

a list of strings that will be used to stop the generation when found. Defaults to None.

None stop_token_ids Optional[List[int]]

a list of token ids that will be used to stop the generation when found. Defaults to None.

None include_stop_str_in_output bool

whether to include the stop string in the output. Defaults to False.

False logits_processors Optional[LogitsProcessors]

a list of functions to process the logits before sampling. Defaults to None.

None extra_sampling_params Optional[Dict[str, Any]]

dictionary with additional arguments to be passed to the SamplingParams class from vllm.

None

Returns:

Type Description List[GenerateOutput]

A list of lists of strings containing the generated responses for each input.

Source code in src/distilabel/models/llms/vllm.py
@validate_call\ndef generate(  # type: ignore\n    self,\n    inputs: List[FormattedInput],\n    num_generations: int = 1,\n    max_new_tokens: int = 128,\n    presence_penalty: float = 0.0,\n    frequency_penalty: float = 0.0,\n    repetition_penalty: float = 1.0,\n    temperature: float = 1.0,\n    top_p: float = 1.0,\n    top_k: int = -1,\n    min_p: float = 0.0,\n    stop: Optional[List[str]] = None,\n    stop_token_ids: Optional[List[int]] = None,\n    include_stop_str_in_output: bool = False,\n    logits_processors: Optional[LogitsProcessors] = None,\n    extra_sampling_params: Optional[Dict[str, Any]] = None,\n) -> List[GenerateOutput]:\n    \"\"\"Generates `num_generations` responses for each input.\n\n    Args:\n        inputs: a list of inputs in chat format to generate responses for.\n        num_generations: the number of generations to create per input. Defaults to\n            `1`.\n        max_new_tokens: the maximum number of new tokens that the model will generate.\n            Defaults to `128`.\n        presence_penalty: the presence penalty to use for the generation. Defaults to\n            `0.0`.\n        frequency_penalty: the repetition penalty to use for the generation. Defaults\n            to `0.0`.\n        repetition_penalty: the repetition penalty to use for the generation Defaults to\n            `1.0`.\n        temperature: the temperature to use for the generation. Defaults to `0.1`.\n        top_p: the top-p value to use for the generation. Defaults to `1.0`.\n        top_k: the top-k value to use for the generation. Defaults to `0`.\n        min_p: the minimum probability to use for the generation. Defaults to `0.0`.\n        stop: a list of strings that will be used to stop the generation when found.\n            Defaults to `None`.\n        stop_token_ids: a list of token ids that will be used to stop the generation\n            when found. Defaults to `None`.\n        include_stop_str_in_output: whether to include the stop string in the output.\n            Defaults to `False`.\n        logits_processors: a list of functions to process the logits before sampling.\n            Defaults to `None`.\n        extra_sampling_params: dictionary with additional arguments to be passed to\n            the `SamplingParams` class from `vllm`.\n\n    Returns:\n        A list of lists of strings containing the generated responses for each input.\n    \"\"\"\n    from vllm import SamplingParams\n\n    if not logits_processors:\n        logits_processors = []\n\n    if extra_sampling_params is None:\n        extra_sampling_params = {}\n\n    structured_output = None\n\n    if isinstance(inputs[0], tuple):\n        prepared_batches, sorted_indices = self._prepare_batches(inputs)\n    else:\n        # Simulate a batch without the structured output content\n        prepared_batches = [([self.prepare_input(input) for input in inputs], None)]\n        sorted_indices = None\n\n    # Case in which we have a single structured output for the dataset\n    if self._structured_output_logits_processor:\n        logits_processors.append(self._structured_output_logits_processor)\n\n    batched_outputs = []\n\n    for prepared_inputs, structured_output in prepared_batches:\n        if structured_output:\n            logits_processors.append(\n                self._prepare_structured_output(structured_output)\n            )\n\n        sampling_params = SamplingParams(  # type: ignore\n            n=num_generations,\n            presence_penalty=presence_penalty,\n            frequency_penalty=frequency_penalty,\n            repetition_penalty=repetition_penalty,\n            temperature=temperature,\n            top_p=top_p,\n            top_k=top_k,\n            min_p=min_p,\n            max_tokens=max_new_tokens,\n            stop=stop,\n            stop_token_ids=stop_token_ids,\n            include_stop_str_in_output=include_stop_str_in_output,\n            logits_processors=logits_processors,\n            **extra_sampling_params,\n        )\n\n        batch_outputs = self._model.generate(\n            prepared_inputs,\n            sampling_params,\n            use_tqdm=False,  # type: ignore\n        )\n\n        batched_outputs += [\n            [output.text for output in outputs.outputs] for outputs in batch_outputs\n        ]\n\n    # If logits_processor is set, we need to sort the outputs back to the original order\n    # (would be needed only if we have multiple structured outputs in the dataset)\n    if sorted_indices is not None:\n        batched_outputs = _sort_batches(\n            batched_outputs, sorted_indices, num_generations=num_generations\n        )\n    return batched_outputs\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.vLLM._prepare_structured_output","title":"_prepare_structured_output(structured_output=None)","text":"

Creates the appropriate function to filter tokens to generate structured outputs.

Parameters:

Name Type Description Default structured_output Optional[OutlinesStructuredOutputType]

the configuration dict to prepare the structured output.

None

Returns:

Type Description Union[Callable, None]

The callable that will be used to guide the generation of the model.

Source code in src/distilabel/models/llms/vllm.py
def _prepare_structured_output(\n    self, structured_output: Optional[OutlinesStructuredOutputType] = None\n) -> Union[Callable, None]:\n    \"\"\"Creates the appropriate function to filter tokens to generate structured outputs.\n\n    Args:\n        structured_output: the configuration dict to prepare the structured output.\n\n    Returns:\n        The callable that will be used to guide the generation of the model.\n    \"\"\"\n    from distilabel.steps.tasks.structured_outputs.outlines import (\n        prepare_guided_output,\n    )\n\n    result = prepare_guided_output(structured_output, \"vllm\", self._model)\n    if (schema := result.get(\"schema\")) and self.structured_output:\n        self.structured_output[\"schema\"] = schema\n    return result[\"processor\"]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin","title":"CudaDevicePlacementMixin","text":"

Bases: BaseModel

Mixin class to assign CUDA devices to the LLM based on the cuda_devices attribute and the device placement information provided in _device_llm_placement_map. Providing the device placement information is optional, but if it is provided, it will be used to assign CUDA devices to the LLMs, trying to avoid using the same device for different LLMs.

Attributes:

Name Type Description cuda_devices RuntimeParameter[Union[List[int], Literal['auto']]]

a list with the ID of the CUDA devices to be used by the LLM. If set to \"auto\", the devices will be automatically assigned based on the device placement information provided in _device_llm_placement_map. If set to a list of devices, it will be checked if the devices are available to be used by the LLM. If not, a warning will be logged.

disable_cuda_device_placement RuntimeParameter[bool]

Whether to disable the CUDA device placement logic or not. Defaults to False.

_llm_identifier Union[str, None]

the identifier of the LLM to be used as key in _device_llm_placement_map.

_device_llm_placement_map Generator[Dict[str, List[int]], None, None]

a dictionary with the device placement information for each LLM.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
class CudaDevicePlacementMixin(BaseModel):\n    \"\"\"Mixin class to assign CUDA devices to the `LLM` based on the `cuda_devices` attribute\n    and the device placement information provided in `_device_llm_placement_map`. Providing\n    the device placement information is optional, but if it is provided, it will be used to\n    assign CUDA devices to the `LLM`s, trying to avoid using the same device for different\n    `LLM`s.\n\n    Attributes:\n        cuda_devices: a list with the ID of the CUDA devices to be used by the `LLM`. If set\n            to \"auto\", the devices will be automatically assigned based on the device\n            placement information provided in `_device_llm_placement_map`. If set to a list\n            of devices, it will be checked if the devices are available to be used by the\n            `LLM`. If not, a warning will be logged.\n        disable_cuda_device_placement: Whether to disable the CUDA device placement logic\n            or not. Defaults to `False`.\n        _llm_identifier: the identifier of the `LLM` to be used as key in `_device_llm_placement_map`.\n        _device_llm_placement_map: a dictionary with the device placement information for each\n            `LLM`.\n    \"\"\"\n\n    cuda_devices: RuntimeParameter[Union[List[int], Literal[\"auto\"]]] = Field(\n        default=\"auto\", description=\"A list with the ID of the CUDA devices to be used.\"\n    )\n    disable_cuda_device_placement: RuntimeParameter[bool] = Field(\n        default=False,\n        description=\"Whether to disable the CUDA device placement logic or not.\",\n    )\n\n    _llm_identifier: Union[str, None] = PrivateAttr(default=None)\n    _desired_num_gpus: PositiveInt = PrivateAttr(default=1)\n    _available_cuda_devices: List[int] = PrivateAttr(default_factory=list)\n    _can_check_cuda_devices: bool = PrivateAttr(default=False)\n\n    _logger: \"Logger\" = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Assign CUDA devices to the LLM based on the device placement information provided\n        in `_device_llm_placement_map`.\"\"\"\n\n        if self.disable_cuda_device_placement:\n            return\n\n        try:\n            import pynvml\n\n            pynvml.nvmlInit()\n            device_count = pynvml.nvmlDeviceGetCount()\n            self._available_cuda_devices = list(range(device_count))\n            self._can_check_cuda_devices = True\n        except ImportError as ie:\n            if self.cuda_devices == \"auto\":\n                raise ImportError(\n                    \"The 'pynvml' library is not installed. It is required to automatically\"\n                    \" assign CUDA devices to the `LLM`s. Please, install it and try again.\"\n                ) from ie\n\n            if self.cuda_devices:\n                self._logger.warning(  # type: ignore\n                    \"The 'pynvml' library is not installed. It is recommended to install it\"\n                    \" to check if the CUDA devices assigned to the LLM are available.\"\n                )\n\n        self._assign_cuda_devices()\n\n    def unload(self) -> None:\n        \"\"\"Unloads the LLM and removes the CUDA devices assigned to it from the device\n        placement information provided in `_device_llm_placement_map`.\"\"\"\n        if self.disable_cuda_device_placement:\n            return\n\n        with self._device_llm_placement_map() as device_map:\n            if self._llm_identifier in device_map:\n                self._logger.debug(  # type: ignore\n                    f\"Removing '{self._llm_identifier}' from the CUDA device map file\"\n                    f\" '{_CUDA_DEVICE_PLACEMENT_MIXIN_FILE}'.\"\n                )\n                del device_map[self._llm_identifier]\n\n    @contextmanager\n    def _device_llm_placement_map(self) -> Generator[Dict[str, List[int]], None, None]:\n        \"\"\"Reads the content of the device placement file of the node with a lock, yields\n        the content, and writes the content back to the file after the context manager is\n        closed. If the file doesn't exist, an empty dictionary will be yielded.\n\n        Yields:\n            The content of the device placement file.\n        \"\"\"\n        _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.parent.mkdir(parents=True, exist_ok=True)\n        _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.touch()\n        with portalocker.Lock(\n            _CUDA_DEVICE_PLACEMENT_MIXIN_FILE,\n            \"r+\",\n            flags=portalocker.LockFlags.EXCLUSIVE,\n        ) as f:\n            try:\n                content = json.load(f)\n            except json.JSONDecodeError:\n                content = {}\n            yield content\n            f.seek(0)\n            f.truncate()\n            f.write(json.dumps(content))\n\n    def _assign_cuda_devices(self) -> None:\n        \"\"\"Assigns CUDA devices to the LLM based on the device placement information provided\n        in `_device_llm_placement_map`. If the `cuda_devices` attribute is set to \"auto\", it\n        will be set to the first available CUDA device that is not going to be used by any\n        other LLM. If the `cuda_devices` attribute is set to a list of devices, it will be\n        checked if the devices are available to be used by the LLM. If not, a warning will be\n        logged.\"\"\"\n\n        # Take the lock and read the device placement information for each LLM.\n        with self._device_llm_placement_map() as device_map:\n            if self.cuda_devices == \"auto\":\n                self.cuda_devices = []\n                for _ in range(self._desired_num_gpus):\n                    if (device_id := self._get_cuda_device(device_map)) is not None:\n                        self.cuda_devices.append(device_id)\n                        device_map[self._llm_identifier] = self.cuda_devices  # type: ignore\n                if len(self.cuda_devices) != self._desired_num_gpus:\n                    self._logger.warning(  # type: ignore\n                        f\"Could not assign the desired number of GPUs {self._desired_num_gpus}\"\n                        f\" for LLM with identifier '{self._llm_identifier}'.\"\n                    )\n            else:\n                self._check_cuda_devices(device_map)\n\n            device_map[self._llm_identifier] = self.cuda_devices  # type: ignore\n\n        # `_device_llm_placement_map` was not provided and user didn't set the `cuda_devices`\n        # attribute. In this case, the `cuda_devices` attribute will be set to an empty list.\n        if self.cuda_devices == \"auto\":\n            self.cuda_devices = []\n\n        self._set_cuda_visible_devices()\n\n    def _check_cuda_devices(self, device_map: Dict[str, List[int]]) -> None:\n        \"\"\"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.\n\n        Args:\n            device_map: a dictionary with the device placement information for each LLM.\n        \"\"\"\n        for device in self.cuda_devices:  # type: ignore\n            for llm, devices in device_map.items():\n                if device in devices:\n                    self._logger.warning(  # type: ignore\n                        f\"LLM with identifier '{llm}' is also going to use CUDA device \"\n                        f\"'{device}'. This may lead to performance issues or running out\"\n                        \" of memory depending on the device capabilities and the loaded\"\n                        \" models.\"\n                    )\n\n    def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]:\n        \"\"\"Returns the first available CUDA device to be used by the LLM that is not going\n        to be used by any other LLM.\n\n        Args:\n            device_map: a dictionary with the device placement information for each LLM.\n\n        Returns:\n            The first available CUDA device to be used by the LLM.\n\n        Raises:\n            RuntimeError: if there is no available CUDA device to be used by the LLM.\n        \"\"\"\n        for device in self._available_cuda_devices:\n            if all(device not in devices for devices in device_map.values()):\n                return device\n\n        return None\n\n    def _set_cuda_visible_devices(self) -> None:\n        \"\"\"Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices\n        to be used by the LLM.\n        \"\"\"\n        if not self.cuda_devices:\n            return\n\n        if self._can_check_cuda_devices and not all(\n            device in self._available_cuda_devices for device in self.cuda_devices\n        ):\n            raise RuntimeError(\n                f\"Invalid CUDA devices for LLM '{self._llm_identifier}': {self.cuda_devices}.\"\n                f\" The available devices are: {self._available_cuda_devices}. Please, review\"\n                \" the 'cuda_devices' attribute and try again.\"\n            )\n\n        cuda_devices = \",\".join([str(device) for device in self.cuda_devices])\n        self._logger.info(  # type: ignore\n            f\"\ud83c\udfae LLM '{self._llm_identifier}' is going to use the following CUDA devices:\"\n            f\" {self.cuda_devices}.\"\n        )\n        os.environ[\"CUDA_VISIBLE_DEVICES\"] = cuda_devices\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin.load","title":"load()","text":"

Assign CUDA devices to the LLM based on the device placement information provided in _device_llm_placement_map.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def load(self) -> None:\n    \"\"\"Assign CUDA devices to the LLM based on the device placement information provided\n    in `_device_llm_placement_map`.\"\"\"\n\n    if self.disable_cuda_device_placement:\n        return\n\n    try:\n        import pynvml\n\n        pynvml.nvmlInit()\n        device_count = pynvml.nvmlDeviceGetCount()\n        self._available_cuda_devices = list(range(device_count))\n        self._can_check_cuda_devices = True\n    except ImportError as ie:\n        if self.cuda_devices == \"auto\":\n            raise ImportError(\n                \"The 'pynvml' library is not installed. It is required to automatically\"\n                \" assign CUDA devices to the `LLM`s. Please, install it and try again.\"\n            ) from ie\n\n        if self.cuda_devices:\n            self._logger.warning(  # type: ignore\n                \"The 'pynvml' library is not installed. It is recommended to install it\"\n                \" to check if the CUDA devices assigned to the LLM are available.\"\n            )\n\n    self._assign_cuda_devices()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin.unload","title":"unload()","text":"

Unloads the LLM and removes the CUDA devices assigned to it from the device placement information provided in _device_llm_placement_map.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def unload(self) -> None:\n    \"\"\"Unloads the LLM and removes the CUDA devices assigned to it from the device\n    placement information provided in `_device_llm_placement_map`.\"\"\"\n    if self.disable_cuda_device_placement:\n        return\n\n    with self._device_llm_placement_map() as device_map:\n        if self._llm_identifier in device_map:\n            self._logger.debug(  # type: ignore\n                f\"Removing '{self._llm_identifier}' from the CUDA device map file\"\n                f\" '{_CUDA_DEVICE_PLACEMENT_MIXIN_FILE}'.\"\n            )\n            del device_map[self._llm_identifier]\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._device_llm_placement_map","title":"_device_llm_placement_map()","text":"

Reads the content of the device placement file of the node with a lock, yields the content, and writes the content back to the file after the context manager is closed. If the file doesn't exist, an empty dictionary will be yielded.

Yields:

Type Description Dict[str, List[int]]

The content of the device placement file.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
@contextmanager\ndef _device_llm_placement_map(self) -> Generator[Dict[str, List[int]], None, None]:\n    \"\"\"Reads the content of the device placement file of the node with a lock, yields\n    the content, and writes the content back to the file after the context manager is\n    closed. If the file doesn't exist, an empty dictionary will be yielded.\n\n    Yields:\n        The content of the device placement file.\n    \"\"\"\n    _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.parent.mkdir(parents=True, exist_ok=True)\n    _CUDA_DEVICE_PLACEMENT_MIXIN_FILE.touch()\n    with portalocker.Lock(\n        _CUDA_DEVICE_PLACEMENT_MIXIN_FILE,\n        \"r+\",\n        flags=portalocker.LockFlags.EXCLUSIVE,\n    ) as f:\n        try:\n            content = json.load(f)\n        except json.JSONDecodeError:\n            content = {}\n        yield content\n        f.seek(0)\n        f.truncate()\n        f.write(json.dumps(content))\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._assign_cuda_devices","title":"_assign_cuda_devices()","text":"

Assigns CUDA devices to the LLM based on the device placement information provided in _device_llm_placement_map. If the cuda_devices attribute is set to \"auto\", it will be set to the first available CUDA device that is not going to be used by any other LLM. If the cuda_devices attribute is set to a list of devices, it will be checked if the devices are available to be used by the LLM. If not, a warning will be logged.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def _assign_cuda_devices(self) -> None:\n    \"\"\"Assigns CUDA devices to the LLM based on the device placement information provided\n    in `_device_llm_placement_map`. If the `cuda_devices` attribute is set to \"auto\", it\n    will be set to the first available CUDA device that is not going to be used by any\n    other LLM. If the `cuda_devices` attribute is set to a list of devices, it will be\n    checked if the devices are available to be used by the LLM. If not, a warning will be\n    logged.\"\"\"\n\n    # Take the lock and read the device placement information for each LLM.\n    with self._device_llm_placement_map() as device_map:\n        if self.cuda_devices == \"auto\":\n            self.cuda_devices = []\n            for _ in range(self._desired_num_gpus):\n                if (device_id := self._get_cuda_device(device_map)) is not None:\n                    self.cuda_devices.append(device_id)\n                    device_map[self._llm_identifier] = self.cuda_devices  # type: ignore\n            if len(self.cuda_devices) != self._desired_num_gpus:\n                self._logger.warning(  # type: ignore\n                    f\"Could not assign the desired number of GPUs {self._desired_num_gpus}\"\n                    f\" for LLM with identifier '{self._llm_identifier}'.\"\n                )\n        else:\n            self._check_cuda_devices(device_map)\n\n        device_map[self._llm_identifier] = self.cuda_devices  # type: ignore\n\n    # `_device_llm_placement_map` was not provided and user didn't set the `cuda_devices`\n    # attribute. In this case, the `cuda_devices` attribute will be set to an empty list.\n    if self.cuda_devices == \"auto\":\n        self.cuda_devices = []\n\n    self._set_cuda_visible_devices()\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._check_cuda_devices","title":"_check_cuda_devices(device_map)","text":"

Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.

Parameters:

Name Type Description Default device_map Dict[str, List[int]]

a dictionary with the device placement information for each LLM.

required Source code in src/distilabel/models/mixins/cuda_device_placement.py
def _check_cuda_devices(self, device_map: Dict[str, List[int]]) -> None:\n    \"\"\"Checks if the CUDA devices assigned to the LLM are also assigned to other LLMs.\n\n    Args:\n        device_map: a dictionary with the device placement information for each LLM.\n    \"\"\"\n    for device in self.cuda_devices:  # type: ignore\n        for llm, devices in device_map.items():\n            if device in devices:\n                self._logger.warning(  # type: ignore\n                    f\"LLM with identifier '{llm}' is also going to use CUDA device \"\n                    f\"'{device}'. This may lead to performance issues or running out\"\n                    \" of memory depending on the device capabilities and the loaded\"\n                    \" models.\"\n                )\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._get_cuda_device","title":"_get_cuda_device(device_map)","text":"

Returns the first available CUDA device to be used by the LLM that is not going to be used by any other LLM.

Parameters:

Name Type Description Default device_map Dict[str, List[int]]

a dictionary with the device placement information for each LLM.

required

Returns:

Type Description Union[int, None]

The first available CUDA device to be used by the LLM.

Raises:

Type Description RuntimeError

if there is no available CUDA device to be used by the LLM.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def _get_cuda_device(self, device_map: Dict[str, List[int]]) -> Union[int, None]:\n    \"\"\"Returns the first available CUDA device to be used by the LLM that is not going\n    to be used by any other LLM.\n\n    Args:\n        device_map: a dictionary with the device placement information for each LLM.\n\n    Returns:\n        The first available CUDA device to be used by the LLM.\n\n    Raises:\n        RuntimeError: if there is no available CUDA device to be used by the LLM.\n    \"\"\"\n    for device in self._available_cuda_devices:\n        if all(device not in devices for devices in device_map.values()):\n            return device\n\n    return None\n
"},{"location":"api/models/llm/llm_gallery/#distilabel.models.llms.CudaDevicePlacementMixin._set_cuda_visible_devices","title":"_set_cuda_visible_devices()","text":"

Sets the CUDA_VISIBLE_DEVICES environment variable to the list of CUDA devices to be used by the LLM.

Source code in src/distilabel/models/mixins/cuda_device_placement.py
def _set_cuda_visible_devices(self) -> None:\n    \"\"\"Sets the `CUDA_VISIBLE_DEVICES` environment variable to the list of CUDA devices\n    to be used by the LLM.\n    \"\"\"\n    if not self.cuda_devices:\n        return\n\n    if self._can_check_cuda_devices and not all(\n        device in self._available_cuda_devices for device in self.cuda_devices\n    ):\n        raise RuntimeError(\n            f\"Invalid CUDA devices for LLM '{self._llm_identifier}': {self.cuda_devices}.\"\n            f\" The available devices are: {self._available_cuda_devices}. Please, review\"\n            \" the 'cuda_devices' attribute and try again.\"\n        )\n\n    cuda_devices = \",\".join([str(device) for device in self.cuda_devices])\n    self._logger.info(  # type: ignore\n        f\"\ud83c\udfae LLM '{self._llm_identifier}' is going to use the following CUDA devices:\"\n        f\" {self.cuda_devices}.\"\n    )\n    os.environ[\"CUDA_VISIBLE_DEVICES\"] = cuda_devices\n
"},{"location":"api/pipeline/","title":"Pipeline","text":"

This section contains the API reference for the distilabel pipelines. For an example on how to use the pipelines, see the Tutorial - Pipeline.

"},{"location":"api/pipeline/#distilabel.pipeline.base","title":"base","text":""},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline","title":"BasePipeline","text":"

Bases: ABC, RequirementsMixin, _Serializable

Base class for a distilabel pipeline.

Attributes:

Name Type Description name

The name of the pipeline.

description

A description of the pipeline.

dag

The DAG instance that represents the pipeline.

_cache_dir

The directory where the pipeline will be cached.

_logger

The logger instance that will be used by the pipeline.

_batch_manager Optional[_BatchManager]

The batch manager that will manage the batches received from the steps while running the pipeline. It will be created when the pipeline is run, from scratch or from cache. Defaults to None.

_write_buffer Optional[_WriteBuffer]

The buffer that will store the data of the leaf steps of the pipeline while running, so the Distiset can be created at the end. It will be created when the pipeline is run. Defaults to None.

_fs Optional[AbstractFileSystem]

The fsspec filesystem to be used to store the data of the _Batches passed between the steps. It will be set when the pipeline is run. Defaults to None.

_storage_base_path Optional[str]

The base path where the data of the _Batches passed between the steps will be stored. It will be set then the pipeline is run. Defaults to None.

_use_fs_to_pass_data bool

Whether to use the file system to pass the data of the _Batches between the steps. Even if this parameter is False, the Batches received by GlobalSteps will always use the file system to pass the data. Defaults to False.

_dry_run

A flag to indicate if the pipeline is running in dry run mode. Defaults to False.

output_queue

A queue to store the output of the steps while running the pipeline.

load_queue

A queue used by each Step to notify the main process it has finished loading or it the step has been unloaded.

Source code in src/distilabel/pipeline/base.py
class BasePipeline(ABC, RequirementsMixin, _Serializable):\n    \"\"\"Base class for a `distilabel` pipeline.\n\n    Attributes:\n        name: The name of the pipeline.\n        description: A description of the pipeline.\n        dag: The `DAG` instance that represents the pipeline.\n        _cache_dir: The directory where the pipeline will be cached.\n        _logger: The logger instance that will be used by the pipeline.\n        _batch_manager: The batch manager that will manage the batches received from the\n            steps while running the pipeline. It will be created when the pipeline is run,\n            from scratch or from cache. Defaults to `None`.\n        _write_buffer: The buffer that will store the data of the leaf steps of the pipeline\n            while running, so the `Distiset` can be created at the end. It will be created\n            when the pipeline is run. Defaults to `None`.\n        _fs: The `fsspec` filesystem to be used to store the data of the `_Batch`es passed\n            between the steps. It will be set when the pipeline is run. Defaults to `None`.\n        _storage_base_path: The base path where the data of the `_Batch`es passed between\n            the steps will be stored. It will be set then the pipeline is run. Defaults\n            to `None`.\n        _use_fs_to_pass_data: Whether to use the file system to pass the data of the\n            `_Batch`es between the steps. Even if this parameter is `False`, the `Batch`es\n            received by `GlobalStep`s will always use the file system to pass the data.\n            Defaults to `False`.\n        _dry_run: A flag to indicate if the pipeline is running in dry run mode. Defaults\n            to `False`.\n        output_queue: A queue to store the output of the steps while running the pipeline.\n        load_queue: A queue used by each `Step` to notify the main process it has finished\n            loading or it the step has been unloaded.\n    \"\"\"\n\n    _output_queue: \"Queue[Any]\"\n    _load_queue: \"Queue[Union[StepLoadStatus, None]]\"\n\n    def __init__(\n        self,\n        name: Optional[str] = None,\n        description: Optional[str] = None,\n        cache_dir: Optional[Union[str, \"PathLike\"]] = None,\n        enable_metadata: bool = False,\n        requirements: Optional[List[str]] = None,\n    ) -> None:\n        \"\"\"Initialize the `BasePipeline` instance.\n\n        Args:\n            name: The name of the pipeline. If not generated, a random one will be generated by default.\n            description: A description of the pipeline. Defaults to `None`.\n            cache_dir: A directory where the pipeline will be cached. Defaults to `None`.\n            enable_metadata: Whether to include the distilabel metadata column for the pipeline\n                in the final `Distiset`. It contains metadata used by distilabel, for example\n                the raw outputs of the `LLM` without processing would be here, inside `raw_output_...`\n                field. Defaults to `False`.\n            requirements: List of requirements that must be installed to run the pipeline.\n                Defaults to `None`, but can be helpful to inform in a pipeline to be shared\n                that this requirements must be installed.\n        \"\"\"\n        self.name = name or _PIPELINE_DEFAULT_NAME\n        self.description = description\n        self._enable_metadata = enable_metadata\n        self.dag = DAG()\n\n        if cache_dir:\n            self._cache_dir = Path(cache_dir)\n        elif env_cache_dir := envs.DISTILABEL_CACHE_DIR:\n            self._cache_dir = Path(env_cache_dir)\n        else:\n            self._cache_dir = constants.PIPELINES_CACHE_DIR\n\n        self._logger = logging.getLogger(\"distilabel.pipeline\")\n\n        self._batch_manager: Optional[\"_BatchManager\"] = None\n        self._write_buffer: Optional[\"_WriteBuffer\"] = None\n\n        self._steps_load_status: Dict[str, int] = {}\n        self._steps_load_status_lock = threading.Lock()\n\n        self._stop_called = False\n        self._stop_called_lock = threading.Lock()\n        self._stop_calls = 0\n\n        self._recover_offline_batch_generate_for_step: Union[\n            Tuple[str, List[List[Dict[str, Any]]]], None\n        ] = None\n\n        self._fs: Optional[fsspec.AbstractFileSystem] = None\n        self._storage_base_path: Optional[str] = None\n        self._use_fs_to_pass_data: bool = False\n        self._dry_run = False\n\n        self._current_stage = 0\n        self._stages_last_batch: List[List[str]] = []\n\n        self.requirements = requirements or []\n\n        self._exception: Union[Exception, None] = None\n\n        self._log_queue: Union[\"Queue[Any]\", None] = None\n\n    def __enter__(self) -> Self:\n        \"\"\"Set the global pipeline instance when entering a pipeline context.\"\"\"\n        _GlobalPipelineManager.set_pipeline(self)\n        return self\n\n    def __exit__(self, exc_type, exc_value, traceback) -> None:\n        \"\"\"Unset the global pipeline instance when exiting a pipeline context.\"\"\"\n        _GlobalPipelineManager.set_pipeline(None)\n        self._set_pipeline_name()\n\n    def _set_pipeline_name(self) -> None:\n        \"\"\"Creates a name for the pipeline if it's the default one (if hasn't been set).\"\"\"\n        if self.name == _PIPELINE_DEFAULT_NAME:\n            self.name = f\"pipeline_{'_'.join(self.dag)}\"\n\n    @property\n    def signature(self) -> str:\n        \"\"\"Makes a signature (hash) of a pipeline, using the step ids and the adjacency between them.\n\n        The main use is to find the pipeline in the cache folder.\n\n        Returns:\n            Signature of the pipeline.\n        \"\"\"\n\n        pipeline_dump = self.dump()[\"pipeline\"]\n        steps_names = list(self.dag)\n        connections_info = [\n            f\"{c['from']}-{'-'.join(c['to'])}\" for c in pipeline_dump[\"connections\"]\n        ]\n\n        routing_batch_functions_info = []\n        for function in pipeline_dump[\"routing_batch_functions\"]:\n            step = function[\"step\"]\n            routing_batch_function: \"RoutingBatchFunction\" = self.dag.get_step(step)[\n                constants.ROUTING_BATCH_FUNCTION_ATTR_NAME\n            ]\n            if type_info := routing_batch_function._get_type_info():\n                step += f\"-{type_info}\"\n            routing_batch_functions_info.append(step)\n\n        return hashlib.sha1(\n            \",\".join(\n                steps_names + connections_info + routing_batch_functions_info\n            ).encode()\n        ).hexdigest()\n\n    def run(\n        self,\n        parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n        use_cache: bool = True,\n        storage_parameters: Optional[Dict[str, Any]] = None,\n        use_fs_to_pass_data: bool = False,\n        dataset: Optional[\"InputDataset\"] = None,\n        dataset_batch_size: int = 50,\n        logging_handlers: Optional[List[logging.Handler]] = None,\n    ) -> \"Distiset\":  # type: ignore\n        \"\"\"Run the pipeline. It will set the runtime parameters for the steps and validate\n        the pipeline.\n\n        This method should be extended by the specific pipeline implementation,\n        adding the logic to run the pipeline.\n\n        Args:\n            parameters: A dictionary with the step name as the key and a dictionary with\n                the runtime parameters for the step as the value. Defaults to `None`.\n            use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n                `True`.\n            storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n                that will be used to store the data of the `_Batch`es passed between the\n                steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n                `GlobalStep` it will be always used). It must have at least the \"path\" key,\n                and it can contain additional keys depending on the protocol. By default,\n                it will use the local file system and a directory in the cache directory.\n                Defaults to `None`.\n            use_fs_to_pass_data: Whether to use the file system to pass the data of\n                the `_Batch`es between the steps. Even if this parameter is `False`, the\n                `Batch`es received by `GlobalStep`s will always use the file system to\n                pass the data. Defaults to `False`.\n            dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n                root step. Convenient method when you have already processed the dataset in\n                your script and just want to pass it already processed. Defaults to `None`.\n            dataset_batch_size: if `dataset` is given, this will be the size of the batches\n                yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n            logging_handlers: A list of logging handlers that will be used to log the\n                output of the pipeline. This argument can be useful so the logging messages\n                can be extracted and used in a different context. Defaults to `None`.\n\n        Returns:\n            The `Distiset` created by the pipeline.\n        \"\"\"\n\n        self._exception: Union[Exception, None] = None\n\n        # Set the runtime parameters that will be used during the pipeline execution.\n        # They are used to generate the signature of the pipeline that is used to hit the\n        # cache when the pipeline is run, so it's important to do it first.\n        self._set_runtime_parameters(parameters or {})\n\n        self._refresh_pipeline_from_cache()\n\n        if dataset is not None:\n            self._add_dataset_generator_step(dataset, dataset_batch_size)\n\n        setup_logging(\n            log_queue=self._log_queue,\n            filename=str(self._cache_location[\"log_file\"]),\n            logging_handlers=logging_handlers,\n        )\n\n        # Set the name of the pipeline if it's the default one. This should be called\n        # if the pipeline is defined within the context manager, and the run is called\n        # outside of it. Is here in the following case:\n        # with Pipeline() as pipeline:\n        #    pipeline.run()\n        self._set_pipeline_name()\n\n        # Validate the pipeline DAG to check that all the steps are chainable, there are\n        # no missing runtime parameters, batch sizes are correct, etc.\n        self.dag.validate()\n\n        self._set_pipeline_artifacts_path_in_steps()\n\n        # Set the initial load status for all the steps\n        self._init_steps_load_status()\n\n        # Load the stages status or initialize it\n        self._load_stages_status(use_cache)\n\n        # Load the `_BatchManager` from cache or create one from scratch\n        self._load_batch_manager(use_cache)\n\n        # Check pipeline requirements are installed\n        self._check_requirements()\n\n        # Setup the filesystem that will be used to pass the data of the `_Batch`es\n        self._setup_fsspec(storage_parameters)\n        self._use_fs_to_pass_data = use_fs_to_pass_data\n\n        if self._dry_run:\n            self._logger.info(\"\ud83c\udf35 Dry run mode\")\n\n        # If the batch manager is not able to generate batches, that means that the loaded\n        # `_BatchManager` from cache didn't have any remaining batches to process i.e.\n        # the previous pipeline execution was completed successfully.\n        if not self._batch_manager.can_generate():  # type: ignore\n            self._logger.info(\n                \"\ud83d\udcbe Loaded batch manager from cache doesn't contain any remaining data.\"\n                \" Returning `Distiset` from cache data...\"\n            )\n            distiset = create_distiset(\n                data_dir=self._cache_location[\"data\"],\n                pipeline_path=self._cache_location[\"pipeline\"],\n                log_filename_path=self._cache_location[\"log_file\"],\n                enable_metadata=self._enable_metadata,\n                dag=self.dag,\n            )\n            stop_logging()\n            return distiset\n\n        self._setup_write_buffer(use_cache)\n\n        self._print_load_stages_info()\n\n    def dry_run(\n        self,\n        parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n        batch_size: int = 1,\n        dataset: Optional[\"InputDataset\"] = None,\n    ) -> \"Distiset\":\n        \"\"\"Do a dry run to test the pipeline runs as expected.\n\n        Running a `Pipeline` in dry run mode will set all the `batch_size` of generator steps\n        to the specified `batch_size`, and run just with a single batch, effectively\n        running the whole pipeline with a single example. The cache will be set to `False`.\n\n        Args:\n            parameters: A dictionary with the step name as the key and a dictionary with\n                the runtime parameters for the step as the value. Defaults to `None`.\n            batch_size: The batch size of the unique batch generated by the generators\n                steps of the pipeline. Defaults to `1`.\n            dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n                root step. Convenient method when you have already processed the dataset in\n                your script and just want to pass it already processed. Defaults to `None`.\n\n        Returns:\n            Will return the `Distiset` as the main run method would do.\n        \"\"\"\n        self._dry_run = True\n\n        for step_name in self.dag:\n            step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n\n            if step.is_generator:\n                if not parameters:\n                    parameters = {}\n                parameters[step_name] = {\"batch_size\": batch_size}\n\n        distiset = self.run(parameters=parameters, use_cache=False, dataset=dataset)\n\n        self._dry_run = False\n        return distiset\n\n    def _add_dataset_generator_step(\n        self, dataset: \"InputDataset\", batch_size: int = 50\n    ) -> None:\n        \"\"\"Create a root step to work as the `GeneratorStep` for the pipeline using a\n        dataset.\n\n        Args:\n            dataset: A dataset that will be used to create a `GeneratorStep` and\n                placed in the DAG as the root step.\n            batch_size: The size of the batches generated by the `GeneratorStep`.\n\n        Raises:\n            ValueError: If there's already a `GeneratorStep` in the pipeline.\n        \"\"\"\n        for step_name in self.dag:\n            step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            if isinstance(step_name, GeneratorStep):\n                raise DistilabelUserError(\n                    \"There is already a `GeneratorStep` in the pipeline, you can either\"\n                    \" pass a `dataset` to the run method, or create a `GeneratorStep` explictly.\"\n                    f\" `GeneratorStep`: {step}\",\n                    page=\"sections/how_to_guides/basic/step/#types-of-steps\",\n                )\n        loader = make_generator_step(\n            dataset=dataset,\n            pipeline=self,\n            batch_size=batch_size,\n        )\n        self.dag.add_root_step(loader)\n\n    def get_runtime_parameters_info(self) -> \"PipelineRuntimeParametersInfo\":\n        \"\"\"Get the runtime parameters for the steps in the pipeline.\n\n        Returns:\n            A dictionary with the step name as the key and a list of dictionaries with\n            the parameter name and the parameter info as the value.\n        \"\"\"\n        runtime_parameters = {}\n        for step_name in self.dag:\n            step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            runtime_parameters[step_name] = step.get_runtime_parameters_info()\n        return runtime_parameters\n\n    def _init_steps_load_status(self) -> None:\n        \"\"\"Initialize the `_steps_load_status` dictionary assigning 0 to every step of\n        the pipeline.\"\"\"\n        for step_name in self.dag:\n            self._steps_load_status[step_name] = _STEP_NOT_LOADED_CODE\n\n    def _set_pipeline_artifacts_path_in_steps(self) -> None:\n        \"\"\"Sets the attribute `_pipeline_artifacts_path` in all the `Step`s of the pipeline,\n        so steps can use it to get the path to save the generated artifacts.\"\"\"\n        artifacts_path = self._cache_location[\"data\"] / constants.STEPS_ARTIFACTS_PATH\n        for name in self.dag:\n            step: \"_Step\" = self.dag.get_step(name)[constants.STEP_ATTR_NAME]\n            step.set_pipeline_artifacts_path(path=artifacts_path)\n\n    def _check_requirements(self) -> None:\n        \"\"\"Checks if the dependencies required to run the pipeline are installed.\n\n        Raises:\n            ModuleNotFoundError: if one or more requirements are missing.\n        \"\"\"\n        if to_install := self.requirements_to_install():\n            # Print the list of requirements like they would appear in a requirements.txt\n            to_install_list = \"\\n\" + \"\\n\".join(to_install)\n            msg = f\"Please install the following requirements to run the pipeline: {to_install_list}\"\n            self._logger.error(msg)\n            raise ModuleNotFoundError(msg)\n\n    def _setup_fsspec(\n        self, storage_parameters: Optional[Dict[str, Any]] = None\n    ) -> None:\n        \"\"\"Setups the `fsspec` filesystem to be used to store the data of the `_Batch`es\n        passed between the steps.\n\n        Args:\n            storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n                that will be used to store the data of the `_Batch`es passed between the\n                steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n                `GlobalStep` it will be always used). It must have at least the \"path\" key,\n                and it can contain additional keys depending on the protocol. By default,\n                it will use the local file system and a directory in the cache directory.\n                Defaults to `None`.\n        \"\"\"\n        if not storage_parameters:\n            self._fs = fsspec.filesystem(\"file\")\n            self._storage_base_path = (\n                f\"file://{self._cache_location['batch_input_data']}\"\n            )\n            return\n\n        if \"path\" not in storage_parameters:\n            raise DistilabelUserError(\n                \"The 'path' key must be present in the `storage_parameters` dictionary\"\n                \" if it's not `None`.\",\n                page=\"sections/how_to_guides/advanced/fs_to_pass_data/\",\n            )\n\n        path = storage_parameters.pop(\"path\")\n        protocol = UPath(path).protocol\n\n        self._fs = fsspec.filesystem(protocol, **storage_parameters)\n        self._storage_base_path = path\n\n    def _add_step(self, step: \"_Step\") -> None:\n        \"\"\"Add a step to the pipeline.\n\n        Args:\n            step: The step to be added to the pipeline.\n        \"\"\"\n        self.dag.add_step(step)\n\n    def _add_edge(self, from_step: str, to_step: str) -> None:\n        \"\"\"Add an edge between two steps in the pipeline.\n\n        Args:\n            from_step: The name of the step that will generate the input for `to_step`.\n            to_step: The name of the step that will receive the input from `from_step`.\n        \"\"\"\n        self.dag.add_edge(from_step, to_step)\n\n        # Check if `from_step` has a `routing_batch_function`. If it does, then mark\n        # `to_step` as a step that will receive a routed batch.\n        node = self.dag.get_step(from_step)  # type: ignore\n        routing_batch_function = node.get(\n            constants.ROUTING_BATCH_FUNCTION_ATTR_NAME, None\n        )\n        self.dag.set_step_attr(\n            name=to_step,\n            attr=constants.RECEIVES_ROUTED_BATCHES_ATTR_NAME,\n            value=routing_batch_function is not None,\n        )\n\n    def _is_convergence_step(self, step_name: str) -> None:\n        \"\"\"Checks if a step is a convergence step.\n\n        Args:\n            step_name: The name of the step.\n        \"\"\"\n        return self.dag.get_step(step_name).get(constants.CONVERGENCE_STEP_ATTR_NAME)\n\n    def _add_routing_batch_function(\n        self, step_name: str, routing_batch_function: \"RoutingBatchFunction\"\n    ) -> None:\n        \"\"\"Add a routing batch function to a step.\n\n        Args:\n            step_name: The name of the step that will receive the routed batch.\n            routing_batch_function: The function that will route the batch to the step.\n        \"\"\"\n        self.dag.set_step_attr(\n            name=step_name,\n            attr=constants.ROUTING_BATCH_FUNCTION_ATTR_NAME,\n            value=routing_batch_function,\n        )\n\n    def _set_runtime_parameters(self, parameters: Dict[str, Dict[str, Any]]) -> None:\n        \"\"\"Set the runtime parameters for the steps in the pipeline.\n\n        Args:\n            parameters: A dictionary with the step name as the key and a dictionary with\n            the parameter name as the key and the parameter value as the value.\n        \"\"\"\n        step_names = set(self.dag.G)\n        for step_name, step_parameters in parameters.items():\n            if step_name not in step_names:\n                self._logger.warning(\n                    f\"\u2753 Step '{step_name}' provided in `Pipeline.run(parameters={{...}})` not found in the pipeline.\"\n                    f\" Available steps are: {step_names}.\"\n                )\n            else:\n                step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n                step.set_runtime_parameters(step_parameters)\n\n    def _model_dump(self, obj: Any, **kwargs: Any) -> Dict[str, Any]:\n        \"\"\"Dumps the DAG content to a dict.\n\n        Args:\n            obj (Any): Unused, just kept to match the signature of the parent method.\n            kwargs (Any): Unused, just kept to match the signature of the parent method.\n\n        Returns:\n            Dict[str, Any]: Internal representation of the DAG from networkx in a serializable format.\n        \"\"\"\n        return self.dag.dump()\n\n    def draw(\n        self,\n        path: Optional[Union[str, Path]] = \"pipeline.png\",\n        top_to_bottom: bool = False,\n        show_edge_labels: bool = True,\n    ) -> str:\n        \"\"\"\n        Draws the pipeline.\n\n        Parameters:\n            path: The path to save the image to.\n            top_to_bottom: Whether to draw the DAG top to bottom. Defaults to `False`.\n            show_edge_labels: Whether to show the edge labels. Defaults to `True`.\n\n        Returns:\n            The path to the saved image.\n        \"\"\"\n        png = self.dag.draw(\n            top_to_bottom=top_to_bottom, show_edge_labels=show_edge_labels\n        )\n        with open(path, \"wb\") as f:\n            f.write(png)\n        return path\n\n    def __repr__(self) -> str:\n        \"\"\"\n        If running in a Jupyter notebook, display an image representing this `Pipeline`.\n        \"\"\"\n        if in_notebook():\n            try:\n                from IPython.display import Image, display\n\n                image_data = self.dag.draw()\n\n                display(Image(image_data))\n            except Exception:\n                pass\n        return super().__repr__()\n\n    def dump(self, **kwargs: Any) -> Dict[str, Any]:\n        return {\n            \"distilabel\": {\"version\": __version__},\n            \"pipeline\": {\n                \"name\": self.name,\n                \"description\": self.description,\n                **super().dump(),\n            },\n            \"requirements\": self.requirements,\n        }\n\n    @classmethod\n    def from_dict(cls, data: Dict[str, Any]) -> Self:\n        \"\"\"Create a Pipeline from a dict containing the serialized data.\n\n        Note:\n            It's intended for internal use.\n\n        Args:\n            data (Dict[str, Any]): Dictionary containing the serialized data from a Pipeline.\n\n        Returns:\n            BasePipeline: Pipeline recreated from the dictionary info.\n        \"\"\"\n        name = data[\"pipeline\"][\"name\"]\n        description = data[\"pipeline\"].get(\"description\")\n        requirements = data.get(\"requirements\", [])\n        with cls(name=name, description=description, requirements=requirements) as pipe:\n            pipe.dag = DAG.from_dict(data[\"pipeline\"])\n        return pipe\n\n    @property\n    def _cache_location(self) -> \"_CacheLocation\":\n        \"\"\"Dictionary containing the object that will stored and the location,\n        whether it is a filename or a folder.\n\n        Returns:\n            Path: Filenames where the pipeline content will be serialized.\n        \"\"\"\n        folder = self._cache_dir / self.name / self.signature\n        pipeline_execution_dir = folder / \"executions\" / self.aggregated_steps_signature\n        return {\n            \"pipeline\": pipeline_execution_dir / \"pipeline.yaml\",\n            \"batch_manager\": pipeline_execution_dir / \"batch_manager.json\",\n            \"steps_data\": self._cache_dir / self.name / \"steps_data\",\n            \"data\": pipeline_execution_dir / \"data\",\n            \"batch_input_data\": pipeline_execution_dir / \"batch_input_data\",\n            \"log_file\": pipeline_execution_dir / \"pipeline.log\",\n            \"stages_file\": pipeline_execution_dir / \"stages.json\",\n        }\n\n    @property\n    def aggregated_steps_signature(self) -> str:\n        \"\"\"Creates an aggregated signature using `Step`s signature that will be used for\n        the `_BatchManager`.\n\n        Returns:\n            The aggregated signature.\n        \"\"\"\n        signatures = []\n        for step_name in self.dag:\n            step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            signatures.append(step.signature)\n        return hashlib.sha1(\"\".join(signatures).encode()).hexdigest()\n\n    def _cache(self) -> None:\n        \"\"\"Saves the `BasePipeline` using the `_cache_filename`.\"\"\"\n        if self._dry_run:\n            return\n\n        self.save(\n            path=self._cache_location[\"pipeline\"],\n            format=self._cache_location[\"pipeline\"].suffix.replace(\".\", \"\"),  # type: ignore\n        )\n\n        if self._batch_manager is not None:\n            self._batch_manager.cache(\n                path=self._cache_location[\"batch_manager\"],\n                steps_data_path=self._cache_location[\"steps_data\"],\n            )\n\n        self._save_stages_status()\n\n        self._logger.debug(\"Pipeline and batch manager saved to cache.\")\n\n    def _save_stages_status(self) -> None:\n        \"\"\"Saves the stages status to cache.\"\"\"\n        self.save(\n            path=self._cache_location[\"stages_file\"],\n            format=\"json\",\n            dump={\n                \"current_stage\": self._current_stage,\n                \"stages_last_batch\": self._stages_last_batch,\n            },\n        )\n\n    def _load_stages_status(self, use_cache: bool = True) -> None:\n        \"\"\"Try to load the stages status from cache, or initialize it if cache file doesn't\n        exist or cache is not going to be used.\"\"\"\n        if use_cache and self._cache_location[\"stages_file\"].exists():\n            stages_status = read_json(self._cache_location[\"stages_file\"])\n            self._current_stage = stages_status[\"current_stage\"]\n            self._stages_last_batch = stages_status[\"stages_last_batch\"]\n        else:\n            self._current_stage = 0\n            self._stages_last_batch = [\n                [] for _ in range(len(self.dag.get_steps_load_stages()[0]))\n            ]\n\n    def _refresh_pipeline_from_cache(self) -> None:\n        \"\"\"Refresh the DAG (and its steps) from the cache file. This is useful as some\n        `Step`s can update and change their state during the pipeline execution, and this\n        method will make sure the pipeline is up-to-date with the latest changes when\n        the pipeline is reloaded from cache.\n        \"\"\"\n\n        def recursively_handle_secrets_and_excluded_attributes(\n            cached_model: \"BaseModel\", model: \"BaseModel\"\n        ) -> None:\n            \"\"\"Recursively handle the secrets and excluded attributes of a `BaseModel`,\n            setting the values of the cached model to the values of the model.\n\n            Args:\n                cached_model: The cached model that will be updated as it doesn't contain\n                    the secrets and excluded attributes (not serialized).\n                model: The model that contains the secrets and excluded attributes because\n                    it comes from pipeline instantiation.\n            \"\"\"\n            for field_name, field_info in cached_model.model_fields.items():\n                if field_name in (\"pipeline\"):\n                    continue\n\n                inner_type = extract_annotation_inner_type(field_info.annotation)\n                if is_type_pydantic_secret_field(inner_type) or field_info.exclude:\n                    setattr(cached_model, field_name, getattr(model, field_name))\n                elif isclass(inner_type) and issubclass(inner_type, BaseModel):\n                    recursively_handle_secrets_and_excluded_attributes(\n                        getattr(cached_model, field_name),\n                        getattr(model, field_name),\n                    )\n\n        if self._cache_location[\"pipeline\"].exists():\n            cached_dag = self.from_yaml(self._cache_location[\"pipeline\"]).dag\n\n            for step_name in cached_dag:\n                step_cached: \"_Step\" = cached_dag.get_step(step_name)[\n                    constants.STEP_ATTR_NAME\n                ]\n                step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n                recursively_handle_secrets_and_excluded_attributes(step_cached, step)\n\n            self.dag = cached_dag\n\n    def _load_batch_manager(self, use_cache: bool = True) -> None:\n        \"\"\"Will try to load the `_BatchManager` from the cache dir if found. Otherwise,\n        it will create one from scratch.\n\n        If the `_BatchManager` is loaded from cache, we check for invalid steps (those that\n        may have a different signature than the original in the pipeline folder), and\n        restart them, as well as their successors.\n\n        Args:\n            use_cache: whether the cache should be used or not.\n        \"\"\"\n        batch_manager_cache_loc = self._cache_location[\"batch_manager\"]\n\n        # This first condition handles the case in which the pipeline is exactly the same\n        # no steps have been added, removed or changed.\n        if use_cache and batch_manager_cache_loc.exists():\n            self._logger.info(\n                f\"\ud83d\udcbe Loading `_BatchManager` from cache: '{batch_manager_cache_loc}'\"\n            )\n            self._batch_manager = _BatchManager.load_from_cache(\n                dag=self.dag,\n                batch_manager_path=batch_manager_cache_loc,\n                steps_data_path=self._cache_location[\"steps_data\"],\n            )\n            self._invalidate_steps_cache_if_required()\n        # In this other case, the pipeline has been changed. We need to create a new batch\n        # manager and if `use_cache==True` then check which outputs have we computed and\n        # cached for steps that haven't changed but that were executed in another pipeline,\n        # and therefore we can reuse\n        else:\n            self._batch_manager = _BatchManager.from_dag(\n                dag=self.dag,\n                use_cache=use_cache,\n                steps_data_path=self._cache_location[\"steps_data\"],\n            )\n\n    def _invalidate_steps_cache_if_required(self) -> None:\n        \"\"\"Iterates over the steps of the pipeline and invalidates their cache if required.\"\"\"\n        for step_name in self.dag:\n            # `GeneratorStep`s doesn't receive input data so no need to check their\n            # `_BatchManagerStep`\n            if self.dag.get_step(step_name)[constants.STEP_ATTR_NAME].is_generator:\n                continue\n\n            step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            if not step.use_cache:\n                self._batch_manager.invalidate_cache_for(\n                    step_name=step.name,\n                    dag=self.dag,\n                    steps_data_path=self._cache_location[\"steps_data\"],\n                )  # type: ignore\n                self._logger.info(\n                    f\"\u267b\ufe0f Step '{step.name}' won't use cache (`use_cache=False`). The cache of this step and their successors won't be\"\n                    \" reused and the results will have to be recomputed.\"\n                )\n                break\n\n    def _setup_write_buffer(self, use_cache: bool = True) -> None:\n        \"\"\"Setups the `_WriteBuffer` that will store the data of the leaf steps of the\n        pipeline while running, so the `Distiset` can be created at the end.\n        \"\"\"\n        if not use_cache and self._cache_location[\"data\"].exists():\n            shutil.rmtree(self._cache_location[\"data\"])\n        buffer_data_path = self._cache_location[\"data\"] / constants.STEPS_OUTPUTS_PATH\n        self._logger.info(f\"\ud83d\udcdd Pipeline data will be written to '{buffer_data_path}'\")\n        self._write_buffer = _WriteBuffer(\n            buffer_data_path,\n            self.dag.leaf_steps,\n            steps_cached={\n                step_name: self.dag.get_step(step_name)[\n                    constants.STEP_ATTR_NAME\n                ].use_cache\n                for step_name in self.dag\n            },\n        )\n\n    def _print_load_stages_info(self) -> None:\n        \"\"\"Prints the information about the load stages.\"\"\"\n        stages, _ = self.dag.get_steps_load_stages()\n        msg = \"\"\n        for stage, steps in enumerate(stages):\n            steps_to_be_loaded = self._steps_to_be_loaded_in_stage(stage)\n            msg += f\"\\n * Stage {stage}:\"\n            for step in steps:\n                msg += f\"\\n   - '{step}'\"\n                if step not in steps_to_be_loaded:\n                    msg += \" (results cached, won't be loaded and executed)\"\n        self._logger.info(\n            f\"\u231b The steps of the pipeline will be loaded in stages:{msg}\"\n        )\n\n    def _run_output_queue_loop_in_thread(self) -> threading.Thread:\n        \"\"\"Runs the output queue loop in a separate thread to receive the output batches\n        from the steps. This is done to avoid the signal handler to block the loop, which\n        would prevent the pipeline from stopping correctly.\"\"\"\n        thread = threading.Thread(target=self._output_queue_loop)\n        thread.start()\n        return thread\n\n    def _output_queue_loop(self) -> None:\n        \"\"\"Loop to receive the output batches from the steps and manage the flow of the\n        batches through the pipeline.\"\"\"\n        if not self._initialize_pipeline_execution():\n            return\n\n        while self._should_continue_processing():  # type: ignore\n            self._logger.debug(\"Waiting for output batch from step...\")\n            if (batch := self._output_queue.get()) is None:\n                self._logger.debug(\"Received `None` from output queue. Breaking loop.\")\n                break\n\n            self._logger.debug(\n                f\"Received batch with seq_no {batch.seq_no} from step '{batch.step_name}'\"\n                f\" from output queue: {batch}\"\n            )\n\n            self._process_batch(batch)\n\n            # If `_stop_called` was set to `True` while waiting for the output queue, then\n            # we need to handle the stop of the pipeline and break the loop to avoid\n            # propagating the batches through the pipeline and making the stop process\n            # slower.\n            with self._stop_called_lock:\n                if self._stop_called:\n                    self._handle_batch_on_stop(batch)\n                    break\n\n            # If there is another load stage and all the `last_batch`es from the stage\n            # have been received, then load the next stage.\n            if self._should_load_next_stage():\n                if not self._update_stage():\n                    break\n\n            self._manage_batch_flow(batch)\n\n        self._finalize_pipeline_execution()\n\n    def _initialize_pipeline_execution(self) -> bool:\n        \"\"\"Load the steps of the required stage to initialize the pipeline execution,\n        and requests the initial batches to trigger the batch flowing in the pipeline.\n\n        Returns:\n            `True` if initialization went OK, `False` otherwise.\n        \"\"\"\n        # Wait for all the steps to be loaded correctly\n        if not self._run_stage_steps_and_wait(stage=self._current_stage):\n            self._set_steps_not_loaded_exception()\n            return False\n\n        # Send the \"first\" batches to the steps so the batches starts flowing through\n        # the input queues and output queue\n        self._request_initial_batches()\n\n        return True\n\n    def _should_continue_processing(self) -> bool:\n        \"\"\"Condition for the consume batches from the `output_queue` loop.\n\n        Returns:\n            `True` if should continue consuming batches, `False` otherwise and the pipeline\n            should stop.\n        \"\"\"\n        with self._stop_called_lock:\n            return self._batch_manager.can_generate() and not self._stop_called  # type: ignore\n\n    def _process_batch(\n        self, batch: \"_Batch\", send_last_batch_flag: bool = True\n    ) -> None:\n        \"\"\"Process a batch consumed from the `output_queue`.\n\n        Args:\n            batch: the batch to be processed.\n        \"\"\"\n        if batch.data_path:\n            self._logger.debug(\n                f\"Reading {batch.seq_no} batch data from '{batch.step_name}': '{batch.data_path}'\"\n            )\n            batch.read_batch_data_from_fs()\n\n        if batch.step_name in self.dag.leaf_steps:\n            self._write_buffer.add_batch(batch)  # type: ignore\n\n        if batch.last_batch:\n            self._register_stages_last_batch(batch)\n\n            # Make sure to send the `LAST_BATCH_SENT_FLAG` to the predecessors of the step\n            # if the batch is the last one, so they stop their processing loop even if they\n            # haven't received the last batch because of the routing function.\n            if send_last_batch_flag:\n                for step_name in self.dag.get_step_predecessors(batch.step_name):\n                    if self._is_step_running(step_name):\n                        self._send_last_batch_flag_to_step(step_name)\n\n    def _set_step_for_recovering_offline_batch_generation(\n        self, step: \"_Step\", data: List[List[Dict[str, Any]]]\n    ) -> None:\n        \"\"\"Sets the required information to recover a pipeline execution from a `_Step`\n        that used an `LLM` with offline batch generation.\n\n        Args:\n            step: The `_Step` that used an `LLM` with offline batch generation.\n            data: The data that was used to generate the batches for the step.\n        \"\"\"\n        # Replace step so the attribute `jobs_ids` of the `LLM` is not lost, as it was\n        # updated in the child process but not in the main process.\n        step_name: str = step.name  # type: ignore\n        self.dag.set_step_attr(\n            name=step_name, attr=constants.STEP_ATTR_NAME, value=step\n        )\n        self._recover_offline_batch_generate_for_step = (step_name, data)\n\n    def _add_batch_for_recovering_offline_batch_generation(self) -> None:\n        \"\"\"Adds a dummy `_Batch` to the specified step name (it's a `Task` that used an\n        `LLM` with offline batch generation) to recover the pipeline state for offline\n        batch generation in next pipeline executions.\"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        if self._recover_offline_batch_generate_for_step is None:\n            return\n\n        step_name, data = self._recover_offline_batch_generate_for_step\n        self._logger.debug(\n            f\"Adding batch to '{step_name}' step to recover pipeline execution for offline\"\n            \" batch generation...\"\n        )\n        self._batch_manager.add_batch_to_recover_offline_batch_generation(\n            to_step=step_name,\n            data=data,\n        )\n\n    def _register_stages_last_batch(self, batch: \"_Batch\") -> None:\n        \"\"\"Registers the last batch received from a step in the `_stages_last_batch`\n        dictionary.\n\n        Args:\n            batch: The last batch received from a step.\n        \"\"\"\n        _, stages_last_steps = self.dag.get_steps_load_stages()\n        stage_last_steps = stages_last_steps[self._current_stage]\n        if batch.step_name in stage_last_steps:\n            self._stages_last_batch[self._current_stage].append(batch.step_name)\n            self._stages_last_batch[self._current_stage].sort()\n\n    def _update_stage(self) -> bool:\n        \"\"\"Checks if the steps of next stage should be loaded and updates `_current_stage`\n        attribute.\n\n        Returns:\n            `True` if updating the stage went OK, `False` otherwise.\n        \"\"\"\n        self._current_stage += 1\n        if not self._run_stage_steps_and_wait(stage=self._current_stage):\n            self._set_steps_not_loaded_exception()\n            return False\n\n        return True\n\n    def _should_load_next_stage(self) -> bool:\n        \"\"\"Returns if the next stage should be loaded.\n\n        Returns:\n            `True` if the next stage should be loaded, `False` otherwise.\n        \"\"\"\n        _, stage_last_steps = self.dag.get_steps_load_stages()\n        there_is_next_stage = self._current_stage + 1 < len(stage_last_steps)\n        stage_last_batches_received = (\n            self._stages_last_batch[self._current_stage]\n            == stage_last_steps[self._current_stage]\n        )\n        return there_is_next_stage and stage_last_batches_received\n\n    def _finalize_pipeline_execution(self) -> None:\n        \"\"\"Finalizes the pipeline execution handling the prematurely stop of the pipeline\n        if required, caching the data and ensuring that all the steps finish its execution.\"\"\"\n\n        # Send `None` to steps `input_queue`s just in case some step is still waiting\n        self._notify_steps_to_stop()\n\n        for step_name in self.dag:\n            while self._is_step_running(step_name):\n                self._logger.debug(f\"Waiting for step '{step_name}' to finish...\")\n                time.sleep(0.5)\n\n        with self._stop_called_lock:\n            if self._stop_called:\n                self._handle_stop()\n\n            # Reset flag state\n            self._stop_called = False\n\n        self._add_batch_for_recovering_offline_batch_generation()\n\n        self._cache()\n\n    def _run_load_queue_loop_in_thread(self) -> threading.Thread:\n        \"\"\"Runs a background thread that reads from the `load_queue` to update the status\n        of the number of replicas loaded for each step.\n\n        Returns:\n            The thread that was started.\n        \"\"\"\n        thread = threading.Thread(target=self._run_load_queue_loop)\n        thread.start()\n        return thread\n\n    def _run_load_queue_loop(self) -> None:\n        \"\"\"Runs a loop that reads from the `load_queue` to update the status of the number\n        of replicas loaded for each step.\"\"\"\n\n        while True:\n            if (load_info := self._load_queue.get()) is None:\n                self._logger.debug(\"Received `None` from load queue. Breaking loop.\")\n                break\n\n            with self._steps_load_status_lock:\n                step_name, status = load_info[\"name\"], load_info[\"status\"]\n                if status == \"loaded\":\n                    if self._steps_load_status[step_name] == _STEP_NOT_LOADED_CODE:\n                        self._steps_load_status[step_name] = 1\n                    else:\n                        self._steps_load_status[step_name] += 1\n                elif status == \"unloaded\":\n                    self._steps_load_status[step_name] -= 1\n                else:\n                    # load failed\n                    self._steps_load_status[step_name] = _STEP_LOAD_FAILED_CODE\n\n                self._logger.debug(\n                    f\"Step '{step_name}' loaded replicas: {self._steps_load_status[step_name]}\"\n                )\n\n    def _is_step_running(self, step_name: str) -> bool:\n        \"\"\"Checks if the step is running (at least one replica is running).\n\n        Args:\n            step_name: The step to be check if running.\n\n        Returns:\n            `True` if the step is running, `False` otherwise.\n        \"\"\"\n        with self._steps_load_status_lock:\n            return self._steps_load_status[step_name] >= 1\n\n    def _steps_to_be_loaded_in_stage(self, stage: int) -> List[str]:\n        \"\"\"Returns the list of steps of the provided stage that should be loaded taking\n        into account if they have finished.\n\n        Args:\n            stage: the stage number\n\n        Returns:\n            A list containing the name of the steps that should be loaded in this stage.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        steps_stages, _ = self.dag.get_steps_load_stages()\n\n        return [\n            step\n            for step in steps_stages[stage]\n            if not self._batch_manager.step_has_finished(step)\n        ]\n\n    def _run_stage_steps_and_wait(self, stage: int) -> bool:\n        \"\"\"Runs the steps of the specified stage and waits for them to be ready.\n\n        Args:\n            stage: the stage from which the steps have to be loaded.\n\n        Returns:\n            `True` if all the steps have been loaded correctly, `False` otherwise.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        steps = self._steps_to_be_loaded_in_stage(stage)\n        self._logger.debug(f\"Steps to be loaded in stage {stage}: {steps}\")\n\n        # Run the steps of the stage\n        self._run_steps(steps=steps)\n\n        # Wait for them to be ready\n        self._logger.info(f\"\u23f3 Waiting for all the steps of stage {stage} to load...\")\n        previous_message = None\n        with self._stop_called_lock:\n            while not self._stop_called:\n                with self._steps_load_status_lock:\n                    filtered_steps_load_status = {\n                        step_name: replicas\n                        for step_name, replicas in self._steps_load_status.items()\n                        if step_name in steps\n                    }\n                    self._logger.debug(\n                        f\"Steps from stage {stage} loaded: {filtered_steps_load_status}\"\n                    )\n\n                    if any(\n                        replicas_loaded == _STEP_LOAD_FAILED_CODE\n                        for replicas_loaded in filtered_steps_load_status.values()\n                    ):\n                        self._logger.error(\n                            f\"\u274c Failed to load all the steps of stage {stage}\"\n                        )\n                        return False\n\n                    num_steps_loaded = 0\n                    replicas_message = \"\"\n                    for step_name, replicas in filtered_steps_load_status.items():\n                        step_replica_count = self.dag.get_step_replica_count(step_name)\n                        if replicas == step_replica_count:\n                            num_steps_loaded += 1\n                        replicas_message += f\"\\n * '{step_name}' replicas: {max(0, replicas)}/{step_replica_count}\"\n\n                    message = f\"\u23f3 Steps from stage {stage} loaded: {num_steps_loaded}/{len(filtered_steps_load_status)}{replicas_message}\"\n                    if num_steps_loaded > 0 and message != previous_message:\n                        self._logger.info(message)\n                        previous_message = message\n\n                    if num_steps_loaded == len(filtered_steps_load_status):\n                        self._logger.info(\n                            f\"\u2705 All the steps from stage {stage} have been loaded!\"\n                        )\n                        return True\n\n                time.sleep(2.5)\n\n        return not self._stop_called\n\n    def _handle_stop(self) -> None:\n        \"\"\"Handles the stop of the pipeline execution, which will stop the steps from\n        processing more batches and wait for the output queue to be empty, to not lose\n        any data that was already processed by the steps before the stop was called.\"\"\"\n        self._logger.debug(\"Handling stop of the pipeline execution...\")\n\n        self._add_batches_back_to_batch_manager()\n\n        # Wait for the input queue to be empty, which means that all the steps finished\n        # processing the batches that were sent before the stop flag.\n        for step_name in self.dag:\n            self._wait_step_input_queue_empty(step_name)\n\n        self._consume_output_queue()\n\n        if self._should_load_next_stage():\n            self._current_stage += 1\n\n    def _wait_step_input_queue_empty(self, step_name: str) -> Union[\"Queue[Any]\", None]:\n        \"\"\"Waits for the input queue of a step to be empty.\n\n        Args:\n            step_name: The name of the step.\n\n        Returns:\n            The input queue of the step if it's not loaded or finished, `None` otherwise.\n        \"\"\"\n        if self._check_step_not_loaded_or_finished(step_name):\n            return None\n\n        if input_queue := self.dag.get_step(step_name).get(\n            constants.INPUT_QUEUE_ATTR_NAME\n        ):\n            while input_queue.qsize() != 0:\n                pass\n            return input_queue\n\n    def _check_step_not_loaded_or_finished(self, step_name: str) -> bool:\n        \"\"\"Checks if a step is not loaded or already finished.\n\n        Args:\n            step_name: The name of the step.\n\n        Returns:\n            `True` if the step is not loaded or already finished, `False` otherwise.\n        \"\"\"\n        with self._steps_load_status_lock:\n            num_replicas = self._steps_load_status[step_name]\n\n            # The step has finished (replicas = 0) or it has failed to load\n            if num_replicas in [0, _STEP_LOAD_FAILED_CODE]:\n                return True\n\n        return False\n\n    @property\n    @abstractmethod\n    def QueueClass(self) -> Callable:\n        \"\"\"The class of the queue to use in the pipeline.\"\"\"\n        pass\n\n    def _create_step_input_queue(self, step_name: str) -> \"Queue[Any]\":\n        \"\"\"Creates an input queue for a step.\n\n        Args:\n            step_name: The name of the step.\n\n        Returns:\n            The input queue created.\n        \"\"\"\n        input_queue = self.QueueClass()\n        self.dag.set_step_attr(step_name, constants.INPUT_QUEUE_ATTR_NAME, input_queue)\n        return input_queue\n\n    @abstractmethod\n    def _run_step(self, step: \"_Step\", input_queue: \"Queue[Any]\", replica: int) -> None:\n        \"\"\"Runs the `Step` instance.\n\n        Args:\n            step: The `Step` instance to run.\n            input_queue: The input queue where the step will receive the batches.\n            replica: The replica ID assigned.\n        \"\"\"\n        pass\n\n    def _run_steps(self, steps: Iterable[str]) -> None:\n        \"\"\"Runs the `Step`s of the pipeline, creating first an input queue for each step\n        that will be used to send the batches.\n\n        Args:\n            steps:\n        \"\"\"\n        for step_name in steps:\n            step: \"Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n            input_queue = self._create_step_input_queue(step_name=step_name)\n\n            # Set `pipeline` to `None` as in some Python environments the pipeline is not\n            # picklable and it will raise an error when trying to send the step to the process.\n            # `TypeError: cannot pickle 'code' object`\n            step.pipeline = None\n\n            if not step.is_normal and step.resources.replicas > 1:  # type: ignore\n                self._logger.warning(\n                    f\"Step '{step_name}' is a `GeneratorStep` or `GlobalStep` and has more\"\n                    \" than 1 replica. Only `Step` instances can have more than 1 replica.\"\n                    \" The number of replicas for the step will be set to 1.\"\n                )\n\n            step_num_replicas: int = step.resources.replicas if step.is_normal else 1  # type: ignore\n            for replica in range(step_num_replicas):\n                self._logger.debug(\n                    f\"Running 1 replica of step '{step.name}' with ID {replica}...\"\n                )\n                self._run_step(\n                    step=step.model_copy(deep=True),\n                    input_queue=input_queue,\n                    replica=replica,\n                )\n\n    def _add_batches_back_to_batch_manager(self) -> None:\n        \"\"\"Add the `Batch`es that were sent to a `Step` back to the `_BatchManager`. This\n        method should be used when the pipeline has been stopped prematurely.\"\"\"\n        for step_name in self.dag:\n            node = self.dag.get_step(step_name)\n            step: \"_Step\" = node[constants.STEP_ATTR_NAME]\n            if step.is_generator:\n                continue\n            if input_queue := node.get(constants.INPUT_QUEUE_ATTR_NAME):\n                while not input_queue.empty():\n                    batch = input_queue.get()\n                    if not isinstance(batch, _Batch):\n                        continue\n                    self._batch_manager.add_batch(  # type: ignore\n                        to_step=step_name,\n                        batch=batch,\n                        prepend=True,\n                    )\n                    self._logger.debug(\n                        f\"Adding batch back to the batch manager: {batch}\"\n                    )\n                input_queue.put(None)\n\n    def _consume_output_queue(self) -> None:\n        \"\"\"Consumes the `Batch`es from the output queue until it's empty. This method should\n        be used when the pipeline has been stopped prematurely to consume and to not lose\n        the `Batch`es that were processed by the leaf `Step`s before stopping the pipeline.\"\"\"\n        while not self._output_queue.empty():\n            batch = self._output_queue.get()\n            if batch is None:\n                continue\n            self._process_batch(batch, send_last_batch_flag=False)\n            self._handle_batch_on_stop(batch)\n\n    def _manage_batch_flow(self, batch: \"_Batch\") -> None:\n        \"\"\"Checks if the step that generated the batch has more data in its buffer to\n        generate a new batch. If there's data, then a new batch is sent to the step. If\n        the step has no data in its buffer, then the predecessors generator steps are\n        requested to send a new batch.\n\n        Args:\n            batch: The batch that was processed.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        route_to, do_not_route_to, routed = self._get_successors(batch)\n\n        self._register_batch(batch)\n\n        # Keep track of the steps that the batch was routed to\n        if routed:\n            batch.batch_routed_to = route_to\n\n        self._set_next_expected_seq_no(\n            steps=do_not_route_to,\n            from_step=batch.step_name,\n            next_expected_seq_no=batch.seq_no + 1,\n        )\n\n        step = self._get_step_from_batch(batch)\n\n        # Add the batch to the successors input buffers\n        for successor in route_to:\n            # Copy batch to avoid modifying the same reference in the batch manager\n            batch_to_add = batch.copy() if len(route_to) > 1 else batch\n\n            self._batch_manager.add_batch(successor, batch_to_add)\n\n            # Check if the step is a generator and if there are successors that need data\n            # from this step. This usually happens when the generator `batch_size` is smaller\n            # than the `input_batch_size` of the successor steps.\n            if (\n                step.is_generator\n                and step.name in self._batch_manager.step_empty_buffers(successor)\n            ):\n                last_batch_sent = self._batch_manager.get_last_batch_sent(step.name)\n                self._send_batch_to_step(last_batch_sent.next_batch())  # type: ignore\n\n            # If successor step has enough data in its buffer to create a new batch, then\n            # send the batch to the step.\n            while new_batch := self._batch_manager.get_batch(successor):\n                self._send_batch_to_step(new_batch)\n\n        if not step.is_generator:\n            # Step (\"this\", the one from which the batch was received) has enough data on its\n            # buffers to create a new batch\n            while new_batch := self._batch_manager.get_batch(step.name):  # type: ignore\n                # if new_batch := self._batch_manager.get_batch(step.name):  # type: ignore\n                self._send_batch_to_step(new_batch)\n\n            else:\n                self._request_more_batches_if_needed(step)\n        else:\n            if len(self.dag) == 1:\n                self._request_batch_from_generator(step.name)  # type: ignore\n\n        self._cache()\n\n    def _send_to_step(self, step_name: str, to_send: Any) -> None:\n        \"\"\"Sends something to the input queue of a step.\n\n        Args:\n            step_name: The name of the step.\n            to_send: The object to send.\n        \"\"\"\n        input_queue = self.dag.get_step(step_name)[constants.INPUT_QUEUE_ATTR_NAME]\n        input_queue.put(to_send)\n\n    def _send_batch_to_step(self, batch: \"_Batch\") -> None:\n        \"\"\"Sends a batch to the input queue of a step, writing the data of the batch\n        to the filesystem and setting `batch.data_path` with the path where the data\n        was written (if requiered i.e. the step is a global step or `use_fs_to_pass_data`)\n\n        This method should be extended by the specific pipeline implementation, adding\n        the logic to send the batch to the step.\n\n        Args:\n            batch: The batch to send.\n        \"\"\"\n        self._logger.debug(\n            f\"Setting batch {batch.seq_no} as last batch sent to '{batch.step_name}': {batch}\"\n        )\n        self._batch_manager.set_last_batch_sent(batch)  # type: ignore\n\n        step: \"_Step\" = self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n        if not step.is_generator and (step.is_global or self._use_fs_to_pass_data):\n            base_path = UPath(self._storage_base_path) / step.name  # type: ignore\n            self._logger.debug(\n                f\"Writing {batch.seq_no} batch for '{batch.step_name}' step to filesystem: {base_path}\"\n            )\n            batch.write_batch_data_to_fs(self._fs, base_path)  # type: ignore\n\n        self._logger.debug(\n            f\"Sending batch {batch.seq_no} to step '{batch.step_name}': {batch}\"\n        )\n        self._send_to_step(batch.step_name, batch)\n\n    def _gather_requirements(self) -> List[str]:\n        \"\"\"Extracts the requirements from the steps to be used in the pipeline.\n\n        Returns:\n            List of requirements gathered from the steps.\n        \"\"\"\n        steps_requirements = []\n        for step in self.dag:\n            step_req = self.dag.get_step(step)[constants.STEP_ATTR_NAME].requirements\n            steps_requirements.extend(step_req)\n\n        return steps_requirements\n\n    def _register_batch(self, batch: \"_Batch\") -> None:\n        \"\"\"Registers a batch in the batch manager.\n\n        Args:\n            batch: The batch to register.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n        self._batch_manager.register_batch(\n            batch, steps_data_path=self._cache_location[\"steps_data\"]\n        )  # type: ignore\n        self._logger.debug(\n            f\"Batch {batch.seq_no} from step '{batch.step_name}' registered in batch\"\n            \" manager\"\n        )\n\n    def _send_last_batch_flag_to_step(self, step_name: str) -> None:\n        \"\"\"Sends the `LAST_BATCH_SENT_FLAG` to a step to stop processing batches.\n\n        Args:\n            step_name: The name of the step.\n        \"\"\"\n        self._logger.debug(\n            f\"Sending `LAST_BATCH_SENT_FLAG` to '{step_name}' step to stop processing\"\n            \" batches...\"\n        )\n\n        for _ in range(self.dag.get_step_replica_count(step_name)):\n            self._send_to_step(step_name, constants.LAST_BATCH_SENT_FLAG)\n        self._batch_manager.set_last_batch_flag_sent_to(step_name)  # type: ignore\n\n    def _request_initial_batches(self) -> None:\n        \"\"\"Requests the initial batches to the generator steps.\"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n        for step in self._batch_manager._steps.values():\n            if not self._is_step_running(step.step_name):\n                continue\n            if batch := step.get_batch():\n                self._logger.debug(\n                    f\"Sending initial batch to '{step.step_name}' step: {batch}\"\n                )\n                self._send_batch_to_step(batch)\n\n        for step_name in self.dag.root_steps:\n            if not self._is_step_running(step_name):\n                continue\n            seq_no = 0\n            if last_batch := self._batch_manager.get_last_batch(step_name):\n                seq_no = last_batch.seq_no + 1\n            batch = _Batch(seq_no=seq_no, step_name=step_name, last_batch=self._dry_run)\n            self._logger.debug(\n                f\"Requesting initial batch to '{step_name}' generator step: {batch}\"\n            )\n            self._send_batch_to_step(batch)\n\n    def _request_batch_from_generator(self, step_name: str) -> None:\n        \"\"\"Request a new batch to a `GeneratorStep`.\n\n        Args:\n            step_name: the name of the `GeneratorStep` to which a batch has to be requested.\n        \"\"\"\n        # Get the last batch that the previous step sent to generate the next batch\n        # (next `seq_no`).\n        last_batch = self._batch_manager.get_last_batch_sent(step_name)  # type: ignore\n        if last_batch is None:\n            return\n        self._send_batch_to_step(last_batch.next_batch())\n\n    def _request_more_batches_if_needed(self, step: \"Step\") -> None:\n        \"\"\"Request more batches to the predecessors steps of `step` if needed.\n\n        Args:\n            step: The step of which it has to be checked if more batches are needed from\n                its predecessors.\n        \"\"\"\n        empty_buffers = self._batch_manager.step_empty_buffers(step.name)  # type: ignore\n        for previous_step_name in empty_buffers:\n            # Only more batches can be requested to the `GeneratorStep`s as they are the\n            # only kind of steps that lazily generate batches.\n            if previous_step_name not in self.dag.root_steps:\n                continue\n\n            self._request_batch_from_generator(previous_step_name)\n\n    def _handle_batch_on_stop(self, batch: \"_Batch\") -> None:\n        \"\"\"Handles a batch that was received from the output queue when the pipeline was\n        stopped. It will add and register the batch in the batch manager.\n\n        Args:\n            batch: The batch to handle.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        self._batch_manager.register_batch(\n            batch, steps_data_path=self._cache_location[\"steps_data\"]\n        )\n        step: \"Step\" = self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n        for successor in self.dag.get_step_successors(step.name):  # type: ignore\n            self._batch_manager.add_batch(successor, batch)\n\n    def _get_step_from_batch(self, batch: \"_Batch\") -> \"Step\":\n        \"\"\"Gets the `Step` instance from a batch.\n\n        Args:\n            batch: The batch to get the step from.\n\n        Returns:\n            The `Step` instance.\n        \"\"\"\n        return self.dag.get_step(batch.step_name)[constants.STEP_ATTR_NAME]\n\n    def _notify_steps_to_stop(self) -> None:\n        \"\"\"Notifies the steps to stop their infinite running loop by sending `None` to\n        their input queues.\"\"\"\n        with self._steps_load_status_lock:\n            for step_name, replicas in self._steps_load_status.items():\n                if replicas > 0:\n                    for _ in range(replicas):\n                        self._send_to_step(step_name, None)\n\n    def _get_successors(self, batch: \"_Batch\") -> Tuple[List[str], List[str], bool]:\n        \"\"\"Gets the successors and the successors to which the batch has to be routed.\n\n        Args:\n            batch: The batch to which the successors will be determined.\n\n        Returns:\n            The successors to route the batch to and whether the batch was routed using\n            a routing function.\n        \"\"\"\n        node = self.dag.get_step(batch.step_name)\n        step: \"Step\" = node[constants.STEP_ATTR_NAME]\n        successors = list(self.dag.get_step_successors(step.name))  # type: ignore\n        route_to = successors\n\n        # Check if the step has a routing function to send the batch to specific steps\n        if routing_batch_function := node.get(\n            constants.ROUTING_BATCH_FUNCTION_ATTR_NAME\n        ):\n            route_to = routing_batch_function(batch, successors)\n            successors_str = \", \".join(f\"'{successor}'\" for successor in route_to)\n            self._logger.info(\n                f\"\ud83d\ude8f Using '{step.name}' routing function to send batch {batch.seq_no} to steps: {successors_str}\"\n            )\n\n        return route_to, list(set(successors) - set(route_to)), route_to != successors\n\n    def _set_next_expected_seq_no(\n        self, steps: List[str], from_step: str, next_expected_seq_no: int\n    ) -> None:\n        \"\"\"Sets the next expected sequence number of a `_Batch` received by `step` from\n        `from_step`. This is necessary as some `Step`s might not receive all the batches\n        comming from the previous steps because there is a routing batch function.\n\n        Args:\n            steps: list of steps to which the next expected sequence number of a `_Batch`\n                from `from_step` has to be updated in the `_BatchManager`.\n            from_step: the name of the step from which the next expected sequence number\n                of a `_Batch` has to be updated in `steps`.\n            next_expected_seq_no: the number of the next expected sequence number of a `Batch`\n                from `from_step`.\n        \"\"\"\n        assert self._batch_manager, \"Batch manager is not set\"\n\n        for step in steps:\n            self._batch_manager.set_next_expected_seq_no(\n                step_name=step,\n                from_step=from_step,\n                next_expected_seq_no=next_expected_seq_no,\n            )\n\n    @abstractmethod\n    def _teardown(self) -> None:\n        \"\"\"Clean/release/stop resources reserved to run the pipeline.\"\"\"\n        pass\n\n    @abstractmethod\n    def _set_steps_not_loaded_exception(self) -> None:\n        \"\"\"Used to raise `RuntimeError` when the load of the steps failed.\n\n        Raises:\n            RuntimeError: containing the information and why a step failed to be loaded.\n        \"\"\"\n        pass\n\n    @abstractmethod\n    def _stop(self) -> None:\n        \"\"\"Stops the pipeline in a controlled way.\"\"\"\n        pass\n\n    def _stop_load_queue_loop(self) -> None:\n        \"\"\"Stops the `_load_queue` loop sending a `None`.\"\"\"\n        self._logger.debug(\"Sending `None` to the load queue to notify stop...\")\n        self._load_queue.put(None)\n\n    def _stop_output_queue_loop(self) -> None:\n        \"\"\"Stops the `_output_queue` loop sending a `None`.\"\"\"\n        self._logger.debug(\"Sending `None` to the output queue to notify stop...\")\n        self._output_queue.put(None)\n\n    def _handle_keyboard_interrupt(self) -> Any:\n        \"\"\"Handles KeyboardInterrupt signal sent during the Pipeline.run method.\n\n        It will try to call self._stop (if the pipeline didn't started yet, it won't\n        have any effect), and if the pool is already started, will close it before exiting\n        the program.\n\n        Returns:\n            The original `signal.SIGINT` handler.\n        \"\"\"\n\n        def signal_handler(signumber: int, frame: Any) -> None:\n            self._stop()\n\n        return signal.signal(signal.SIGINT, signal_handler)\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.signature","title":"signature: str property","text":"

Makes a signature (hash) of a pipeline, using the step ids and the adjacency between them.

The main use is to find the pipeline in the cache folder.

Returns:

Type Description str

Signature of the pipeline.

"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.aggregated_steps_signature","title":"aggregated_steps_signature: str property","text":"

Creates an aggregated signature using Steps signature that will be used for the _BatchManager.

Returns:

Type Description str

The aggregated signature.

"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.QueueClass","title":"QueueClass: Callable abstractmethod property","text":"

The class of the queue to use in the pipeline.

"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__init__","title":"__init__(name=None, description=None, cache_dir=None, enable_metadata=False, requirements=None)","text":"

Initialize the BasePipeline instance.

Parameters:

Name Type Description Default name Optional[str]

The name of the pipeline. If not generated, a random one will be generated by default.

None description Optional[str]

A description of the pipeline. Defaults to None.

None cache_dir Optional[Union[str, PathLike]]

A directory where the pipeline will be cached. Defaults to None.

None enable_metadata bool

Whether to include the distilabel metadata column for the pipeline in the final Distiset. It contains metadata used by distilabel, for example the raw outputs of the LLM without processing would be here, inside raw_output_... field. Defaults to False.

False requirements Optional[List[str]]

List of requirements that must be installed to run the pipeline. Defaults to None, but can be helpful to inform in a pipeline to be shared that this requirements must be installed.

None Source code in src/distilabel/pipeline/base.py
def __init__(\n    self,\n    name: Optional[str] = None,\n    description: Optional[str] = None,\n    cache_dir: Optional[Union[str, \"PathLike\"]] = None,\n    enable_metadata: bool = False,\n    requirements: Optional[List[str]] = None,\n) -> None:\n    \"\"\"Initialize the `BasePipeline` instance.\n\n    Args:\n        name: The name of the pipeline. If not generated, a random one will be generated by default.\n        description: A description of the pipeline. Defaults to `None`.\n        cache_dir: A directory where the pipeline will be cached. Defaults to `None`.\n        enable_metadata: Whether to include the distilabel metadata column for the pipeline\n            in the final `Distiset`. It contains metadata used by distilabel, for example\n            the raw outputs of the `LLM` without processing would be here, inside `raw_output_...`\n            field. Defaults to `False`.\n        requirements: List of requirements that must be installed to run the pipeline.\n            Defaults to `None`, but can be helpful to inform in a pipeline to be shared\n            that this requirements must be installed.\n    \"\"\"\n    self.name = name or _PIPELINE_DEFAULT_NAME\n    self.description = description\n    self._enable_metadata = enable_metadata\n    self.dag = DAG()\n\n    if cache_dir:\n        self._cache_dir = Path(cache_dir)\n    elif env_cache_dir := envs.DISTILABEL_CACHE_DIR:\n        self._cache_dir = Path(env_cache_dir)\n    else:\n        self._cache_dir = constants.PIPELINES_CACHE_DIR\n\n    self._logger = logging.getLogger(\"distilabel.pipeline\")\n\n    self._batch_manager: Optional[\"_BatchManager\"] = None\n    self._write_buffer: Optional[\"_WriteBuffer\"] = None\n\n    self._steps_load_status: Dict[str, int] = {}\n    self._steps_load_status_lock = threading.Lock()\n\n    self._stop_called = False\n    self._stop_called_lock = threading.Lock()\n    self._stop_calls = 0\n\n    self._recover_offline_batch_generate_for_step: Union[\n        Tuple[str, List[List[Dict[str, Any]]]], None\n    ] = None\n\n    self._fs: Optional[fsspec.AbstractFileSystem] = None\n    self._storage_base_path: Optional[str] = None\n    self._use_fs_to_pass_data: bool = False\n    self._dry_run = False\n\n    self._current_stage = 0\n    self._stages_last_batch: List[List[str]] = []\n\n    self.requirements = requirements or []\n\n    self._exception: Union[Exception, None] = None\n\n    self._log_queue: Union[\"Queue[Any]\", None] = None\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__enter__","title":"__enter__()","text":"

Set the global pipeline instance when entering a pipeline context.

Source code in src/distilabel/pipeline/base.py
def __enter__(self) -> Self:\n    \"\"\"Set the global pipeline instance when entering a pipeline context.\"\"\"\n    _GlobalPipelineManager.set_pipeline(self)\n    return self\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__exit__","title":"__exit__(exc_type, exc_value, traceback)","text":"

Unset the global pipeline instance when exiting a pipeline context.

Source code in src/distilabel/pipeline/base.py
def __exit__(self, exc_type, exc_value, traceback) -> None:\n    \"\"\"Unset the global pipeline instance when exiting a pipeline context.\"\"\"\n    _GlobalPipelineManager.set_pipeline(None)\n    self._set_pipeline_name()\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.run","title":"run(parameters=None, use_cache=True, storage_parameters=None, use_fs_to_pass_data=False, dataset=None, dataset_batch_size=50, logging_handlers=None)","text":"

Run the pipeline. It will set the runtime parameters for the steps and validate the pipeline.

This method should be extended by the specific pipeline implementation, adding the logic to run the pipeline.

Parameters:

Name Type Description Default parameters Optional[Dict[str, Dict[str, Any]]]

A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None.

None use_cache bool

Whether to use the cache from previous pipeline runs. Defaults to True.

True storage_parameters Optional[Dict[str, Any]]

A dictionary with the storage parameters (fsspec and path) that will be used to store the data of the _Batches passed between the steps if use_fs_to_pass_data is True (for the batches received by a GlobalStep it will be always used). It must have at least the \"path\" key, and it can contain additional keys depending on the protocol. By default, it will use the local file system and a directory in the cache directory. Defaults to None.

None use_fs_to_pass_data bool

Whether to use the file system to pass the data of the _Batches between the steps. Even if this parameter is False, the Batches received by GlobalSteps will always use the file system to pass the data. Defaults to False.

False dataset Optional[InputDataset]

If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None.

None dataset_batch_size int

if dataset is given, this will be the size of the batches yield by the GeneratorStep created using the dataset. Defaults to 50.

50 logging_handlers Optional[List[Handler]]

A list of logging handlers that will be used to log the output of the pipeline. This argument can be useful so the logging messages can be extracted and used in a different context. Defaults to None.

None

Returns:

Type Description Distiset

The Distiset created by the pipeline.

Source code in src/distilabel/pipeline/base.py
def run(\n    self,\n    parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n    use_cache: bool = True,\n    storage_parameters: Optional[Dict[str, Any]] = None,\n    use_fs_to_pass_data: bool = False,\n    dataset: Optional[\"InputDataset\"] = None,\n    dataset_batch_size: int = 50,\n    logging_handlers: Optional[List[logging.Handler]] = None,\n) -> \"Distiset\":  # type: ignore\n    \"\"\"Run the pipeline. It will set the runtime parameters for the steps and validate\n    the pipeline.\n\n    This method should be extended by the specific pipeline implementation,\n    adding the logic to run the pipeline.\n\n    Args:\n        parameters: A dictionary with the step name as the key and a dictionary with\n            the runtime parameters for the step as the value. Defaults to `None`.\n        use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n            `True`.\n        storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n            that will be used to store the data of the `_Batch`es passed between the\n            steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n            `GlobalStep` it will be always used). It must have at least the \"path\" key,\n            and it can contain additional keys depending on the protocol. By default,\n            it will use the local file system and a directory in the cache directory.\n            Defaults to `None`.\n        use_fs_to_pass_data: Whether to use the file system to pass the data of\n            the `_Batch`es between the steps. Even if this parameter is `False`, the\n            `Batch`es received by `GlobalStep`s will always use the file system to\n            pass the data. Defaults to `False`.\n        dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n            root step. Convenient method when you have already processed the dataset in\n            your script and just want to pass it already processed. Defaults to `None`.\n        dataset_batch_size: if `dataset` is given, this will be the size of the batches\n            yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n        logging_handlers: A list of logging handlers that will be used to log the\n            output of the pipeline. This argument can be useful so the logging messages\n            can be extracted and used in a different context. Defaults to `None`.\n\n    Returns:\n        The `Distiset` created by the pipeline.\n    \"\"\"\n\n    self._exception: Union[Exception, None] = None\n\n    # Set the runtime parameters that will be used during the pipeline execution.\n    # They are used to generate the signature of the pipeline that is used to hit the\n    # cache when the pipeline is run, so it's important to do it first.\n    self._set_runtime_parameters(parameters or {})\n\n    self._refresh_pipeline_from_cache()\n\n    if dataset is not None:\n        self._add_dataset_generator_step(dataset, dataset_batch_size)\n\n    setup_logging(\n        log_queue=self._log_queue,\n        filename=str(self._cache_location[\"log_file\"]),\n        logging_handlers=logging_handlers,\n    )\n\n    # Set the name of the pipeline if it's the default one. This should be called\n    # if the pipeline is defined within the context manager, and the run is called\n    # outside of it. Is here in the following case:\n    # with Pipeline() as pipeline:\n    #    pipeline.run()\n    self._set_pipeline_name()\n\n    # Validate the pipeline DAG to check that all the steps are chainable, there are\n    # no missing runtime parameters, batch sizes are correct, etc.\n    self.dag.validate()\n\n    self._set_pipeline_artifacts_path_in_steps()\n\n    # Set the initial load status for all the steps\n    self._init_steps_load_status()\n\n    # Load the stages status or initialize it\n    self._load_stages_status(use_cache)\n\n    # Load the `_BatchManager` from cache or create one from scratch\n    self._load_batch_manager(use_cache)\n\n    # Check pipeline requirements are installed\n    self._check_requirements()\n\n    # Setup the filesystem that will be used to pass the data of the `_Batch`es\n    self._setup_fsspec(storage_parameters)\n    self._use_fs_to_pass_data = use_fs_to_pass_data\n\n    if self._dry_run:\n        self._logger.info(\"\ud83c\udf35 Dry run mode\")\n\n    # If the batch manager is not able to generate batches, that means that the loaded\n    # `_BatchManager` from cache didn't have any remaining batches to process i.e.\n    # the previous pipeline execution was completed successfully.\n    if not self._batch_manager.can_generate():  # type: ignore\n        self._logger.info(\n            \"\ud83d\udcbe Loaded batch manager from cache doesn't contain any remaining data.\"\n            \" Returning `Distiset` from cache data...\"\n        )\n        distiset = create_distiset(\n            data_dir=self._cache_location[\"data\"],\n            pipeline_path=self._cache_location[\"pipeline\"],\n            log_filename_path=self._cache_location[\"log_file\"],\n            enable_metadata=self._enable_metadata,\n            dag=self.dag,\n        )\n        stop_logging()\n        return distiset\n\n    self._setup_write_buffer(use_cache)\n\n    self._print_load_stages_info()\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.dry_run","title":"dry_run(parameters=None, batch_size=1, dataset=None)","text":"

Do a dry run to test the pipeline runs as expected.

Running a Pipeline in dry run mode will set all the batch_size of generator steps to the specified batch_size, and run just with a single batch, effectively running the whole pipeline with a single example. The cache will be set to False.

Parameters:

Name Type Description Default parameters Optional[Dict[str, Dict[str, Any]]]

A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None.

None batch_size int

The batch size of the unique batch generated by the generators steps of the pipeline. Defaults to 1.

1 dataset Optional[InputDataset]

If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None.

None

Returns:

Type Description Distiset

Will return the Distiset as the main run method would do.

Source code in src/distilabel/pipeline/base.py
def dry_run(\n    self,\n    parameters: Optional[Dict[str, Dict[str, Any]]] = None,\n    batch_size: int = 1,\n    dataset: Optional[\"InputDataset\"] = None,\n) -> \"Distiset\":\n    \"\"\"Do a dry run to test the pipeline runs as expected.\n\n    Running a `Pipeline` in dry run mode will set all the `batch_size` of generator steps\n    to the specified `batch_size`, and run just with a single batch, effectively\n    running the whole pipeline with a single example. The cache will be set to `False`.\n\n    Args:\n        parameters: A dictionary with the step name as the key and a dictionary with\n            the runtime parameters for the step as the value. Defaults to `None`.\n        batch_size: The batch size of the unique batch generated by the generators\n            steps of the pipeline. Defaults to `1`.\n        dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n            root step. Convenient method when you have already processed the dataset in\n            your script and just want to pass it already processed. Defaults to `None`.\n\n    Returns:\n        Will return the `Distiset` as the main run method would do.\n    \"\"\"\n    self._dry_run = True\n\n    for step_name in self.dag:\n        step = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n\n        if step.is_generator:\n            if not parameters:\n                parameters = {}\n            parameters[step_name] = {\"batch_size\": batch_size}\n\n    distiset = self.run(parameters=parameters, use_cache=False, dataset=dataset)\n\n    self._dry_run = False\n    return distiset\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.get_runtime_parameters_info","title":"get_runtime_parameters_info()","text":"

Get the runtime parameters for the steps in the pipeline.

Returns:

Type Description PipelineRuntimeParametersInfo

A dictionary with the step name as the key and a list of dictionaries with

PipelineRuntimeParametersInfo

the parameter name and the parameter info as the value.

Source code in src/distilabel/pipeline/base.py
def get_runtime_parameters_info(self) -> \"PipelineRuntimeParametersInfo\":\n    \"\"\"Get the runtime parameters for the steps in the pipeline.\n\n    Returns:\n        A dictionary with the step name as the key and a list of dictionaries with\n        the parameter name and the parameter info as the value.\n    \"\"\"\n    runtime_parameters = {}\n    for step_name in self.dag:\n        step: \"_Step\" = self.dag.get_step(step_name)[constants.STEP_ATTR_NAME]\n        runtime_parameters[step_name] = step.get_runtime_parameters_info()\n    return runtime_parameters\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.draw","title":"draw(path='pipeline.png', top_to_bottom=False, show_edge_labels=True)","text":"

Draws the pipeline.

Parameters:

Name Type Description Default path Optional[Union[str, Path]]

The path to save the image to.

'pipeline.png' top_to_bottom bool

Whether to draw the DAG top to bottom. Defaults to False.

False show_edge_labels bool

Whether to show the edge labels. Defaults to True.

True

Returns:

Type Description str

The path to the saved image.

Source code in src/distilabel/pipeline/base.py
def draw(\n    self,\n    path: Optional[Union[str, Path]] = \"pipeline.png\",\n    top_to_bottom: bool = False,\n    show_edge_labels: bool = True,\n) -> str:\n    \"\"\"\n    Draws the pipeline.\n\n    Parameters:\n        path: The path to save the image to.\n        top_to_bottom: Whether to draw the DAG top to bottom. Defaults to `False`.\n        show_edge_labels: Whether to show the edge labels. Defaults to `True`.\n\n    Returns:\n        The path to the saved image.\n    \"\"\"\n    png = self.dag.draw(\n        top_to_bottom=top_to_bottom, show_edge_labels=show_edge_labels\n    )\n    with open(path, \"wb\") as f:\n        f.write(png)\n    return path\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.__repr__","title":"__repr__()","text":"

If running in a Jupyter notebook, display an image representing this Pipeline.

Source code in src/distilabel/pipeline/base.py
def __repr__(self) -> str:\n    \"\"\"\n    If running in a Jupyter notebook, display an image representing this `Pipeline`.\n    \"\"\"\n    if in_notebook():\n        try:\n            from IPython.display import Image, display\n\n            image_data = self.dag.draw()\n\n            display(Image(image_data))\n        except Exception:\n            pass\n    return super().__repr__()\n
"},{"location":"api/pipeline/#distilabel.pipeline.base.BasePipeline.from_dict","title":"from_dict(data) classmethod","text":"

Create a Pipeline from a dict containing the serialized data.

Note

It's intended for internal use.

Parameters:

Name Type Description Default data Dict[str, Any]

Dictionary containing the serialized data from a Pipeline.

required

Returns:

Name Type Description BasePipeline Self

Pipeline recreated from the dictionary info.

Source code in src/distilabel/pipeline/base.py
@classmethod\ndef from_dict(cls, data: Dict[str, Any]) -> Self:\n    \"\"\"Create a Pipeline from a dict containing the serialized data.\n\n    Note:\n        It's intended for internal use.\n\n    Args:\n        data (Dict[str, Any]): Dictionary containing the serialized data from a Pipeline.\n\n    Returns:\n        BasePipeline: Pipeline recreated from the dictionary info.\n    \"\"\"\n    name = data[\"pipeline\"][\"name\"]\n    description = data[\"pipeline\"].get(\"description\")\n    requirements = data.get(\"requirements\", [])\n    with cls(name=name, description=description, requirements=requirements) as pipe:\n        pipe.dag = DAG.from_dict(data[\"pipeline\"])\n    return pipe\n
"},{"location":"api/pipeline/#distilabel.pipeline.local","title":"local","text":""},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline","title":"Pipeline","text":"

Bases: BasePipeline

Local pipeline implementation using multiprocessing.

Source code in src/distilabel/pipeline/local.py
class Pipeline(BasePipeline):\n    \"\"\"Local pipeline implementation using `multiprocessing`.\"\"\"\n\n    def ray(\n        self,\n        ray_head_node_url: Optional[str] = None,\n        ray_init_kwargs: Optional[Dict[str, Any]] = None,\n    ) -> RayPipeline:\n        \"\"\"Creates a `RayPipeline` using the init parameters of this pipeline. This is a\n        convenient method that can be used to \"transform\" one common `Pipeline` to a `RayPipeline`\n        and it's mainly used by the CLI.\n\n        Args:\n            ray_head_node_url: The URL that can be used to connect to the head node of\n                the Ray cluster. Normally, you won't want to use this argument as the\n                recommended way to submit a job to a Ray cluster is using the [Ray Jobs\n                CLI](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html#ray-jobs-overview).\n                Defaults to `None`.\n            ray_init_kwargs: kwargs that will be passed to the `ray.init` method. Defaults\n                to `None`.\n\n        Returns:\n            A `RayPipeline` instance.\n        \"\"\"\n        pipeline = RayPipeline(\n            name=self.name,\n            description=self.description,\n            cache_dir=self._cache_dir,\n            enable_metadata=self._enable_metadata,\n            requirements=self.requirements,\n            ray_head_node_url=ray_head_node_url,\n            ray_init_kwargs=ray_init_kwargs,\n        )\n        pipeline.dag = self.dag\n        return pipeline\n\n    def run(\n        self,\n        parameters: Optional[Dict[Any, Dict[str, Any]]] = None,\n        use_cache: bool = True,\n        storage_parameters: Optional[Dict[str, Any]] = None,\n        use_fs_to_pass_data: bool = False,\n        dataset: Optional[\"InputDataset\"] = None,\n        dataset_batch_size: int = 50,\n        logging_handlers: Optional[List[\"logging.Handler\"]] = None,\n    ) -> \"Distiset\":\n        \"\"\"Runs the pipeline.\n\n        Args:\n            parameters: A dictionary with the step name as the key and a dictionary with\n                the runtime parameters for the step as the value. Defaults to `None`.\n            use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n                `True`.\n            storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n                that will be used to store the data of the `_Batch`es passed between the\n                steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n                `GlobalStep` it will be always used). It must have at least the \"path\" key,\n                and it can contain additional keys depending on the protocol. By default,\n                it will use the local file system and a directory in the cache directory.\n                Defaults to `None`.\n            use_fs_to_pass_data: Whether to use the file system to pass the data of\n                the `_Batch`es between the steps. Even if this parameter is `False`, the\n                `Batch`es received by `GlobalStep`s will always use the file system to\n                pass the data. Defaults to `False`.\n            dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n                root step. Convenient method when you have already processed the dataset in\n                your script and just want to pass it already processed. Defaults to `None`.\n            dataset_batch_size: if `dataset` is given, this will be the size of the batches\n                yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n            logging_handlers: A list of logging handlers that will be used to log the\n                output of the pipeline. This argument can be useful so the logging messages\n                can be extracted and used in a different context. Defaults to `None`.\n\n        Returns:\n            The `Distiset` created by the pipeline.\n\n        Raises:\n            RuntimeError: If the pipeline fails to load all the steps.\n        \"\"\"\n        if script_executed_in_ray_cluster():\n            print(\"Script running in Ray cluster... Using `RayPipeline`...\")\n            return self.ray().run(\n                parameters=parameters,\n                use_cache=use_cache,\n                storage_parameters=storage_parameters,\n                use_fs_to_pass_data=use_fs_to_pass_data,\n                dataset=dataset,\n                dataset_batch_size=dataset_batch_size,\n            )\n\n        self._log_queue = cast(\"Queue[Any]\", mp.Queue())\n\n        if distiset := super().run(\n            parameters=parameters,\n            use_cache=use_cache,\n            storage_parameters=storage_parameters,\n            use_fs_to_pass_data=use_fs_to_pass_data,\n            dataset=dataset,\n            dataset_batch_size=dataset_batch_size,\n            logging_handlers=logging_handlers,\n        ):\n            return distiset\n\n        num_processes = self.dag.get_total_replica_count()\n        with (\n            mp.Manager() as manager,\n            _NoDaemonPool(\n                num_processes,\n                initializer=_init_worker,\n                initargs=(\n                    self._log_queue,\n                    self.name,\n                    self.signature,\n                ),\n            ) as pool,\n        ):\n            self._manager = manager\n            self._pool = pool\n            self._output_queue = self.QueueClass()\n            self._load_queue = self.QueueClass()\n            self._handle_keyboard_interrupt()\n\n            # Run the loop for receiving the load status of each step\n            self._load_steps_thread = self._run_load_queue_loop_in_thread()\n\n            # Start a loop to receive the output batches from the steps\n            self._output_queue_thread = self._run_output_queue_loop_in_thread()\n            self._output_queue_thread.join()\n\n            self._teardown()\n\n            if self._exception:\n                raise self._exception\n\n        distiset = create_distiset(\n            self._cache_location[\"data\"],\n            pipeline_path=self._cache_location[\"pipeline\"],\n            log_filename_path=self._cache_location[\"log_file\"],\n            enable_metadata=self._enable_metadata,\n            dag=self.dag,\n        )\n\n        stop_logging()\n\n        return distiset\n\n    @property\n    def QueueClass(self) -> Callable:\n        \"\"\"The callable used to create the input and output queues.\n\n        Returns:\n            The callable to create a `Queue`.\n        \"\"\"\n        assert self._manager, \"Manager is not initialized\"\n        return self._manager.Queue\n\n    def _run_step(self, step: \"_Step\", input_queue: \"Queue[Any]\", replica: int) -> None:\n        \"\"\"Runs the `Step` wrapped in a `_ProcessWrapper` in a separate process of the\n        `Pool`.\n\n        Args:\n            step: The step to run.\n            input_queue: The input queue to send the data to the step.\n            replica: The replica ID assigned.\n        \"\"\"\n        assert self._pool, \"Pool is not initialized\"\n\n        step_wrapper = _StepWrapper(\n            step=step,  # type: ignore\n            replica=replica,\n            input_queue=input_queue,\n            output_queue=self._output_queue,\n            load_queue=self._load_queue,\n            dry_run=self._dry_run,\n            ray_pipeline=False,\n        )\n\n        self._pool.apply_async(step_wrapper.run, error_callback=self._error_callback)\n\n    def _error_callback(self, e: BaseException) -> None:\n        \"\"\"Error callback that will be called when an error occurs in a `Step` process.\n\n        Args:\n            e: The exception raised by the process.\n        \"\"\"\n        global _SUBPROCESS_EXCEPTION\n\n        # First we check that the exception is a `_StepWrapperException`, otherwise, we\n        # print it out and stop the pipeline, since some errors may be unhandled\n        if not isinstance(e, _StepWrapperException):\n            self._logger.error(f\"\u274c Failed with an unhandled exception: {e}\")\n            self._stop()\n            return\n\n        if e.is_load_error:\n            self._logger.error(f\"\u274c Failed to load step '{e.step.name}': {e.message}\")\n            _SUBPROCESS_EXCEPTION = e.subprocess_exception\n            _SUBPROCESS_EXCEPTION.__traceback__ = tblib.Traceback.from_string(  # type: ignore\n                e.formatted_traceback\n            ).as_traceback()\n            return\n\n        # If the step is global, is not in the last trophic level and has no successors,\n        # then we can ignore the error and continue executing the pipeline\n        step_name: str = e.step.name  # type: ignore\n        if (\n            e.step.is_global\n            and not self.dag.step_in_last_trophic_level(step_name)\n            and list(self.dag.get_step_successors(step_name)) == []\n        ):\n            self._logger.error(\n                f\"\u270b An error occurred when running global step '{step_name}' with no\"\n                \" successors and not in the last trophic level. Pipeline execution can\"\n                f\" continue. Error will be ignored.\"\n            )\n            self._logger.error(f\"Subprocess traceback:\\n\\n{e.formatted_traceback}\")\n            return\n\n        # Handle tasks using an `LLM` using offline batch generation\n        if isinstance(\n            e.subprocess_exception, DistilabelOfflineBatchGenerationNotFinishedException\n        ):\n            self._logger.info(\n                f\"\u23f9\ufe0f '{e.step.name}' task stopped pipeline execution: LLM offline batch\"\n                \" generation in progress. Rerun pipeline with cache to check results and\"\n                \" continue execution.\"\n            )\n            self._set_step_for_recovering_offline_batch_generation(e.step, e.data)  # type: ignore\n            with self._stop_called_lock:\n                if not self._stop_called:\n                    self._stop(acquire_lock=False)\n            return\n\n        # Global step with successors failed\n        self._logger.error(f\"An error occurred in global step '{step_name}'\")\n        self._logger.error(f\"Subprocess traceback:\\n\\n{e.formatted_traceback}\")\n\n        self._stop()\n\n    def _teardown(self) -> None:\n        \"\"\"Clean/release/stop resources reserved to run the pipeline.\"\"\"\n        if self._write_buffer:\n            self._write_buffer.close()\n\n        if self._batch_manager:\n            self._batch_manager = None\n\n        self._stop_load_queue_loop()\n        self._load_steps_thread.join()\n\n        if self._pool:\n            self._pool.terminate()\n            self._pool.join()\n\n        if self._manager:\n            self._manager.shutdown()\n            self._manager.join()\n\n    def _set_steps_not_loaded_exception(self) -> None:\n        \"\"\"Raises a `RuntimeError` notifying that the steps load has failed.\n\n        Raises:\n            RuntimeError: containing the information and why a step failed to be loaded.\n        \"\"\"\n        self._exception = RuntimeError(\n            \"Failed to load all the steps. Could not run pipeline.\"\n        )\n        self._exception.__cause__ = _SUBPROCESS_EXCEPTION\n\n    def _stop(self, acquire_lock: bool = True) -> None:\n        \"\"\"Stops the pipeline execution. It will first send `None` to the input queues\n        of all the steps and then wait until the output queue is empty i.e. all the steps\n        finished processing the batches that were sent before the stop flag. Then it will\n        send `None` to the output queue to notify the pipeline to stop.\n\n        Args:\n            acquire_lock: Whether to acquire the lock to access the `_stop_called` attribute.\n        \"\"\"\n\n        if acquire_lock:\n            self._stop_called_lock.acquire()\n\n        if self._stop_called:\n            self._stop_calls += 1\n            if self._stop_calls == 1:\n                self._logger.warning(\"\ud83d\uded1 Press again to force the pipeline to stop.\")\n            elif self._stop_calls > 1:\n                self._logger.warning(\"\ud83d\uded1 Forcing pipeline interruption.\")\n\n                if self._pool:\n                    self._pool.terminate()\n                    self._pool.join()\n                    self._pool = None\n\n                if self._manager:\n                    self._manager.shutdown()\n                    self._manager.join()\n                    self._manager = None\n\n                stop_logging()\n\n                sys.exit(1)\n\n            return\n        self._stop_called = True\n\n        if acquire_lock:\n            self._stop_called_lock.release()\n\n        self._logger.debug(\n            f\"Steps loaded before calling `stop`: {self._steps_load_status}\"\n        )\n        self._logger.info(\n            \"\ud83d\uded1 Stopping pipeline. Waiting for steps to finish processing batches...\"\n        )\n\n        self._stop_output_queue_loop()\n
"},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.QueueClass","title":"QueueClass: Callable property","text":"

The callable used to create the input and output queues.

Returns:

Type Description Callable

The callable to create a Queue.

"},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.ray","title":"ray(ray_head_node_url=None, ray_init_kwargs=None)","text":"

Creates a RayPipeline using the init parameters of this pipeline. This is a convenient method that can be used to \"transform\" one common Pipeline to a RayPipeline and it's mainly used by the CLI.

Parameters:

Name Type Description Default ray_head_node_url Optional[str]

The URL that can be used to connect to the head node of the Ray cluster. Normally, you won't want to use this argument as the recommended way to submit a job to a Ray cluster is using the Ray Jobs CLI. Defaults to None.

None ray_init_kwargs Optional[Dict[str, Any]]

kwargs that will be passed to the ray.init method. Defaults to None.

None

Returns:

Type Description RayPipeline

A RayPipeline instance.

Source code in src/distilabel/pipeline/local.py
def ray(\n    self,\n    ray_head_node_url: Optional[str] = None,\n    ray_init_kwargs: Optional[Dict[str, Any]] = None,\n) -> RayPipeline:\n    \"\"\"Creates a `RayPipeline` using the init parameters of this pipeline. This is a\n    convenient method that can be used to \"transform\" one common `Pipeline` to a `RayPipeline`\n    and it's mainly used by the CLI.\n\n    Args:\n        ray_head_node_url: The URL that can be used to connect to the head node of\n            the Ray cluster. Normally, you won't want to use this argument as the\n            recommended way to submit a job to a Ray cluster is using the [Ray Jobs\n            CLI](https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html#ray-jobs-overview).\n            Defaults to `None`.\n        ray_init_kwargs: kwargs that will be passed to the `ray.init` method. Defaults\n            to `None`.\n\n    Returns:\n        A `RayPipeline` instance.\n    \"\"\"\n    pipeline = RayPipeline(\n        name=self.name,\n        description=self.description,\n        cache_dir=self._cache_dir,\n        enable_metadata=self._enable_metadata,\n        requirements=self.requirements,\n        ray_head_node_url=ray_head_node_url,\n        ray_init_kwargs=ray_init_kwargs,\n    )\n    pipeline.dag = self.dag\n    return pipeline\n
"},{"location":"api/pipeline/#distilabel.pipeline.local.Pipeline.run","title":"run(parameters=None, use_cache=True, storage_parameters=None, use_fs_to_pass_data=False, dataset=None, dataset_batch_size=50, logging_handlers=None)","text":"

Runs the pipeline.

Parameters:

Name Type Description Default parameters Optional[Dict[Any, Dict[str, Any]]]

A dictionary with the step name as the key and a dictionary with the runtime parameters for the step as the value. Defaults to None.

None use_cache bool

Whether to use the cache from previous pipeline runs. Defaults to True.

True storage_parameters Optional[Dict[str, Any]]

A dictionary with the storage parameters (fsspec and path) that will be used to store the data of the _Batches passed between the steps if use_fs_to_pass_data is True (for the batches received by a GlobalStep it will be always used). It must have at least the \"path\" key, and it can contain additional keys depending on the protocol. By default, it will use the local file system and a directory in the cache directory. Defaults to None.

None use_fs_to_pass_data bool

Whether to use the file system to pass the data of the _Batches between the steps. Even if this parameter is False, the Batches received by GlobalSteps will always use the file system to pass the data. Defaults to False.

False dataset Optional[InputDataset]

If given, it will be used to create a GeneratorStep and put it as the root step. Convenient method when you have already processed the dataset in your script and just want to pass it already processed. Defaults to None.

None dataset_batch_size int

if dataset is given, this will be the size of the batches yield by the GeneratorStep created using the dataset. Defaults to 50.

50 logging_handlers Optional[List[Handler]]

A list of logging handlers that will be used to log the output of the pipeline. This argument can be useful so the logging messages can be extracted and used in a different context. Defaults to None.

None

Returns:

Type Description Distiset

The Distiset created by the pipeline.

Raises:

Type Description RuntimeError

If the pipeline fails to load all the steps.

Source code in src/distilabel/pipeline/local.py
def run(\n    self,\n    parameters: Optional[Dict[Any, Dict[str, Any]]] = None,\n    use_cache: bool = True,\n    storage_parameters: Optional[Dict[str, Any]] = None,\n    use_fs_to_pass_data: bool = False,\n    dataset: Optional[\"InputDataset\"] = None,\n    dataset_batch_size: int = 50,\n    logging_handlers: Optional[List[\"logging.Handler\"]] = None,\n) -> \"Distiset\":\n    \"\"\"Runs the pipeline.\n\n    Args:\n        parameters: A dictionary with the step name as the key and a dictionary with\n            the runtime parameters for the step as the value. Defaults to `None`.\n        use_cache: Whether to use the cache from previous pipeline runs. Defaults to\n            `True`.\n        storage_parameters: A dictionary with the storage parameters (`fsspec` and path)\n            that will be used to store the data of the `_Batch`es passed between the\n            steps if `use_fs_to_pass_data` is `True` (for the batches received by a\n            `GlobalStep` it will be always used). It must have at least the \"path\" key,\n            and it can contain additional keys depending on the protocol. By default,\n            it will use the local file system and a directory in the cache directory.\n            Defaults to `None`.\n        use_fs_to_pass_data: Whether to use the file system to pass the data of\n            the `_Batch`es between the steps. Even if this parameter is `False`, the\n            `Batch`es received by `GlobalStep`s will always use the file system to\n            pass the data. Defaults to `False`.\n        dataset: If given, it will be used to create a `GeneratorStep` and put it as the\n            root step. Convenient method when you have already processed the dataset in\n            your script and just want to pass it already processed. Defaults to `None`.\n        dataset_batch_size: if `dataset` is given, this will be the size of the batches\n            yield by the `GeneratorStep` created using the `dataset`. Defaults to `50`.\n        logging_handlers: A list of logging handlers that will be used to log the\n            output of the pipeline. This argument can be useful so the logging messages\n            can be extracted and used in a different context. Defaults to `None`.\n\n    Returns:\n        The `Distiset` created by the pipeline.\n\n    Raises:\n        RuntimeError: If the pipeline fails to load all the steps.\n    \"\"\"\n    if script_executed_in_ray_cluster():\n        print(\"Script running in Ray cluster... Using `RayPipeline`...\")\n        return self.ray().run(\n            parameters=parameters,\n            use_cache=use_cache,\n            storage_parameters=storage_parameters,\n            use_fs_to_pass_data=use_fs_to_pass_data,\n            dataset=dataset,\n            dataset_batch_size=dataset_batch_size,\n        )\n\n    self._log_queue = cast(\"Queue[Any]\", mp.Queue())\n\n    if distiset := super().run(\n        parameters=parameters,\n        use_cache=use_cache,\n        storage_parameters=storage_parameters,\n        use_fs_to_pass_data=use_fs_to_pass_data,\n        dataset=dataset,\n        dataset_batch_size=dataset_batch_size,\n        logging_handlers=logging_handlers,\n    ):\n        return distiset\n\n    num_processes = self.dag.get_total_replica_count()\n    with (\n        mp.Manager() as manager,\n        _NoDaemonPool(\n            num_processes,\n            initializer=_init_worker,\n            initargs=(\n                self._log_queue,\n                self.name,\n                self.signature,\n            ),\n        ) as pool,\n    ):\n        self._manager = manager\n        self._pool = pool\n        self._output_queue = self.QueueClass()\n        self._load_queue = self.QueueClass()\n        self._handle_keyboard_interrupt()\n\n        # Run the loop for receiving the load status of each step\n        self._load_steps_thread = self._run_load_queue_loop_in_thread()\n\n        # Start a loop to receive the output batches from the steps\n        self._output_queue_thread = self._run_output_queue_loop_in_thread()\n        self._output_queue_thread.join()\n\n        self._teardown()\n\n        if self._exception:\n            raise self._exception\n\n    distiset = create_distiset(\n        self._cache_location[\"data\"],\n        pipeline_path=self._cache_location[\"pipeline\"],\n        log_filename_path=self._cache_location[\"log_file\"],\n        enable_metadata=self._enable_metadata,\n        dag=self.dag,\n    )\n\n    stop_logging()\n\n    return distiset\n
"},{"location":"api/pipeline/routing_batch_function/","title":"Routing batch function","text":""},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function","title":"routing_batch_function","text":""},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunc","title":"RoutingBatchFunc = Callable[[List[str]], List[str]] module-attribute","text":"

Type alias for a routing batch function. It takes a list of all the downstream steps and returns a list with the names of the steps that should receive the batch.

"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction","title":"RoutingBatchFunction","text":"

Bases: BaseModel, _Serializable

A thin wrapper around a routing batch function that can be used to route batches from one upstream step to specific downstream steps.

Attributes:

Name Type Description routing_function RoutingBatchFunc

The routing function that takes a list of all the downstream steps and returns a list with the names of the steps that should receive the batch.

_step Union[_Step, None]

The upstream step that is connected to the routing batch function.

_routed_batch_registry Dict[str, Dict[int, List[str]]]

A dictionary that keeps track of the batches that have been routed to specific downstream steps.

Source code in src/distilabel/pipeline/routing_batch_function.py
class RoutingBatchFunction(BaseModel, _Serializable):\n    \"\"\"A thin wrapper around a routing batch function that can be used to route batches\n    from one upstream step to specific downstream steps.\n\n    Attributes:\n        routing_function: The routing function that takes a list of all the downstream steps\n            and returns a list with the names of the steps that should receive the batch.\n        _step: The upstream step that is connected to the routing batch function.\n        _routed_batch_registry: A dictionary that keeps track of the batches that have been\n            routed to specific downstream steps.\n    \"\"\"\n\n    routing_function: RoutingBatchFunc\n    description: Optional[str] = None\n\n    _step: Union[\"_Step\", None] = PrivateAttr(default=None)\n    _routed_batch_registry: Dict[str, Dict[int, List[str]]] = PrivateAttr(\n        default_factory=dict\n    )\n    _factory_function_module: Union[str, None] = PrivateAttr(default=None)\n    _factory_function_name: Union[str, None] = PrivateAttr(default=None)\n    _factory_function_kwargs: Union[Dict[str, Any], None] = PrivateAttr(default=None)\n\n    def route_batch(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n        \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n        should be routed.\n\n        Args:\n            batch: The batch that should be routed.\n            steps: A list of all the downstream steps that can receive the batch.\n\n        Returns:\n            A list with the names of the steps that should receive the batch.\n        \"\"\"\n        routed_steps = self.routing_function(steps)\n        self._register_routed_batch(batch, routed_steps)\n        return routed_steps\n\n    def set_factory_function(\n        self,\n        factory_function_module: str,\n        factory_function_name: str,\n        factory_function_kwargs: Dict[str, Any],\n    ) -> None:\n        \"\"\"Sets the factory function that was used to create the `routing_batch_function`.\n\n        Args:\n            factory_function_module: The module name where the factory function is defined.\n            factory_function_name: The name of the factory function that was used to create\n                the `routing_batch_function`.\n            factory_function_kwargs: The keyword arguments that were used when calling the\n                factory function.\n        \"\"\"\n        self._factory_function_module = factory_function_module\n        self._factory_function_name = factory_function_name\n        self._factory_function_kwargs = factory_function_kwargs\n\n    def __call__(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n        \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n        should be routed.\n\n        Args:\n            batch: The batch that should be routed.\n            steps: A list of all the downstream steps that can receive the batch.\n\n        Returns:\n            A list with the names of the steps that should receive the batch.\n        \"\"\"\n        return self.route_batch(batch, steps)\n\n    def _register_routed_batch(self, batch: \"_Batch\", routed_steps: List[str]) -> None:\n        \"\"\"Registers a batch that has been routed to specific downstream steps.\n\n        Args:\n            batch: The batch that has been routed.\n            routed_steps: The list of downstream steps that have been selected to receive\n                the batch.\n        \"\"\"\n        upstream_step = batch.step_name\n        batch_seq_no = batch.seq_no\n        self._routed_batch_registry.setdefault(upstream_step, {}).setdefault(\n            batch_seq_no, routed_steps\n        )\n\n    def __rshift__(\n        self, other: List[\"DownstreamConnectableSteps\"]\n    ) -> List[\"DownstreamConnectableSteps\"]:\n        \"\"\"Connects a list of dowstream steps to the upstream step of the routing batch\n        function.\n\n        Args:\n            other: A list of downstream steps that should be connected to the upstream step\n                of the routing batch function.\n\n        Returns:\n            The list of downstream steps that have been connected to the upstream step of the\n            routing batch function.\n        \"\"\"\n        if not isinstance(other, list):\n            raise DistilabelUserError(\n                f\"Can only set a `routing_batch_function` for a list of steps. Got: {other}.\"\n                \" Please, review the right-hand side of the `routing_batch_function >> other`\"\n                \" expression. It should be\"\n                \" `upstream_step >> routing_batch_function >> [downstream_step_1, dowstream_step_2, ...]`.\",\n                page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n            )\n\n        if not self._step:\n            raise DistilabelUserError(\n                \"Routing batch function doesn't have an upstream step. Cannot connect downstream\"\n                \" steps before connecting the upstream step. Connect this routing batch\"\n                \" function to an upstream step using the `>>` operator. For example:\"\n                \" `upstream_step >> routing_batch_function >> [downstream_step_1, downstream_step_2, ...]`.\",\n                page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n            )\n\n        for step in other:\n            self._step.connect(step)\n        return other\n\n    def dump(self, **kwargs: Any) -> Dict[str, Any]:\n        \"\"\"Dumps the routing batch function to a dictionary, and the information of the\n        factory function used to create this routing batch function.\n\n        Args:\n            **kwargs: Additional keyword arguments that should be included in the dump.\n\n        Returns:\n            A dictionary with the routing batch function information and the factory function\n            information.\n        \"\"\"\n        dump_info: Dict[str, Any] = {\"step\": self._step.name}  # type: ignore\n\n        if self.description:\n            dump_info[\"description\"] = self.description\n\n        if type_info := self._get_type_info():\n            dump_info[TYPE_INFO_KEY] = type_info\n\n        return dump_info\n\n    def _get_type_info(self) -> Dict[str, Any]:\n        \"\"\"Returns the information of the factory function used to create the routing batch\n        function.\n\n        Returns:\n            A dictionary with the factory function information.\n        \"\"\"\n\n        type_info = {}\n\n        if self._factory_function_module:\n            type_info[\"module\"] = self._factory_function_module\n\n        if self._factory_function_name:\n            type_info[\"name\"] = self._factory_function_name\n\n        if self._factory_function_kwargs:\n            type_info[\"kwargs\"] = self._factory_function_kwargs\n\n        return type_info\n\n    @classmethod\n    def from_dict(cls, data: Dict[str, Any]) -> Self:\n        \"\"\"Loads a routing batch function from a dictionary. It must contain the information\n        of the factory function used to create the routing batch function.\n\n        Args:\n            data: A dictionary with the routing batch function information and the factory\n                function information.\n        \"\"\"\n        type_info = data.get(TYPE_INFO_KEY)\n        if not type_info:\n            step = data.get(\"step\")\n            raise ValueError(\n                f\"The routing batch function for step '{step}' was created without a factory\"\n                \" function, and it cannot be reconstructed.\"\n            )\n\n        module = type_info.get(\"module\")\n        name = type_info.get(\"name\")\n        kwargs = type_info.get(\"kwargs\")\n\n        if not module or not name or not kwargs:\n            raise ValueError(\n                \"The routing batch function was created with a factory function, but the\"\n                \" information is incomplete. Cannot reconstruct the routing batch function.\"\n            )\n\n        routing_batch_function = _get_module_attr(module=module, name=name)(**kwargs)\n        routing_batch_function.description = data.get(\"description\")\n        routing_batch_function.set_factory_function(\n            factory_function_module=module,\n            factory_function_name=name,\n            factory_function_kwargs=kwargs,\n        )\n\n        return routing_batch_function\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.route_batch","title":"route_batch(batch, steps)","text":"

Returns a list of selected downstream steps from steps to which the batch should be routed.

Parameters:

Name Type Description Default batch _Batch

The batch that should be routed.

required steps List[str]

A list of all the downstream steps that can receive the batch.

required

Returns:

Type Description List[str]

A list with the names of the steps that should receive the batch.

Source code in src/distilabel/pipeline/routing_batch_function.py
def route_batch(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n    \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n    should be routed.\n\n    Args:\n        batch: The batch that should be routed.\n        steps: A list of all the downstream steps that can receive the batch.\n\n    Returns:\n        A list with the names of the steps that should receive the batch.\n    \"\"\"\n    routed_steps = self.routing_function(steps)\n    self._register_routed_batch(batch, routed_steps)\n    return routed_steps\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.set_factory_function","title":"set_factory_function(factory_function_module, factory_function_name, factory_function_kwargs)","text":"

Sets the factory function that was used to create the routing_batch_function.

Parameters:

Name Type Description Default factory_function_module str

The module name where the factory function is defined.

required factory_function_name str

The name of the factory function that was used to create the routing_batch_function.

required factory_function_kwargs Dict[str, Any]

The keyword arguments that were used when calling the factory function.

required Source code in src/distilabel/pipeline/routing_batch_function.py
def set_factory_function(\n    self,\n    factory_function_module: str,\n    factory_function_name: str,\n    factory_function_kwargs: Dict[str, Any],\n) -> None:\n    \"\"\"Sets the factory function that was used to create the `routing_batch_function`.\n\n    Args:\n        factory_function_module: The module name where the factory function is defined.\n        factory_function_name: The name of the factory function that was used to create\n            the `routing_batch_function`.\n        factory_function_kwargs: The keyword arguments that were used when calling the\n            factory function.\n    \"\"\"\n    self._factory_function_module = factory_function_module\n    self._factory_function_name = factory_function_name\n    self._factory_function_kwargs = factory_function_kwargs\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.__call__","title":"__call__(batch, steps)","text":"

Returns a list of selected downstream steps from steps to which the batch should be routed.

Parameters:

Name Type Description Default batch _Batch

The batch that should be routed.

required steps List[str]

A list of all the downstream steps that can receive the batch.

required

Returns:

Type Description List[str]

A list with the names of the steps that should receive the batch.

Source code in src/distilabel/pipeline/routing_batch_function.py
def __call__(self, batch: \"_Batch\", steps: List[str]) -> List[str]:\n    \"\"\"Returns a list of selected downstream steps from `steps` to which the `batch`\n    should be routed.\n\n    Args:\n        batch: The batch that should be routed.\n        steps: A list of all the downstream steps that can receive the batch.\n\n    Returns:\n        A list with the names of the steps that should receive the batch.\n    \"\"\"\n    return self.route_batch(batch, steps)\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.__rshift__","title":"__rshift__(other)","text":"

Connects a list of dowstream steps to the upstream step of the routing batch function.

Parameters:

Name Type Description Default other List[DownstreamConnectableSteps]

A list of downstream steps that should be connected to the upstream step of the routing batch function.

required

Returns:

Type Description List[DownstreamConnectableSteps]

The list of downstream steps that have been connected to the upstream step of the

List[DownstreamConnectableSteps]

routing batch function.

Source code in src/distilabel/pipeline/routing_batch_function.py
def __rshift__(\n    self, other: List[\"DownstreamConnectableSteps\"]\n) -> List[\"DownstreamConnectableSteps\"]:\n    \"\"\"Connects a list of dowstream steps to the upstream step of the routing batch\n    function.\n\n    Args:\n        other: A list of downstream steps that should be connected to the upstream step\n            of the routing batch function.\n\n    Returns:\n        The list of downstream steps that have been connected to the upstream step of the\n        routing batch function.\n    \"\"\"\n    if not isinstance(other, list):\n        raise DistilabelUserError(\n            f\"Can only set a `routing_batch_function` for a list of steps. Got: {other}.\"\n            \" Please, review the right-hand side of the `routing_batch_function >> other`\"\n            \" expression. It should be\"\n            \" `upstream_step >> routing_batch_function >> [downstream_step_1, dowstream_step_2, ...]`.\",\n            page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n        )\n\n    if not self._step:\n        raise DistilabelUserError(\n            \"Routing batch function doesn't have an upstream step. Cannot connect downstream\"\n            \" steps before connecting the upstream step. Connect this routing batch\"\n            \" function to an upstream step using the `>>` operator. For example:\"\n            \" `upstream_step >> routing_batch_function >> [downstream_step_1, downstream_step_2, ...]`.\",\n            page=\"sections/how_to_guides/basic/pipeline/?h=routing#routing-batches-to-specific-downstream-steps\",\n        )\n\n    for step in other:\n        self._step.connect(step)\n    return other\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.dump","title":"dump(**kwargs)","text":"

Dumps the routing batch function to a dictionary, and the information of the factory function used to create this routing batch function.

Parameters:

Name Type Description Default **kwargs Any

Additional keyword arguments that should be included in the dump.

{}

Returns:

Type Description Dict[str, Any]

A dictionary with the routing batch function information and the factory function

Dict[str, Any]

information.

Source code in src/distilabel/pipeline/routing_batch_function.py
def dump(self, **kwargs: Any) -> Dict[str, Any]:\n    \"\"\"Dumps the routing batch function to a dictionary, and the information of the\n    factory function used to create this routing batch function.\n\n    Args:\n        **kwargs: Additional keyword arguments that should be included in the dump.\n\n    Returns:\n        A dictionary with the routing batch function information and the factory function\n        information.\n    \"\"\"\n    dump_info: Dict[str, Any] = {\"step\": self._step.name}  # type: ignore\n\n    if self.description:\n        dump_info[\"description\"] = self.description\n\n    if type_info := self._get_type_info():\n        dump_info[TYPE_INFO_KEY] = type_info\n\n    return dump_info\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.RoutingBatchFunction.from_dict","title":"from_dict(data) classmethod","text":"

Loads a routing batch function from a dictionary. It must contain the information of the factory function used to create the routing batch function.

Parameters:

Name Type Description Default data Dict[str, Any]

A dictionary with the routing batch function information and the factory function information.

required Source code in src/distilabel/pipeline/routing_batch_function.py
@classmethod\ndef from_dict(cls, data: Dict[str, Any]) -> Self:\n    \"\"\"Loads a routing batch function from a dictionary. It must contain the information\n    of the factory function used to create the routing batch function.\n\n    Args:\n        data: A dictionary with the routing batch function information and the factory\n            function information.\n    \"\"\"\n    type_info = data.get(TYPE_INFO_KEY)\n    if not type_info:\n        step = data.get(\"step\")\n        raise ValueError(\n            f\"The routing batch function for step '{step}' was created without a factory\"\n            \" function, and it cannot be reconstructed.\"\n        )\n\n    module = type_info.get(\"module\")\n    name = type_info.get(\"name\")\n    kwargs = type_info.get(\"kwargs\")\n\n    if not module or not name or not kwargs:\n        raise ValueError(\n            \"The routing batch function was created with a factory function, but the\"\n            \" information is incomplete. Cannot reconstruct the routing batch function.\"\n        )\n\n    routing_batch_function = _get_module_attr(module=module, name=name)(**kwargs)\n    routing_batch_function.description = data.get(\"description\")\n    routing_batch_function.set_factory_function(\n        factory_function_module=module,\n        factory_function_name=name,\n        factory_function_kwargs=kwargs,\n    )\n\n    return routing_batch_function\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.routing_batch_function","title":"routing_batch_function(description=None)","text":"

Creates a routing batch function that can be used to route batches from one upstream step to specific downstream steps.

Parameters:

Name Type Description Default description Optional[str]

An optional description for the routing batch function.

None

Returns:

Type Description Callable[[RoutingBatchFunc], RoutingBatchFunction]

A RoutingBatchFunction instance that can be used with the >> operators and with

Callable[[RoutingBatchFunc], RoutingBatchFunction]

the Pipeline.connect method when defining the pipeline.

Example:

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import LoadDataFromHub, GroupColumns\n\n\n@routing_batch_function\ndef random_routing_batch(steps: List[str]) -> List[str]:\n    return random.sample(steps, 2)\n\n\nwith Pipeline(name=\"routing-batch-function\") as pipeline:\n    load_data = LoadDataFromHub()\n\n    generations = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        generations.append(task)\n\n    combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n    load_data >> random_routing_batch >> generations >> combine_columns\n
Source code in src/distilabel/pipeline/routing_batch_function.py
def routing_batch_function(\n    description: Optional[str] = None,\n) -> Callable[[RoutingBatchFunc], RoutingBatchFunction]:\n    \"\"\"Creates a routing batch function that can be used to route batches from one upstream\n    step to specific downstream steps.\n\n    Args:\n        description: An optional description for the routing batch function.\n\n    Returns:\n        A `RoutingBatchFunction` instance that can be used with the `>>` operators and with\n        the `Pipeline.connect` method when defining the pipeline.\n\n    Example:\n\n    ```python\n    from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\n    from distilabel.pipeline import Pipeline, routing_batch_function\n    from distilabel.steps import LoadDataFromHub, GroupColumns\n\n\n    @routing_batch_function\n    def random_routing_batch(steps: List[str]) -> List[str]:\n        return random.sample(steps, 2)\n\n\n    with Pipeline(name=\"routing-batch-function\") as pipeline:\n        load_data = LoadDataFromHub()\n\n        generations = []\n        for llm in (\n            OpenAILLM(model=\"gpt-4-0125-preview\"),\n            MistralLLM(model=\"mistral-large-2402\"),\n            VertexAILLM(model=\"gemini-1.5-pro\"),\n        ):\n            task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n            generations.append(task)\n\n        combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n        load_data >> random_routing_batch >> generations >> combine_columns\n    ```\n    \"\"\"\n\n    def decorator(func: RoutingBatchFunc) -> RoutingBatchFunction:\n        factory_function_name, factory_function_module, factory_function_kwargs = (\n            None,\n            None,\n            None,\n        )\n\n        # Check if `routing_batch_function` was created using a factory function from an installed package\n        stack = inspect.stack()\n        if len(stack) > 2:\n            factory_function_frame_info = stack[1]\n\n            # Function factory path\n            if factory_function_frame_info.function != \"<module>\":\n                factory_function_name = factory_function_frame_info.function\n                factory_function_module = inspect.getmodule(\n                    factory_function_frame_info.frame\n                ).__name__  # type: ignore\n\n                # Function factory kwargs\n                factory_function_kwargs = factory_function_frame_info.frame.f_locals\n\n        routing_batch_function = RoutingBatchFunction(\n            routing_function=func,\n            description=description,\n        )\n\n        if (\n            factory_function_module\n            and factory_function_name\n            and factory_function_kwargs\n        ):\n            routing_batch_function.set_factory_function(\n                factory_function_module=factory_function_module,\n                factory_function_name=factory_function_name,\n                factory_function_kwargs=factory_function_kwargs,\n            )\n\n        return routing_batch_function\n\n    return decorator\n
"},{"location":"api/pipeline/routing_batch_function/#distilabel.pipeline.routing_batch_function.sample_n_steps","title":"sample_n_steps(n)","text":"

A simple function that creates a routing batch function that samples n steps from the list of all the downstream steps.

Parameters:

Name Type Description Default n int

The number of steps to sample from the list of all the downstream steps.

required

Returns:

Type Description RoutingBatchFunction

A RoutingBatchFunction instance that can be used with the >> operators and with

RoutingBatchFunction

the Pipeline.connect method when defining the pipeline.

Example:

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, sample_n_steps\nfrom distilabel.steps import LoadDataFromHub, GroupColumns\n\nrandom_routing_batch = sample_n_steps(2)\n\n\nwith Pipeline(name=\"routing-batch-function\") as pipeline:\n    load_data = LoadDataFromHub()\n\n    generations = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        generations.append(task)\n\n    combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n    load_data >> random_routing_batch >> generations >> combine_columns\n
Source code in src/distilabel/pipeline/routing_batch_function.py
def sample_n_steps(n: int) -> RoutingBatchFunction:\n    \"\"\"A simple function that creates a routing batch function that samples `n` steps from\n    the list of all the downstream steps.\n\n    Args:\n        n: The number of steps to sample from the list of all the downstream steps.\n\n    Returns:\n        A `RoutingBatchFunction` instance that can be used with the `>>` operators and with\n        the `Pipeline.connect` method when defining the pipeline.\n\n    Example:\n\n    ```python\n    from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\n    from distilabel.pipeline import Pipeline, sample_n_steps\n    from distilabel.steps import LoadDataFromHub, GroupColumns\n\n    random_routing_batch = sample_n_steps(2)\n\n\n    with Pipeline(name=\"routing-batch-function\") as pipeline:\n        load_data = LoadDataFromHub()\n\n        generations = []\n        for llm in (\n            OpenAILLM(model=\"gpt-4-0125-preview\"),\n            MistralLLM(model=\"mistral-large-2402\"),\n            VertexAILLM(model=\"gemini-1.5-pro\"),\n        ):\n            task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n            generations.append(task)\n\n        combine_columns = GroupColumns(columns=[\"generation\", \"model_name\"])\n\n        load_data >> random_routing_batch >> generations >> combine_columns\n    ```\n    \"\"\"\n\n    @routing_batch_function(\n        description=f\"Sample {n} steps from the list of downstream steps.\"\n    )\n    def sample_n(steps: List[str]) -> List[str]:\n        return random.sample(steps, n)\n\n    return sample_n\n
"},{"location":"api/pipeline/step_wrapper/","title":"Step Wrapper","text":""},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper","title":"_StepWrapper","text":"

Wrapper to run the Step.

Attributes:

Name Type Description step

The step to run.

replica

The replica ID assigned.

input_queue

The queue to receive the input data.

output_queue

The queue to send the output data.

load_queue

The queue used to notify the main process that the step has been loaded, has been unloaded or has failed to load.

Source code in src/distilabel/pipeline/step_wrapper.py
class _StepWrapper:\n    \"\"\"Wrapper to run the `Step`.\n\n    Attributes:\n        step: The step to run.\n        replica: The replica ID assigned.\n        input_queue: The queue to receive the input data.\n        output_queue: The queue to send the output data.\n        load_queue: The queue used to notify the main process that the step has been loaded,\n            has been unloaded or has failed to load.\n    \"\"\"\n\n    def __init__(\n        self,\n        step: Union[\"Step\", \"GeneratorStep\"],\n        replica: int,\n        input_queue: \"Queue[_Batch]\",\n        output_queue: \"Queue[_Batch]\",\n        load_queue: \"Queue[Union[StepLoadStatus, None]]\",\n        dry_run: bool = False,\n        ray_pipeline: bool = False,\n    ) -> None:\n        \"\"\"Initializes the `_ProcessWrapper`.\n\n        Args:\n            step: The step to run.\n            input_queue: The queue to receive the input data.\n            output_queue: The queue to send the output data.\n            load_queue: The queue used to notify the main process that the step has been\n                loaded, has been unloaded or has failed to load.\n            dry_run: Flag to ensure we are forcing to run the last batch.\n            ray_pipeline: Whether the step is running a `RayPipeline` or not.\n        \"\"\"\n        self.step = step\n        self.replica = replica\n        self.input_queue = input_queue\n        self.output_queue = output_queue\n        self.load_queue = load_queue\n        self.dry_run = dry_run\n        self.ray_pipeline = ray_pipeline\n\n        self._init_cuda_device_placement()\n\n    def _init_cuda_device_placement(self) -> None:\n        \"\"\"Sets the LLM identifier and the number of desired GPUs of the `CudaDevicePlacementMixin`\"\"\"\n\n        def _init_cuda_device_placement_mixin(attr: CudaDevicePlacementMixin) -> None:\n            if self.ray_pipeline:\n                attr.disable_cuda_device_placement = True\n            else:\n                desired_num_gpus = self.step.resources.gpus or 1\n                attr._llm_identifier = f\"{self.step.name}-replica-{self.replica}\"\n                attr._desired_num_gpus = desired_num_gpus\n\n        for field_name in self.step.model_fields_set:\n            attr = getattr(self.step, field_name)\n            if isinstance(attr, CudaDevicePlacementMixin):\n                _init_cuda_device_placement_mixin(attr)\n\n        if isinstance(self.step, CudaDevicePlacementMixin):\n            _init_cuda_device_placement_mixin(self.step)\n\n    def run(self) -> str:\n        \"\"\"The target function executed by the process. This function will also handle\n        the step lifecycle, executing first the `load` function of the `Step` and then\n        waiting to receive a batch from the `input_queue` that will be handled by the\n        `process` method of the `Step`.\n\n        Returns:\n            The name of the step that was executed.\n        \"\"\"\n\n        try:\n            self.step.load()\n            self.step._logger.debug(f\"Step '{self.step.name}' loaded!\")\n        except Exception as e:\n            self.step.unload()\n            self._notify_load_failed()\n            raise _StepWrapperException.create_load_error(\n                message=f\"Step load failed: {e}\",\n                step=self.step,\n                subprocess_exception=e,\n            ) from e\n\n        self._notify_load()\n\n        if self.step.is_generator:\n            self._generator_step_process_loop()\n        else:\n            self._non_generator_process_loop()\n\n        # Just in case `None` sentinel was sent\n        try:\n            self.input_queue.get(block=False)\n        except Exception:\n            pass\n\n        self.step.unload()\n\n        self._notify_unload()\n\n        self.step._logger.info(\n            f\"\ud83c\udfc1 Finished running step '{self.step.name}' (replica ID: {self.replica})\"\n        )\n\n        return self.step.name  # type: ignore\n\n    def _notify_load(self) -> None:\n        \"\"\"Notifies that the step has finished executing its `load` function successfully.\"\"\"\n        self.step._logger.debug(\n            f\"Notifying load of step '{self.step.name}' (replica ID {self.replica})...\"\n        )\n        self.load_queue.put({\"name\": self.step.name, \"status\": \"loaded\"})  # type: ignore\n\n    def _notify_unload(self) -> None:\n        \"\"\"Notifies that the step has been unloaded.\"\"\"\n        self.step._logger.debug(\n            f\"Notifying unload of step '{self.step.name}' (replica ID {self.replica})...\"\n        )\n        self.load_queue.put({\"name\": self.step.name, \"status\": \"unloaded\"})  # type: ignore\n\n    def _notify_load_failed(self) -> None:\n        \"\"\"Notifies that the step failed to load.\"\"\"\n        self.step._logger.debug(\n            f\"Notifying load failed of step '{self.step.name}' (replica ID {self.replica})...\"\n        )\n        self.load_queue.put({\"name\": self.step.name, \"status\": \"load_failed\"})  # type: ignore\n\n    def _generator_step_process_loop(self) -> None:\n        \"\"\"Runs the process loop for a generator step. It will call the `process` method\n        of the step and send the output data to the `output_queue` and block until the next\n        batch request is received (i.e. receiving an empty batch from the `input_queue`).\n\n        If the `last_batch` attribute of the batch is `True`, the loop will stop and the\n        process will finish.\n\n        Raises:\n            _StepWrapperException: If an error occurs during the execution of the\n                `process` method.\n        \"\"\"\n        step = cast(\"GeneratorStep\", self.step)\n\n        try:\n            if (batch := self.input_queue.get()) is None:\n                self.step._logger.info(\n                    f\"\ud83d\uded1 Stopping yielding batches from step '{self.step.name}'\"\n                )\n                return\n\n            offset = batch.seq_no * step.batch_size  # type: ignore\n\n            self.step._logger.info(\n                f\"\ud83e\uddec Starting yielding batches from generator step '{self.step.name}'.\"\n                f\" Offset: {offset}\"\n            )\n\n            for data, last_batch in step.process_applying_mappings(offset=offset):\n                batch.set_data([data])\n                batch.last_batch = self.dry_run or last_batch\n                self._send_batch(batch)\n\n                if batch.last_batch:\n                    return\n\n                self.step._logger.debug(\n                    f\"Step '{self.step.name}' waiting for next batch request...\"\n                )\n                if (batch := self.input_queue.get()) is None:\n                    self.step._logger.info(\n                        f\"\ud83d\uded1 Stopping yielding batches from step '{self.step.name}'\"\n                    )\n                    return\n        except Exception as e:\n            raise _StepWrapperException(str(e), self.step, 2, e) from e\n\n    def _non_generator_process_loop(self) -> None:\n        \"\"\"Runs the process loop for a non-generator step. It will call the `process`\n        method of the step and send the output data to the `output_queue` and block until\n        the next batch is received from the `input_queue`. If the `last_batch` attribute\n        of the batch is `True`, the loop will stop and the process will finish.\n\n        If an error occurs during the execution of the `process` method and the step is\n        global, the process will raise a `_StepWrapperException`. If the step is not\n        global, the process will log the error and send an empty batch to the `output_queue`.\n\n        Raises:\n            _StepWrapperException: If an error occurs during the execution of the\n                `process` method and the step is global.\n        \"\"\"\n        step = cast(\"Step\", self.step)\n        while True:\n            if (batch := self.input_queue.get()) is None:\n                self.step._logger.info(\n                    f\"\ud83d\uded1 Stopping processing batches from step '{self.step.name}'\"\n                )\n                break\n\n            if batch == LAST_BATCH_SENT_FLAG:\n                self.step._logger.debug(\"Received `LAST_BATCH_SENT_FLAG`. Stopping...\")\n                break\n\n            self.step._logger.info(\n                f\"\ud83d\udce6 Processing batch {batch.seq_no} in '{batch.step_name}' (replica ID: {self.replica})\"\n            )\n\n            if batch.data_path is not None:\n                self.step._logger.debug(f\"Reading batch data from '{batch.data_path}'\")\n                batch.read_batch_data_from_fs()\n\n            result = []\n            try:\n                if self.step.has_multiple_inputs:\n                    result = next(step.process_applying_mappings(*batch.data))\n                else:\n                    result = next(step.process_applying_mappings(batch.data[0]))\n            except Exception as e:\n                if self.step.is_global:\n                    self.step.unload()\n                    self._notify_unload()\n                    data = (\n                        batch.data\n                        if isinstance(\n                            e, DistilabelOfflineBatchGenerationNotFinishedException\n                        )\n                        else None\n                    )\n                    raise _StepWrapperException(str(e), self.step, 2, e, data) from e\n\n                # Impute step outputs columns with `None`\n                result = self._impute_step_outputs(batch)\n\n                # if the step is not global then we can skip the batch which means sending\n                # an empty batch to the output queue\n                self.step._logger.warning(\n                    f\"\u26a0\ufe0f Processing batch {batch.seq_no} with step '{self.step.name}' failed.\"\n                    \" Sending empty batch filled with `None`s...\"\n                )\n                self.step._logger.warning(\n                    f\"Subprocess traceback:\\n\\n{traceback.format_exc()}\"\n                )\n            finally:\n                batch.set_data([result])\n                self._send_batch(batch)\n\n            if batch.last_batch:\n                break\n\n    def _impute_step_outputs(self, batch: \"_Batch\") -> List[Dict[str, Any]]:\n        \"\"\"Imputes the step outputs columns with `None` in the batch data.\n\n        Args:\n            batch: The batch to impute.\n        \"\"\"\n        return self.step.impute_step_outputs(batch.data[0])\n\n    def _send_batch(self, batch: _Batch) -> None:\n        \"\"\"Sends a batch to the `output_queue`.\"\"\"\n        if batch.data_path is not None:\n            self.step._logger.debug(f\"Writing batch data to '{batch.data_path}'\")\n            batch.write_batch_data_to_fs()\n\n        self.step._logger.info(\n            f\"\ud83d\udce8 Step '{batch.step_name}' sending batch {batch.seq_no} to output queue\"\n        )\n        self.output_queue.put(batch)\n
"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper.__init__","title":"__init__(step, replica, input_queue, output_queue, load_queue, dry_run=False, ray_pipeline=False)","text":"

Initializes the _ProcessWrapper.

Parameters:

Name Type Description Default step Union[Step, GeneratorStep]

The step to run.

required input_queue Queue[_Batch]

The queue to receive the input data.

required output_queue Queue[_Batch]

The queue to send the output data.

required load_queue Queue[Union[StepLoadStatus, None]]

The queue used to notify the main process that the step has been loaded, has been unloaded or has failed to load.

required dry_run bool

Flag to ensure we are forcing to run the last batch.

False ray_pipeline bool

Whether the step is running a RayPipeline or not.

False Source code in src/distilabel/pipeline/step_wrapper.py
def __init__(\n    self,\n    step: Union[\"Step\", \"GeneratorStep\"],\n    replica: int,\n    input_queue: \"Queue[_Batch]\",\n    output_queue: \"Queue[_Batch]\",\n    load_queue: \"Queue[Union[StepLoadStatus, None]]\",\n    dry_run: bool = False,\n    ray_pipeline: bool = False,\n) -> None:\n    \"\"\"Initializes the `_ProcessWrapper`.\n\n    Args:\n        step: The step to run.\n        input_queue: The queue to receive the input data.\n        output_queue: The queue to send the output data.\n        load_queue: The queue used to notify the main process that the step has been\n            loaded, has been unloaded or has failed to load.\n        dry_run: Flag to ensure we are forcing to run the last batch.\n        ray_pipeline: Whether the step is running a `RayPipeline` or not.\n    \"\"\"\n    self.step = step\n    self.replica = replica\n    self.input_queue = input_queue\n    self.output_queue = output_queue\n    self.load_queue = load_queue\n    self.dry_run = dry_run\n    self.ray_pipeline = ray_pipeline\n\n    self._init_cuda_device_placement()\n
"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapper.run","title":"run()","text":"

The target function executed by the process. This function will also handle the step lifecycle, executing first the load function of the Step and then waiting to receive a batch from the input_queue that will be handled by the process method of the Step.

Returns:

Type Description str

The name of the step that was executed.

Source code in src/distilabel/pipeline/step_wrapper.py
def run(self) -> str:\n    \"\"\"The target function executed by the process. This function will also handle\n    the step lifecycle, executing first the `load` function of the `Step` and then\n    waiting to receive a batch from the `input_queue` that will be handled by the\n    `process` method of the `Step`.\n\n    Returns:\n        The name of the step that was executed.\n    \"\"\"\n\n    try:\n        self.step.load()\n        self.step._logger.debug(f\"Step '{self.step.name}' loaded!\")\n    except Exception as e:\n        self.step.unload()\n        self._notify_load_failed()\n        raise _StepWrapperException.create_load_error(\n            message=f\"Step load failed: {e}\",\n            step=self.step,\n            subprocess_exception=e,\n        ) from e\n\n    self._notify_load()\n\n    if self.step.is_generator:\n        self._generator_step_process_loop()\n    else:\n        self._non_generator_process_loop()\n\n    # Just in case `None` sentinel was sent\n    try:\n        self.input_queue.get(block=False)\n    except Exception:\n        pass\n\n    self.step.unload()\n\n    self._notify_unload()\n\n    self.step._logger.info(\n        f\"\ud83c\udfc1 Finished running step '{self.step.name}' (replica ID: {self.replica})\"\n    )\n\n    return self.step.name  # type: ignore\n
"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException","title":"_StepWrapperException","text":"

Bases: Exception

Exception to be raised when an error occurs in the _StepWrapper class.

Attributes:

Name Type Description message

The error message.

step

The Step that raised the error.

code

The error code.

subprocess_exception

The exception raised by the subprocess.

data

The data that caused the error. Defaults to None.

Source code in src/distilabel/pipeline/step_wrapper.py
class _StepWrapperException(Exception):\n    \"\"\"Exception to be raised when an error occurs in the `_StepWrapper` class.\n\n    Attributes:\n        message: The error message.\n        step: The `Step` that raised the error.\n        code: The error code.\n        subprocess_exception: The exception raised by the subprocess.\n        data: The data that caused the error. Defaults to `None`.\n    \"\"\"\n\n    def __init__(\n        self,\n        message: str,\n        step: \"_Step\",\n        code: int,\n        subprocess_exception: Exception,\n        data: Optional[List[List[Dict[str, Any]]]] = None,\n    ) -> None:\n        self.message = f\"{message}\\n\\nFor further information visit '{DISTILABEL_DOCS_URL}api/pipeline/step_wrapper'\"\n        self.step = step\n        self.code = code\n        self.subprocess_exception = subprocess_exception\n        self.formatted_traceback = \"\".join(\n            traceback.format_exception(\n                type(subprocess_exception),\n                subprocess_exception,\n                subprocess_exception.__traceback__,\n            )\n        )\n        self.data = data\n\n    @classmethod\n    def create_load_error(\n        cls,\n        message: str,\n        step: \"_Step\",\n        subprocess_exception: Optional[Exception] = None,\n    ) -> \"_StepWrapperException\":\n        \"\"\"Creates a `_StepWrapperException` for a load error.\n\n        Args:\n            message: The error message.\n            step: The `Step` that raised the error.\n            subprocess_exception: The exception raised by the subprocess. Defaults to `None`.\n\n        Returns:\n            The `_StepWrapperException` instance.\n        \"\"\"\n        return cls(message, step, 1, subprocess_exception, None)\n\n    @property\n    def is_load_error(self) -> bool:\n        \"\"\"Whether the error is a load error.\n\n        Returns:\n            `True` if the error is a load error, `False` otherwise.\n        \"\"\"\n        return self.code == 1\n
"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException.is_load_error","title":"is_load_error: bool property","text":"

Whether the error is a load error.

Returns:

Type Description bool

True if the error is a load error, False otherwise.

"},{"location":"api/pipeline/step_wrapper/#distilabel.pipeline.step_wrapper._StepWrapperException.create_load_error","title":"create_load_error(message, step, subprocess_exception=None) classmethod","text":"

Creates a _StepWrapperException for a load error.

Parameters:

Name Type Description Default message str

The error message.

required step _Step

The Step that raised the error.

required subprocess_exception Optional[Exception]

The exception raised by the subprocess. Defaults to None.

None

Returns:

Type Description _StepWrapperException

The _StepWrapperException instance.

Source code in src/distilabel/pipeline/step_wrapper.py
@classmethod\ndef create_load_error(\n    cls,\n    message: str,\n    step: \"_Step\",\n    subprocess_exception: Optional[Exception] = None,\n) -> \"_StepWrapperException\":\n    \"\"\"Creates a `_StepWrapperException` for a load error.\n\n    Args:\n        message: The error message.\n        step: The `Step` that raised the error.\n        subprocess_exception: The exception raised by the subprocess. Defaults to `None`.\n\n    Returns:\n        The `_StepWrapperException` instance.\n    \"\"\"\n    return cls(message, step, 1, subprocess_exception, None)\n
"},{"location":"api/pipeline/typing/","title":"Pipeline Typing","text":""},{"location":"api/pipeline/typing/#distilabel.pipeline.typing","title":"typing","text":""},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.DownstreamConnectable","title":"DownstreamConnectable = Union['Step', 'GlobalStep'] module-attribute","text":"

Alias for the Step types that can be connected as downstream steps.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.UpstreamConnectableSteps","title":"UpstreamConnectableSteps = TypeVar('UpstreamConnectableSteps', bound=Union['Step', 'GlobalStep', 'GeneratorStep']) module-attribute","text":"

Type for the Step types that can be connected as upstream steps.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.DownstreamConnectableSteps","title":"DownstreamConnectableSteps = TypeVar('DownstreamConnectableSteps', bound=DownstreamConnectable, covariant=True) module-attribute","text":"

Type for the Step types that can be connected as downstream steps.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.PipelineRuntimeParametersInfo","title":"PipelineRuntimeParametersInfo = Dict[str, Union[List['RuntimeParameterInfo'], Dict[str, 'RuntimeParameterInfo']]] module-attribute","text":"

Alias for the information of the runtime parameters of a Pipeline.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.InputDataset","title":"InputDataset = Union['Dataset', 'pd.DataFrame', List[Dict[str, str]]] module-attribute","text":"

Alias for the types we can process as input dataset.

"},{"location":"api/pipeline/typing/#distilabel.pipeline.typing.StepLoadStatus","title":"StepLoadStatus","text":"

Bases: TypedDict

Dict containing information about if one step was loaded/unloaded or if it's load failed

Source code in src/distilabel/pipeline/typing.py
class StepLoadStatus(TypedDict):\n    \"\"\"Dict containing information about if one step was loaded/unloaded or if it's load\n    failed\"\"\"\n\n    name: str\n    status: Literal[\"loaded\", \"unloaded\", \"load_failed\"]\n
"},{"location":"api/step/","title":"Step","text":"

This section contains the API reference for the distilabel step, both for the _Step base class and the Step class.

For more information and examples on how to use existing steps or create custom ones, please refer to Tutorial - Step.

"},{"location":"api/step/#distilabel.steps.base","title":"base","text":""},{"location":"api/step/#distilabel.steps.base.StepInput","title":"StepInput = Annotated[List[Dict[str, Any]], _STEP_INPUT_ANNOTATION] module-attribute","text":"

StepInput is just an Annotated alias of the typing List[Dict[str, Any]] with extra metadata that allows distilabel to perform validations over the process step method defined in each Step

"},{"location":"api/step/#distilabel.steps.base._Step","title":"_Step","text":"

Bases: RuntimeParametersMixin, RequirementsMixin, SignatureMixin, BaseModel, _Serializable, ABC

Base class for the steps that can be included in a Pipeline.

A Step is a class defining some processing logic. The input and outputs for this processing logic are lists of dictionaries with the same keys:

```python\n[\n    {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n    {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n    {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n]\n```\n

The processing logic is defined in the process method, which depending on the number of previous steps, can receive more than one list of dictionaries, each with the output of the previous steps. In order to make distilabel know where the outputs from the previous steps are, the process function from each Step must have an argument or positional argument annotated with StepInput.

```python\nclass StepWithOnePreviousStep(Step):\n    def process(self, inputs: StepInput) -> StepOutput:\n        yield [...]\n\nclass StepWithSeveralPreviousStep(Step):\n    # mind the * to indicate that the argument is a list of StepInput\n    def process(self, *inputs: StepInput) -> StepOutput:\n        yield [...]\n```\n

In order to perform static validations and to check that the chaining of the steps in the pipeline is valid, a Step must also define the inputs and outputs properties:

  • inputs: a list of strings with the names of the columns that the step needs as input. It can be an empty list if the step is a generator step.
  • outputs: a list of strings with the names of the columns that the step will produce as output.

Optionally, a Step can override the load method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc.

Finally, the Step class inherits from pydantic.BaseModel, so attributes can be easily defined, validated, serialized and included in the __init__ method of the step.

Source code in src/distilabel/steps/base.py
class _Step(\n    RuntimeParametersMixin,\n    RequirementsMixin,\n    SignatureMixin,\n    BaseModel,\n    _Serializable,\n    ABC,\n):\n    \"\"\"Base class for the steps that can be included in a `Pipeline`.\n\n    A `Step` is a class defining some processing logic. The input and outputs for this\n    processing logic are lists of dictionaries with the same keys:\n\n        ```python\n        [\n            {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n            {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n            {\"column1\": \"value1\", \"column2\": \"value2\", ...},\n        ]\n        ```\n\n    The processing logic is defined in the `process` method, which depending on the\n    number of previous steps, can receive more than one list of dictionaries, each with\n    the output of the previous steps. In order to make `distilabel` know where the outputs\n    from the previous steps are, the `process` function from each `Step` must have an argument\n    or positional argument annotated with `StepInput`.\n\n        ```python\n        class StepWithOnePreviousStep(Step):\n            def process(self, inputs: StepInput) -> StepOutput:\n                yield [...]\n\n        class StepWithSeveralPreviousStep(Step):\n            # mind the * to indicate that the argument is a list of StepInput\n            def process(self, *inputs: StepInput) -> StepOutput:\n                yield [...]\n        ```\n\n    In order to perform static validations and to check that the chaining of the steps\n    in the pipeline is valid, a `Step` must also define the `inputs` and `outputs`\n    properties:\n\n    - `inputs`: a list of strings with the names of the columns that the step needs as\n        input. It can be an empty list if the step is a generator step.\n    - `outputs`: a list of strings with the names of the columns that the step will\n        produce as output.\n\n    Optionally, a `Step` can override the `load` method to perform any initialization\n    logic before the `process` method is called. For example, to load an LLM, stablish a\n    connection to a database, etc.\n\n    Finally, the `Step` class inherits from `pydantic.BaseModel`, so attributes can be easily\n    defined, validated, serialized and included in the `__init__` method of the step.\n    \"\"\"\n\n    model_config = ConfigDict(\n        arbitrary_types_allowed=True,\n        validate_default=True,\n        validate_assignment=True,\n        extra=\"forbid\",\n    )\n\n    name: Optional[str] = Field(default=None, pattern=r\"^[a-zA-Z0-9_-]+$\")\n    resources: StepResources = StepResources()\n    pipeline: Any = Field(default=None, exclude=True, repr=False)\n    input_mappings: Dict[str, str] = {}\n    output_mappings: Dict[str, str] = {}\n    use_cache: bool = True\n\n    _pipeline_artifacts_path: Path = PrivateAttr(None)\n    _built_from_decorator: bool = PrivateAttr(default=False)\n    _logger: \"Logger\" = PrivateAttr(None)\n\n    def model_post_init(self, __context: Any) -> None:\n        from distilabel.pipeline.base import _GlobalPipelineManager\n\n        super().model_post_init(__context)\n\n        if self.pipeline is None:\n            self.pipeline = _GlobalPipelineManager.get_pipeline()\n\n        if self.pipeline is None:\n            _logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n            _logger.warning(\n                f\"Step '{self.name}' hasn't received a pipeline, and it hasn't been\"\n                \" created within a `Pipeline` context. Please, use\"\n                \" `with Pipeline() as pipeline:` and create the step within the context.\"\n            )\n\n        if not self.name:\n            # This must be done before the check for repeated names, but assuming\n            # we are passing the pipeline from the _GlobalPipelineManager, should\n            # be done after that.\n            self.name = _infer_step_name(type(self).__name__, self.pipeline)\n\n        if self.pipeline is not None:\n            # If not set an error will be raised in `Pipeline.run` parent\n            self.pipeline._add_step(self)\n\n    def connect(\n        self,\n        *steps: \"_Step\",\n        routing_batch_function: Optional[\"RoutingBatchFunction\"] = None,\n    ) -> None:\n        \"\"\"Connects the current step to another step in the pipeline, which means that\n        the output of this step will be the input of the other step.\n\n        Args:\n            steps: The steps to connect to the current step.\n            routing_batch_function: A function that receives a list of steps and returns\n                a list of steps to which the output batch generated by this step should be\n                routed. It should be used to define the routing logic of the pipeline. If\n                not provided, the output batch will be routed to all the connected steps.\n                Defaults to `None`.\n        \"\"\"\n        assert self.pipeline is not None\n\n        if routing_batch_function:\n            self._set_routing_batch_function(routing_batch_function)\n\n        for step in steps:\n            self.pipeline._add_edge(from_step=self.name, to_step=step.name)  # type: ignore\n\n    def _set_routing_batch_function(\n        self, routing_batch_function: \"RoutingBatchFunction\"\n    ) -> None:\n        \"\"\"Sets a routing batch function for the batches generated by this step, so they\n        get routed to specific downstream steps.\n\n        Args:\n            routing_batch_function: The routing batch function that will be used to route\n                the batches generated by this step.\n        \"\"\"\n        self.pipeline._add_routing_batch_function(\n            step_name=self.name,  # type: ignore\n            routing_batch_function=routing_batch_function,\n        )\n        routing_batch_function._step = self\n\n    @overload\n    def __rshift__(self, other: \"RoutingBatchFunction\") -> \"RoutingBatchFunction\": ...\n\n    @overload\n    def __rshift__(\n        self, other: List[\"DownstreamConnectableSteps\"]\n    ) -> List[\"DownstreamConnectableSteps\"]: ...\n\n    @overload\n    def __rshift__(self, other: \"DownstreamConnectable\") -> \"DownstreamConnectable\": ...\n\n    def __rshift__(\n        self,\n        other: Union[\n            \"DownstreamConnectable\",\n            \"RoutingBatchFunction\",\n            List[\"DownstreamConnectableSteps\"],\n        ],\n    ) -> Union[\n        \"DownstreamConnectable\",\n        \"RoutingBatchFunction\",\n        List[\"DownstreamConnectableSteps\"],\n    ]:\n        \"\"\"Allows using the `>>` operator to connect steps in the pipeline.\n\n        Args:\n            other: The step to connect, a list of steps to connect to or a routing batch\n                function to be set for the step.\n\n        Returns:\n            The connected step, the list of connected steps or the routing batch function.\n\n        Example:\n            ```python\n            step1 >> step2\n            # Would be equivalent to:\n            step1.connect(step2)\n\n            # It also allows to connect a list of steps\n            step1 >> [step2, step3]\n            ```\n        \"\"\"\n        # Here to avoid circular imports\n        from distilabel.pipeline.routing_batch_function import RoutingBatchFunction\n\n        if isinstance(other, list):\n            self.connect(*other)\n            return other\n\n        if isinstance(other, RoutingBatchFunction):\n            self._set_routing_batch_function(other)\n            return other\n\n        self.connect(other)\n        return other\n\n    def __rrshift__(self, other: List[\"UpstreamConnectableSteps\"]) -> Self:\n        \"\"\"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline\n        to a single step, as the list doesn't have the __rshift__ operator.\n\n        Args:\n            other: The step to connect to.\n\n        Returns:\n            The connected step\n\n        Example:\n            ```python\n            [step2, step3] >> step1\n            # Would be equivalent to:\n            step2.connect(step1)\n            step3.connect(step1)\n            ```\n        \"\"\"\n        for o in other:\n            o.connect(self)\n        return self\n\n    def load(self) -> None:\n        \"\"\"Method to perform any initialization logic before the `process` method is\n        called. For example, to load an LLM, stablish a connection to a database, etc.\n        \"\"\"\n        self._logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n\n    def unload(self) -> None:\n        \"\"\"Method to perform any cleanup logic after the `process` method is called. For\n        example, to close a connection to a database, etc.\n        \"\"\"\n        self._logger.debug(\"Executing step unload logic.\")\n\n    @property\n    def is_generator(self) -> bool:\n        \"\"\"Whether the step is a generator step or not.\n\n        Returns:\n            `True` if the step is a generator step, `False` otherwise.\n        \"\"\"\n        return isinstance(self, GeneratorStep)\n\n    @property\n    def is_global(self) -> bool:\n        \"\"\"Whether the step is a global step or not.\n\n        Returns:\n            `True` if the step is a global step, `False` otherwise.\n        \"\"\"\n        return isinstance(self, GlobalStep)\n\n    @property\n    def is_normal(self) -> bool:\n        \"\"\"Whether the step is a normal step or not.\n\n        Returns:\n            `True` if the step is a normal step, `False` otherwise.\n        \"\"\"\n        return not self.is_generator and not self.is_global\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of strings with the names of the mandatory columns that the step needs as\n        input or dictionary in which the keys are the input columns of the step and the\n        values are booleans indicating whether the column is optional or not.\n\n        Returns:\n            List of strings with the names of the columns that the step needs as input.\n        \"\"\"\n        return []\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of strings with the names of the columns that the step will produce as\n        output or dictionary in which the keys are the output columns of the step and the\n        values are booleans indicating whether the column is optional or not.\n\n        Returns:\n            List of strings with the names of the columns that the step will produce as\n            output.\n        \"\"\"\n        return []\n\n    @cached_property\n    def process_parameters(self) -> List[inspect.Parameter]:\n        \"\"\"Returns the parameters of the `process` method of the step.\n\n        Returns:\n            The parameters of the `process` method of the step.\n        \"\"\"\n        return list(inspect.signature(self.process).parameters.values())  # type: ignore\n\n    def has_multiple_inputs(self) -> bool:\n        \"\"\"Whether the `process` method of the step receives more than one input or not\n        i.e. has a `*` argument annotated with `StepInput`.\n\n        Returns:\n            `True` if the `process` method of the step receives more than one input,\n            `False` otherwise.\n        \"\"\"\n        return any(\n            param.kind == param.VAR_POSITIONAL for param in self.process_parameters\n        )\n\n    def get_process_step_input(self) -> Union[inspect.Parameter, None]:\n        \"\"\"Returns the parameter of the `process` method of the step annotated with\n        `StepInput`.\n\n        Returns:\n            The parameter of the `process` method of the step annotated with `StepInput`,\n            or `None` if there is no parameter annotated with `StepInput`.\n\n        Raises:\n            TypeError: If the step has more than one parameter annotated with `StepInput`.\n        \"\"\"\n        step_input_parameter = None\n        for parameter in self.process_parameters:\n            if is_parameter_annotated_with(parameter, _STEP_INPUT_ANNOTATION):\n                if step_input_parameter is not None:\n                    raise DistilabelTypeError(\n                        f\"Step '{self.name}' should have only one parameter with type\"\n                        \" hint `StepInput`.\",\n                        page=\"sections/how_to_guides/basic/step/#defining-custom-steps\",\n                    )\n                step_input_parameter = parameter\n        return step_input_parameter\n\n    def verify_inputs_mappings(self) -> None:\n        \"\"\"Verifies that the `inputs_mappings` of the step are valid i.e. the input\n        columns exist in the inputs of the step.\n\n        Raises:\n            ValueError: If the `inputs_mappings` of the step are not valid.\n        \"\"\"\n        if not self.input_mappings:\n            return\n\n        for input in self.input_mappings:\n            if input not in self.inputs:\n                raise DistilabelUserError(\n                    f\"The input column '{input}' doesn't exist in the inputs of the\"\n                    f\" step '{self.name}'. Inputs of the step are: {self.inputs}.\"\n                    \" Please, review the `inputs_mappings` argument of the step.\",\n                    page=\"sections/how_to_guides/basic/step/#arguments\",\n                )\n\n    def verify_outputs_mappings(self) -> None:\n        \"\"\"Verifies that the `outputs_mappings` of the step are valid i.e. the output\n        columns exist in the outputs of the step.\n\n        Raises:\n            ValueError: If the `outputs_mappings` of the step are not valid.\n        \"\"\"\n        if not self.output_mappings:\n            return\n\n        for output in self.output_mappings:\n            if output not in self.outputs:\n                raise DistilabelUserError(\n                    f\"The output column '{output}' doesn't exist in the outputs of the\"\n                    f\" step '{self.name}'. Outputs of the step are: {self.outputs}.\"\n                    \" Please, review the `outputs_mappings` argument of the step.\",\n                    page=\"sections/how_to_guides/basic/step/#arguments\",\n                )\n\n    def get_inputs(self) -> Dict[str, bool]:\n        \"\"\"Gets the inputs of the step after the `input_mappings`. This method is meant\n        to be used to run validations on the inputs of the step.\n\n        Returns:\n            The inputs of the step after the `input_mappings` and if they are required or\n            not.\n        \"\"\"\n        if isinstance(self.inputs, list):\n            return {\n                self.input_mappings.get(input, input): True for input in self.inputs\n            }\n\n        return {\n            self.input_mappings.get(input, input): required\n            for input, required in self.inputs.items()\n        }\n\n    def get_outputs(self) -> Dict[str, bool]:\n        \"\"\"Gets the outputs of the step after the `outputs_mappings`. This method is\n        meant to be used to run validations on the outputs of the step.\n\n        Returns:\n            The outputs of the step after the `outputs_mappings` and if they are required\n            or not.\n        \"\"\"\n        if isinstance(self.outputs, list):\n            return {\n                self.output_mappings.get(output, output): True\n                for output in self.outputs\n            }\n\n        return {\n            self.output_mappings.get(output, output): required\n            for output, required in self.outputs.items()\n        }\n\n    def set_pipeline_artifacts_path(self, path: Path) -> None:\n        \"\"\"Sets the `_pipeline_artifacts_path` attribute. This method is meant to be used\n        by the `Pipeline` once the cache location is known.\n\n        Args:\n            path: the path where the artifacts generated by the pipeline steps should be\n                saved.\n        \"\"\"\n        self._pipeline_artifacts_path = path\n\n    @property\n    def artifacts_directory(self) -> Union[Path, None]:\n        \"\"\"Gets the path of the directory where the step should save its generated artifacts.\n\n        Returns:\n            The path of the directory where the step should save the generated artifacts,\n                or `None` if `_pipeline_artifacts_path` is not set.\n        \"\"\"\n        if self._pipeline_artifacts_path is None:\n            return None\n        return self._pipeline_artifacts_path / self.name  # type: ignore\n\n    def save_artifact(\n        self,\n        name: str,\n        write_function: Callable[[Path], None],\n        metadata: Optional[Dict[str, Any]] = None,\n    ) -> None:\n        \"\"\"Saves an artifact generated by the `Step`.\n\n        Args:\n            name: the name of the artifact.\n            write_function: a function that will receive the path where the artifact should\n                be saved.\n            metadata: the artifact metadata. Defaults to `None`.\n        \"\"\"\n        if self.artifacts_directory is None:\n            self._logger.warning(\n                f\"Cannot save artifact with '{name}' as `_pipeline_artifacts_path` is not\"\n                \" set. This is normal if the `Step` is being executed as a standalone component.\"\n            )\n            return\n\n        artifact_directory_path = self.artifacts_directory / name\n        artifact_directory_path.mkdir(parents=True, exist_ok=True)\n\n        self._logger.info(f\"\ud83c\udffa Storing '{name}' generated artifact...\")\n\n        self._logger.debug(\n            f\"Calling `write_function` to write artifact in '{artifact_directory_path}'...\"\n        )\n        write_function(artifact_directory_path)\n\n        metadata_path = artifact_directory_path / \"metadata.json\"\n        self._logger.debug(\n            f\"Calling `write_json` to write artifact metadata in '{metadata_path}'...\"\n        )\n        write_json(filename=metadata_path, data=metadata or {})\n\n    def impute_step_outputs(\n        self, step_output: List[Dict[str, Any]]\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Imputes the output columns of the step that are not present in the step output.\n        \"\"\"\n        result = []\n        for row in step_output:\n            data = row.copy()\n            for output in self.get_outputs().keys():\n                data[output] = None\n            result.append(data)\n        return result\n\n    def _model_dump(self, obj: Any, **kwargs: Any) -> Dict[str, Any]:\n        dump = super()._model_dump(obj, **kwargs)\n        dump[\"runtime_parameters_info\"] = self.get_runtime_parameters_info()\n        return dump\n
"},{"location":"api/step/#distilabel.steps.base._Step.is_generator","title":"is_generator: bool property","text":"

Whether the step is a generator step or not.

Returns:

Type Description bool

True if the step is a generator step, False otherwise.

"},{"location":"api/step/#distilabel.steps.base._Step.is_global","title":"is_global: bool property","text":"

Whether the step is a global step or not.

Returns:

Type Description bool

True if the step is a global step, False otherwise.

"},{"location":"api/step/#distilabel.steps.base._Step.is_normal","title":"is_normal: bool property","text":"

Whether the step is a normal step or not.

Returns:

Type Description bool

True if the step is a normal step, False otherwise.

"},{"location":"api/step/#distilabel.steps.base._Step.inputs","title":"inputs: StepColumns property","text":"

List of strings with the names of the mandatory columns that the step needs as input or dictionary in which the keys are the input columns of the step and the values are booleans indicating whether the column is optional or not.

Returns:

Type Description StepColumns

List of strings with the names of the columns that the step needs as input.

"},{"location":"api/step/#distilabel.steps.base._Step.outputs","title":"outputs: StepColumns property","text":"

List of strings with the names of the columns that the step will produce as output or dictionary in which the keys are the output columns of the step and the values are booleans indicating whether the column is optional or not.

Returns:

Type Description StepColumns

List of strings with the names of the columns that the step will produce as

StepColumns

output.

"},{"location":"api/step/#distilabel.steps.base._Step.process_parameters","title":"process_parameters: List[inspect.Parameter] cached property","text":"

Returns the parameters of the process method of the step.

Returns:

Type Description List[Parameter]

The parameters of the process method of the step.

"},{"location":"api/step/#distilabel.steps.base._Step.artifacts_directory","title":"artifacts_directory: Union[Path, None] property","text":"

Gets the path of the directory where the step should save its generated artifacts.

Returns:

Type Description Union[Path, None]

The path of the directory where the step should save the generated artifacts, or None if _pipeline_artifacts_path is not set.

"},{"location":"api/step/#distilabel.steps.base._Step.connect","title":"connect(*steps, routing_batch_function=None)","text":"

Connects the current step to another step in the pipeline, which means that the output of this step will be the input of the other step.

Parameters:

Name Type Description Default steps _Step

The steps to connect to the current step.

() routing_batch_function Optional[RoutingBatchFunction]

A function that receives a list of steps and returns a list of steps to which the output batch generated by this step should be routed. It should be used to define the routing logic of the pipeline. If not provided, the output batch will be routed to all the connected steps. Defaults to None.

None Source code in src/distilabel/steps/base.py
def connect(\n    self,\n    *steps: \"_Step\",\n    routing_batch_function: Optional[\"RoutingBatchFunction\"] = None,\n) -> None:\n    \"\"\"Connects the current step to another step in the pipeline, which means that\n    the output of this step will be the input of the other step.\n\n    Args:\n        steps: The steps to connect to the current step.\n        routing_batch_function: A function that receives a list of steps and returns\n            a list of steps to which the output batch generated by this step should be\n            routed. It should be used to define the routing logic of the pipeline. If\n            not provided, the output batch will be routed to all the connected steps.\n            Defaults to `None`.\n    \"\"\"\n    assert self.pipeline is not None\n\n    if routing_batch_function:\n        self._set_routing_batch_function(routing_batch_function)\n\n    for step in steps:\n        self.pipeline._add_edge(from_step=self.name, to_step=step.name)  # type: ignore\n
"},{"location":"api/step/#distilabel.steps.base._Step.__rshift__","title":"__rshift__(other)","text":"
__rshift__(other: RoutingBatchFunction) -> RoutingBatchFunction\n
__rshift__(other: List[DownstreamConnectableSteps]) -> List[DownstreamConnectableSteps]\n
__rshift__(other: DownstreamConnectable) -> DownstreamConnectable\n

Allows using the >> operator to connect steps in the pipeline.

Parameters:

Name Type Description Default other Union[DownstreamConnectable, RoutingBatchFunction, List[DownstreamConnectableSteps]]

The step to connect, a list of steps to connect to or a routing batch function to be set for the step.

required

Returns:

Type Description Union[DownstreamConnectable, RoutingBatchFunction, List[DownstreamConnectableSteps]]

The connected step, the list of connected steps or the routing batch function.

Example
step1 >> step2\n# Would be equivalent to:\nstep1.connect(step2)\n\n# It also allows to connect a list of steps\nstep1 >> [step2, step3]\n
Source code in src/distilabel/steps/base.py
def __rshift__(\n    self,\n    other: Union[\n        \"DownstreamConnectable\",\n        \"RoutingBatchFunction\",\n        List[\"DownstreamConnectableSteps\"],\n    ],\n) -> Union[\n    \"DownstreamConnectable\",\n    \"RoutingBatchFunction\",\n    List[\"DownstreamConnectableSteps\"],\n]:\n    \"\"\"Allows using the `>>` operator to connect steps in the pipeline.\n\n    Args:\n        other: The step to connect, a list of steps to connect to or a routing batch\n            function to be set for the step.\n\n    Returns:\n        The connected step, the list of connected steps or the routing batch function.\n\n    Example:\n        ```python\n        step1 >> step2\n        # Would be equivalent to:\n        step1.connect(step2)\n\n        # It also allows to connect a list of steps\n        step1 >> [step2, step3]\n        ```\n    \"\"\"\n    # Here to avoid circular imports\n    from distilabel.pipeline.routing_batch_function import RoutingBatchFunction\n\n    if isinstance(other, list):\n        self.connect(*other)\n        return other\n\n    if isinstance(other, RoutingBatchFunction):\n        self._set_routing_batch_function(other)\n        return other\n\n    self.connect(other)\n    return other\n
"},{"location":"api/step/#distilabel.steps.base._Step.__rrshift__","title":"__rrshift__(other)","text":"

Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline to a single step, as the list doesn't have the rshift operator.

Parameters:

Name Type Description Default other List[UpstreamConnectableSteps]

The step to connect to.

required

Returns:

Type Description Self

The connected step

Example
[step2, step3] >> step1\n# Would be equivalent to:\nstep2.connect(step1)\nstep3.connect(step1)\n
Source code in src/distilabel/steps/base.py
def __rrshift__(self, other: List[\"UpstreamConnectableSteps\"]) -> Self:\n    \"\"\"Allows using the [step1, step2] >> step3 operator to connect a list of steps in the pipeline\n    to a single step, as the list doesn't have the __rshift__ operator.\n\n    Args:\n        other: The step to connect to.\n\n    Returns:\n        The connected step\n\n    Example:\n        ```python\n        [step2, step3] >> step1\n        # Would be equivalent to:\n        step2.connect(step1)\n        step3.connect(step1)\n        ```\n    \"\"\"\n    for o in other:\n        o.connect(self)\n    return self\n
"},{"location":"api/step/#distilabel.steps.base._Step.load","title":"load()","text":"

Method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc.

Source code in src/distilabel/steps/base.py
def load(self) -> None:\n    \"\"\"Method to perform any initialization logic before the `process` method is\n    called. For example, to load an LLM, stablish a connection to a database, etc.\n    \"\"\"\n    self._logger = logging.getLogger(f\"distilabel.step.{self.name}\")\n
"},{"location":"api/step/#distilabel.steps.base._Step.unload","title":"unload()","text":"

Method to perform any cleanup logic after the process method is called. For example, to close a connection to a database, etc.

Source code in src/distilabel/steps/base.py
def unload(self) -> None:\n    \"\"\"Method to perform any cleanup logic after the `process` method is called. For\n    example, to close a connection to a database, etc.\n    \"\"\"\n    self._logger.debug(\"Executing step unload logic.\")\n
"},{"location":"api/step/#distilabel.steps.base._Step.has_multiple_inputs","title":"has_multiple_inputs()","text":"

Whether the process method of the step receives more than one input or not i.e. has a * argument annotated with StepInput.

Returns:

Type Description bool

True if the process method of the step receives more than one input,

bool

False otherwise.

Source code in src/distilabel/steps/base.py
def has_multiple_inputs(self) -> bool:\n    \"\"\"Whether the `process` method of the step receives more than one input or not\n    i.e. has a `*` argument annotated with `StepInput`.\n\n    Returns:\n        `True` if the `process` method of the step receives more than one input,\n        `False` otherwise.\n    \"\"\"\n    return any(\n        param.kind == param.VAR_POSITIONAL for param in self.process_parameters\n    )\n
"},{"location":"api/step/#distilabel.steps.base._Step.get_process_step_input","title":"get_process_step_input()","text":"

Returns the parameter of the process method of the step annotated with StepInput.

Returns:

Type Description Union[Parameter, None]

The parameter of the process method of the step annotated with StepInput,

Union[Parameter, None]

or None if there is no parameter annotated with StepInput.

Raises:

Type Description TypeError

If the step has more than one parameter annotated with StepInput.

Source code in src/distilabel/steps/base.py
def get_process_step_input(self) -> Union[inspect.Parameter, None]:\n    \"\"\"Returns the parameter of the `process` method of the step annotated with\n    `StepInput`.\n\n    Returns:\n        The parameter of the `process` method of the step annotated with `StepInput`,\n        or `None` if there is no parameter annotated with `StepInput`.\n\n    Raises:\n        TypeError: If the step has more than one parameter annotated with `StepInput`.\n    \"\"\"\n    step_input_parameter = None\n    for parameter in self.process_parameters:\n        if is_parameter_annotated_with(parameter, _STEP_INPUT_ANNOTATION):\n            if step_input_parameter is not None:\n                raise DistilabelTypeError(\n                    f\"Step '{self.name}' should have only one parameter with type\"\n                    \" hint `StepInput`.\",\n                    page=\"sections/how_to_guides/basic/step/#defining-custom-steps\",\n                )\n            step_input_parameter = parameter\n    return step_input_parameter\n
"},{"location":"api/step/#distilabel.steps.base._Step.verify_inputs_mappings","title":"verify_inputs_mappings()","text":"

Verifies that the inputs_mappings of the step are valid i.e. the input columns exist in the inputs of the step.

Raises:

Type Description ValueError

If the inputs_mappings of the step are not valid.

Source code in src/distilabel/steps/base.py
def verify_inputs_mappings(self) -> None:\n    \"\"\"Verifies that the `inputs_mappings` of the step are valid i.e. the input\n    columns exist in the inputs of the step.\n\n    Raises:\n        ValueError: If the `inputs_mappings` of the step are not valid.\n    \"\"\"\n    if not self.input_mappings:\n        return\n\n    for input in self.input_mappings:\n        if input not in self.inputs:\n            raise DistilabelUserError(\n                f\"The input column '{input}' doesn't exist in the inputs of the\"\n                f\" step '{self.name}'. Inputs of the step are: {self.inputs}.\"\n                \" Please, review the `inputs_mappings` argument of the step.\",\n                page=\"sections/how_to_guides/basic/step/#arguments\",\n            )\n
"},{"location":"api/step/#distilabel.steps.base._Step.verify_outputs_mappings","title":"verify_outputs_mappings()","text":"

Verifies that the outputs_mappings of the step are valid i.e. the output columns exist in the outputs of the step.

Raises:

Type Description ValueError

If the outputs_mappings of the step are not valid.

Source code in src/distilabel/steps/base.py
def verify_outputs_mappings(self) -> None:\n    \"\"\"Verifies that the `outputs_mappings` of the step are valid i.e. the output\n    columns exist in the outputs of the step.\n\n    Raises:\n        ValueError: If the `outputs_mappings` of the step are not valid.\n    \"\"\"\n    if not self.output_mappings:\n        return\n\n    for output in self.output_mappings:\n        if output not in self.outputs:\n            raise DistilabelUserError(\n                f\"The output column '{output}' doesn't exist in the outputs of the\"\n                f\" step '{self.name}'. Outputs of the step are: {self.outputs}.\"\n                \" Please, review the `outputs_mappings` argument of the step.\",\n                page=\"sections/how_to_guides/basic/step/#arguments\",\n            )\n
"},{"location":"api/step/#distilabel.steps.base._Step.get_inputs","title":"get_inputs()","text":"

Gets the inputs of the step after the input_mappings. This method is meant to be used to run validations on the inputs of the step.

Returns:

Type Description Dict[str, bool]

The inputs of the step after the input_mappings and if they are required or

Dict[str, bool]

not.

Source code in src/distilabel/steps/base.py
def get_inputs(self) -> Dict[str, bool]:\n    \"\"\"Gets the inputs of the step after the `input_mappings`. This method is meant\n    to be used to run validations on the inputs of the step.\n\n    Returns:\n        The inputs of the step after the `input_mappings` and if they are required or\n        not.\n    \"\"\"\n    if isinstance(self.inputs, list):\n        return {\n            self.input_mappings.get(input, input): True for input in self.inputs\n        }\n\n    return {\n        self.input_mappings.get(input, input): required\n        for input, required in self.inputs.items()\n    }\n
"},{"location":"api/step/#distilabel.steps.base._Step.get_outputs","title":"get_outputs()","text":"

Gets the outputs of the step after the outputs_mappings. This method is meant to be used to run validations on the outputs of the step.

Returns:

Type Description Dict[str, bool]

The outputs of the step after the outputs_mappings and if they are required

Dict[str, bool]

or not.

Source code in src/distilabel/steps/base.py
def get_outputs(self) -> Dict[str, bool]:\n    \"\"\"Gets the outputs of the step after the `outputs_mappings`. This method is\n    meant to be used to run validations on the outputs of the step.\n\n    Returns:\n        The outputs of the step after the `outputs_mappings` and if they are required\n        or not.\n    \"\"\"\n    if isinstance(self.outputs, list):\n        return {\n            self.output_mappings.get(output, output): True\n            for output in self.outputs\n        }\n\n    return {\n        self.output_mappings.get(output, output): required\n        for output, required in self.outputs.items()\n    }\n
"},{"location":"api/step/#distilabel.steps.base._Step.set_pipeline_artifacts_path","title":"set_pipeline_artifacts_path(path)","text":"

Sets the _pipeline_artifacts_path attribute. This method is meant to be used by the Pipeline once the cache location is known.

Parameters:

Name Type Description Default path Path

the path where the artifacts generated by the pipeline steps should be saved.

required Source code in src/distilabel/steps/base.py
def set_pipeline_artifacts_path(self, path: Path) -> None:\n    \"\"\"Sets the `_pipeline_artifacts_path` attribute. This method is meant to be used\n    by the `Pipeline` once the cache location is known.\n\n    Args:\n        path: the path where the artifacts generated by the pipeline steps should be\n            saved.\n    \"\"\"\n    self._pipeline_artifacts_path = path\n
"},{"location":"api/step/#distilabel.steps.base._Step.save_artifact","title":"save_artifact(name, write_function, metadata=None)","text":"

Saves an artifact generated by the Step.

Parameters:

Name Type Description Default name str

the name of the artifact.

required write_function Callable[[Path], None]

a function that will receive the path where the artifact should be saved.

required metadata Optional[Dict[str, Any]]

the artifact metadata. Defaults to None.

None Source code in src/distilabel/steps/base.py
def save_artifact(\n    self,\n    name: str,\n    write_function: Callable[[Path], None],\n    metadata: Optional[Dict[str, Any]] = None,\n) -> None:\n    \"\"\"Saves an artifact generated by the `Step`.\n\n    Args:\n        name: the name of the artifact.\n        write_function: a function that will receive the path where the artifact should\n            be saved.\n        metadata: the artifact metadata. Defaults to `None`.\n    \"\"\"\n    if self.artifacts_directory is None:\n        self._logger.warning(\n            f\"Cannot save artifact with '{name}' as `_pipeline_artifacts_path` is not\"\n            \" set. This is normal if the `Step` is being executed as a standalone component.\"\n        )\n        return\n\n    artifact_directory_path = self.artifacts_directory / name\n    artifact_directory_path.mkdir(parents=True, exist_ok=True)\n\n    self._logger.info(f\"\ud83c\udffa Storing '{name}' generated artifact...\")\n\n    self._logger.debug(\n        f\"Calling `write_function` to write artifact in '{artifact_directory_path}'...\"\n    )\n    write_function(artifact_directory_path)\n\n    metadata_path = artifact_directory_path / \"metadata.json\"\n    self._logger.debug(\n        f\"Calling `write_json` to write artifact metadata in '{metadata_path}'...\"\n    )\n    write_json(filename=metadata_path, data=metadata or {})\n
"},{"location":"api/step/#distilabel.steps.base._Step.impute_step_outputs","title":"impute_step_outputs(step_output)","text":"

Imputes the output columns of the step that are not present in the step output.

Source code in src/distilabel/steps/base.py
def impute_step_outputs(\n    self, step_output: List[Dict[str, Any]]\n) -> List[Dict[str, Any]]:\n    \"\"\"\n    Imputes the output columns of the step that are not present in the step output.\n    \"\"\"\n    result = []\n    for row in step_output:\n        data = row.copy()\n        for output in self.get_outputs().keys():\n            data[output] = None\n        result.append(data)\n    return result\n
"},{"location":"api/step/#distilabel.steps.base.Step","title":"Step","text":"

Bases: _Step, ABC

Base class for the steps that can be included in a Pipeline.

Attributes:

Name Type Description input_batch_size RuntimeParameter[PositiveInt]

The number of rows that will contain the batches processed by the step. Defaults to 50.

Runtime parameters
  • input_batch_size: The number of rows that will contain the batches processed by the step. Defaults to 50.
Source code in src/distilabel/steps/base.py
class Step(_Step, ABC):\n    \"\"\"Base class for the steps that can be included in a `Pipeline`.\n\n    Attributes:\n        input_batch_size: The number of rows that will contain the batches processed by\n            the step. Defaults to `50`.\n\n    Runtime parameters:\n        - `input_batch_size`: The number of rows that will contain the batches processed\n            by the step. Defaults to `50`.\n    \"\"\"\n\n    input_batch_size: RuntimeParameter[PositiveInt] = Field(\n        default=DEFAULT_INPUT_BATCH_SIZE,\n        description=\"The number of rows that will contain the batches processed by the\"\n        \" step.\",\n    )\n\n    @abstractmethod\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        \"\"\"Method that defines the processing logic of the step. It should yield the\n        output rows.\n\n        Args:\n            *inputs: An argument used to receive the outputs of the previous steps. The\n                number of arguments depends on the number of previous steps. It doesn't\n                need to be an `*args` argument, it can be a regular argument annotated\n                with `StepInput` if the step has only one previous step.\n        \"\"\"\n        pass\n\n    def process_applying_mappings(self, *args: List[Dict[str, Any]]) -> \"StepOutput\":\n        \"\"\"Runs the `process` method of the step applying the `input_mappings` to the input\n        rows and the `outputs_mappings` to the output rows. This is the function that\n        should be used to run the processing logic of the step.\n\n        Yields:\n            The output rows.\n        \"\"\"\n\n        inputs, overriden_inputs = (\n            self._apply_input_mappings(args)\n            if self.input_mappings\n            else (args, [{} for _ in range(len(args[0]))])\n        )\n\n        # If the `Step` was built using the `@step` decorator, then we need to pass\n        # the runtime parameters as kwargs, so they can be used within the processing\n        # function\n        generator = (\n            self.process(*inputs)\n            if not self._built_from_decorator\n            else self.process(*inputs, **self._runtime_parameters)\n        )\n\n        for output_rows in generator:\n            restored = []\n            for i, row in enumerate(output_rows):\n                # Correct the index here because we don't know the num_generations from the llm\n                # ahead of time. For example, if we have `len(overriden_inputs)==5` and `len(row)==10`,\n                # from `num_generations==2` and `group_generations=False` in the LLM:\n                # The loop will use indices 0, 1, 2, 3, 4, 0, 1, 2, 3, 4\n                ntimes_i = i % len(overriden_inputs)\n                restored.append(\n                    self._apply_mappings_and_restore_overriden(\n                        row, overriden_inputs[ntimes_i]\n                    )\n                )\n            yield restored\n\n    def _apply_input_mappings(\n        self, inputs: Tuple[List[Dict[str, Any]], ...]\n    ) -> Tuple[Tuple[List[Dict[str, Any]], ...], List[Dict[str, Any]]]:\n        \"\"\"Applies the `input_mappings` to the input rows.\n\n        Args:\n            inputs: The input rows.\n\n        Returns:\n            The input rows with the `input_mappings` applied and the overriden values\n                that were replaced by the `input_mappings`.\n        \"\"\"\n        reverted_input_mappings = {v: k for k, v in self.input_mappings.items()}\n\n        renamed_inputs = []\n        overriden_inputs = []\n        for i, row_inputs in enumerate(inputs):\n            renamed_row_inputs = []\n            for row in row_inputs:\n                overriden_keys = {}\n                renamed_row = {}\n                for k, v in row.items():\n                    renamed_key = reverted_input_mappings.get(k, k)\n\n                    if renamed_key not in renamed_row or k != renamed_key:\n                        renamed_row[renamed_key] = v\n\n                        if k != renamed_key and renamed_key in row and len(inputs) == 1:\n                            overriden_keys[renamed_key] = row[renamed_key]\n\n                if i == 0:\n                    overriden_inputs.append(overriden_keys)\n                renamed_row_inputs.append(renamed_row)\n            renamed_inputs.append(renamed_row_inputs)\n        return tuple(renamed_inputs), overriden_inputs\n\n    def _apply_mappings_and_restore_overriden(\n        self, row: Dict[str, Any], overriden: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Reverts the `input_mappings` applied to the input rows and applies the `output_mappings`\n        to the output rows. In addition, it restores the overriden values that were replaced\n        by the `input_mappings`.\n\n        Args:\n            row: The output row.\n            overriden: The overriden values that were replaced by the `input_mappings`.\n\n        Returns:\n            The output row with the `output_mappings` applied and the overriden values\n            restored.\n        \"\"\"\n        result = {}\n        for k, v in row.items():\n            mapped_key = (\n                self.output_mappings.get(k, None)\n                or self.input_mappings.get(k, None)\n                or k\n            )\n            result[mapped_key] = v\n\n        # Restore overriden values\n        for k, v in overriden.items():\n            if k not in result:\n                result[k] = v\n\n        return result\n
"},{"location":"api/step/#distilabel.steps.base.Step.process","title":"process(*inputs) abstractmethod","text":"

Method that defines the processing logic of the step. It should yield the output rows.

Parameters:

Name Type Description Default *inputs StepInput

An argument used to receive the outputs of the previous steps. The number of arguments depends on the number of previous steps. It doesn't need to be an *args argument, it can be a regular argument annotated with StepInput if the step has only one previous step.

() Source code in src/distilabel/steps/base.py
@abstractmethod\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n    \"\"\"Method that defines the processing logic of the step. It should yield the\n    output rows.\n\n    Args:\n        *inputs: An argument used to receive the outputs of the previous steps. The\n            number of arguments depends on the number of previous steps. It doesn't\n            need to be an `*args` argument, it can be a regular argument annotated\n            with `StepInput` if the step has only one previous step.\n    \"\"\"\n    pass\n
"},{"location":"api/step/#distilabel.steps.base.Step.process_applying_mappings","title":"process_applying_mappings(*args)","text":"

Runs the process method of the step applying the input_mappings to the input rows and the outputs_mappings to the output rows. This is the function that should be used to run the processing logic of the step.

Yields:

Type Description StepOutput

The output rows.

Source code in src/distilabel/steps/base.py
def process_applying_mappings(self, *args: List[Dict[str, Any]]) -> \"StepOutput\":\n    \"\"\"Runs the `process` method of the step applying the `input_mappings` to the input\n    rows and the `outputs_mappings` to the output rows. This is the function that\n    should be used to run the processing logic of the step.\n\n    Yields:\n        The output rows.\n    \"\"\"\n\n    inputs, overriden_inputs = (\n        self._apply_input_mappings(args)\n        if self.input_mappings\n        else (args, [{} for _ in range(len(args[0]))])\n    )\n\n    # If the `Step` was built using the `@step` decorator, then we need to pass\n    # the runtime parameters as kwargs, so they can be used within the processing\n    # function\n    generator = (\n        self.process(*inputs)\n        if not self._built_from_decorator\n        else self.process(*inputs, **self._runtime_parameters)\n    )\n\n    for output_rows in generator:\n        restored = []\n        for i, row in enumerate(output_rows):\n            # Correct the index here because we don't know the num_generations from the llm\n            # ahead of time. For example, if we have `len(overriden_inputs)==5` and `len(row)==10`,\n            # from `num_generations==2` and `group_generations=False` in the LLM:\n            # The loop will use indices 0, 1, 2, 3, 4, 0, 1, 2, 3, 4\n            ntimes_i = i % len(overriden_inputs)\n            restored.append(\n                self._apply_mappings_and_restore_overriden(\n                    row, overriden_inputs[ntimes_i]\n                )\n            )\n        yield restored\n
"},{"location":"api/step/decorator/","title":"@step","text":"

This section contains the reference for the @step decorator, used to create new Step subclasses without having to manually define the class.

For more information check the Tutorial - Step page.

"},{"location":"api/step/decorator/#distilabel.steps.decorator","title":"decorator","text":""},{"location":"api/step/decorator/#distilabel.steps.decorator.step","title":"step(inputs=None, outputs=None, step_type='normal')","text":"
step(inputs: Union[StepColumns, None] = None, outputs: Union[StepColumns, None] = None, step_type: Literal['normal'] = 'normal') -> Callable[..., Type[Step]]\n
step(inputs: Union[StepColumns, None] = None, outputs: Union[StepColumns, None] = None, step_type: Literal['global'] = 'global') -> Callable[..., Type[GlobalStep]]\n
step(inputs: None = None, outputs: Union[StepColumns, None] = None, step_type: Literal['generator'] = 'generator') -> Callable[..., Type[GeneratorStep]]\n

Creates an Step from a processing function.

Parameters:

Name Type Description Default inputs Union[StepColumns, None]

a list containing the name of the inputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not, that are required by the step. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None.

None outputs Union[StepColumns, None]

a list containing the name of the outputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column will be generated or not. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None.

None step_type Literal['normal', 'global', 'generator']

the kind of step to create. Valid choices are: \"normal\" (Step), \"global\" (GlobalStep) or \"generator\" (GeneratorStep). Defaults to \"normal\".

'normal'

Returns:

Type Description Callable[..., Type[_Step]]

A callable that will generate the type given the processing function.

Example:

# Normal step\n@step(inputs=[\"instruction\"], outputs=[\"generation\"])\ndef GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:\n    for input in inputs:\n        input[\"generation\"] = dummy_generation\n    yield inputs\n\n# Global step\n@step(inputs=[\"instruction\"], step_type=\"global\")\ndef FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:\n    yield [\n        input\n        for input in inputs\n        if len(input[\"instruction\"]) <= max_length\n    ]\n\n# Generator step\n@step(outputs=[\"num\"], step_type=\"generator\")\ndef RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:\n    data = list(range(num_rows))\n    for i in range(0, len(data), 100):\n        last_batch = i + 100 >= len(data)\n        yield [{\"num\": num} for num in data[i : i + 100]], last_batch\n
Source code in src/distilabel/steps/decorator.py
def step(\n    inputs: Union[\"StepColumns\", None] = None,\n    outputs: Union[\"StepColumns\", None] = None,\n    step_type: Literal[\"normal\", \"global\", \"generator\"] = \"normal\",\n) -> Callable[..., Type[\"_Step\"]]:\n    \"\"\"Creates an `Step` from a processing function.\n\n    Args:\n        inputs: a list containing the name of the inputs columns/keys or a dictionary\n            where the keys are the columns and the values are booleans indicating whether\n            the column is required or not, that are required by the step. If not provided\n            the default will be an empty list `[]` and it will be assumed that the step\n            doesn't need any specific columns. Defaults to `None`.\n        outputs: a list containing the name of the outputs columns/keys or a dictionary\n            where the keys are the columns and the values are booleans indicating whether\n            the column will be generated or not. If not provided the default will be an\n            empty list `[]` and it will be assumed that the step doesn't need any specific\n            columns. Defaults to `None`.\n        step_type: the kind of step to create. Valid choices are: \"normal\" (`Step`),\n            \"global\" (`GlobalStep`) or \"generator\" (`GeneratorStep`). Defaults to\n            `\"normal\"`.\n\n    Returns:\n        A callable that will generate the type given the processing function.\n\n    Example:\n\n    ```python\n    # Normal step\n    @step(inputs=[\"instruction\"], outputs=[\"generation\"])\n    def GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:\n        for input in inputs:\n            input[\"generation\"] = dummy_generation\n        yield inputs\n\n    # Global step\n    @step(inputs=[\"instruction\"], step_type=\"global\")\n    def FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:\n        yield [\n            input\n            for input in inputs\n            if len(input[\"instruction\"]) <= max_length\n        ]\n\n    # Generator step\n    @step(outputs=[\"num\"], step_type=\"generator\")\n    def RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:\n        data = list(range(num_rows))\n        for i in range(0, len(data), 100):\n            last_batch = i + 100 >= len(data)\n            yield [{\"num\": num} for num in data[i : i + 100]], last_batch\n    ```\n    \"\"\"\n\n    inputs = inputs or []\n    outputs = outputs or []\n\n    def decorator(func: ProcessingFunc) -> Type[\"_Step\"]:\n        if step_type not in _STEP_MAPPING:\n            raise ValueError(\n                f\"Invalid step type '{step_type}'. Please, review the '{func.__name__}'\"\n                \" function decorated with the `@step` decorator and provide a valid\"\n                \" `step_type`. Valid choices are: 'normal', 'global' or 'generator'.\"\n            )\n\n        BaseClass = _STEP_MAPPING[step_type]\n\n        signature = inspect.signature(func)\n\n        runtime_parameters = {\n            name: (\n                param.annotation,\n                param.default if param.default != param.empty else None,\n            )\n            for name, param in signature.parameters.items()\n        }\n\n        runtime_parameters = {}\n        step_input_parameter = None\n        for name, param in signature.parameters.items():\n            if is_parameter_annotated_with(param, _RUNTIME_PARAMETER_ANNOTATION):\n                runtime_parameters[name] = (\n                    param.annotation,\n                    param.default if param.default != param.empty else None,\n                )\n\n            if not step_type == \"generator\" and is_parameter_annotated_with(\n                param, _STEP_INPUT_ANNOTATION\n            ):\n                if step_input_parameter is not None:\n                    raise ValueError(\n                        f\"Function '{func.__name__}' has more than one parameter annotated\"\n                        f\" with `StepInput`. Please, review the '{func.__name__}' function\"\n                        \" decorated with the `@step` decorator and provide only one\"\n                        \" argument annotated with `StepInput`.\"\n                    )\n                step_input_parameter = param\n\n        RuntimeParametersModel = create_model(  # type: ignore\n            \"RuntimeParametersModel\",\n            **runtime_parameters,  # type: ignore\n        )\n\n        def inputs_property(self) -> \"StepColumns\":\n            return inputs\n\n        def outputs_property(self) -> \"StepColumns\":\n            return outputs\n\n        def process(\n            self, *args: Any, **kwargs: Any\n        ) -> Union[\"StepOutput\", \"GeneratorStepOutput\"]:\n            return func(*args, **kwargs)\n\n        return type(  # type: ignore\n            func.__name__,\n            (\n                BaseClass,\n                RuntimeParametersModel,\n            ),\n            {\n                \"process\": process,\n                \"inputs\": property(inputs_property),\n                \"outputs\": property(outputs_property),\n                \"__module__\": func.__module__,\n                \"__doc__\": func.__doc__,\n                \"_built_from_decorator\": True,\n                # Override the `get_process_step_input` method to return the parameter\n                # of the original function annotated with `StepInput`.\n                \"get_process_step_input\": lambda self: step_input_parameter,\n            },\n        )\n\n    return decorator\n
"},{"location":"api/step/generator_step/","title":"GeneratorStep","text":"

This section contains the API reference for the GeneratorStep class.

For more information and examples on how to use existing generator steps or create custom ones, please refer to Tutorial - Step - GeneratorStep.

"},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep","title":"GeneratorStep","text":"

Bases: _Step, ABC

A special kind of Step that is able to generate data i.e. it doesn't receive any input from the previous steps.

Attributes:

Name Type Description batch_size RuntimeParameter[int]

The number of rows that will contain the batches generated by the step. Defaults to 50.

Runtime parameters
  • batch_size: The number of rows that will contain the batches generated by the step. Defaults to 50.
Source code in src/distilabel/steps/base.py
class GeneratorStep(_Step, ABC):\n    \"\"\"A special kind of `Step` that is able to generate data i.e. it doesn't receive\n    any input from the previous steps.\n\n    Attributes:\n        batch_size: The number of rows that will contain the batches generated by the\n            step. Defaults to `50`.\n\n    Runtime parameters:\n        - `batch_size`: The number of rows that will contain the batches generated by\n            the step. Defaults to `50`.\n    \"\"\"\n\n    batch_size: RuntimeParameter[int] = Field(\n        default=50,\n        description=\"The number of rows that will contain the batches generated by the\"\n        \" step.\",\n    )\n\n    @abstractmethod\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        \"\"\"Method that defines the generation logic of the step. It should yield the\n        output rows and a boolean indicating if it's the last batch or not.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to 0.\n\n        Yields:\n            The output rows and a boolean indicating if it's the last batch or not.\n        \"\"\"\n        pass\n\n    def process_applying_mappings(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        \"\"\"Runs the `process` method of the step applying the `outputs_mappings` to the\n        output rows. This is the function that should be used to run the generation logic\n        of the step.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to 0.\n\n        Yields:\n            The output rows and a boolean indicating if it's the last batch or not.\n        \"\"\"\n\n        # If the `Step` was built using the `@step` decorator, then we need to pass\n        # the runtime parameters as `kwargs`, so they can be used within the processing\n        # function\n        generator = (\n            self.process(offset=offset)\n            if not self._built_from_decorator\n            else self.process(offset=offset, **self._runtime_parameters)\n        )\n\n        for output_rows, last_batch in generator:\n            yield (\n                [\n                    {self.output_mappings.get(k, k): v for k, v in row.items()}\n                    for row in output_rows\n                ],\n                last_batch,\n            )\n
"},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep.process","title":"process(offset=0) abstractmethod","text":"

Method that defines the generation logic of the step. It should yield the output rows and a boolean indicating if it's the last batch or not.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

The output rows and a boolean indicating if it's the last batch or not.

Source code in src/distilabel/steps/base.py
@abstractmethod\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n    \"\"\"Method that defines the generation logic of the step. It should yield the\n    output rows and a boolean indicating if it's the last batch or not.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to 0.\n\n    Yields:\n        The output rows and a boolean indicating if it's the last batch or not.\n    \"\"\"\n    pass\n
"},{"location":"api/step/generator_step/#distilabel.steps.base.GeneratorStep.process_applying_mappings","title":"process_applying_mappings(offset=0)","text":"

Runs the process method of the step applying the outputs_mappings to the output rows. This is the function that should be used to run the generation logic of the step.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

The output rows and a boolean indicating if it's the last batch or not.

Source code in src/distilabel/steps/base.py
def process_applying_mappings(self, offset: int = 0) -> \"GeneratorStepOutput\":\n    \"\"\"Runs the `process` method of the step applying the `outputs_mappings` to the\n    output rows. This is the function that should be used to run the generation logic\n    of the step.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to 0.\n\n    Yields:\n        The output rows and a boolean indicating if it's the last batch or not.\n    \"\"\"\n\n    # If the `Step` was built using the `@step` decorator, then we need to pass\n    # the runtime parameters as `kwargs`, so they can be used within the processing\n    # function\n    generator = (\n        self.process(offset=offset)\n        if not self._built_from_decorator\n        else self.process(offset=offset, **self._runtime_parameters)\n    )\n\n    for output_rows, last_batch in generator:\n        yield (\n            [\n                {self.output_mappings.get(k, k): v for k, v in row.items()}\n                for row in output_rows\n            ],\n            last_batch,\n        )\n
"},{"location":"api/step/generator_step/#distilabel.steps.generators.utils.make_generator_step","title":"make_generator_step(dataset, pipeline=None, batch_size=50, input_mappings=None, output_mappings=None, resources=StepResources(), repo_id='default_name')","text":"

Helper method to create a GeneratorStep from a dataset, to simplify

Parameters:

Name Type Description Default dataset Union[Dataset, DataFrame, List[Dict[str, str]]]

The dataset to use in the Pipeline.

required batch_size int

The batch_size, will default to the same used by the GeneratorSteps. Defaults to 50.

50 input_mappings Optional[Dict[str, str]]

Applies the same as any other step. Defaults to None.

None output_mappings Optional[Dict[str, str]]

Applies the same as any other step. Defaults to None.

None resources StepResources

Applies the same as any other step. Defaults to StepResources().

StepResources() repo_id Optional[str]

The repository ID to use in the LoadDataFromHub step. This shouldn't be necessary, but in case of error, the dataset will try to be loaded using load_dataset internally. If that case happens, the repo_id will be used.

'default_name'

Raises:

Type Description ValueError

If the format is different from the ones supported.

Returns:

Type Description GeneratorStep

A LoadDataFromDicts if the input is a list of dicts, or LoadDataFromHub instance

GeneratorStep

if the input is a pd.DataFrame or a Dataset.

Source code in src/distilabel/steps/generators/utils.py
def make_generator_step(\n    dataset: Union[Dataset, pd.DataFrame, List[Dict[str, str]]],\n    pipeline: Union[\"BasePipeline\", None] = None,\n    batch_size: int = 50,\n    input_mappings: Optional[Dict[str, str]] = None,\n    output_mappings: Optional[Dict[str, str]] = None,\n    resources: StepResources = StepResources(),\n    repo_id: Optional[str] = \"default_name\",\n) -> \"GeneratorStep\":\n    \"\"\"Helper method to create a `GeneratorStep` from a dataset, to simplify\n\n    Args:\n        dataset: The dataset to use in the `Pipeline`.\n        batch_size: The batch_size, will default to the same used by the `GeneratorStep`s.\n            Defaults to `50`.\n        input_mappings: Applies the same as any other step. Defaults to `None`.\n        output_mappings: Applies the same as any other step. Defaults to `None`.\n        resources: Applies the same as any other step. Defaults to `StepResources()`.\n        repo_id: The repository ID to use in the `LoadDataFromHub` step.\n            This shouldn't be necessary, but in case of error, the dataset will try to be loaded\n            using `load_dataset` internally. If that case happens, the `repo_id` will be used.\n\n    Raises:\n        ValueError: If the format is different from the ones supported.\n\n    Returns:\n        A `LoadDataFromDicts` if the input is a list of dicts, or `LoadDataFromHub` instance\n        if the input is a `pd.DataFrame` or a `Dataset`.\n    \"\"\"\n    from distilabel.steps import LoadDataFromDicts, LoadDataFromHub\n\n    if isinstance(dataset, list):\n        return LoadDataFromDicts(\n            pipeline=pipeline,\n            data=dataset,\n            batch_size=batch_size,\n            input_mappings=input_mappings or {},\n            output_mappings=output_mappings or {},\n            resources=resources,\n        )\n\n    if isinstance(dataset, pd.DataFrame):\n        dataset = Dataset.from_pandas(dataset, preserve_index=False)\n\n    if not isinstance(dataset, Dataset):\n        raise DistilabelUserError(\n            f\"Dataset type not allowed: {type(dataset)}, must be one of: \"\n            \"`datasets.Dataset`, `pd.DataFrame`, `List[Dict[str, str]]`\",\n            page=\"sections/how_to_guides/basic/pipeline/?h=make_#__tabbed_1_2\",\n        )\n\n    loader = LoadDataFromHub(\n        pipeline=pipeline,\n        repo_id=repo_id,\n        batch_size=batch_size,\n        input_mappings=input_mappings or {},\n        output_mappings=output_mappings or {},\n        resources=resources,\n    )\n    super(loader.__class__, loader).load()  # Ensure the logger is loaded\n    loader._dataset = dataset\n    loader.num_examples = len(dataset)\n    loader._dataset_info = {\"default\": dataset.info}\n    return loader\n
"},{"location":"api/step/global_step/","title":"GlobalStep","text":"

This section contains the API reference for the GlobalStep class.

For more information and examples on how to use existing global steps or create custom ones, please refer to Tutorial - Step - GlobalStep.

"},{"location":"api/step/global_step/#distilabel.steps.base.GlobalStep","title":"GlobalStep","text":"

Bases: Step, ABC

A special kind of Step which it's process method receives all the data processed by their previous steps at once, instead of receiving it in batches. This kind of steps are useful when the processing logic requires to have all the data at once, for example to train a model, to perform a global aggregation, etc.

Source code in src/distilabel/steps/base.py
class GlobalStep(Step, ABC):\n    \"\"\"A special kind of `Step` which it's `process` method receives all the data processed\n    by their previous steps at once, instead of receiving it in batches. This kind of steps\n    are useful when the processing logic requires to have all the data at once, for example\n    to train a model, to perform a global aggregation, etc.\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return []\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return []\n
"},{"location":"api/step/resources/","title":"StepResources","text":""},{"location":"api/step/resources/#distilabel.steps.base.StepResources","title":"StepResources","text":"

Bases: RuntimeParametersMixin, BaseModel

A class to define the resources assigned to a _Step.

Attributes:

Name Type Description replicas RuntimeParameter[PositiveInt]

The number of replicas for the step.

cpus Optional[RuntimeParameter[PositiveInt]]

The number of CPUs assigned to each step replica.

gpus Optional[RuntimeParameter[PositiveInt]]

The number of GPUs assigned to each step replica.

memory Optional[RuntimeParameter[PositiveInt]]

The memory in bytes required for each step replica.

resources Optional[RuntimeParameter[Dict[str, int]]]

A dictionary containing the number of custom resources required for each step replica.

Source code in src/distilabel/steps/base.py
class StepResources(RuntimeParametersMixin, BaseModel):\n    \"\"\"A class to define the resources assigned to a `_Step`.\n\n    Attributes:\n        replicas: The number of replicas for the step.\n        cpus: The number of CPUs assigned to each step replica.\n        gpus: The number of GPUs assigned to each step replica.\n        memory: The memory in bytes required for each step replica.\n        resources: A dictionary containing the number of custom resources required for\n            each step replica.\n    \"\"\"\n\n    replicas: RuntimeParameter[PositiveInt] = Field(\n        default=1, description=\"The number of replicas for the step.\"\n    )\n    cpus: Optional[RuntimeParameter[PositiveInt]] = Field(\n        default=None, description=\"The number of CPUs assigned to each step replica.\"\n    )\n    gpus: Optional[RuntimeParameter[PositiveInt]] = Field(\n        default=None, description=\"The number of GPUs assigned to each step replica.\"\n    )\n    memory: Optional[RuntimeParameter[PositiveInt]] = Field(\n        default=None, description=\"The memory in bytes required for each step replica.\"\n    )\n    resources: Optional[RuntimeParameter[Dict[str, int]]] = Field(\n        default=None,\n        description=\"A dictionary containing names of custom resources and the\"\n        \" number of those resources required for each step replica.\",\n    )\n
"},{"location":"api/step/typing/","title":"Step Typing","text":""},{"location":"api/step/typing/#distilabel.steps.typing","title":"typing","text":""},{"location":"api/step/typing/#distilabel.steps.typing.StepOutput","title":"StepOutput = Iterator[List[Dict[str, Any]]] module-attribute","text":"

StepOutput is an alias of the typing Iterator[List[Dict[str, Any]]]

"},{"location":"api/step/typing/#distilabel.steps.typing.GeneratorStepOutput","title":"GeneratorStepOutput = Iterator[Tuple[List[Dict[str, Any]], bool]] module-attribute","text":"

GeneratorStepOutput is an alias of the typing Iterator[Tuple[List[Dict[str, Any]], bool]]

"},{"location":"api/step/typing/#distilabel.steps.typing.StepColumns","title":"StepColumns = Union[List[str], Dict[str, bool]] module-attribute","text":"

StepColumns is an alias of the typing Union[List[str], Dict[str, bool]] used by the inputs and outputs properties of an Step. In the case of a List[str], it is a list with the required columns. In the case of a Dict[str, bool], it is a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not.

"},{"location":"api/step_gallery/argilla/","title":"Argilla","text":"

This section contains the existing steps integrated with Argilla so as to easily push the generated datasets to Argilla.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base","title":"base","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase","title":"ArgillaBase","text":"

Bases: Step, ABC

Abstract step that provides a class to subclass from, that contains the boilerplate code required to interact with Argilla, as well as some extra validations on top of it. It also defines the abstract methods that need to be implemented in order to add a new dataset type as a step.

Note

This class is not intended to be instanced directly, but via subclass.

Attributes:

Name Type Description dataset_name RuntimeParameter[str]

The name of the dataset in Argilla where the records will be added.

dataset_workspace Optional[RuntimeParameter[str]]

The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

api_url Optional[RuntimeParameter[str]]

The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

api_key Optional[RuntimeParameter[SecretStr]]

The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

Runtime parameters
  • dataset_name: The name of the dataset in Argilla where the records will be added.
  • dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.
  • api_url: The base URL to use for the Argilla API requests.
  • api_key: The API key to authenticate the requests to the Argilla API.
Input columns
  • dynamic, based on the inputs value provided
Source code in src/distilabel/steps/argilla/base.py
class ArgillaBase(Step, ABC):\n    \"\"\"Abstract step that provides a class to subclass from, that contains the boilerplate code\n    required to interact with Argilla, as well as some extra validations on top of it. It also defines\n    the abstract methods that need to be implemented in order to add a new dataset type as a step.\n\n    Note:\n        This class is not intended to be instanced directly, but via subclass.\n\n    Attributes:\n        dataset_name: The name of the dataset in Argilla where the records will be added.\n        dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n            `None`, which means it will be created in the default workspace.\n        api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n            the `ARGILLA_API_URL` environment variable.\n        api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n            be read from the `ARGILLA_API_KEY` environment variable.\n\n    Runtime parameters:\n        - `dataset_name`: The name of the dataset in Argilla where the records will be\n            added.\n        - `dataset_workspace`: The workspace where the dataset will be created in Argilla.\n            Defaults to `None`, which means it will be created in the default workspace.\n        - `api_url`: The base URL to use for the Argilla API requests.\n        - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n    Input columns:\n        - dynamic, based on the `inputs` value provided\n    \"\"\"\n\n    dataset_name: RuntimeParameter[str] = Field(\n        default=None, description=\"The name of the dataset in Argilla.\"\n    )\n    dataset_workspace: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The workspace where the dataset will be created in Argilla. Defaults \"\n        \"to `None` which means it will be created in the default workspace.\",\n    )\n\n    api_url: Optional[RuntimeParameter[str]] = Field(\n        default_factory=lambda: os.getenv(_ARGILLA_API_URL_ENV_VAR_NAME),\n        description=\"The base URL to use for the Argilla API requests.\",\n    )\n    api_key: Optional[RuntimeParameter[SecretStr]] = Field(\n        default_factory=lambda: os.getenv(_ARGILLA_API_KEY_ENV_VAR_NAME),\n        description=\"The API key to authenticate the requests to the Argilla API.\",\n    )\n\n    _client: Optional[\"Argilla\"] = PrivateAttr(...)\n    _dataset: Optional[\"Dataset\"] = PrivateAttr(...)\n\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.\"\"\"\n        super().model_post_init(__context)\n\n        if importlib.util.find_spec(\"argilla\") is None:\n            raise ImportError(\n                \"Argilla is not installed. Please install it using `pip install argilla\"\n                \" --upgrade`.\"\n            )\n\n    def _client_init(self) -> None:\n        \"\"\"Initializes the Argilla API client with the provided `api_url` and `api_key`.\"\"\"\n        try:\n            self._client = rg.Argilla(  # type: ignore\n                api_url=self.api_url,\n                api_key=self.api_key.get_secret_value(),  # type: ignore\n                headers={\"Authorization\": f\"Bearer {os.environ['HF_TOKEN']}\"}\n                if isinstance(self.api_url, str)\n                and \"hf.space\" in self.api_url\n                and \"HF_TOKEN\" in os.environ\n                else {},\n            )\n        except Exception as e:\n            raise DistilabelUserError(\n                f\"Failed to initialize the Argilla API: {e}\",\n                page=\"sections/how_to_guides/advanced/argilla/\",\n            ) from e\n\n    @property\n    def _dataset_exists_in_workspace(self) -> bool:\n        \"\"\"Checks if the dataset already exists in Argilla in the provided workspace if any.\n\n        Returns:\n            `True` if the dataset exists, `False` otherwise.\n        \"\"\"\n        return (\n            self._client.datasets(  # type: ignore\n                name=self.dataset_name,  # type: ignore\n                workspace=self.dataset_workspace,\n            )\n            is not None\n        )\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs of the step is an empty list, since the steps subclassing from this one, will\n        always be leaf nodes and won't propagate the inputs neither generate any outputs.\n        \"\"\"\n        return []\n\n    def load(self) -> None:\n        \"\"\"Method to perform any initialization logic before the `process` method is\n        called. For example, to load an LLM, stablish a connection to a database, etc.\n        \"\"\"\n        super().load()\n\n        if self.api_url is None or self.api_key is None:\n            raise DistilabelUserError(\n                \"`Argilla` step requires the `api_url` and `api_key` to be provided. Please,\"\n                \" provide those at step instantiation, via environment variables `ARGILLA_API_URL`\"\n                \" and `ARGILLA_API_KEY`, or as `Step` runtime parameters via `pipeline.run(parameters={...})`.\",\n                page=\"sections/how_to_guides/advanced/argilla/\",\n            )\n\n        self._client_init()\n\n    @property\n    @abstractmethod\n    def inputs(self) -> \"StepColumns\": ...\n\n    @abstractmethod\n    def process(self, *inputs: StepInput) -> \"StepOutput\": ...\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.outputs","title":"outputs: StepColumns property","text":"

The outputs of the step is an empty list, since the steps subclassing from this one, will always be leaf nodes and won't propagate the inputs neither generate any outputs.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.model_post_init","title":"model_post_init(__context)","text":"

Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.

Source code in src/distilabel/steps/argilla/base.py
def model_post_init(self, __context: Any) -> None:\n    \"\"\"Checks that the Argilla Python SDK is installed, and then filters the Argilla warnings.\"\"\"\n    super().model_post_init(__context)\n\n    if importlib.util.find_spec(\"argilla\") is None:\n        raise ImportError(\n            \"Argilla is not installed. Please install it using `pip install argilla\"\n            \" --upgrade`.\"\n        )\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.base.ArgillaBase.load","title":"load()","text":"

Method to perform any initialization logic before the process method is called. For example, to load an LLM, stablish a connection to a database, etc.

Source code in src/distilabel/steps/argilla/base.py
def load(self) -> None:\n    \"\"\"Method to perform any initialization logic before the `process` method is\n    called. For example, to load an LLM, stablish a connection to a database, etc.\n    \"\"\"\n    super().load()\n\n    if self.api_url is None or self.api_key is None:\n        raise DistilabelUserError(\n            \"`Argilla` step requires the `api_url` and `api_key` to be provided. Please,\"\n            \" provide those at step instantiation, via environment variables `ARGILLA_API_URL`\"\n            \" and `ARGILLA_API_KEY`, or as `Step` runtime parameters via `pipeline.run(parameters={...})`.\",\n            page=\"sections/how_to_guides/advanced/argilla/\",\n        )\n\n    self._client_init()\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference","title":"preference","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla","title":"PreferenceToArgilla","text":"

Bases: ArgillaBase

Creates a preference dataset in Argilla.

Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a preference dataset, where there's one field for the instruction and one extra field per each generation within the same record, and then a rating question per each of the generation fields. The rating question asks the annotator to set a rating from 1 to 5 for each of the provided generations.

Note

This step is meant to be used in conjunction with the UltraFeedback step, or any other step generating both ratings and responses for a given set of instruction and generations for the given instruction. But alternatively, it can also be used with any other task or step generating only the instruction and generations, as the ratings and rationales are optional.

Attributes:

Name Type Description num_generations int

The number of generations to include in the dataset.

dataset_name int

The name of the dataset in Argilla.

dataset_workspace int

The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

api_url int

The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

api_key int

The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

Runtime parameters
  • api_url: The base URL to use for the Argilla API requests.
  • api_key: The API key to authenticate the requests to the Argilla API.
Input columns
  • instruction (str): The instruction that was used to generate the completion.
  • generations (List[str]): The completion that was generated based on the input instruction.
  • ratings (List[str], optional): The ratings for the generations. If not provided, the generated ratings won't be pushed to Argilla.
  • rationales (List[str], optional): The rationales for the ratings. If not provided, the generated rationales won't be pushed to Argilla.

Examples:

Push a preference dataset to an Argilla instance:

from distilabel.steps import PreferenceToArgilla\n\nto_argilla = PreferenceToArgilla(\n    num_generations=2,\n    api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n    api_key=\"api.key\",\n    dataset_name=\"argilla_dataset\",\n    dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generations\": [\"first_generation\", \"second_generation\"],\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n

It can also include ratings and rationales:

result = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generations\": [\"first_generation\", \"second_generation\"],\n                \"ratings\": [\"4\", \"5\"],\n                \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n            }\n        ],\n    )\n)\n# >>> result\n# [\n#     {\n#         'instruction': 'instruction',\n#         'generations': ['first_generation', 'second_generation'],\n#         'ratings': ['4', '5'],\n#         'rationales': ['rationale for 4', 'rationale for 5']\n#     }\n# ]\n
Source code in src/distilabel/steps/argilla/preference.py
class PreferenceToArgilla(ArgillaBase):\n    \"\"\"Creates a preference dataset in Argilla.\n\n    Step that creates a dataset in Argilla during the load phase, and then pushes the input\n    batches into it as records. This dataset is a preference dataset, where there's one field\n    for the instruction and one extra field per each generation within the same record, and then\n    a rating question per each of the generation fields. The rating question asks the annotator to\n    set a rating from 1 to 5 for each of the provided generations.\n\n    Note:\n        This step is meant to be used in conjunction with the `UltraFeedback` step, or any other step\n        generating both ratings and responses for a given set of instruction and generations for the\n        given instruction. But alternatively, it can also be used with any other task or step generating\n        only the `instruction` and `generations`, as the `ratings` and `rationales` are optional.\n\n    Attributes:\n        num_generations: The number of generations to include in the dataset.\n        dataset_name: The name of the dataset in Argilla.\n        dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n            `None`, which means it will be created in the default workspace.\n        api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n            the `ARGILLA_API_URL` environment variable.\n        api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n            be read from the `ARGILLA_API_KEY` environment variable.\n\n    Runtime parameters:\n        - `api_url`: The base URL to use for the Argilla API requests.\n        - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n    Input columns:\n        - instruction (`str`): The instruction that was used to generate the completion.\n        - generations (`List[str]`): The completion that was generated based on the input instruction.\n        - ratings (`List[str]`, optional): The ratings for the generations. If not provided, the\n            generated ratings won't be pushed to Argilla.\n        - rationales (`List[str]`, optional): The rationales for the ratings. If not provided, the\n            generated rationales won't be pushed to Argilla.\n\n    Examples:\n        Push a preference dataset to an Argilla instance:\n\n        ```python\n        from distilabel.steps import PreferenceToArgilla\n\n        to_argilla = PreferenceToArgilla(\n            num_generations=2,\n            api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n            api_key=\"api.key\",\n            dataset_name=\"argilla_dataset\",\n            dataset_workspace=\"my_workspace\",\n        )\n        to_argilla.load()\n\n        result = next(\n            to_argilla.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"generations\": [\"first_generation\", \"second_generation\"],\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n        ```\n\n        It can also include ratings and rationales:\n\n        ```python\n        result = next(\n            to_argilla.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"generations\": [\"first_generation\", \"second_generation\"],\n                        \"ratings\": [\"4\", \"5\"],\n                        \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [\n        #     {\n        #         'instruction': 'instruction',\n        #         'generations': ['first_generation', 'second_generation'],\n        #         'ratings': ['4', '5'],\n        #         'rationales': ['rationale for 4', 'rationale for 5']\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    num_generations: int\n\n    _id: str = PrivateAttr(default=\"id\")\n    _instruction: str = PrivateAttr(...)\n    _generations: str = PrivateAttr(...)\n    _ratings: str = PrivateAttr(...)\n    _rationales: str = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Sets the `_instruction` and `_generations` attributes based on the `inputs_mapping`, otherwise\n        uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n        the text-generation scenario. And then it pushes it to Argilla.\n        \"\"\"\n        super().load()\n\n        # Both `instruction` and `generations` will be used as the fields of the dataset\n        self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n        self._generations = self.input_mappings.get(\"generations\", \"generations\")\n        # Both `ratings` and `rationales` will be used as suggestions to the default questions of the dataset\n        self._ratings = self.input_mappings.get(\"ratings\", \"ratings\")\n        self._rationales = self.input_mappings.get(\"rationales\", \"rationales\")\n\n        if self._dataset_exists_in_workspace:\n            _dataset = self._client.datasets(  # type: ignore\n                name=self.dataset_name,  # type: ignore\n                workspace=self.dataset_workspace,  # type: ignore\n            )\n\n            for field in _dataset.fields:\n                if not isinstance(field, rg.TextField):\n                    continue\n                if (\n                    field.name\n                    not in [self._id, self._instruction]  # type: ignore\n                    + [\n                        f\"{self._generations}-{idx}\"\n                        for idx in range(self.num_generations)\n                    ]\n                    and field.required\n                ):\n                    raise DistilabelUserError(\n                        f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n                        f\" already exists, but contains at least a required field that is\"\n                        f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generations}`\"\n                        f\" (one per generation starting from 0 up to {self.num_generations - 1}).\",\n                        page=\"components-gallery/steps/preferencetoargilla/\",\n                    )\n\n            self._dataset = _dataset\n        else:\n            _settings = rg.Settings(  # type: ignore\n                fields=[\n                    rg.TextField(name=self._id, title=self._id),  # type: ignore\n                    rg.TextField(name=self._instruction, title=self._instruction),  # type: ignore\n                    *self._generation_fields(),  # type: ignore\n                ],\n                questions=self._rating_rationale_pairs(),  # type: ignore\n            )\n            _dataset = rg.Dataset(  # type: ignore\n                name=self.dataset_name,\n                workspace=self.dataset_workspace,\n                settings=_settings,\n                client=self._client,\n            )\n            self._dataset = _dataset.create()\n\n    def _generation_fields(self) -> List[\"TextField\"]:\n        \"\"\"Method to generate the fields for each of the generations.\n\n        Returns:\n            A list containing `TextField`s for each text generation.\n        \"\"\"\n        return [\n            rg.TextField(  # type: ignore\n                name=f\"{self._generations}-{idx}\",\n                title=f\"{self._generations}-{idx}\",\n                required=True if idx == 0 else False,\n            )\n            for idx in range(self.num_generations)\n        ]\n\n    def _rating_rationale_pairs(\n        self,\n    ) -> List[Union[\"RatingQuestion\", \"TextQuestion\"]]:\n        \"\"\"Method to generate the rating and rationale questions for each of the generations.\n\n        Returns:\n            A list of questions containing a `RatingQuestion` and `TextQuestion` pair for\n            each text generation.\n        \"\"\"\n        questions = []\n        for idx in range(self.num_generations):\n            questions.extend(\n                [\n                    rg.RatingQuestion(  # type: ignore\n                        name=f\"{self._generations}-{idx}-rating\",\n                        title=f\"Rate {self._generations}-{idx} given {self._instruction}.\",\n                        description=f\"Ignore this question if the corresponding `{self._generations}-{idx}` field is not available.\"\n                        if idx != 0\n                        else None,\n                        values=[1, 2, 3, 4, 5],\n                        required=True if idx == 0 else False,\n                    ),\n                    rg.TextQuestion(  # type: ignore\n                        name=f\"{self._generations}-{idx}-rationale\",\n                        title=f\"Specify the rationale for {self._generations}-{idx}'s rating.\",\n                        description=f\"Ignore this question if the corresponding `{self._generations}-{idx}` field is not available.\"\n                        if idx != 0\n                        else None,\n                        required=False,\n                    ),\n                ]\n            )\n        return questions\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the step are the `instruction` and the `generations`. Optionally, one could also\n        provide the `ratings` and the `rationales` for the generations.\"\"\"\n        return [\"instruction\", \"generations\"]\n\n    @property\n    def optional_inputs(self) -> List[str]:\n        \"\"\"The optional inputs for the step are the `ratings` and the `rationales` for the generations.\"\"\"\n        return [\"ratings\", \"rationales\"]\n\n    def _add_suggestions_if_any(self, input: Dict[str, Any]) -> List[\"Suggestion\"]:\n        \"\"\"Method to generate the suggestions for the `rg.Record` based on the input.\n\n        Returns:\n            A list of `Suggestion`s for the rating and rationales questions.\n        \"\"\"\n        # Since the `suggestions` i.e. answers to the `questions` are optional, will default to {}\n        suggestions = []\n        # If `ratings` is in `input`, then add those as suggestions\n        if self._ratings in input:\n            suggestions.extend(\n                [\n                    rg.Suggestion(  # type: ignore\n                        value=rating,\n                        question_name=f\"{self._generations}-{idx}-rating\",\n                    )\n                    for idx, rating in enumerate(input[self._ratings])\n                    if rating is not None\n                    and isinstance(rating, int)\n                    and rating in [1, 2, 3, 4, 5]\n                ],\n            )\n        # If `rationales` is in `input`, then add those as suggestions\n        if self._rationales in input:\n            suggestions.extend(\n                [\n                    rg.Suggestion(  # type: ignore\n                        value=rationale,\n                        question_name=f\"{self._generations}-{idx}-rationale\",\n                    )\n                    for idx, rationale in enumerate(input[self._rationales])\n                    if rationale is not None and isinstance(rationale, str)\n                ],\n            )\n        return suggestions\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Creates and pushes the records as `rg.Record`s to the Argilla dataset.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n        records = []\n        for input in inputs:\n            # Generate the SHA-256 hash of the instruction to use it as the metadata\n            instruction_id = hashlib.sha256(\n                input[\"instruction\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            generations = {\n                f\"{self._generations}-{idx}\": generation\n                for idx, generation in enumerate(input[\"generations\"])  # type: ignore\n            }\n\n            records.append(  # type: ignore\n                rg.Record(  # type: ignore\n                    fields={\n                        \"id\": instruction_id,\n                        \"instruction\": input[\"instruction\"],  # type: ignore\n                        **generations,\n                    },\n                    suggestions=self._add_suggestions_if_any(input),  # type: ignore\n                )\n            )\n        self._dataset.records.log(records)  # type: ignore\n        yield inputs\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.inputs","title":"inputs: List[str] property","text":"

The inputs for the step are the instruction and the generations. Optionally, one could also provide the ratings and the rationales for the generations.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.optional_inputs","title":"optional_inputs: List[str] property","text":"

The optional inputs for the step are the ratings and the rationales for the generations.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.load","title":"load()","text":"

Sets the _instruction and _generations attributes based on the inputs_mapping, otherwise uses the default values; and then uses those values to create a FeedbackDataset suited for the text-generation scenario. And then it pushes it to Argilla.

Source code in src/distilabel/steps/argilla/preference.py
def load(self) -> None:\n    \"\"\"Sets the `_instruction` and `_generations` attributes based on the `inputs_mapping`, otherwise\n    uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n    the text-generation scenario. And then it pushes it to Argilla.\n    \"\"\"\n    super().load()\n\n    # Both `instruction` and `generations` will be used as the fields of the dataset\n    self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n    self._generations = self.input_mappings.get(\"generations\", \"generations\")\n    # Both `ratings` and `rationales` will be used as suggestions to the default questions of the dataset\n    self._ratings = self.input_mappings.get(\"ratings\", \"ratings\")\n    self._rationales = self.input_mappings.get(\"rationales\", \"rationales\")\n\n    if self._dataset_exists_in_workspace:\n        _dataset = self._client.datasets(  # type: ignore\n            name=self.dataset_name,  # type: ignore\n            workspace=self.dataset_workspace,  # type: ignore\n        )\n\n        for field in _dataset.fields:\n            if not isinstance(field, rg.TextField):\n                continue\n            if (\n                field.name\n                not in [self._id, self._instruction]  # type: ignore\n                + [\n                    f\"{self._generations}-{idx}\"\n                    for idx in range(self.num_generations)\n                ]\n                and field.required\n            ):\n                raise DistilabelUserError(\n                    f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n                    f\" already exists, but contains at least a required field that is\"\n                    f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generations}`\"\n                    f\" (one per generation starting from 0 up to {self.num_generations - 1}).\",\n                    page=\"components-gallery/steps/preferencetoargilla/\",\n                )\n\n        self._dataset = _dataset\n    else:\n        _settings = rg.Settings(  # type: ignore\n            fields=[\n                rg.TextField(name=self._id, title=self._id),  # type: ignore\n                rg.TextField(name=self._instruction, title=self._instruction),  # type: ignore\n                *self._generation_fields(),  # type: ignore\n            ],\n            questions=self._rating_rationale_pairs(),  # type: ignore\n        )\n        _dataset = rg.Dataset(  # type: ignore\n            name=self.dataset_name,\n            workspace=self.dataset_workspace,\n            settings=_settings,\n            client=self._client,\n        )\n        self._dataset = _dataset.create()\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.preference.PreferenceToArgilla.process","title":"process(inputs)","text":"

Creates and pushes the records as rg.Records to the Argilla dataset.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/argilla/preference.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Creates and pushes the records as `rg.Record`s to the Argilla dataset.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n    records = []\n    for input in inputs:\n        # Generate the SHA-256 hash of the instruction to use it as the metadata\n        instruction_id = hashlib.sha256(\n            input[\"instruction\"].encode(\"utf-8\")  # type: ignore\n        ).hexdigest()\n\n        generations = {\n            f\"{self._generations}-{idx}\": generation\n            for idx, generation in enumerate(input[\"generations\"])  # type: ignore\n        }\n\n        records.append(  # type: ignore\n            rg.Record(  # type: ignore\n                fields={\n                    \"id\": instruction_id,\n                    \"instruction\": input[\"instruction\"],  # type: ignore\n                    **generations,\n                },\n                suggestions=self._add_suggestions_if_any(input),  # type: ignore\n            )\n        )\n    self._dataset.records.log(records)  # type: ignore\n    yield inputs\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation","title":"text_generation","text":""},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla","title":"TextGenerationToArgilla","text":"

Bases: ArgillaBase

Creates a text generation dataset in Argilla.

Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a text-generation dataset, where there's one field per each input, and then a label question to rate the quality of the completion in either bad (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d).

Note

This step is meant to be used in conjunction with a TextGeneration step and no column mapping is needed, as it will use the default values for the instruction and generation columns.

Attributes:

Name Type Description dataset_name

The name of the dataset in Argilla.

dataset_workspace

The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

api_url

The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

api_key

The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

Runtime parameters
  • api_url: The base URL to use for the Argilla API requests.
  • api_key: The API key to authenticate the requests to the Argilla API.
Input columns
  • instruction (str): The instruction that was used to generate the completion.
  • generation (str or List[str]): The completions that were generated based on the input instruction.

Examples:

Push a text generation dataset to an Argilla instance:

from distilabel.steps import PreferenceToArgilla\n\nto_argilla = TextGenerationToArgilla(\n    num_generations=2,\n    api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n    api_key=\"api.key\",\n    dataset_name=\"argilla_dataset\",\n    dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generation\": \"generation\",\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generation': 'generation'}]\n
Source code in src/distilabel/steps/argilla/text_generation.py
class TextGenerationToArgilla(ArgillaBase):\n    \"\"\"Creates a text generation dataset in Argilla.\n\n    `Step` that creates a dataset in Argilla during the load phase, and then pushes the input\n    batches into it as records. This dataset is a text-generation dataset, where there's one field\n    per each input, and then a label question to rate the quality of the completion in either bad\n    (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d).\n\n    Note:\n        This step is meant to be used in conjunction with a `TextGeneration` step and no column mapping\n        is needed, as it will use the default values for the `instruction` and `generation` columns.\n\n    Attributes:\n        dataset_name: The name of the dataset in Argilla.\n        dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to\n            `None`, which means it will be created in the default workspace.\n        api_url: The URL of the Argilla API. Defaults to `None`, which means it will be read from\n            the `ARGILLA_API_URL` environment variable.\n        api_key: The API key to authenticate with Argilla. Defaults to `None`, which means it will\n            be read from the `ARGILLA_API_KEY` environment variable.\n\n    Runtime parameters:\n        - `api_url`: The base URL to use for the Argilla API requests.\n        - `api_key`: The API key to authenticate the requests to the Argilla API.\n\n    Input columns:\n        - instruction (`str`): The instruction that was used to generate the completion.\n        - generation (`str` or `List[str]`): The completions that were generated based on the input instruction.\n\n    Examples:\n        Push a text generation dataset to an Argilla instance:\n\n        ```python\n        from distilabel.steps import PreferenceToArgilla\n\n        to_argilla = TextGenerationToArgilla(\n            num_generations=2,\n            api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n            api_key=\"api.key\",\n            dataset_name=\"argilla_dataset\",\n            dataset_workspace=\"my_workspace\",\n        )\n        to_argilla.load()\n\n        result = next(\n            to_argilla.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"generation\": \"generation\",\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'instruction', 'generation': 'generation'}]\n        ```\n    \"\"\"\n\n    _id: str = PrivateAttr(default=\"id\")\n    _instruction: str = PrivateAttr(...)\n    _generation: str = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Sets the `_instruction` and `_generation` attributes based on the `inputs_mapping`, otherwise\n        uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n        the text-generation scenario. And then it pushes it to Argilla.\n        \"\"\"\n        super().load()\n\n        self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n        self._generation = self.input_mappings.get(\"generation\", \"generation\")\n\n        if self._dataset_exists_in_workspace:\n            _dataset = self._client.datasets(  # type: ignore\n                name=self.dataset_name,  # type: ignore\n                workspace=self.dataset_workspace,  # type: ignore\n            )\n\n            for field in _dataset.fields:\n                if not isinstance(field, rg.TextField):  # type: ignore\n                    continue\n                if (\n                    field.name not in [self._id, self._instruction, self._generation]\n                    and field.required\n                ):\n                    raise DistilabelUserError(\n                        f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n                        f\" already exists, but contains at least a required field that is\"\n                        f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generation}`,\"\n                        \" so it cannot be reused for this dataset.\",\n                        page=\"components-gallery/steps/textgenerationtoargilla/\",\n                    )\n\n            self._dataset = _dataset\n        else:\n            _settings = rg.Settings(  # type: ignore\n                fields=[\n                    rg.TextField(name=self._id, title=self._id),  # type: ignore\n                    rg.TextField(name=self._instruction, title=self._instruction),  # type: ignore\n                    rg.TextField(name=self._generation, title=self._generation),  # type: ignore\n                ],\n                questions=[\n                    rg.LabelQuestion(  # type: ignore\n                        name=\"quality\",\n                        title=f\"What's the quality of the {self._generation} for the given {self._instruction}?\",\n                        labels={\"bad\": \"\ud83d\udc4e\", \"good\": \"\ud83d\udc4d\"},  # type: ignore\n                    )\n                ],\n            )\n            _dataset = rg.Dataset(  # type: ignore\n                name=self.dataset_name,\n                workspace=self.dataset_workspace,\n                settings=_settings,\n                client=self._client,\n            )\n            self._dataset = _dataset.create()\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the step are the `instruction` and the `generation`.\"\"\"\n        return [\"instruction\", \"generation\"]\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Creates and pushes the records as FeedbackRecords to the Argilla dataset.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n        records = []\n        for input in inputs:\n            # Generate the SHA-256 hash of the instruction to use it as the metadata\n            instruction_id = hashlib.sha256(\n                input[\"instruction\"].encode(\"utf-8\")\n            ).hexdigest()\n\n            generations = input[\"generation\"]\n\n            # If the `generation` is not a list, then convert it into a list\n            if not isinstance(generations, list):\n                generations = [generations]\n\n            # Create a `generations_set` to avoid adding duplicates\n            generations_set = set()\n\n            for generation in generations:\n                # If the generation is already in the set, then skip it\n                if generation in generations_set:\n                    continue\n                # Otherwise, add it to the set\n                generations_set.add(generation)\n\n                records.append(\n                    rg.Record(  # type: ignore\n                        fields={\n                            self._id: instruction_id,\n                            self._instruction: input[\"instruction\"],\n                            self._generation: generation,\n                        },\n                    ),\n                )\n        self._dataset.records.log(records)  # type: ignore\n        yield inputs\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.inputs","title":"inputs: List[str] property","text":"

The inputs for the step are the instruction and the generation.

"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.load","title":"load()","text":"

Sets the _instruction and _generation attributes based on the inputs_mapping, otherwise uses the default values; and then uses those values to create a FeedbackDataset suited for the text-generation scenario. And then it pushes it to Argilla.

Source code in src/distilabel/steps/argilla/text_generation.py
def load(self) -> None:\n    \"\"\"Sets the `_instruction` and `_generation` attributes based on the `inputs_mapping`, otherwise\n    uses the default values; and then uses those values to create a `FeedbackDataset` suited for\n    the text-generation scenario. And then it pushes it to Argilla.\n    \"\"\"\n    super().load()\n\n    self._instruction = self.input_mappings.get(\"instruction\", \"instruction\")\n    self._generation = self.input_mappings.get(\"generation\", \"generation\")\n\n    if self._dataset_exists_in_workspace:\n        _dataset = self._client.datasets(  # type: ignore\n            name=self.dataset_name,  # type: ignore\n            workspace=self.dataset_workspace,  # type: ignore\n        )\n\n        for field in _dataset.fields:\n            if not isinstance(field, rg.TextField):  # type: ignore\n                continue\n            if (\n                field.name not in [self._id, self._instruction, self._generation]\n                and field.required\n            ):\n                raise DistilabelUserError(\n                    f\"The dataset '{self.dataset_name}' in the workspace '{self.dataset_workspace}'\"\n                    f\" already exists, but contains at least a required field that is\"\n                    f\" neither `{self._id}`, `{self._instruction}`, nor `{self._generation}`,\"\n                    \" so it cannot be reused for this dataset.\",\n                    page=\"components-gallery/steps/textgenerationtoargilla/\",\n                )\n\n        self._dataset = _dataset\n    else:\n        _settings = rg.Settings(  # type: ignore\n            fields=[\n                rg.TextField(name=self._id, title=self._id),  # type: ignore\n                rg.TextField(name=self._instruction, title=self._instruction),  # type: ignore\n                rg.TextField(name=self._generation, title=self._generation),  # type: ignore\n            ],\n            questions=[\n                rg.LabelQuestion(  # type: ignore\n                    name=\"quality\",\n                    title=f\"What's the quality of the {self._generation} for the given {self._instruction}?\",\n                    labels={\"bad\": \"\ud83d\udc4e\", \"good\": \"\ud83d\udc4d\"},  # type: ignore\n                )\n            ],\n        )\n        _dataset = rg.Dataset(  # type: ignore\n            name=self.dataset_name,\n            workspace=self.dataset_workspace,\n            settings=_settings,\n            client=self._client,\n        )\n        self._dataset = _dataset.create()\n
"},{"location":"api/step_gallery/argilla/#distilabel.steps.argilla.text_generation.TextGenerationToArgilla.process","title":"process(inputs)","text":"

Creates and pushes the records as FeedbackRecords to the Argilla dataset.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/argilla/text_generation.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Creates and pushes the records as FeedbackRecords to the Argilla dataset.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n    records = []\n    for input in inputs:\n        # Generate the SHA-256 hash of the instruction to use it as the metadata\n        instruction_id = hashlib.sha256(\n            input[\"instruction\"].encode(\"utf-8\")\n        ).hexdigest()\n\n        generations = input[\"generation\"]\n\n        # If the `generation` is not a list, then convert it into a list\n        if not isinstance(generations, list):\n            generations = [generations]\n\n        # Create a `generations_set` to avoid adding duplicates\n        generations_set = set()\n\n        for generation in generations:\n            # If the generation is already in the set, then skip it\n            if generation in generations_set:\n                continue\n            # Otherwise, add it to the set\n            generations_set.add(generation)\n\n            records.append(\n                rg.Record(  # type: ignore\n                    fields={\n                        self._id: instruction_id,\n                        self._instruction: input[\"instruction\"],\n                        self._generation: generation,\n                    },\n                ),\n            )\n    self._dataset.records.log(records)  # type: ignore\n    yield inputs\n
"},{"location":"api/step_gallery/columns/","title":"Columns","text":"

This section contains the existing steps intended to be used for common column operations to apply to the batches.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand","title":"expand","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns","title":"ExpandColumns","text":"

Bases: Step

Expand columns that contain lists into multiple rows.

ExpandColumns is a Step that takes a list of columns and expands them into multiple rows. The new rows will have the same data as the original row, except for the expanded column, which will contain a single item from the original list.

Attributes:

Name Type Description columns Union[Dict[str, str], List[str]]

A dictionary that maps the column to be expanded to the new column name or a list of columns to be expanded. If a list is provided, the new column name will be the same as the column name.

Input columns
  • dynamic (determined by columns attribute): The columns to be expanded into multiple rows.
Output columns
  • dynamic (determined by columns attribute): The expanded columns.
Categories
  • columns

Examples:

Expand the selected columns into multiple rows:

from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n    columns=[\"generation\"],\n)\nexpand_columns.load()\n\nresult = next(\n    expand_columns.process(\n        [\n            {\n                \"instruction\": \"instruction 1\",\n                \"generation\": [\"generation 1\", \"generation 2\"]}\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n
Source code in src/distilabel/steps/columns/expand.py
class ExpandColumns(Step):\n    \"\"\"Expand columns that contain lists into multiple rows.\n\n    `ExpandColumns` is a `Step` that takes a list of columns and expands them into multiple\n    rows. The new rows will have the same data as the original row, except for the expanded\n    column, which will contain a single item from the original list.\n\n    Attributes:\n        columns: A dictionary that maps the column to be expanded to the new column name\n            or a list of columns to be expanded. If a list is provided, the new column name\n            will be the same as the column name.\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): The columns to be expanded into\n            multiple rows.\n\n    Output columns:\n        - dynamic (determined by `columns` attribute):  The expanded columns.\n\n    Categories:\n        - columns\n\n    Examples:\n        Expand the selected columns into multiple rows:\n\n        ```python\n        from distilabel.steps import ExpandColumns\n\n        expand_columns = ExpandColumns(\n            columns=[\"generation\"],\n        )\n        expand_columns.load()\n\n        result = next(\n            expand_columns.process(\n                [\n                    {\n                        \"instruction\": \"instruction 1\",\n                        \"generation\": [\"generation 1\", \"generation 2\"]}\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n        ```\n    \"\"\"\n\n    columns: Union[Dict[str, str], List[str]]\n\n    @field_validator(\"columns\")\n    @classmethod\n    def always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:\n        \"\"\"Ensure that the columns are always a dictionary.\n\n        Args:\n            value: The columns to be expanded.\n\n        Returns:\n            The columns to be expanded as a dictionary.\n        \"\"\"\n        if isinstance(value, list):\n            return {col: col for col in value}\n\n        return value\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The columns to be expanded.\"\"\"\n        return list(self.columns.keys())\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The expanded columns.\"\"\"\n        return [\n            new_column if new_column else expand_column\n            for expand_column, new_column in self.columns.items()\n        ]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Expand the columns in the input data.\n\n        Args:\n            inputs: The input data.\n\n        Yields:\n            The expanded rows.\n        \"\"\"\n        yield [row for input in inputs for row in self._expand_columns(input)]\n\n    def _expand_columns(self, input: Dict[str, Any]) -> List[Dict[str, Any]]:\n        \"\"\"Expand the columns in the input data.\n\n        Args:\n            input: The input data.\n\n        Returns:\n            The expanded rows.\n        \"\"\"\n        expanded_rows = []\n        for expand_column, new_column in self.columns.items():  # type: ignore\n            data = input.get(expand_column)\n            rows = []\n            for item, expanded in zip_longest(*[data, expanded_rows], fillvalue=input):\n                rows.append({**expanded, new_column: item})\n            expanded_rows = rows\n        return expanded_rows\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.inputs","title":"inputs: StepColumns property","text":"

The columns to be expanded.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.outputs","title":"outputs: StepColumns property","text":"

The expanded columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.always_dict","title":"always_dict(value) classmethod","text":"

Ensure that the columns are always a dictionary.

Parameters:

Name Type Description Default value Union[Dict[str, str], List[str]]

The columns to be expanded.

required

Returns:

Type Description Dict[str, str]

The columns to be expanded as a dictionary.

Source code in src/distilabel/steps/columns/expand.py
@field_validator(\"columns\")\n@classmethod\ndef always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:\n    \"\"\"Ensure that the columns are always a dictionary.\n\n    Args:\n        value: The columns to be expanded.\n\n    Returns:\n        The columns to be expanded as a dictionary.\n    \"\"\"\n    if isinstance(value, list):\n        return {col: col for col in value}\n\n    return value\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.expand.ExpandColumns.process","title":"process(inputs)","text":"

Expand the columns in the input data.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Yields:

Type Description StepOutput

The expanded rows.

Source code in src/distilabel/steps/columns/expand.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Expand the columns in the input data.\n\n    Args:\n        inputs: The input data.\n\n    Yields:\n        The expanded rows.\n    \"\"\"\n    yield [row for input in inputs for row in self._expand_columns(input)]\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep","title":"keep","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns","title":"KeepColumns","text":"

Bases: Step

Keeps selected columns in the dataset.

KeepColumns is a Step that implements the process method that keeps only the columns specified in the columns attribute. Also KeepColumns provides an attribute columns to specify the columns to keep which will override the default value for the properties inputs and outputs.

Note

The order in which the columns are provided is important, as the output will be sorted using the provided order, which is useful before pushing either a dataset.Dataset via the PushToHub step or a distilabel.Distiset via the Pipeline.run output variable.

Attributes:

Name Type Description columns List[str]

List of strings with the names of the columns to keep.

Input columns
  • dynamic (determined by columns attribute): The columns to keep.
Output columns
  • dynamic (determined by columns attribute): The columns that were kept.
Categories
  • columns

Examples:

Select the columns to keep:

from distilabel.steps import KeepColumns\n\nkeep_columns = KeepColumns(\n    columns=[\"instruction\", \"generation\"],\n)\nkeep_columns.load()\n\nresult = next(\n    keep_columns.process(\n        [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n    )\n)\n# >>> result\n# [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n
Source code in src/distilabel/steps/columns/keep.py
class KeepColumns(Step):\n    \"\"\"Keeps selected columns in the dataset.\n\n    `KeepColumns` is a `Step` that implements the `process` method that keeps only the columns\n    specified in the `columns` attribute. Also `KeepColumns` provides an attribute `columns` to\n    specify the columns to keep which will override the default value for the properties `inputs`\n    and `outputs`.\n\n    Note:\n        The order in which the columns are provided is important, as the output will be sorted\n        using the provided order, which is useful before pushing either a `dataset.Dataset` via\n        the `PushToHub` step or a `distilabel.Distiset` via the `Pipeline.run` output variable.\n\n    Attributes:\n        columns: List of strings with the names of the columns to keep.\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): The columns to keep.\n\n    Output columns:\n        - dynamic (determined by `columns` attribute): The columns that were kept.\n\n    Categories:\n        - columns\n\n    Examples:\n        Select the columns to keep:\n\n        ```python\n        from distilabel.steps import KeepColumns\n\n        keep_columns = KeepColumns(\n            columns=[\"instruction\", \"generation\"],\n        )\n        keep_columns.load()\n\n        result = next(\n            keep_columns.process(\n                [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n            )\n        )\n        # >>> result\n        # [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n        ```\n    \"\"\"\n\n    columns: List[str]\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task are the column names in `columns`.\"\"\"\n        return self.columns\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs for the task are the column names in `columns`.\"\"\"\n        return self.columns\n\n    @override\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        \"\"\"The `process` method keeps only the columns specified in the `columns` attribute.\n\n        Args:\n            *inputs: A list of dictionaries with the input data.\n\n        Yields:\n            A list of dictionaries with the output data.\n        \"\"\"\n        for input in inputs:\n            outputs = []\n            for item in input:\n                outputs.append({col: item[col] for col in self.columns})\n            yield outputs\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task are the column names in columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.outputs","title":"outputs: StepColumns property","text":"

The outputs for the task are the column names in columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.keep.KeepColumns.process","title":"process(*inputs)","text":"

The process method keeps only the columns specified in the columns attribute.

Parameters:

Name Type Description Default *inputs StepInput

A list of dictionaries with the input data.

()

Yields:

Type Description StepOutput

A list of dictionaries with the output data.

Source code in src/distilabel/steps/columns/keep.py
@override\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n    \"\"\"The `process` method keeps only the columns specified in the `columns` attribute.\n\n    Args:\n        *inputs: A list of dictionaries with the input data.\n\n    Yields:\n        A list of dictionaries with the output data.\n    \"\"\"\n    for input in inputs:\n        outputs = []\n        for item in input:\n            outputs.append({col: item[col] for col in self.columns})\n        yield outputs\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.merge","title":"merge","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.merge.MergeColumns","title":"MergeColumns","text":"

Bases: Step

Merge columns from a row.

MergeColumns is a Step that implements the process method that calls the merge_columns function to handle and combine columns in a StepInput. MergeColumns provides two attributes columns and output_column to specify the columns to merge and the resulting output column.

This step can be useful if you have a Task that generates instructions for example, and you want to have more examples of those. In such a case, you could for example use another Task to multiply your instructions synthetically, what would yield two different columns splitted. Using MergeColumns you can merge them and use them as a single column in your dataset for further processing.

Attributes:

Name Type Description columns List[str]

List of strings with the names of the columns to merge.

output_column Optional[str]

str name of the output column

Input columns
  • dynamic (determined by columns attribute): The columns to merge.
Output columns
  • dynamic (determined by columns and output_column attributes): The columns that were merged.
Categories
  • columns

Examples:

Combine columns in rows of a dataset:

from distilabel.steps import MergeColumns\n\ncombiner = MergeColumns(\n    columns=[\"queries\", \"multiple_queries\"],\n    output_column=\"queries\",\n)\ncombiner.load()\n\nresult = next(\n    combiner.process(\n        [\n            {\n                \"queries\": \"How are you?\",\n                \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n            }\n        ],\n    )\n)\n# >>> result\n# [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n
Source code in src/distilabel/steps/columns/merge.py
class MergeColumns(Step):\n    \"\"\"Merge columns from a row.\n\n    `MergeColumns` is a `Step` that implements the `process` method that calls the `merge_columns`\n    function to handle and combine columns in a `StepInput`. `MergeColumns` provides two attributes\n    `columns` and `output_column` to specify the columns to merge and the resulting output column.\n\n    This step can be useful if you have a `Task` that generates instructions for example, and you\n    want to have more examples of those. In such a case, you could for example use another `Task`\n    to multiply your instructions synthetically, what would yield two different columns splitted.\n    Using `MergeColumns` you can merge them and use them as a single column in your dataset for\n    further processing.\n\n    Attributes:\n        columns: List of strings with the names of the columns to merge.\n        output_column: str name of the output column\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): The columns to merge.\n\n    Output columns:\n        - dynamic (determined by `columns` and `output_column` attributes): The columns\n            that were merged.\n\n    Categories:\n        - columns\n\n    Examples:\n        Combine columns in rows of a dataset:\n\n        ```python\n        from distilabel.steps import MergeColumns\n\n        combiner = MergeColumns(\n            columns=[\"queries\", \"multiple_queries\"],\n            output_column=\"queries\",\n        )\n        combiner.load()\n\n        result = next(\n            combiner.process(\n                [\n                    {\n                        \"queries\": \"How are you?\",\n                        \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n        ```\n    \"\"\"\n\n    columns: List[str]\n    output_column: Optional[str] = None\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return self.columns\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [self.output_column] if self.output_column else [\"merged_column\"]\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        combined = []\n        for input in inputs:\n            combined.append(\n                merge_columns(\n                    input,\n                    columns=self.columns,\n                    new_column=self.outputs[0],\n                )\n            )\n        yield combined\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group","title":"group","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns","title":"GroupColumns","text":"

Bases: Step

Combines columns from a list of StepInput.

GroupColumns is a Step that implements the process method that calls the group_dicts function to handle and combine a list of StepInput. Also GroupColumns provides two attributes columns and output_columns to specify the columns to group and the output columns which will override the default value for the properties inputs and outputs, respectively.

Attributes:

Name Type Description columns List[str]

List of strings with the names of the columns to group.

output_columns Optional[List[str]]

Optional list of strings with the names of the output columns.

Input columns
  • dynamic (determined by columns attribute): The columns to group.
Output columns
  • dynamic (determined by columns and output_columns attributes): The columns that were grouped.
Categories
  • columns

Examples:

Group columns of a dataset:\n\n```python\nfrom distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n    name=\"group_columns\",\n    columns=[\"generation\", \"model_name\"],\n)\ngroup_columns.load()\n\nresult = next(\n    group_columns.process(\n        [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n        [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n    )\n)\n# >>> result\n# [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n```\n\nSpecify the name of the output columns:\n\n```python\nfrom distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n    name=\"group_columns\",\n    columns=[\"generation\", \"model_name\"],\n    output_columns=[\"generations\", \"generation_models\"]\n)\ngroup_columns.load()\n\nresult = next(\n    group_columns.process(\n        [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n        [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n    )\n)\n# >>> result\n#[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n```\n
Source code in src/distilabel/steps/columns/group.py
class GroupColumns(Step):\n    \"\"\"Combines columns from a list of `StepInput`.\n\n    `GroupColumns` is a `Step` that implements the `process` method that calls the `group_dicts`\n    function to handle and combine a list of `StepInput`. Also `GroupColumns` provides two attributes\n    `columns` and `output_columns` to specify the columns to group and the output columns\n    which will override the default value for the properties `inputs` and `outputs`, respectively.\n\n    Attributes:\n        columns: List of strings with the names of the columns to group.\n        output_columns: Optional list of strings with the names of the output columns.\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): The columns to group.\n\n    Output columns:\n        - dynamic (determined by `columns` and `output_columns` attributes): The columns\n            that were grouped.\n\n    Categories:\n        - columns\n\n    Examples:\n\n        Group columns of a dataset:\n\n        ```python\n        from distilabel.steps import GroupColumns\n\n        group_columns = GroupColumns(\n            name=\"group_columns\",\n            columns=[\"generation\", \"model_name\"],\n        )\n        group_columns.load()\n\n        result = next(\n            group_columns.process(\n                [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n                [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n            )\n        )\n        # >>> result\n        # [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n        ```\n\n        Specify the name of the output columns:\n\n        ```python\n        from distilabel.steps import GroupColumns\n\n        group_columns = GroupColumns(\n            name=\"group_columns\",\n            columns=[\"generation\", \"model_name\"],\n            output_columns=[\"generations\", \"generation_models\"]\n        )\n        group_columns.load()\n\n        result = next(\n            group_columns.process(\n                [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n                [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n            )\n        )\n        # >>> result\n        #[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n        ```\n    \"\"\"\n\n    columns: List[str]\n    output_columns: Optional[List[str]] = None\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task are the column names in `columns`.\"\"\"\n        return self.columns\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs for the task are the column names in `output_columns` or\n        `grouped_{column}` for each column in `columns`.\"\"\"\n        return (\n            self.output_columns\n            if self.output_columns is not None\n            else [f\"grouped_{column}\" for column in self.columns]\n        )\n\n    @override\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        \"\"\"The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with the combined `StepInput` using the `group_dicts` function.\n        \"\"\"\n        yield group_columns(\n            *inputs,\n            group_columns=self.inputs,\n            output_group_columns=self.outputs,\n        )\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task are the column names in columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.outputs","title":"outputs: StepColumns property","text":"

The outputs for the task are the column names in output_columns or grouped_{column} for each column in columns.

"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.GroupColumns.process","title":"process(*inputs)","text":"

The process method calls the group_dicts function to handle and combine a list of StepInput.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with the combined StepInput using the group_dicts function.

Source code in src/distilabel/steps/columns/group.py
@override\ndef process(self, *inputs: StepInput) -> \"StepOutput\":\n    \"\"\"The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with the combined `StepInput` using the `group_dicts` function.\n    \"\"\"\n    yield group_columns(\n        *inputs,\n        group_columns=self.inputs,\n        output_group_columns=self.outputs,\n    )\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.group.CombineColumns","title":"CombineColumns","text":"

Bases: GroupColumns

CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead.

Source code in src/distilabel/steps/columns/group.py
class CombineColumns(GroupColumns):\n    \"\"\"`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.\"\"\"\n\n    def __init__(self, **data: Any) -> None:\n        warnings.warn(\n            \"`CombineColumns` is deprecated and will be removed in version 1.5.0, use `GroupColumns` instead.\",\n            DeprecationWarning,\n            stacklevel=2,\n        )\n        return super().__init__(**data)\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils","title":"utils","text":""},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.merge_distilabel_metadata","title":"merge_distilabel_metadata(*output_dicts)","text":"

Merge the DISTILABEL_METADATA_KEY from multiple output dictionaries.

Parameters:

Name Type Description Default *output_dicts Dict[str, Any]

Variable number of dictionaries containing distilabel metadata.

()

Returns:

Type Description Dict[str, Any]

A merged dictionary containing all the distilabel metadata from the input dictionaries.

Source code in src/distilabel/steps/columns/utils.py
def merge_distilabel_metadata(*output_dicts: Dict[str, Any]) -> Dict[str, Any]:\n    \"\"\"\n    Merge the `DISTILABEL_METADATA_KEY` from multiple output dictionaries.\n\n    Args:\n        *output_dicts: Variable number of dictionaries containing distilabel metadata.\n\n    Returns:\n        A merged dictionary containing all the distilabel metadata from the input dictionaries.\n    \"\"\"\n    merged_metadata = defaultdict(list)\n\n    for output_dict in output_dicts:\n        metadata = output_dict.get(DISTILABEL_METADATA_KEY, {})\n        for key, value in metadata.items():\n            merged_metadata[key].append(value)\n\n    final_metadata = {}\n    for key, value_list in merged_metadata.items():\n        if len(value_list) == 1:\n            final_metadata[key] = value_list[0]\n        else:\n            final_metadata[key] = value_list\n\n    return final_metadata\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.group_columns","title":"group_columns(*inputs, group_columns, output_group_columns=None)","text":"

Groups multiple list of dictionaries into a single list of dictionaries on the specified group_columns. If group_columns are provided, then it will also rename group_columns.

Parameters:

Name Type Description Default inputs StepInput

list of dictionaries to combine.

() group_columns List[str]

list of keys to merge on.

required output_group_columns Optional[List[str]]

list of keys to rename the merge keys to. Defaults to None.

None

Returns:

Type Description StepInput

A list of dictionaries where the values of the group_columns are combined into a

StepInput

list and renamed to output_group_columns.

Source code in src/distilabel/steps/columns/utils.py
def group_columns(\n    *inputs: \"StepInput\",\n    group_columns: List[str],\n    output_group_columns: Optional[List[str]] = None,\n) -> \"StepInput\":\n    \"\"\"Groups multiple list of dictionaries into a single list of dictionaries on the\n    specified `group_columns`. If `group_columns` are provided, then it will also rename\n    `group_columns`.\n\n    Args:\n        inputs: list of dictionaries to combine.\n        group_columns: list of keys to merge on.\n        output_group_columns: list of keys to rename the merge keys to. Defaults to `None`.\n\n    Returns:\n        A list of dictionaries where the values of the `group_columns` are combined into a\n        list and renamed to `output_group_columns`.\n    \"\"\"\n    if output_group_columns is not None and len(output_group_columns) != len(\n        group_columns\n    ):\n        raise ValueError(\n            \"The length of `output_group_columns` must be the same as the length of `group_columns`.\"\n        )\n    if output_group_columns is None:\n        output_group_columns = [f\"grouped_{key}\" for key in group_columns]\n    group_columns_dict = dict(zip(group_columns, output_group_columns))\n\n    result = []\n    # Use zip to iterate over lists based on their index\n    for dicts_at_index in zip(*inputs):\n        combined_dict = {}\n        metadata_dicts = []\n        # Iterate over dicts at the same index\n        for d in dicts_at_index:\n            # Extract metadata for merging\n            if DISTILABEL_METADATA_KEY in d:\n                metadata_dicts.append(\n                    {DISTILABEL_METADATA_KEY: d[DISTILABEL_METADATA_KEY]}\n                )\n            # Iterate over key-value pairs in each dict\n            for key, value in d.items():\n                if key == DISTILABEL_METADATA_KEY:\n                    continue\n                # If the key is in the merge_keys, append the value to the existing list\n                if key in group_columns_dict.keys():\n                    combined_dict.setdefault(group_columns_dict[key], []).append(value)\n                # If the key is not in the merge_keys, create a new key-value pair\n                else:\n                    combined_dict[key] = value\n\n        if metadata_dicts:\n            combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(\n                *metadata_dicts\n            )\n\n        result.append(combined_dict)\n    return result\n
"},{"location":"api/step_gallery/columns/#distilabel.steps.columns.utils.merge_columns","title":"merge_columns(row, columns, new_column='combined_key')","text":"

Merge columns in a dictionary into a single column on the specified new_column.

Parameters:

Name Type Description Default row Dict[str, Any]

Dictionary corresponding to a row in a dataset.

required columns List[str]

List of keys to merge.

required new_column str

Name of the new key created.

'combined_key'

Returns:

Type Description Dict[str, Any]

Dictionary with the new merged key.

Source code in src/distilabel/steps/columns/utils.py
def merge_columns(\n    row: Dict[str, Any], columns: List[str], new_column: str = \"combined_key\"\n) -> Dict[str, Any]:\n    \"\"\"Merge columns in a dictionary into a single column on the specified `new_column`.\n\n    Args:\n        row: Dictionary corresponding to a row in a dataset.\n        columns: List of keys to merge.\n        new_column: Name of the new key created.\n\n    Returns:\n        Dictionary with the new merged key.\n    \"\"\"\n    result = row.copy()  # preserve the original dictionary\n    combined = []\n    for key in columns:\n        to_combine = result.pop(key)\n        if not isinstance(to_combine, list):\n            to_combine = [to_combine]\n        combined += to_combine\n    result[new_column] = combined\n    return result\n
"},{"location":"api/step_gallery/extra/","title":"Extra","text":""},{"location":"api/step_gallery/extra/#distilabel.steps","title":"steps","text":""},{"location":"api/step_gallery/extra/#distilabel.steps.DBSCAN","title":"DBSCAN","text":"

Bases: GlobalStep

DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core samples in regions of high density and expands clusters from them. This algorithm is good for data which contains clusters of similar density.

This is a GlobalStep that clusters the embeddings using the DBSCAN algorithm from sklearn. Visit TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub.

Input columns
  • projection (List[float]): Vector representation of the text to cluster, normally the output from the UMAP step.
Output columns
  • cluster_label (int): Integer representing the label of a given cluster. -1 means it wasn't clustered.
Categories
  • clustering
  • text-classification
References
  • DBSCAN demo of sklearn
  • sklearn dbscan

Attributes:

Name Type Description - eps

The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.

- min_samples

The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse.

- metric

The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter.

- n_jobs

The number of parallel jobs to run.

Runtime parameters
  • eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
  • min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse.
  • metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter.
  • n_jobs: The number of parallel jobs to run.
Source code in src/distilabel/steps/clustering/dbscan.py
class DBSCAN(GlobalStep):\n    r\"\"\"DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core\n    samples in regions of high density and expands clusters from them. This algorithm\n    is good for data which contains clusters of similar density.\n\n    This is a `GlobalStep` that clusters the embeddings using the DBSCAN algorithm\n    from `sklearn`. Visit `TextClustering` step for an example of use.\n    The trained model is saved as an artifact when creating a distiset\n    and pushing it to the Hugging Face Hub.\n\n    Input columns:\n        - projection (`List[float]`): Vector representation of the text to cluster,\n            normally the output from the `UMAP` step.\n\n    Output columns:\n        - cluster_label (`int`): Integer representing the label of a given cluster. -1\n            means it wasn't clustered.\n\n    Categories:\n        - clustering\n        - text-classification\n\n    References:\n        - [`DBSCAN demo of sklearn`](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#demo-of-dbscan-clustering-algorithm)\n        - [`sklearn dbscan`](https://scikit-learn.org/stable/modules/clustering.html#dbscan)\n\n    Attributes:\n        - eps: The maximum distance between two samples for one to be considered as in the\n            neighborhood of the other. This is not a maximum bound on the distances of\n            points within a cluster. This is the most important DBSCAN parameter to\n            choose appropriately for your data set and distance function.\n        - min_samples: The number of samples (or total weight) in a neighborhood for a point\n            to be considered as a core point. This includes the point itself. If `min_samples`\n            is set to a higher value, DBSCAN will find denser clusters, whereas if it is set\n            to a lower value, the found clusters will be more sparse.\n        - metric: The metric to use when calculating distance between instances in a feature\n            array. If metric is a string or callable, it must be one of the options allowed\n            by `sklearn.metrics.pairwise_distances` for its metric parameter.\n        - n_jobs: The number of parallel jobs to run.\n\n    Runtime parameters:\n        - `eps`: The maximum distance between two samples for one to be considered as in the\n            neighborhood of the other. This is not a maximum bound on the distances of\n            points within a cluster. This is the most important DBSCAN parameter to\n            choose appropriately for your data set and distance function.\n        - `min_samples`: The number of samples (or total weight) in a neighborhood for a point\n            to be considered as a core point. This includes the point itself. If `min_samples`\n            is set to a higher value, DBSCAN will find denser clusters, whereas if it is set\n            to a lower value, the found clusters will be more sparse.\n        - `metric`: The metric to use when calculating distance between instances in a feature\n            array. If metric is a string or callable, it must be one of the options allowed\n            by `sklearn.metrics.pairwise_distances` for its metric parameter.\n        - `n_jobs`: The number of parallel jobs to run.\n    \"\"\"\n\n    eps: Optional[RuntimeParameter[float]] = Field(\n        default=0.3,\n        description=(\n            \"The maximum distance between two samples for one to be considered \"\n            \"as in the neighborhood of the other. This is not a maximum bound \"\n            \"on the distances of points within a cluster. This is the most \"\n            \"important DBSCAN parameter to choose appropriately for your data set \"\n            \"and distance function.\"\n        ),\n    )\n    min_samples: Optional[RuntimeParameter[int]] = Field(\n        default=30,\n        description=(\n            \"The number of samples (or total weight) in a neighborhood for a point to \"\n            \"be considered as a core point. This includes the point itself. If \"\n            \"`min_samples` is set to a higher value, DBSCAN will find denser clusters, \"\n            \"whereas if it is set to a lower value, the found clusters will be more \"\n            \"sparse.\"\n        ),\n    )\n    metric: Optional[RuntimeParameter[str]] = Field(\n        default=\"euclidean\",\n        description=(\n            \"The metric to use when calculating distance between instances in a \"\n            \"feature array. If metric is a string or callable, it must be one of \"\n            \"the options allowed by `sklearn.metrics.pairwise_distances` for \"\n            \"its metric parameter.\"\n        ),\n    )\n    n_jobs: Optional[RuntimeParameter[int]] = Field(\n        default=8, description=\"The number of parallel jobs to run.\"\n    )\n\n    _clusterer: Optional[\"_DBSCAN\"] = PrivateAttr(None)\n\n    def load(self) -> None:\n        super().load()\n        if importlib.util.find_spec(\"sklearn\") is None:\n            raise ImportError(\n                \"`sklearn` package is not installed. Please install it using `pip install scikit-learn`.\"\n            )\n        from sklearn.cluster import DBSCAN as _DBSCAN\n\n        self._clusterer = _DBSCAN(\n            eps=self.eps,\n            min_samples=self.min_samples,\n            metric=self.metric,\n            n_jobs=self.n_jobs,\n        )\n\n    def unload(self) -> None:\n        self._clusterer = None\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"projection\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"cluster_label\"]\n\n    def _save_model(self, model: Any) -> None:\n        import joblib\n\n        def save_model(path):\n            with open(str(path / \"DBSCAN.joblib\"), \"wb\") as f:\n                joblib.dump(model, f)\n\n        self.save_artifact(\n            name=\"DBSCAN_model\",\n            write_function=lambda path: save_model(path),\n            metadata={\n                \"eps\": self.eps,\n                \"min_samples\": self.min_samples,\n                \"metric\": self.metric,\n            },\n        )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        projections = np.array([input[\"projection\"] for input in inputs])\n\n        self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Start training DBSCAN...\")\n        fitted_clusterer = self._clusterer.fit(projections)\n        cluster_labels = fitted_clusterer.labels_\n        # Sets the cluster labels for each input, -1 means it wasn't clustered\n        for input, cluster_label in zip(inputs, cluster_labels):\n            input[\"cluster_label\"] = cluster_label\n        self._logger.info(f\"DBSCAN labels assigned: {len(set(cluster_labels))}\")\n        self._save_model(fitted_clusterer)\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering","title":"TextClustering","text":"

Bases: TextClassification, GlobalTask

Task that clusters a set of texts and generates summary labels for each cluster.

This is a GlobalTask that inherits from TextClassification, this means that all the attributes from that class are available here. Also, in this case we deal with all the inputs at once, instead of using batches. The input_batch_size is used here to send the examples to the LLM in batches (a subtle difference with the more common Task definitions). The task looks in each cluster for a given number of representative examples (the number is set by the samples_per_cluster attribute), and sends them to the LLM to get a label/s that represent the cluster. The labels are then assigned to each text in the cluster. The clusters and projections used in the step, are assumed to be obtained from the UMAP + DBSCAN steps, but could be generated for similar steps, as long as they represent the same concepts. This step runs a pipeline like the one in this repository: https://github.com/huggingface/text-clustering

Input columns
  • text (str): The reference text we want to obtain labels for.
  • projection (List[float]): Vector representation of the text to cluster, normally the output from the UMAP step.
  • cluster_label (int): Integer representing the label of a given cluster. -1 means it wasn't clustered.
Output columns
  • summary_label (str): The label or list of labels for the text.
  • model_name (str): The name of the model used to generate the label/s.
Categories
  • clustering
  • text-classification
References
  • text-clustering repository

Attributes:

Name Type Description - savefig

Whether to generate and save a figure with the clustering of the texts.

- samples_per_cluster

The number of examples to use in the LLM as a sample of the cluster.

Examples:

Generate labels for a set of texts using clustering:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import UMAP, DBSCAN, TextClustering\nfrom distilabel.pipeline import Pipeline\n\nds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\nwith Pipeline(name=\"Text clustering dataset\") as pipeline:\n    batch_size = 500\n\n    ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n    loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n    umap = UMAP(n_components=2, metric=\"cosine\")\n    dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n    text_clustering = TextClustering(\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        ),\n        n=3,  # 3 labels per example\n        query_title=\"Examples of Personas\",\n        samples_per_cluster=10,\n        context=(\n            \"Describe the main themes, topics, or categories that could describe the \"\n            \"following types of personas. All the examples of personas must share \"\n            \"the same set of labels.\"\n        ),\n        default_label=\"None\",\n        savefig=True,\n        input_batch_size=8,\n        input_mappings={\"text\": \"persona\"},\n        use_default_structured_output=True,\n    )\n\n    loader >> umap >> dbscan >> text_clustering\n
Source code in src/distilabel/steps/clustering/text_clustering.py
class TextClustering(TextClassification, GlobalTask):\n    \"\"\"Task that clusters a set of texts and generates summary labels for each cluster.\n\n    This is a `GlobalTask` that inherits from `TextClassification`, this means that all\n    the attributes from that class are available here. Also, in this case we deal\n    with all the inputs at once, instead of using batches. The `input_batch_size` is\n    used here to send the examples to the LLM in batches (a subtle difference with the\n    more common `Task` definitions).\n    The task looks in each cluster for a given number of representative examples (the number\n    is set by the `samples_per_cluster` attribute), and sends them to the LLM to get a label/s\n    that represent the cluster. The labels are then assigned to each text in the cluster.\n    The clusters and projections used in the step, are assumed to be obtained from the `UMAP`\n    + `DBSCAN` steps, but could be generated for similar steps, as long as they represent the\n    same concepts.\n    This step runs a pipeline like the one in this repository:\n    https://github.com/huggingface/text-clustering\n\n    Input columns:\n        - text (`str`): The reference text we want to obtain labels for.\n        - projection (`List[float]`): Vector representation of the text to cluster,\n            normally the output from the `UMAP` step.\n        - cluster_label (`int`): Integer representing the label of a given cluster. -1\n            means it wasn't clustered.\n\n    Output columns:\n        - summary_label (`str`): The label or list of labels for the text.\n        - model_name (`str`): The name of the model used to generate the label/s.\n\n    Categories:\n        - clustering\n        - text-classification\n\n    References:\n        - [`text-clustering repository`](https://github.com/huggingface/text-clustering)\n\n    Attributes:\n        - savefig: Whether to generate and save a figure with the clustering of the texts.\n        - samples_per_cluster: The number of examples to use in the LLM as a sample of the cluster.\n\n    Examples:\n        Generate labels for a set of texts using clustering:\n\n        ```python\n        from distilabel.models import InferenceEndpointsLLM\n        from distilabel.steps import UMAP, DBSCAN, TextClustering\n        from distilabel.pipeline import Pipeline\n\n        ds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\n        with Pipeline(name=\"Text clustering dataset\") as pipeline:\n            batch_size = 500\n\n            ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n            loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n            umap = UMAP(n_components=2, metric=\"cosine\")\n            dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n            text_clustering = TextClustering(\n                llm=InferenceEndpointsLLM(\n                    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                ),\n                n=3,  # 3 labels per example\n                query_title=\"Examples of Personas\",\n                samples_per_cluster=10,\n                context=(\n                    \"Describe the main themes, topics, or categories that could describe the \"\n                    \"following types of personas. All the examples of personas must share \"\n                    \"the same set of labels.\"\n                ),\n                default_label=\"None\",\n                savefig=True,\n                input_batch_size=8,\n                input_mappings={\"text\": \"persona\"},\n                use_default_structured_output=True,\n            )\n\n            loader >> umap >> dbscan >> text_clustering\n        ```\n    \"\"\"\n\n    savefig: Optional[RuntimeParameter[bool]] = Field(\n        default=True,\n        description=\"Whether to generate and save a figure with the clustering of the texts.\",\n    )\n    samples_per_cluster: int = Field(\n        default=10,\n        description=\"The number of examples to use in the LLM as a sample of the cluster.\",\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task are the same as those for `TextClassification` plus\n        the `projection` and `cluster_label` columns (which can be obtained from\n        UMAP + DBSCAN steps).\n        \"\"\"\n        return super().inputs + [\"projection\", \"cluster_label\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `summary_label` and the `model_name`.\"\"\"\n        return [\"summary_label\", \"model_name\"]\n\n    def load(self) -> None:\n        super().load()\n        if self.savefig and (importlib.util.find_spec(\"matplotlib\") is None):\n            raise ImportError(\n                \"`matplotlib` package is not installed. Please install it using `pip install matplotlib`.\"\n            )\n\n    def _save_figure(\n        self,\n        data: pd.DataFrame,\n        cluster_centers: Dict[str, Tuple[float, float]],\n        cluster_summaries: Dict[int, str],\n    ) -> None:\n        \"\"\"Saves the figure starting from the dataframe, using matplotlib.\n\n        Args:\n            data: pd.DataFrame with the columns 'X', 'Y' and 'labels' representing\n                the projections and the label of each text respectively.\n            cluster_centers: Dictionary mapping from each label the center of a cluster,\n                to help with the placement of the annotations.\n            cluster_summaries: The summaries of the clusters, obtained from the LLM.\n        \"\"\"\n        import matplotlib.pyplot as plt\n\n        fig, ax = plt.subplots(figsize=(12, 8), dpi=300)\n        unique_labels = data[\"labels\"].unique()\n        # Map of colors for each label (-1 is black)\n        colormap = dict(\n            zip(unique_labels, plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))))\n        )\n        colormap[-1] = np.array([0, 0, 0, 0])\n        data[\"color\"] = data[\"labels\"].map(colormap)\n\n        data.plot(\n            kind=\"scatter\",\n            x=\"X\",\n            y=\"Y\",\n            c=\"color\",\n            s=0.75,\n            alpha=0.8,\n            linewidth=0.4,\n            ax=ax,\n            colorbar=False,\n        )\n\n        for label in cluster_summaries.keys():\n            if label == -1:\n                continue\n            summary = str(cluster_summaries[label])  # These are obtained from the LLM\n            position = cluster_centers[label]\n            t = ax.text(\n                position[0],\n                position[1],\n                summary,\n                horizontalalignment=\"center\",\n                verticalalignment=\"center\",\n                fontsize=4,\n            )\n            t.set_bbox(\n                {\n                    \"facecolor\": \"white\",\n                    \"alpha\": 0.9,\n                    \"linewidth\": 0,\n                    \"boxstyle\": \"square,pad=0.1\",\n                }\n            )\n\n        ax.set_axis_off()\n        # Save the plot as an artifact of the step\n        self.save_artifact(\n            name=\"Text clusters\",\n            write_function=lambda path: fig.savefig(path / \"figure_clustering.png\"),\n            metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n        )\n        plt.close()\n\n    def _create_figure(\n        self,\n        inputs: StepInput,\n        label2docs: Dict[int, List[str]],\n        cluster_summaries: Dict[int, str],\n    ) -> None:\n        \"\"\"Creates a figure of the clustered texts and save it as an artifact.\n\n        Args:\n            inputs: The inputs of the step, as we will extract information from them again.\n            label2docs: Map from each label to the list of documents (texts) that belong to that cluster.\n            cluster_summaries: The summaries of the clusters, obtained from the LLM.\n        \"\"\"\n        self._logger.info(\"\ud83d\uddbc\ufe0f Creating figure for the clusters...\")\n\n        labels = []\n        projections = []\n        id2cluster = {}\n        for i, input in enumerate(inputs):\n            label = input[\"cluster_label\"]\n            id2cluster[i] = label\n            labels.append(label)\n            projections.append(input[\"projection\"])\n\n        projections = np.array(projections)\n\n        # Contains the placement of the cluster centers in the figure\n        cluster_centers: Dict[str, Tuple[float, float]] = {}\n        for label in label2docs.keys():\n            x = np.mean([projections[doc, 0] for doc in label2docs[label]])\n            y = np.mean([projections[doc, 1] for doc in label2docs[label]])\n            cluster_centers[label] = (x, y)\n\n        df = pd.DataFrame(\n            data={\n                \"X\": projections[:, 0],\n                \"Y\": projections[:, 1],\n                \"labels\": labels,\n            }\n        )\n\n        self._save_figure(\n            df, cluster_centers=cluster_centers, cluster_summaries=cluster_summaries\n        )\n\n    def _prepare_input_texts(\n        self,\n        inputs: StepInput,\n        label2docs: Dict[int, List[int]],\n        unique_labels: List[int],\n    ) -> List[Dict[str, Union[str, int]]]:\n        \"\"\"Prepares a batch of inputs to send to the LLM, with the examples of each cluster.\n\n        Args:\n            inputs: Inputs from the step.\n            label2docs: Map from each label to the list of documents (texts) that\n                belong to that cluster.\n            unique_labels: The unique labels of the clusters.\n\n        Returns:\n            The input texts to send to the LLM, with the examples of each cluster\n            prepared to be used in the prompt, and an additional key to store the\n            labels (that will be needed to find the data after the batches are\n            returned from the LLM).\n        \"\"\"\n        input_texts = []\n        for label in range(unique_labels):  # The label -1 is implicitly excluded\n            # Get the ids but remove possible duplicates, which could happen with bigger probability\n            # the bigger the number of examples requested, and the smaller the subset of examples\n            ids = set(\n                np.random.choice(label2docs[label], size=self.samples_per_cluster)\n            )  # Grab the number of examples\n            examples = [inputs[i][\"text\"] for i in ids]\n            input_text = {\n                \"text\": \"\\n\\n\".join(\n                    [f\"Example {i}:\\n{t}\" for i, t in enumerate(examples, start=1)]\n                ),\n                \"__LABEL\": label,\n            }\n            input_texts.append(input_text)\n        return input_texts\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        labels = [input[\"cluster_label\"] for input in inputs]\n        # -1 because -1 is the label for the unclassified\n        unique_labels = len(set(labels)) - 1\n        # This will be the output of the LLM, the set of labels for each cluster\n        cluster_summaries: Dict[int, str] = {-1: self.default_label}\n\n        # Map from label to list of documents, will use them to select examples from each cluster\n        label2docs = defaultdict(list)\n        for i, label in enumerate(labels):\n            label2docs[label].append(i)\n\n        input_texts = self._prepare_input_texts(inputs, label2docs, unique_labels)\n\n        # Send the texts in batches to the LLM, and get the labels for each cluster\n        for i, batched_inputs in enumerate(batched(input_texts, self.input_batch_size)):\n            self._logger.info(f\"\ud83d\udce6 Processing internal batch of inputs {i}...\")\n            results = super().process(batched_inputs)\n            for result in next(results):  # Extract the elements from the generator\n                cluster_summaries[result[\"__LABEL\"]] = result[\"labels\"]\n\n        # Assign the labels to each text\n        for input in inputs:\n            input[\"summary_label\"] = json.dumps(\n                cluster_summaries[input[\"cluster_label\"]]\n            )\n\n        if self.savefig:\n            self._create_figure(inputs, label2docs, cluster_summaries)\n\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering.inputs","title":"inputs: List[str] property","text":"

The input for the task are the same as those for TextClassification plus the projection and cluster_label columns (which can be obtained from UMAP + DBSCAN steps).

"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering.outputs","title":"outputs: List[str] property","text":"

The output for the task is the summary_label and the model_name.

"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._save_figure","title":"_save_figure(data, cluster_centers, cluster_summaries)","text":"

Saves the figure starting from the dataframe, using matplotlib.

Parameters:

Name Type Description Default data DataFrame

pd.DataFrame with the columns 'X', 'Y' and 'labels' representing the projections and the label of each text respectively.

required cluster_centers Dict[str, Tuple[float, float]]

Dictionary mapping from each label the center of a cluster, to help with the placement of the annotations.

required cluster_summaries Dict[int, str]

The summaries of the clusters, obtained from the LLM.

required Source code in src/distilabel/steps/clustering/text_clustering.py
def _save_figure(\n    self,\n    data: pd.DataFrame,\n    cluster_centers: Dict[str, Tuple[float, float]],\n    cluster_summaries: Dict[int, str],\n) -> None:\n    \"\"\"Saves the figure starting from the dataframe, using matplotlib.\n\n    Args:\n        data: pd.DataFrame with the columns 'X', 'Y' and 'labels' representing\n            the projections and the label of each text respectively.\n        cluster_centers: Dictionary mapping from each label the center of a cluster,\n            to help with the placement of the annotations.\n        cluster_summaries: The summaries of the clusters, obtained from the LLM.\n    \"\"\"\n    import matplotlib.pyplot as plt\n\n    fig, ax = plt.subplots(figsize=(12, 8), dpi=300)\n    unique_labels = data[\"labels\"].unique()\n    # Map of colors for each label (-1 is black)\n    colormap = dict(\n        zip(unique_labels, plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))))\n    )\n    colormap[-1] = np.array([0, 0, 0, 0])\n    data[\"color\"] = data[\"labels\"].map(colormap)\n\n    data.plot(\n        kind=\"scatter\",\n        x=\"X\",\n        y=\"Y\",\n        c=\"color\",\n        s=0.75,\n        alpha=0.8,\n        linewidth=0.4,\n        ax=ax,\n        colorbar=False,\n    )\n\n    for label in cluster_summaries.keys():\n        if label == -1:\n            continue\n        summary = str(cluster_summaries[label])  # These are obtained from the LLM\n        position = cluster_centers[label]\n        t = ax.text(\n            position[0],\n            position[1],\n            summary,\n            horizontalalignment=\"center\",\n            verticalalignment=\"center\",\n            fontsize=4,\n        )\n        t.set_bbox(\n            {\n                \"facecolor\": \"white\",\n                \"alpha\": 0.9,\n                \"linewidth\": 0,\n                \"boxstyle\": \"square,pad=0.1\",\n            }\n        )\n\n    ax.set_axis_off()\n    # Save the plot as an artifact of the step\n    self.save_artifact(\n        name=\"Text clusters\",\n        write_function=lambda path: fig.savefig(path / \"figure_clustering.png\"),\n        metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n    )\n    plt.close()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._create_figure","title":"_create_figure(inputs, label2docs, cluster_summaries)","text":"

Creates a figure of the clustered texts and save it as an artifact.

Parameters:

Name Type Description Default inputs StepInput

The inputs of the step, as we will extract information from them again.

required label2docs Dict[int, List[str]]

Map from each label to the list of documents (texts) that belong to that cluster.

required cluster_summaries Dict[int, str]

The summaries of the clusters, obtained from the LLM.

required Source code in src/distilabel/steps/clustering/text_clustering.py
def _create_figure(\n    self,\n    inputs: StepInput,\n    label2docs: Dict[int, List[str]],\n    cluster_summaries: Dict[int, str],\n) -> None:\n    \"\"\"Creates a figure of the clustered texts and save it as an artifact.\n\n    Args:\n        inputs: The inputs of the step, as we will extract information from them again.\n        label2docs: Map from each label to the list of documents (texts) that belong to that cluster.\n        cluster_summaries: The summaries of the clusters, obtained from the LLM.\n    \"\"\"\n    self._logger.info(\"\ud83d\uddbc\ufe0f Creating figure for the clusters...\")\n\n    labels = []\n    projections = []\n    id2cluster = {}\n    for i, input in enumerate(inputs):\n        label = input[\"cluster_label\"]\n        id2cluster[i] = label\n        labels.append(label)\n        projections.append(input[\"projection\"])\n\n    projections = np.array(projections)\n\n    # Contains the placement of the cluster centers in the figure\n    cluster_centers: Dict[str, Tuple[float, float]] = {}\n    for label in label2docs.keys():\n        x = np.mean([projections[doc, 0] for doc in label2docs[label]])\n        y = np.mean([projections[doc, 1] for doc in label2docs[label]])\n        cluster_centers[label] = (x, y)\n\n    df = pd.DataFrame(\n        data={\n            \"X\": projections[:, 0],\n            \"Y\": projections[:, 1],\n            \"labels\": labels,\n        }\n    )\n\n    self._save_figure(\n        df, cluster_centers=cluster_centers, cluster_summaries=cluster_summaries\n    )\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TextClustering._prepare_input_texts","title":"_prepare_input_texts(inputs, label2docs, unique_labels)","text":"

Prepares a batch of inputs to send to the LLM, with the examples of each cluster.

Parameters:

Name Type Description Default inputs StepInput

Inputs from the step.

required label2docs Dict[int, List[int]]

Map from each label to the list of documents (texts) that belong to that cluster.

required unique_labels List[int]

The unique labels of the clusters.

required

Returns:

Type Description List[Dict[str, Union[str, int]]]

The input texts to send to the LLM, with the examples of each cluster

List[Dict[str, Union[str, int]]]

prepared to be used in the prompt, and an additional key to store the

List[Dict[str, Union[str, int]]]

labels (that will be needed to find the data after the batches are

List[Dict[str, Union[str, int]]]

returned from the LLM).

Source code in src/distilabel/steps/clustering/text_clustering.py
def _prepare_input_texts(\n    self,\n    inputs: StepInput,\n    label2docs: Dict[int, List[int]],\n    unique_labels: List[int],\n) -> List[Dict[str, Union[str, int]]]:\n    \"\"\"Prepares a batch of inputs to send to the LLM, with the examples of each cluster.\n\n    Args:\n        inputs: Inputs from the step.\n        label2docs: Map from each label to the list of documents (texts) that\n            belong to that cluster.\n        unique_labels: The unique labels of the clusters.\n\n    Returns:\n        The input texts to send to the LLM, with the examples of each cluster\n        prepared to be used in the prompt, and an additional key to store the\n        labels (that will be needed to find the data after the batches are\n        returned from the LLM).\n    \"\"\"\n    input_texts = []\n    for label in range(unique_labels):  # The label -1 is implicitly excluded\n        # Get the ids but remove possible duplicates, which could happen with bigger probability\n        # the bigger the number of examples requested, and the smaller the subset of examples\n        ids = set(\n            np.random.choice(label2docs[label], size=self.samples_per_cluster)\n        )  # Grab the number of examples\n        examples = [inputs[i][\"text\"] for i in ids]\n        input_text = {\n            \"text\": \"\\n\\n\".join(\n                [f\"Example {i}:\\n{t}\" for i, t in enumerate(examples, start=1)]\n            ),\n            \"__LABEL\": label,\n        }\n        input_texts.append(input_text)\n    return input_texts\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.UMAP","title":"UMAP","text":"

Bases: GlobalStep

UMAP is a general purpose manifold learning and dimension reduction algorithm.

This is a GlobalStep that reduces the dimensionality of the embeddings using. Visit the TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub.

Input columns
  • embedding (List[float]): The original embeddings we want to reduce the dimension.
Output columns
  • projection (List[float]): Embedding reduced to the number of components specified, the size of the new embeddings will be determined by the n_components.
Categories
  • clustering
  • text-classification
References
  • UMAP repository
  • UMAP documentation

Attributes:

Name Type Description - n_components

The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100.

- metric

The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean.

- n_jobs

The number of parallel jobs to run. Defaults to 8.

- random_state

The random state to use for the UMAP algorithm.

Runtime parameters
  • n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100.
  • metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean.
  • n_jobs: The number of parallel jobs to run. Defaults to 8.
  • random_state: The random state to use for the UMAP algorithm.
Citations
@misc{mcinnes2020umapuniformmanifoldapproximation,\n    title={UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction},\n    author={Leland McInnes and John Healy and James Melville},\n    year={2020},\n    eprint={1802.03426},\n    archivePrefix={arXiv},\n    primaryClass={stat.ML},\n    url={https://arxiv.org/abs/1802.03426},\n}\n
Source code in src/distilabel/steps/clustering/umap.py
class UMAP(GlobalStep):\n    r\"\"\"UMAP is a general purpose manifold learning and dimension reduction algorithm.\n\n    This is a `GlobalStep` that reduces the dimensionality of the embeddings using. Visit\n    the `TextClustering` step for an example of use. The trained model is saved as an artifact\n    when creating a distiset and pushing it to the Hugging Face Hub.\n\n    Input columns:\n        - embedding (`List[float]`): The original embeddings we want to reduce the dimension.\n\n    Output columns:\n        - projection (`List[float]`): Embedding reduced to the number of components specified,\n            the size of the new embeddings will be determined by the `n_components`.\n\n    Categories:\n        - clustering\n        - text-classification\n\n    References:\n        - [`UMAP repository`](https://github.com/lmcinnes/umap/tree/master)\n        - [`UMAP documentation`](https://umap-learn.readthedocs.io/en/latest/)\n\n    Attributes:\n        - n_components: The dimension of the space to embed into. This defaults to 2 to\n            provide easy visualization (that's probably what you want), but can\n            reasonably be set to any integer value in the range 2 to 100.\n        - metric: The metric to use to compute distances in high dimensional space.\n            Visit UMAP's documentation for more information. Defaults to `euclidean`.\n        - n_jobs: The number of parallel jobs to run. Defaults to `8`.\n        - random_state: The random state to use for the UMAP algorithm.\n\n    Runtime parameters:\n        - `n_components`: The dimension of the space to embed into. This defaults to 2 to\n            provide easy visualization (that's probably what you want), but can\n            reasonably be set to any integer value in the range 2 to 100.\n        - `metric`: The metric to use to compute distances in high dimensional space.\n            Visit UMAP's documentation for more information. Defaults to `euclidean`.\n        - `n_jobs`: The number of parallel jobs to run. Defaults to `8`.\n        - `random_state`: The random state to use for the UMAP algorithm.\n\n    Citations:\n        ```\n        @misc{mcinnes2020umapuniformmanifoldapproximation,\n            title={UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction},\n            author={Leland McInnes and John Healy and James Melville},\n            year={2020},\n            eprint={1802.03426},\n            archivePrefix={arXiv},\n            primaryClass={stat.ML},\n            url={https://arxiv.org/abs/1802.03426},\n        }\n        ```\n    \"\"\"\n\n    n_components: Optional[RuntimeParameter[int]] = Field(\n        default=2,\n        description=(\n            \"The dimension of the space to embed into. This defaults to 2 to \"\n            \"provide easy visualization, but can reasonably be set to any \"\n            \"integer value in the range 2 to 100.\"\n        ),\n    )\n    metric: Optional[RuntimeParameter[str]] = Field(\n        default=\"euclidean\",\n        description=(\n            \"The metric to use to compute distances in high dimensional space. \"\n            \"Visit UMAP's documentation for more information.\"\n        ),\n    )\n    n_jobs: Optional[RuntimeParameter[int]] = Field(\n        default=8, description=\"The number of parallel jobs to run.\"\n    )\n    random_state: Optional[RuntimeParameter[int]] = Field(\n        default=None, description=\"The random state to use for the UMAP algorithm.\"\n    )\n\n    _umap: Optional[\"_UMAP\"] = PrivateAttr(None)\n\n    def load(self) -> None:\n        super().load()\n        if importlib.util.find_spec(\"umap\") is None:\n            raise ImportError(\n                \"`umap` package is not installed. Please install it using `pip install umap-learn`.\"\n            )\n        from umap import UMAP as _UMAP\n\n        self._umap = _UMAP(\n            n_components=self.n_components,\n            metric=self.metric,\n            n_jobs=self.n_jobs,\n            random_state=self.random_state,\n        )\n\n    def unload(self) -> None:\n        self._umap = None\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"embedding\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"projection\"]\n\n    def _save_model(self, model: Any) -> None:\n        import joblib\n\n        def save_model(path):\n            with open(str(path / \"UMAP.joblib\"), \"wb\") as f:\n                joblib.dump(model, f)\n\n        self.save_artifact(\n            name=\"UMAP_model\",\n            write_function=lambda path: save_model(path),\n            metadata={\n                \"n_components\": self.n_components,\n                \"metric\": self.metric,\n            },\n        )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        # Shape of the embeddings is (n_samples, n_features)\n        embeddings = np.array([input[\"embedding\"] for input in inputs])\n\n        self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Start UMAP training...\")\n        mapper = self._umap.fit(embeddings)\n        # Shape of the projection will be (n_samples, n_components)\n        for input, projection in zip(inputs, mapper.embedding_):\n            input[\"projection\"] = projection\n\n        self._save_model(mapper)\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.CombineOutputs","title":"CombineOutputs","text":"

Bases: Step

Combine the outputs of several upstream steps.

CombineOutputs is a Step that takes the outputs of several upstream steps and combines them to generate a new dictionary with all keys/columns of the upstream steps outputs.

Input columns
  • dynamic (based on the upstream Steps): All the columns of the upstream steps outputs.
Output columns
  • dynamic (based on the upstream Steps): All the columns of the upstream steps outputs.
Categories
  • columns

Examples:

Combine dictionaries of a dataset:\n\n```python\nfrom distilabel.steps import CombineOutputs\n\ncombine_outputs = CombineOutputs()\ncombine_outputs.load()\n\nresult = next(\n    combine_outputs.process(\n        [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n        [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n    )\n)\n# [\n#   {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n#   {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n# ]\n```\n\nCombine upstream steps outputs in a pipeline:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs\n\nwith Pipeline() as pipeline:\n    step_1 = ...\n    step_2 = ...\n    step_3 = ...\n    combine = CombineOutputs()\n\n    [step_1, step_2, step_3] >> combine\n```\n
Source code in src/distilabel/steps/columns/combine.py
class CombineOutputs(Step):\n    \"\"\"Combine the outputs of several upstream steps.\n\n    `CombineOutputs` is a `Step` that takes the outputs of several upstream steps and combines\n    them to generate a new dictionary with all keys/columns of the upstream steps outputs.\n\n    Input columns:\n        - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.\n\n    Output columns:\n        - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.\n\n    Categories:\n        - columns\n\n    Examples:\n\n        Combine dictionaries of a dataset:\n\n        ```python\n        from distilabel.steps import CombineOutputs\n\n        combine_outputs = CombineOutputs()\n        combine_outputs.load()\n\n        result = next(\n            combine_outputs.process(\n                [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n                [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n            )\n        )\n        # [\n        #   {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n        #   {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n        # ]\n        ```\n\n        Combine upstream steps outputs in a pipeline:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import CombineOutputs\n\n        with Pipeline() as pipeline:\n            step_1 = ...\n            step_2 = ...\n            step_3 = ...\n            combine = CombineOutputs()\n\n            [step_1, step_2, step_3] >> combine\n        ```\n    \"\"\"\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        combined_outputs = []\n        for output_dicts in zip(*inputs):\n            combined_dict = {}\n            for output_dict in output_dicts:\n                combined_dict.update(\n                    {\n                        k: v\n                        for k, v in output_dict.items()\n                        if k != DISTILABEL_METADATA_KEY\n                    }\n                )\n\n            if any(\n                DISTILABEL_METADATA_KEY in output_dict for output_dict in output_dicts\n            ):\n                combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(\n                    *output_dicts\n                )\n            combined_outputs.append(combined_dict)\n\n        yield combined_outputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering","title":"DeitaFiltering","text":"

Bases: GlobalStep

Filter dataset rows using DEITA filtering strategy.

Filter the dataset based on the DEITA score and the cosine distance between the embeddings. It's an implementation of the filtering step from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

Attributes:

Name Type Description data_budget RuntimeParameter[int]

The desired size of the dataset after filtering.

diversity_threshold RuntimeParameter[float]

If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Defaults to 0.9.

normalize_embeddings RuntimeParameter[bool]

Whether to normalize the embeddings before computing the cosine distance. Defaults to True.

Runtime parameters
  • data_budget: The desired size of the dataset after filtering.
  • diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset.
Input columns
  • evol_instruction_score (float): The score of the instruction generated by ComplexityScorer step.
  • evol_response_score (float): The score of the response generated by QualityScorer step.
  • embedding (List[float]): The embedding generated for the conversation of the instruction-response pair using GenerateEmbeddings step.
Output columns
  • deita_score (float): The DEITA score for the instruction-response pair.
  • deita_score_computed_with (List[str]): The scores used to compute the DEITA score.
  • nearest_neighbor_distance (float): The cosine distance between the embeddings of the instruction-response pair.
Categories
  • filtering
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Filter the dataset based on the DEITA score and the cosine distance between the embeddings:

from distilabel.steps import DeitaFiltering\n\ndeita_filtering = DeitaFiltering(data_budget=1)\n\ndeita_filtering.load()\n\nresult = next(\n    deita_filtering.process(\n        [\n            {\n                \"evol_instruction_score\": 0.5,\n                \"evol_response_score\": 0.5,\n                \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n            },\n            {\n                \"evol_instruction_score\": 0.6,\n                \"evol_response_score\": 0.6,\n                \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n            },\n            {\n                \"evol_instruction_score\": 0.7,\n                \"evol_response_score\": 0.7,\n                \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n            },\n        ],\n    )\n)\n# >>> result\n# [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/deita.py
class DeitaFiltering(GlobalStep):\n    \"\"\"Filter dataset rows using DEITA filtering strategy.\n\n    Filter the dataset based on the DEITA score and the cosine distance between the embeddings.\n    It's an implementation of the filtering step from the paper 'What Makes Good Data\n    for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.\n\n    Attributes:\n        data_budget: The desired size of the dataset after filtering.\n        diversity_threshold: If a row has a cosine distance with respect to it's nearest\n            neighbor greater than this value, it will be included in the filtered dataset.\n            Defaults to `0.9`.\n        normalize_embeddings: Whether to normalize the embeddings before computing the cosine\n            distance. Defaults to `True`.\n\n    Runtime parameters:\n        - `data_budget`: The desired size of the dataset after filtering.\n        - `diversity_threshold`: If a row has a cosine distance with respect to it's nearest\n            neighbor greater than this value, it will be included in the filtered dataset.\n\n    Input columns:\n        - evol_instruction_score (`float`): The score of the instruction generated by\n            `ComplexityScorer` step.\n        - evol_response_score (`float`): The score of the response generated by\n            `QualityScorer` step.\n        - embedding (`List[float]`): The embedding generated for the conversation of the\n            instruction-response pair using `GenerateEmbeddings` step.\n\n    Output columns:\n        - deita_score (`float`): The DEITA score for the instruction-response pair.\n        - deita_score_computed_with (`List[str]`): The scores used to compute the DEITA\n            score.\n        - nearest_neighbor_distance (`float`): The cosine distance between the embeddings\n            of the instruction-response pair.\n\n    Categories:\n        - filtering\n\n    References:\n        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Filter the dataset based on the DEITA score and the cosine distance between the embeddings:\n\n        ```python\n        from distilabel.steps import DeitaFiltering\n\n        deita_filtering = DeitaFiltering(data_budget=1)\n\n        deita_filtering.load()\n\n        result = next(\n            deita_filtering.process(\n                [\n                    {\n                        \"evol_instruction_score\": 0.5,\n                        \"evol_response_score\": 0.5,\n                        \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n                    },\n                    {\n                        \"evol_instruction_score\": 0.6,\n                        \"evol_response_score\": 0.6,\n                        \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n                    },\n                    {\n                        \"evol_instruction_score\": 0.7,\n                        \"evol_response_score\": 0.7,\n                        \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n                    },\n                ],\n            )\n        )\n        # >>> result\n        # [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    data_budget: RuntimeParameter[int] = Field(\n        default=None, description=\"The desired size of the dataset after filtering.\"\n    )\n    diversity_threshold: RuntimeParameter[float] = Field(\n        default=0.9,\n        description=\"If a row has a cosine distance with respect to it's nearest neighbor\"\n        \" greater than this value, it will be included in the filtered dataset.\",\n    )\n    normalize_embeddings: RuntimeParameter[bool] = Field(\n        default=True,\n        description=\"Whether to normalize the embeddings before computing the cosine distance.\",\n    )\n    distance_metric: RuntimeParameter[Literal[\"cosine\", \"manhattan\"]] = Field(\n        default=\"cosine\",\n        description=\"The distance metric to use. Currently only 'cosine' is supported.\",\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"evol_instruction_score\", \"evol_response_score\", \"embedding\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"deita_score\", \"nearest_neighbor_distance\", \"deita_score_computed_with\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Filter the dataset based on the DEITA score and the cosine distance between the\n        embeddings.\n\n        Args:\n            inputs: The input data.\n\n        Returns:\n            The filtered dataset.\n        \"\"\"\n        inputs = self._compute_deita_score(inputs)\n        inputs = self._compute_nearest_neighbor(inputs)\n        inputs.sort(key=lambda x: x[\"deita_score\"], reverse=True)\n\n        selected_rows = []\n        for input in inputs:\n            if len(selected_rows) >= self.data_budget:  # type: ignore\n                break\n            if input[\"nearest_neighbor_distance\"] >= self.diversity_threshold:\n                selected_rows.append(input)\n        yield selected_rows\n\n    def _compute_deita_score(self, inputs: StepInput) -> StepInput:\n        \"\"\"Computes the DEITA score for each instruction-response pair. The DEITA score is\n        the product of the instruction score and the response score.\n\n        Args:\n            inputs: The input data.\n\n        Returns:\n            The input data with the DEITA score computed.\n        \"\"\"\n        for input_ in inputs:\n            evol_instruction_score = input_.get(\"evol_instruction_score\")\n            evol_response_score = input_.get(\"evol_response_score\")\n\n            if evol_instruction_score and evol_response_score:\n                deita_score = evol_instruction_score * evol_response_score\n                score_computed_with = [\"evol_instruction_score\", \"evol_response_score\"]\n            elif evol_instruction_score:\n                self._logger.warning(\n                    \"Response score is missing for the instruction-response pair. Using\"\n                    \" instruction score as DEITA score.\"\n                )\n                deita_score = evol_instruction_score\n                score_computed_with = [\"evol_instruction_score\"]\n            elif evol_response_score:\n                self._logger.warning(\n                    \"Instruction score is missing for the instruction-response pair. Using\"\n                    \" response score as DEITA score.\"\n                )\n                deita_score = evol_response_score\n                score_computed_with = [\"evol_response_score\"]\n            else:\n                self._logger.warning(\n                    \"Instruction and response scores are missing for the instruction-response\"\n                    \" pair. Setting DEITA score to 0.\"\n                )\n                deita_score = 0\n                score_computed_with = []\n\n            input_.update(\n                {\n                    \"deita_score\": deita_score,\n                    \"deita_score_computed_with\": score_computed_with,\n                }\n            )\n        return inputs\n\n    def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:\n        \"\"\"Computes the cosine distance between the embeddings of the instruction-response\n        pairs and the nearest neighbor.\n\n        Args:\n            inputs: The input data.\n\n        Returns:\n            The input data with the cosine distance computed.\n        \"\"\"\n        embeddings = np.array([input[\"embedding\"] for input in inputs])\n        if self.normalize_embeddings:\n            embeddings = self._normalize_embeddings(embeddings)\n        self._logger.info(\"\ud83d\udccf Computing nearest neighbor distance...\")\n\n        if self.distance_metric == \"cosine\":\n            self._logger.info(\"\ud83d\udccf Using cosine distance.\")\n            distances = self._cosine_distance(embeddings)\n        else:\n            self._logger.info(\"\ud83d\udccf Using manhattan distance.\")\n            distances = self._manhattan_distance(embeddings)\n\n        for distance, input in zip(distances, inputs):\n            input[\"nearest_neighbor_distance\"] = distance\n        return inputs\n\n    def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:\n        \"\"\"Normalize the embeddings.\n\n        Args:\n            embeddings: The embeddings to normalize.\n\n        Returns:\n            The normalized embeddings.\n        \"\"\"\n        self._logger.info(\"\u2696\ufe0f Normalizing embeddings...\")\n        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)\n        return embeddings / norms\n\n    def _cosine_distance(self, embeddings: np.array) -> np.array:  # type: ignore\n        \"\"\"Computes the cosine distance between the embeddings.\n\n        Args:\n            embeddings: The embeddings.\n\n        Returns:\n            The cosine distance between the embeddings.\n        \"\"\"\n        cosine_similarity = np.dot(embeddings, embeddings.T)\n        cosine_distance = 1 - cosine_similarity\n        # Ignore self-distance\n        np.fill_diagonal(cosine_distance, np.inf)\n        return np.min(cosine_distance, axis=1)\n\n    def _manhattan_distance(self, embeddings: np.array) -> np.array:  # type: ignore\n        \"\"\"Computes the manhattan distance between the embeddings.\n\n        Args:\n            embeddings: The embeddings.\n\n        Returns:\n            The manhattan distance between the embeddings.\n        \"\"\"\n        manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)\n        # Ignore self-distance\n        np.fill_diagonal(manhattan_distance, np.inf)\n        return np.min(manhattan_distance, axis=1)\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering.process","title":"process(inputs)","text":"

Filter the dataset based on the DEITA score and the cosine distance between the embeddings.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Returns:

Type Description StepOutput

The filtered dataset.

Source code in src/distilabel/steps/deita.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Filter the dataset based on the DEITA score and the cosine distance between the\n    embeddings.\n\n    Args:\n        inputs: The input data.\n\n    Returns:\n        The filtered dataset.\n    \"\"\"\n    inputs = self._compute_deita_score(inputs)\n    inputs = self._compute_nearest_neighbor(inputs)\n    inputs.sort(key=lambda x: x[\"deita_score\"], reverse=True)\n\n    selected_rows = []\n    for input in inputs:\n        if len(selected_rows) >= self.data_budget:  # type: ignore\n            break\n        if input[\"nearest_neighbor_distance\"] >= self.diversity_threshold:\n            selected_rows.append(input)\n    yield selected_rows\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._compute_deita_score","title":"_compute_deita_score(inputs)","text":"

Computes the DEITA score for each instruction-response pair. The DEITA score is the product of the instruction score and the response score.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Returns:

Type Description StepInput

The input data with the DEITA score computed.

Source code in src/distilabel/steps/deita.py
def _compute_deita_score(self, inputs: StepInput) -> StepInput:\n    \"\"\"Computes the DEITA score for each instruction-response pair. The DEITA score is\n    the product of the instruction score and the response score.\n\n    Args:\n        inputs: The input data.\n\n    Returns:\n        The input data with the DEITA score computed.\n    \"\"\"\n    for input_ in inputs:\n        evol_instruction_score = input_.get(\"evol_instruction_score\")\n        evol_response_score = input_.get(\"evol_response_score\")\n\n        if evol_instruction_score and evol_response_score:\n            deita_score = evol_instruction_score * evol_response_score\n            score_computed_with = [\"evol_instruction_score\", \"evol_response_score\"]\n        elif evol_instruction_score:\n            self._logger.warning(\n                \"Response score is missing for the instruction-response pair. Using\"\n                \" instruction score as DEITA score.\"\n            )\n            deita_score = evol_instruction_score\n            score_computed_with = [\"evol_instruction_score\"]\n        elif evol_response_score:\n            self._logger.warning(\n                \"Instruction score is missing for the instruction-response pair. Using\"\n                \" response score as DEITA score.\"\n            )\n            deita_score = evol_response_score\n            score_computed_with = [\"evol_response_score\"]\n        else:\n            self._logger.warning(\n                \"Instruction and response scores are missing for the instruction-response\"\n                \" pair. Setting DEITA score to 0.\"\n            )\n            deita_score = 0\n            score_computed_with = []\n\n        input_.update(\n            {\n                \"deita_score\": deita_score,\n                \"deita_score_computed_with\": score_computed_with,\n            }\n        )\n    return inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._compute_nearest_neighbor","title":"_compute_nearest_neighbor(inputs)","text":"

Computes the cosine distance between the embeddings of the instruction-response pairs and the nearest neighbor.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Returns:

Type Description StepInput

The input data with the cosine distance computed.

Source code in src/distilabel/steps/deita.py
def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:\n    \"\"\"Computes the cosine distance between the embeddings of the instruction-response\n    pairs and the nearest neighbor.\n\n    Args:\n        inputs: The input data.\n\n    Returns:\n        The input data with the cosine distance computed.\n    \"\"\"\n    embeddings = np.array([input[\"embedding\"] for input in inputs])\n    if self.normalize_embeddings:\n        embeddings = self._normalize_embeddings(embeddings)\n    self._logger.info(\"\ud83d\udccf Computing nearest neighbor distance...\")\n\n    if self.distance_metric == \"cosine\":\n        self._logger.info(\"\ud83d\udccf Using cosine distance.\")\n        distances = self._cosine_distance(embeddings)\n    else:\n        self._logger.info(\"\ud83d\udccf Using manhattan distance.\")\n        distances = self._manhattan_distance(embeddings)\n\n    for distance, input in zip(distances, inputs):\n        input[\"nearest_neighbor_distance\"] = distance\n    return inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._normalize_embeddings","title":"_normalize_embeddings(embeddings)","text":"

Normalize the embeddings.

Parameters:

Name Type Description Default embeddings ndarray

The embeddings to normalize.

required

Returns:

Type Description ndarray

The normalized embeddings.

Source code in src/distilabel/steps/deita.py
def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:\n    \"\"\"Normalize the embeddings.\n\n    Args:\n        embeddings: The embeddings to normalize.\n\n    Returns:\n        The normalized embeddings.\n    \"\"\"\n    self._logger.info(\"\u2696\ufe0f Normalizing embeddings...\")\n    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)\n    return embeddings / norms\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._cosine_distance","title":"_cosine_distance(embeddings)","text":"

Computes the cosine distance between the embeddings.

Parameters:

Name Type Description Default embeddings array

The embeddings.

required

Returns:

Type Description array

The cosine distance between the embeddings.

Source code in src/distilabel/steps/deita.py
def _cosine_distance(self, embeddings: np.array) -> np.array:  # type: ignore\n    \"\"\"Computes the cosine distance between the embeddings.\n\n    Args:\n        embeddings: The embeddings.\n\n    Returns:\n        The cosine distance between the embeddings.\n    \"\"\"\n    cosine_similarity = np.dot(embeddings, embeddings.T)\n    cosine_distance = 1 - cosine_similarity\n    # Ignore self-distance\n    np.fill_diagonal(cosine_distance, np.inf)\n    return np.min(cosine_distance, axis=1)\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DeitaFiltering._manhattan_distance","title":"_manhattan_distance(embeddings)","text":"

Computes the manhattan distance between the embeddings.

Parameters:

Name Type Description Default embeddings array

The embeddings.

required

Returns:

Type Description array

The manhattan distance between the embeddings.

Source code in src/distilabel/steps/deita.py
def _manhattan_distance(self, embeddings: np.array) -> np.array:  # type: ignore\n    \"\"\"Computes the manhattan distance between the embeddings.\n\n    Args:\n        embeddings: The embeddings.\n\n    Returns:\n        The manhattan distance between the embeddings.\n    \"\"\"\n    manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)\n    # Ignore self-distance\n    np.fill_diagonal(manhattan_distance, np.inf)\n    return np.min(manhattan_distance, axis=1)\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingGeneration","title":"EmbeddingGeneration","text":"

Bases: Step

Generate embeddings using an Embeddings model.

EmbeddingGeneration is a Step that using an Embeddings model generates sentence embeddings for the provided input texts.

Attributes:

Name Type Description embeddings Embeddings

the Embeddings model used to generate the sentence embeddings.

Input columns
  • text (str): The text for which the sentence embedding has to be generated.
Output columns
  • embedding (List[Union[float, int]]): the generated sentence embedding.
Categories
  • embedding

Examples:

Generate sentence embeddings with Sentence Transformers:

from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.steps import EmbeddingGeneration\n\nembedding_generation = EmbeddingGeneration(\n    embeddings=SentenceTransformerEmbeddings(\n        model=\"mixedbread-ai/mxbai-embed-large-v1\",\n    )\n)\n\nembedding_generation.load()\n\nresult = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n# [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n
Source code in src/distilabel/steps/embeddings/embedding_generation.py
class EmbeddingGeneration(Step):\n    \"\"\"Generate embeddings using an `Embeddings` model.\n\n    `EmbeddingGeneration` is a `Step` that using an `Embeddings` model generates sentence\n    embeddings for the provided input texts.\n\n    Attributes:\n        embeddings: the `Embeddings` model used to generate the sentence embeddings.\n\n    Input columns:\n        - text (`str`): The text for which the sentence embedding has to be generated.\n\n    Output columns:\n        - embedding (`List[Union[float, int]]`): the generated sentence embedding.\n\n    Categories:\n        - embedding\n\n    Examples:\n        Generate sentence embeddings with Sentence Transformers:\n\n        ```python\n        from distilabel.models import SentenceTransformerEmbeddings\n        from distilabel.steps import EmbeddingGeneration\n\n        embedding_generation = EmbeddingGeneration(\n            embeddings=SentenceTransformerEmbeddings(\n                model=\"mixedbread-ai/mxbai-embed-large-v1\",\n            )\n        )\n\n        embedding_generation.load()\n\n        result = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n        # [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n        ```\n\n    \"\"\"\n\n    embeddings: Embeddings\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"embedding\", \"model_name\"]\n\n    def load(self) -> None:\n        \"\"\"Loads the `Embeddings` model.\"\"\"\n        super().load()\n\n        self.embeddings.load()\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        embeddings = self.embeddings.encode(inputs=[input[\"text\"] for input in inputs])\n        for input, embedding in zip(inputs, embeddings):\n            input[\"embedding\"] = embedding\n            input[\"model_name\"] = self.embeddings.model_name\n        yield inputs\n\n    def unload(self) -> None:\n        super().unload()\n        self.embeddings.unload()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingGeneration.load","title":"load()","text":"

Loads the Embeddings model.

Source code in src/distilabel/steps/embeddings/embedding_generation.py
def load(self) -> None:\n    \"\"\"Loads the `Embeddings` model.\"\"\"\n    super().load()\n\n    self.embeddings.load()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour","title":"FaissNearestNeighbour","text":"

Bases: GlobalStep

Create a faiss index to get the nearest neighbours.

FaissNearestNeighbour is a GlobalStep that creates a faiss index using the Hugging Face datasets library integration, and then gets the nearest neighbours and the scores or distance of the nearest neighbours for each input row.

Attributes:

Name Type Description device Optional[RuntimeParameter[Union[int, List[int]]]]

the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None.

string_factory Optional[RuntimeParameter[str]]

the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None.

metric_type Optional[RuntimeParameter[int]]

the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None.

k Optional[RuntimeParameter[int]]

the number of nearest neighbours to search for each input row. Defaults to 1.

search_batch_size Optional[RuntimeParameter[int]]

the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50.

train_size Optional[RuntimeParameter[int]]

If the index needs a training step, specifies how many vectors will be used to train the index.

Runtime parameters
  • device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None.
  • string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None.
  • metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None.
  • k: the number of nearest neighbours to search for each input row. Defaults to 1.
  • search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50.
  • train_size: If the index needs a training step, specifies how many vectors will be used to train the index.
Input columns
  • embedding (List[Union[float, int]]): a sentence embedding.
Output columns
  • nn_indices (List[int]): a list containing the indices of the k nearest neighbours in the inputs for the row.
  • nn_scores (List[float]): a list containing the score or distance to each k nearest neighbour in the inputs.
Categories
  • embedding
References
  • The Faiss library

Examples:

Generating embeddings and getting the nearest neighbours:

from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\nwith Pipeline(name=\"hello\") as pipeline:\n    load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n    embeddings = EmbeddingGeneration(\n        embeddings=SentenceTransformerEmbeddings(\n            model=\"mixedbread-ai/mxbai-embed-large-v1\"\n        )\n    )\n\n    nearest_neighbours = FaissNearestNeighbour()\n\n    load_data >> embeddings >> nearest_neighbours\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            load_data.name: {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n        },\n        use_cache=False,\n    )\n
Citations
@misc{douze2024faisslibrary,\n    title={The Faiss library},\n    author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazar\u00e9 and Maria Lomeli and Lucas Hosseini and Herv\u00e9 J\u00e9gou},\n    year={2024},\n    eprint={2401.08281},\n    archivePrefix={arXiv},\n    primaryClass={cs.LG},\n    url={https://arxiv.org/abs/2401.08281},\n}\n
Source code in src/distilabel/steps/embeddings/nearest_neighbour.py
class FaissNearestNeighbour(GlobalStep):\n    \"\"\"Create a `faiss` index to get the nearest neighbours.\n\n    `FaissNearestNeighbour` is a `GlobalStep` that creates a `faiss` index using the Hugging\n    Face `datasets` library integration, and then gets the nearest neighbours and the scores\n    or distance of the nearest neighbours for each input row.\n\n    Attributes:\n        device: the CUDA device ID or a list of IDs to be used. If negative integer, it\n            will use all the available GPUs. Defaults to `None`.\n        string_factory: the name of the factory to be used to build the `faiss` index.\n            Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\n            Defaults to `None`.\n        metric_type: the metric to be used to measure the distance between the points. It's\n            an integer and the recommend way to pass it is importing `faiss` and then passing\n            one of `faiss.METRIC_x` variables. Defaults to `None`.\n        k: the number of nearest neighbours to search for each input row. Defaults to `1`.\n        search_batch_size: the number of rows to include in a search batch. The value can\n            be adjusted to maximize the resources usage or to avoid OOM issues. Defaults\n            to `50`.\n        train_size: If the index needs a training step, specifies how many vectors will be\n            used to train the index.\n\n    Runtime parameters:\n        - `device`: the CUDA device ID or a list of IDs to be used. If negative integer,\n            it will use all the available GPUs. Defaults to `None`.\n        - `string_factory`: the name of the factory to be used to build the `faiss` index.\n            Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\n            Defaults to `None`.\n        - `metric_type`: the metric to be used to measure the distance between the points.\n            It's an integer and the recommend way to pass it is importing `faiss` and then\n            passing one of `faiss.METRIC_x` variables. Defaults to `None`.\n        - `k`: the number of nearest neighbours to search for each input row. Defaults to `1`.\n        - `search_batch_size`: the number of rows to include in a search batch. The value\n            can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults\n            to `50`.\n        - `train_size`: If the index needs a training step, specifies how many vectors will\n            be used to train the index.\n\n    Input columns:\n        - embedding (`List[Union[float, int]]`): a sentence embedding.\n\n    Output columns:\n        - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours\n            in the inputs for the row.\n        - nn_scores (`List[float]`): a list containing the score or distance to each `k`\n            nearest neighbour in the inputs.\n\n    Categories:\n        - embedding\n\n    References:\n        - [`The Faiss library`](https://arxiv.org/abs/2401.08281)\n\n    Examples:\n        Generating embeddings and getting the nearest neighbours:\n\n        ```python\n        from distilabel.models import SentenceTransformerEmbeddings\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\n        with Pipeline(name=\"hello\") as pipeline:\n            load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n            embeddings = EmbeddingGeneration(\n                embeddings=SentenceTransformerEmbeddings(\n                    model=\"mixedbread-ai/mxbai-embed-large-v1\"\n                )\n            )\n\n            nearest_neighbours = FaissNearestNeighbour()\n\n            load_data >> embeddings >> nearest_neighbours\n\n        if __name__ == \"__main__\":\n            distiset = pipeline.run(\n                parameters={\n                    load_data.name: {\n                        \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                        \"split\": \"test\",\n                    },\n                },\n                use_cache=False,\n            )\n        ```\n\n    Citations:\n        ```\n        @misc{douze2024faisslibrary,\n            title={The Faiss library},\n            author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazar\u00e9 and Maria Lomeli and Lucas Hosseini and Herv\u00e9 J\u00e9gou},\n            year={2024},\n            eprint={2401.08281},\n            archivePrefix={arXiv},\n            primaryClass={cs.LG},\n            url={https://arxiv.org/abs/2401.08281},\n        }\n        ```\n    \"\"\"\n\n    device: Optional[RuntimeParameter[Union[int, List[int]]]] = Field(\n        default=None,\n        description=\"The CUDA device ID or a list of IDs to be used. If negative integer,\"\n        \" it will use all the available GPUs.\",\n    )\n    string_factory: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The name of the factory to be used to build the `faiss` index.\"\n        \"Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.\",\n    )\n    metric_type: Optional[RuntimeParameter[int]] = Field(\n        default=None,\n        description=\"The metric to be used to measure the distance between the points. It's\"\n        \" an integer and the recommend way to pass it is importing `faiss` and thenpassing\"\n        \" one of `faiss.METRIC_x` variables.\",\n    )\n    k: Optional[RuntimeParameter[int]] = Field(\n        default=1,\n        description=\"The number of nearest neighbours to search for each input row.\",\n    )\n    search_batch_size: Optional[RuntimeParameter[int]] = Field(\n        default=50,\n        description=\"The number of rows to include in a search batch. The value can be adjusted\"\n        \" to maximize the resources usage or to avoid OOM issues.\",\n    )\n    train_size: Optional[RuntimeParameter[int]] = Field(\n        default=None,\n        description=\"If the index needs a training step, specifies how many vectors will be used to train the index.\",\n    )\n\n    def load(self) -> None:\n        super().load()\n\n        if importlib.util.find_spec(\"faiss\") is None:\n            raise ImportError(\n                \"`faiss` package is not installed. Please install it using `pip install\"\n                \" faiss-cpu` or `pip install faiss-gpu`.\"\n            )\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"embedding\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"nn_indices\", \"nn_scores\"]\n\n    def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:\n        \"\"\"Builds a `faiss` index using `datasets` integration.\n\n        Args:\n            inputs: a list of dictionaries.\n\n        Returns:\n            The build `datasets.Dataset` with its `faiss` index.\n        \"\"\"\n        dataset = Dataset.from_list(inputs)\n        if self.train_size is not None and self.string_factory:\n            self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Starting Faiss index training...\")\n        dataset.add_faiss_index(\n            column=\"embedding\",\n            device=self.device,  # type: ignore\n            string_factory=self.string_factory,\n            metric_type=self.metric_type,\n            train_size=self.train_size,\n        )\n        return dataset\n\n    def _save_index(self, dataset: Dataset) -> None:\n        \"\"\"Save the generated Faiss index as an artifact of the step.\n\n        Args:\n            dataset: the dataset with the `faiss` index built.\n        \"\"\"\n        self.save_artifact(\n            name=\"faiss_index\",\n            write_function=lambda path: dataset.save_faiss_index(\n                index_name=\"embedding\", file=path / \"index.faiss\"\n            ),\n            metadata={\n                \"num_rows\": len(dataset),\n                \"embedding_dim\": len(dataset[0][\"embedding\"]),\n            },\n        )\n\n    def _search(self, dataset: Dataset) -> Dataset:\n        \"\"\"Search the top `k` nearest neighbours for each row in the dataset.\n\n        Args:\n            dataset: the dataset with the `faiss` index built.\n\n        Returns:\n            The updated dataset containing the top `k` nearest neighbours for each row,\n            as well as the score or distance.\n        \"\"\"\n\n        def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:\n            queries = np.array(examples[\"embedding\"])\n            results = dataset.search_batch(\n                index_name=\"embedding\",\n                queries=queries,\n                k=self.k + 1,  # type: ignore\n            )\n            examples[\"nn_indices\"] = [indices[1:] for indices in results.total_indices]\n            examples[\"nn_scores\"] = [scores[1:] for scores in results.total_scores]\n            return examples\n\n        return dataset.map(\n            add_search_results, batched=True, batch_size=self.search_batch_size\n        )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        dataset = self._build_index(inputs)\n        dataset_with_search_results = self._search(dataset)\n        self._save_index(dataset)\n        yield dataset_with_search_results.to_list()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._build_index","title":"_build_index(inputs)","text":"

Builds a faiss index using datasets integration.

Parameters:

Name Type Description Default inputs List[Dict[str, Any]]

a list of dictionaries.

required

Returns:

Type Description Dataset

The build datasets.Dataset with its faiss index.

Source code in src/distilabel/steps/embeddings/nearest_neighbour.py
def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:\n    \"\"\"Builds a `faiss` index using `datasets` integration.\n\n    Args:\n        inputs: a list of dictionaries.\n\n    Returns:\n        The build `datasets.Dataset` with its `faiss` index.\n    \"\"\"\n    dataset = Dataset.from_list(inputs)\n    if self.train_size is not None and self.string_factory:\n        self._logger.info(\"\ud83c\udfcb\ufe0f\u200d\u2640\ufe0f Starting Faiss index training...\")\n    dataset.add_faiss_index(\n        column=\"embedding\",\n        device=self.device,  # type: ignore\n        string_factory=self.string_factory,\n        metric_type=self.metric_type,\n        train_size=self.train_size,\n    )\n    return dataset\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._save_index","title":"_save_index(dataset)","text":"

Save the generated Faiss index as an artifact of the step.

Parameters:

Name Type Description Default dataset Dataset

the dataset with the faiss index built.

required Source code in src/distilabel/steps/embeddings/nearest_neighbour.py
def _save_index(self, dataset: Dataset) -> None:\n    \"\"\"Save the generated Faiss index as an artifact of the step.\n\n    Args:\n        dataset: the dataset with the `faiss` index built.\n    \"\"\"\n    self.save_artifact(\n        name=\"faiss_index\",\n        write_function=lambda path: dataset.save_faiss_index(\n            index_name=\"embedding\", file=path / \"index.faiss\"\n        ),\n        metadata={\n            \"num_rows\": len(dataset),\n            \"embedding_dim\": len(dataset[0][\"embedding\"]),\n        },\n    )\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FaissNearestNeighbour._search","title":"_search(dataset)","text":"

Search the top k nearest neighbours for each row in the dataset.

Parameters:

Name Type Description Default dataset Dataset

the dataset with the faiss index built.

required

Returns:

Type Description Dataset

The updated dataset containing the top k nearest neighbours for each row,

Dataset

as well as the score or distance.

Source code in src/distilabel/steps/embeddings/nearest_neighbour.py
def _search(self, dataset: Dataset) -> Dataset:\n    \"\"\"Search the top `k` nearest neighbours for each row in the dataset.\n\n    Args:\n        dataset: the dataset with the `faiss` index built.\n\n    Returns:\n        The updated dataset containing the top `k` nearest neighbours for each row,\n        as well as the score or distance.\n    \"\"\"\n\n    def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:\n        queries = np.array(examples[\"embedding\"])\n        results = dataset.search_batch(\n            index_name=\"embedding\",\n            queries=queries,\n            k=self.k + 1,  # type: ignore\n        )\n        examples[\"nn_indices\"] = [indices[1:] for indices in results.total_indices]\n        examples[\"nn_scores\"] = [scores[1:] for scores in results.total_scores]\n        return examples\n\n    return dataset.map(\n        add_search_results, batched=True, batch_size=self.search_batch_size\n    )\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.EmbeddingDedup","title":"EmbeddingDedup","text":"

Bases: GlobalStep

Deduplicates text using embeddings.

EmbeddingDedup is a Step that detects near-duplicates in datasets, using embeddings to compare the similarity between the texts. The typical workflow with this step would include having a dataset with embeddings precomputed, and then (possibly using the FaissNearestNeighbour) using the nn_indices and nn_scores, determine the texts that are duplicate.

Attributes:

Name Type Description threshold Optional[RuntimeParameter[float]]

the threshold to consider 2 examples as duplicates. It's dependent on the type of index that was used to generate the embeddings. For example, if the embeddings were generated using cosine similarity, a threshold of 0.9 would make all the texts with a cosine similarity above the value duplicates. Higher values detect less duplicates in such an index, but that should be taken into account when building it. Defaults to 0.9.

Runtime Parameters
  • threshold: the threshold to consider 2 examples as duplicates.
Input columns
  • nn_indices (List[int]): a list containing the indices of the k nearest neighbours in the inputs for the row.
  • nn_scores (List[float]): a list containing the score or distance to each k nearest neighbour in the inputs.
Output columns
  • keep_row_after_embedding_filtering (bool): boolean indicating if the piece text is not a duplicate i.e. this text should be kept.
Categories
  • filtering

Examples:

Deduplicate a list of texts using embedding information:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n    data = LoadDataFromDicts(\n        data=[\n            {\n                \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n                \"embedding\": [\n                    0.018477669046149742,\n                    -0.03748236608841726,\n                    0.001919870620352492,\n                    0.024918478063770535,\n                    0.02348063521315178,\n                    0.0038251285566308375,\n                    -0.01723884983037716,\n                    0.02881971942372201,\n                ],\n                \"nn_indices\": [0, 1],\n                \"nn_scores\": [\n                    0.9164746999740601,\n                    0.782106876373291,\n                ],\n            },\n            {\n                \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n                \"embedding\": [\n                    -0.0023464179614082125,\n                    -0.07325472251663565,\n                    -0.06058678419516501,\n                    -0.02100326928586996,\n                    -0.013462744792362657,\n                    0.027368447064244242,\n                    -0.003916070100455717,\n                    0.01243614518480423,\n                ],\n                \"nn_indices\": [0, 2],\n                \"nn_scores\": [\n                    0.7552462220191956,\n                    0.7261884808540344,\n                ],\n            },\n            {\n                \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n                \"embedding\": [\n                    -0.01630817942328242,\n                    -0.023760151552345232,\n                    -0.014249650090627883,\n                    -0.005713686451446624,\n                    -0.016033059279131567,\n                    0.0071440908501058786,\n                    -0.05691099643425161,\n                    0.01597412704817784,\n                ],\n                \"nn_indices\": [1, 2],\n                \"nn_scores\": [\n                    0.8107735514640808,\n                    0.7172299027442932,\n                ],\n            },\n        ],\n        batch_size=batch_size,\n    )\n    # In general you should do something like this before the deduplication step, to obtain the\n    # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n    # no need for it.\n    # nn = FaissNearestNeighbour(\n    #     k=30,\n    #     metric_type=faiss.METRIC_INNER_PRODUCT,\n    #     search_batch_size=50,\n    #     train_size=len(dataset),              # The number of embeddings to use for training\n    #     string_factory=\"IVF300_HNSW32,Flat\"   # To use an index (optional, maybe required for big datasets)\n    # )\n    # Read more about the `string_factory` here:\n    # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n    embedding_dedup = EmbeddingDedup(\n        threshold=0.8,\n        input_batch_size=batch_size,\n    )\n\n    data >> embedding_dedup\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    ds = distiset[\"default\"][\"train\"]\n    # Filter out the duplicates\n    ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n```\n
Source code in src/distilabel/steps/filtering/embedding.py
class EmbeddingDedup(GlobalStep):\n    \"\"\"Deduplicates text using embeddings.\n\n    `EmbeddingDedup` is a Step that detects near-duplicates in datasets, using\n    embeddings to compare the similarity between the texts. The typical workflow with this step\n    would include having a dataset with embeddings precomputed, and then (possibly using the\n    `FaissNearestNeighbour`) using the `nn_indices` and `nn_scores`, determine the texts that\n    are duplicate.\n\n    Attributes:\n        threshold: the threshold to consider 2 examples as duplicates.\n            It's dependent on the type of index that was used to generate the embeddings.\n            For example, if the embeddings were generated using cosine similarity, a threshold\n            of `0.9` would make all the texts with a cosine similarity above the value\n            duplicates. Higher values detect less duplicates in such an index, but that should\n            be taken into account when building it. Defaults to `0.9`.\n\n    Runtime Parameters:\n        - `threshold`: the threshold to consider 2 examples as duplicates.\n\n    Input columns:\n        - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours\n            in the inputs for the row.\n        - nn_scores (`List[float]`): a list containing the score or distance to each `k`\n            nearest neighbour in the inputs.\n\n    Output columns:\n        - keep_row_after_embedding_filtering (`bool`): boolean indicating if the piece `text` is\n            not a duplicate i.e. this text should be kept.\n\n    Categories:\n        - filtering\n\n    Examples:\n\n        Deduplicate a list of texts using embedding information:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import EmbeddingDedup\n        from distilabel.steps import LoadDataFromDicts\n\n        with Pipeline() as pipeline:\n            data = LoadDataFromDicts(\n                data=[\n                    {\n                        \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n                        \"embedding\": [\n                            0.018477669046149742,\n                            -0.03748236608841726,\n                            0.001919870620352492,\n                            0.024918478063770535,\n                            0.02348063521315178,\n                            0.0038251285566308375,\n                            -0.01723884983037716,\n                            0.02881971942372201,\n                        ],\n                        \"nn_indices\": [0, 1],\n                        \"nn_scores\": [\n                            0.9164746999740601,\n                            0.782106876373291,\n                        ],\n                    },\n                    {\n                        \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n                        \"embedding\": [\n                            -0.0023464179614082125,\n                            -0.07325472251663565,\n                            -0.06058678419516501,\n                            -0.02100326928586996,\n                            -0.013462744792362657,\n                            0.027368447064244242,\n                            -0.003916070100455717,\n                            0.01243614518480423,\n                        ],\n                        \"nn_indices\": [0, 2],\n                        \"nn_scores\": [\n                            0.7552462220191956,\n                            0.7261884808540344,\n                        ],\n                    },\n                    {\n                        \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n                        \"embedding\": [\n                            -0.01630817942328242,\n                            -0.023760151552345232,\n                            -0.014249650090627883,\n                            -0.005713686451446624,\n                            -0.016033059279131567,\n                            0.0071440908501058786,\n                            -0.05691099643425161,\n                            0.01597412704817784,\n                        ],\n                        \"nn_indices\": [1, 2],\n                        \"nn_scores\": [\n                            0.8107735514640808,\n                            0.7172299027442932,\n                        ],\n                    },\n                ],\n                batch_size=batch_size,\n            )\n            # In general you should do something like this before the deduplication step, to obtain the\n            # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n            # no need for it.\n            # nn = FaissNearestNeighbour(\n            #     k=30,\n            #     metric_type=faiss.METRIC_INNER_PRODUCT,\n            #     search_batch_size=50,\n            #     train_size=len(dataset),              # The number of embeddings to use for training\n            #     string_factory=\"IVF300_HNSW32,Flat\"   # To use an index (optional, maybe required for big datasets)\n            # )\n            # Read more about the `string_factory` here:\n            # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n            embedding_dedup = EmbeddingDedup(\n                threshold=0.8,\n                input_batch_size=batch_size,\n            )\n\n            data >> embedding_dedup\n\n        if __name__ == \"__main__\":\n            distiset = pipeline.run(use_cache=False)\n            ds = distiset[\"default\"][\"train\"]\n            # Filter out the duplicates\n            ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n        ```\n    \"\"\"\n\n    threshold: Optional[RuntimeParameter[float]] = Field(\n        default=0.9,\n        description=\"The threshold to consider 2 examples as duplicates. It's dependent \"\n        \"on the type of index that was used to generate the embeddings. For example, if \"\n        \"the embeddings were generated using cosine similarity, a threshold of `0.9` \"\n        \"would make all the texts with a cosine similarity above the value duplicates. \"\n        \"Higher values detect less duplicates in such an index, but that should be \"\n        \"taken into account when building it.\",\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"nn_scores\", \"nn_indices\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"keep_row_after_embedding_filtering\"]\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        rows_to_remove = set()\n\n        for input in track(inputs, description=\"Running Embedding deduplication...\"):\n            input[\"keep_row_after_embedding_filtering\"] = True\n            indices_scores = np.array(input[\"nn_scores\"]) > self.threshold\n            indices = np.array(input[\"nn_indices\"])[indices_scores]\n            if len(indices) > 0:  # If there are any rows found over the threshold\n                rows_to_remove.update(list(indices))\n\n        # Remove duplicates and get the list of rows to remove\n        for idx in rows_to_remove:\n            inputs[idx][\"keep_row_after_embedding_filtering\"] = False\n\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.MinHashDedup","title":"MinHashDedup","text":"

Bases: Step

Deduplicates text using MinHash and MinHashLSH.

MinHashDedup is a Step that detects near-duplicates in datasets. The idea roughly translates to the following steps: 1. Tokenize the text into words or ngrams. 2. Create a MinHash for each text. 3. Store the MinHashes in a MinHashLSH. 4. Check if the MinHash is already in the LSH, if so, it is a duplicate.

Attributes:

Name Type Description num_perm int

the number of permutations to use. Defaults to 128.

seed int

the seed to use for the MinHash. This seed must be the same used for MinHash, keep in mind when both steps are created. Defaults to 1.

tokenizer Literal['words', 'ngrams']

the tokenizer to use. Available ones are words or ngrams. If words is selected, it tokenize the text into words using nltk's word tokenizer. ngram estimates the ngrams (together with the size n) using. Defaults to words.

n Optional[int]

the size of the ngrams to use. Only relevant if tokenizer=\"ngrams\". Defaults to 5.

threshold float

the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to 0.9.

storage Literal['dict', 'disk']

the storage to use for the LSH. Can be dict to store the index in memory, or disk. Keep in mind, disk is an experimental feature not defined in datasketch, that is based on DiskCache's Index class. It should work as a dict, but backed by disk, but depending on the system it can be slower. Defaults to dict. which uses a custom shelve backend. Note the disk is an experimetal feature that may cause issues. Defaults to dict.

Input columns
  • text (str): the texts to be filtered.
Output columns
  • keep_row_after_minhash_filtering (bool): boolean indicating if the piece text is not a duplicate i.e. this text should be kept.
Categories
  • filtering
References
  • datasketch documentation
  • Identifying and Filtering Near-Duplicate Documents
  • Diskcache's Index

Examples:

Deduplicate a list of texts using MinHash and MinHashLSH:\n\n```python\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import MinHashDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n    ds_size = 1000\n    batch_size = 500  # Bigger batch sizes work better for this step\n    data = LoadDataFromDicts(\n        data=[\n            {\"text\": \"This is a test document.\"},\n            {\"text\": \"This document is a test.\"},\n            {\"text\": \"Test document for duplication.\"},\n            {\"text\": \"Document for duplication test.\"},\n            {\"text\": \"This is another unique document.\"},\n        ]\n        * (ds_size // 5),\n        batch_size=batch_size,\n    )\n    minhash_dedup = MinHashDedup(\n        tokenizer=\"words\",\n        threshold=0.9,      # lower values will increase the number of duplicates\n        storage=\"dict\",     # or \"disk\" for bigger datasets\n    )\n\n    data >> minhash_dedup\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    ds = distiset[\"default\"][\"train\"]\n    # Filter out the duplicates\n    ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n```\n
Source code in src/distilabel/steps/filtering/minhash.py
class MinHashDedup(Step):\n    \"\"\"Deduplicates text using `MinHash` and `MinHashLSH`.\n\n    `MinHashDedup` is a Step that detects near-duplicates in datasets. The idea roughly translates\n    to the following steps:\n    1. Tokenize the text into words or ngrams.\n    2. Create a `MinHash` for each text.\n    3. Store the `MinHashes` in a `MinHashLSH`.\n    4. Check if the `MinHash` is already in the `LSH`, if so, it is a duplicate.\n\n    Attributes:\n        num_perm: the number of permutations to use. Defaults to `128`.\n        seed: the seed to use for the MinHash. This seed must be the same\n            used for `MinHash`, keep in mind when both steps are created. Defaults to `1`.\n        tokenizer: the tokenizer to use. Available ones are `words` or `ngrams`.\n            If `words` is selected, it tokenize the text into words using nltk's\n            word tokenizer. `ngram` estimates the ngrams (together with the size\n            `n`) using. Defaults to `words`.\n        n: the size of the ngrams to use. Only relevant if `tokenizer=\"ngrams\"`. Defaults to `5`.\n        threshold: the threshold to consider two MinHashes as duplicates.\n            Values closer to 0 detect more duplicates. Defaults to `0.9`.\n        storage: the storage to use for the LSH. Can be `dict` to store the index\n            in memory, or `disk`. Keep in mind, `disk` is an experimental feature\n            not defined in `datasketch`, that is based on DiskCache's `Index` class.\n            It should work as a `dict`, but backed by disk, but depending on the system\n            it can be slower. Defaults to `dict`.\n            which uses a custom `shelve` backend. Note the `disk`\n            is an experimetal feature that may cause issues. Defaults to `dict`.\n\n    Input columns:\n        - text (`str`): the texts to be filtered.\n\n    Output columns:\n        - keep_row_after_minhash_filtering (`bool`): boolean indicating if the piece `text` is\n            not a duplicate i.e. this text should be kept.\n\n    Categories:\n        - filtering\n\n    References:\n        - [`datasketch documentation`](https://ekzhu.github.io/datasketch/lsh.html)\n        - [Identifying and Filtering Near-Duplicate Documents](https://cs.brown.edu/courses/cs253/papers/nearduplicate.pdf)\n        - [Diskcache's Index](https://grantjenks.com/docs/diskcache/api.html#diskcache.Index)\n\n    Examples:\n\n        Deduplicate a list of texts using MinHash and MinHashLSH:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import MinHashDedup\n        from distilabel.steps import LoadDataFromDicts\n\n        with Pipeline() as pipeline:\n            ds_size = 1000\n            batch_size = 500  # Bigger batch sizes work better for this step\n            data = LoadDataFromDicts(\n                data=[\n                    {\"text\": \"This is a test document.\"},\n                    {\"text\": \"This document is a test.\"},\n                    {\"text\": \"Test document for duplication.\"},\n                    {\"text\": \"Document for duplication test.\"},\n                    {\"text\": \"This is another unique document.\"},\n                ]\n                * (ds_size // 5),\n                batch_size=batch_size,\n            )\n            minhash_dedup = MinHashDedup(\n                tokenizer=\"words\",\n                threshold=0.9,      # lower values will increase the number of duplicates\n                storage=\"dict\",     # or \"disk\" for bigger datasets\n            )\n\n            data >> minhash_dedup\n\n        if __name__ == \"__main__\":\n            distiset = pipeline.run(use_cache=False)\n            ds = distiset[\"default\"][\"train\"]\n            # Filter out the duplicates\n            ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n        ```\n    \"\"\"\n\n    num_perm: int = 128\n    seed: int = 1\n    tokenizer: Literal[\"words\", \"ngrams\"] = \"words\"\n    n: Optional[int] = 5\n    threshold: float = 0.9\n    storage: Literal[\"dict\", \"disk\"] = \"dict\"\n\n    _hasher: Union[\"MinHash\", None] = PrivateAttr(None)\n    _tokenizer: Union[Callable, None] = PrivateAttr(None)\n    _lhs: Union[\"MinHashLSH\", None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        super().load()\n        if not importlib.import_module(\"datasketch\"):\n            raise ImportError(\n                \"`datasketch` is needed to deduplicate with MinHash, but is not installed. \"\n                \"Please install it using `pip install datasketch`.\"\n            )\n        from datasketch import MinHash\n\n        from distilabel.steps.filtering._datasketch import MinHashLSH\n\n        self._hasher = MinHash.bulk\n        self._lsh = MinHashLSH(\n            num_perm=self.num_perm,\n            threshold=self.threshold,\n            storage_config={\"type\": self.storage},\n        )\n\n        if self.tokenizer == \"words\":\n            if not importlib.import_module(\"nltk\"):\n                raise ImportError(\n                    \"`nltk` is needed to tokenize based on words, but is not installed. \"\n                    \"Please install it using `pip install nltk`. Then run `nltk.download('punkt_tab')`.\"\n                )\n            self._tokenizer = tokenized_on_words\n        else:\n            self._tokenizer = partial(tokenize_on_ngrams, n=self.n)\n\n    def unload(self) -> None:\n        super().unload()\n        # In case of LSH being stored in disk, we need to close the file.\n        if self.storage == \"disk\":\n            self._lsh.close()\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"keep_row_after_minhash_filtering\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        tokenized_texts = []\n        for input in inputs:\n            tokenized_texts.append(self._tokenizer([input[self.inputs[0]]])[0])\n\n        minhashes = self._hasher(\n            tokenized_texts, num_perm=self.num_perm, seed=self.seed\n        )\n\n        for input, minhash in zip(inputs, minhashes):\n            # Check if the text is already in the LSH index\n            if self._lsh.query(minhash):\n                input[\"keep_row_after_minhash_filtering\"] = False\n            else:\n                self._lsh.insert(str(uuid.uuid4()), minhash)\n                input[\"keep_row_after_minhash_filtering\"] = True\n\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate","title":"ConversationTemplate","text":"

Bases: Step

Generate a conversation template from an instruction and a response.

Input columns
  • instruction (str): The instruction to be used in the conversation.
  • response (str): The response to be used in the conversation.
Output columns
  • conversation (ChatType): The conversation template.
Categories
  • format
  • chat
  • template

Examples:

Create a conversation from an instruction and a response:

from distilabel.steps import ConversationTemplate\n\nconv_template = ConversationTemplate()\nconv_template.load()\n\nresult = next(\n    conv_template.process(\n        [\n            {\n                \"instruction\": \"Hello\",\n                \"response\": \"Hi\",\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n
Source code in src/distilabel/steps/formatting/conversation.py
class ConversationTemplate(Step):\n    \"\"\"Generate a conversation template from an instruction and a response.\n\n    Input columns:\n        - instruction (`str`): The instruction to be used in the conversation.\n        - response (`str`): The response to be used in the conversation.\n\n    Output columns:\n        - conversation (`ChatType`): The conversation template.\n\n    Categories:\n        - format\n        - chat\n        - template\n\n    Examples:\n        Create a conversation from an instruction and a response:\n\n        ```python\n        from distilabel.steps import ConversationTemplate\n\n        conv_template = ConversationTemplate()\n        conv_template.load()\n\n        result = next(\n            conv_template.process(\n                [\n                    {\n                        \"instruction\": \"Hello\",\n                        \"response\": \"Hi\",\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The instruction and response.\"\"\"\n        return [\"instruction\", \"response\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The conversation template.\"\"\"\n        return [\"conversation\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Generate a conversation template from an instruction and a response.\n\n        Args:\n            inputs: The input data.\n\n        Yields:\n            The input data with the conversation template.\n        \"\"\"\n        for input in inputs:\n            input[\"conversation\"] = [\n                {\"role\": \"user\", \"content\": input[\"instruction\"]},\n                {\"role\": \"assistant\", \"content\": input[\"response\"]},\n            ]\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.inputs","title":"inputs: StepColumns property","text":"

The instruction and response.

"},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.outputs","title":"outputs: StepColumns property","text":"

The conversation template.

"},{"location":"api/step_gallery/extra/#distilabel.steps.ConversationTemplate.process","title":"process(inputs)","text":"

Generate a conversation template from an instruction and a response.

Parameters:

Name Type Description Default inputs StepInput

The input data.

required

Yields:

Type Description StepOutput

The input data with the conversation template.

Source code in src/distilabel/steps/formatting/conversation.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Generate a conversation template from an instruction and a response.\n\n    Args:\n        inputs: The input data.\n\n    Yields:\n        The input data with the conversation template.\n    \"\"\"\n    for input in inputs:\n        input[\"conversation\"] = [\n            {\"role\": \"user\", \"content\": input[\"instruction\"]},\n            {\"role\": \"assistant\", \"content\": input[\"response\"]},\n        ]\n    yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO","title":"FormatChatGenerationDPO","text":"

Bases: Step

Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO).

FormatChatGenerationDPO is a Step that formats the output of the combination of a ChatGeneration task with a preference Task i.e. a task generating ratings such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook., so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings.

Note

The messages column should contain at least one message from the user, the generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations.

Input columns
  • messages (List[Dict[str, str]]): The conversation messages.
  • generations (List[str]): The generations produced by the LLM.
  • generation_models (List[str], optional): The model names used to generate the generations, only available if the model_name from the ChatGeneration task/s is combined into a single column named this way, otherwise, it will be ignored.
  • ratings (List[float]): The ratings for each of the generations, produced by a preference task such as UltraFeedback.
Output columns
  • prompt (str): The user message used to generate the generations with the LLM.
  • prompt_id (str): The SHA256 hash of the prompt.
  • chosen (List[Dict[str, str]]): The chosen generation based on the ratings.
  • chosen_model (str, optional): The model name used to generate the chosen generation, if the generation_models are available.
  • chosen_rating (float): The rating of the chosen generation.
  • rejected (List[Dict[str, str]]): The rejected generation based on the ratings.
  • rejected_model (str, optional): The model name used to generate the rejected generation, if the generation_models are available.
  • rejected_rating (float): The rating of the rejected generation.
Categories
  • format
  • chat-generation
  • preference
  • messages
  • generations

Examples:

Format your dataset for DPO fine tuning:

from distilabel.steps import FormatChatGenerationDPO\n\nformat_dpo = FormatChatGenerationDPO()\nformat_dpo.load()\n\n# NOTE: \"generation_models\" can be added optionally.\nresult = next(\n    format_dpo.process(\n        [\n            {\n                \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                \"generations\": [\"4\", \"5\", \"6\"],\n                \"ratings\": [1, 0, -1],\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n#         'generations': ['4', '5', '6'],\n#         'ratings': [1, 0, -1],\n#         'prompt': \"What's 2+2?\",\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#         'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#         'chosen_rating': 1,\n#         'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n#         'rejected_rating': -1\n#     }\n# ]\n
Source code in src/distilabel/steps/formatting/dpo.py
class FormatChatGenerationDPO(Step):\n    \"\"\"Format the output of a combination of a `ChatGeneration` + a preference task for Direct Preference Optimization (DPO).\n\n    `FormatChatGenerationDPO` is a `Step` that formats the output of the combination of a `ChatGeneration`\n    task with a preference `Task` i.e. a task generating `ratings` such as `UltraFeedback` following the standard\n    formatting from frameworks such as `axolotl` or `alignment-handbook`., so that those are used to rank the\n    existing generations and provide the `chosen` and `rejected` generations based on the `ratings`.\n\n    Note:\n        The `messages` column should contain at least one message from the user, the `generations`\n        column should contain at least two generations, the `ratings` column should contain the same\n        number of ratings as generations.\n\n    Input columns:\n        - messages (`List[Dict[str, str]]`): The conversation messages.\n        - generations (`List[str]`): The generations produced by the `LLM`.\n        - generation_models (`List[str]`, optional): The model names used to generate the `generations`,\n            only available if the `model_name` from the `ChatGeneration` task/s is combined into a single\n            column named this way, otherwise, it will be ignored.\n        - ratings (`List[float]`): The ratings for each of the `generations`, produced by a preference\n            task such as `UltraFeedback`.\n\n    Output columns:\n        - prompt (`str`): The user message used to generate the `generations` with the `LLM`.\n        - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n        - chosen (`List[Dict[str, str]]`): The `chosen` generation based on the `ratings`.\n        - chosen_model (`str`, optional): The model name used to generate the `chosen` generation,\n            if the `generation_models` are available.\n        - chosen_rating (`float`): The rating of the `chosen` generation.\n        - rejected (`List[Dict[str, str]]`): The `rejected` generation based on the `ratings`.\n        - rejected_model (`str`, optional): The model name used to generate the `rejected` generation,\n            if the `generation_models` are available.\n        - rejected_rating (`float`): The rating of the `rejected` generation.\n\n    Categories:\n        - format\n        - chat-generation\n        - preference\n        - messages\n        - generations\n\n    Examples:\n        Format your dataset for DPO fine tuning:\n\n        ```python\n        from distilabel.steps import FormatChatGenerationDPO\n\n        format_dpo = FormatChatGenerationDPO()\n        format_dpo.load()\n\n        # NOTE: \"generation_models\" can be added optionally.\n        result = next(\n            format_dpo.process(\n                [\n                    {\n                        \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                        \"generations\": [\"4\", \"5\", \"6\"],\n                        \"ratings\": [1, 0, -1],\n                    }\n                ]\n            )\n        )\n        # >>> result\n        # [\n        #     {\n        #         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n        #         'generations': ['4', '5', '6'],\n        #         'ratings': [1, 0, -1],\n        #         'prompt': \"What's 2+2?\",\n        #         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n        #         'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n        #         'chosen_rating': 1,\n        #         'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n        #         'rejected_rating': -1\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of inputs required by the `Step`, which in this case are: `messages`, `generations`,\n        and `ratings`.\"\"\"\n        return [\"messages\", \"generations\", \"ratings\"]\n\n    @property\n    def optional_inputs(self) -> List[str]:\n        \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n        which in this case is: `generation_models`.\"\"\"\n        return [\"generation_models\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `chosen`,\n        `chosen_model`, `chosen_rating`, `rejected`, `rejected_model`, `rejected_rating`. Both\n        the `chosen_model` and `rejected_model` being optional and only used if `generation_models`\n        is available.\n\n        Reference:\n            - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n        \"\"\"\n        return [\n            \"prompt\",\n            \"prompt_id\",\n            \"chosen\",\n            \"chosen_model\",\n            \"chosen_rating\",\n            \"rejected\",\n            \"rejected_model\",\n            \"rejected_rating\",\n        ]\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n        according to the DPO formatting standard.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n        \"\"\"\n        for input in inputs:\n            for item in input:\n                item[\"prompt\"] = next(\n                    (\n                        turn[\"content\"]\n                        for turn in item[\"messages\"]\n                        if turn[\"role\"] == \"user\"\n                    ),\n                    None,\n                )\n                item[\"prompt_id\"] = hashlib.sha256(\n                    item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n                ).hexdigest()\n\n                chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n                item[\"chosen\"] = item[\"messages\"] + [\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": item[\"generations\"][chosen_idx],\n                    }\n                ]\n                if \"generation_models\" in item:\n                    item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n                item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n                rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n                item[\"rejected\"] = item[\"messages\"] + [\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": item[\"generations\"][rejected_idx],\n                    }\n                ]\n                if \"generation_models\" in item:\n                    item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n                item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n            yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.inputs","title":"inputs: StepColumns property","text":"

List of inputs required by the Step, which in this case are: messages, generations, and ratings.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.optional_inputs","title":"optional_inputs: List[str] property","text":"

List of optional inputs, which are not required by the Step but used if available, which in this case is: generation_models.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.outputs","title":"outputs: StepColumns property","text":"

List of outputs generated by the Step, which are: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating. Both the chosen_model and rejected_model being optional and only used if generation_models is available.

Reference
  • Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationDPO.process","title":"process(*inputs)","text":"

The process method formats the received StepInput or list of StepInput according to the DPO formatting standard.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with batches of formatted StepInput following the DPO standard.

Source code in src/distilabel/steps/formatting/dpo.py
def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n    according to the DPO formatting standard.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n    \"\"\"\n    for input in inputs:\n        for item in input:\n            item[\"prompt\"] = next(\n                (\n                    turn[\"content\"]\n                    for turn in item[\"messages\"]\n                    if turn[\"role\"] == \"user\"\n                ),\n                None,\n            )\n            item[\"prompt_id\"] = hashlib.sha256(\n                item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n            item[\"chosen\"] = item[\"messages\"] + [\n                {\n                    \"role\": \"assistant\",\n                    \"content\": item[\"generations\"][chosen_idx],\n                }\n            ]\n            if \"generation_models\" in item:\n                item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n            item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n            rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n            item[\"rejected\"] = item[\"messages\"] + [\n                {\n                    \"role\": \"assistant\",\n                    \"content\": item[\"generations\"][rejected_idx],\n                }\n            ]\n            if \"generation_models\" in item:\n                item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n            item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n        yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO","title":"FormatTextGenerationDPO","text":"

Bases: Step

Format the output of your LLMs for Direct Preference Optimization (DPO).

FormatTextGenerationDPO is a Step that formats the output of the combination of a TextGeneration task with a preference Task i.e. a task generating ratings, so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings. Use this step to transform the output of a combination of a TextGeneration + a preference task such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook.

Note

The generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations.

Input columns
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generations, if available.
  • instruction (str): The instruction used to generate the generations with the LLM.
  • generations (List[str]): The generations produced by the LLM.
  • generation_models (List[str], optional): The model names used to generate the generations, only available if the model_name from the TextGeneration task/s is combined into a single column named this way, otherwise, it will be ignored.
  • ratings (List[float]): The ratings for each of the generations, produced by a preference task such as UltraFeedback.
Output columns
  • prompt (str): The instruction used to generate the generations with the LLM.
  • prompt_id (str): The SHA256 hash of the prompt.
  • chosen (List[Dict[str, str]]): The chosen generation based on the ratings.
  • chosen_model (str, optional): The model name used to generate the chosen generation, if the generation_models are available.
  • chosen_rating (float): The rating of the chosen generation.
  • rejected (List[Dict[str, str]]): The rejected generation based on the ratings.
  • rejected_model (str, optional): The model name used to generate the rejected generation, if the generation_models are available.
  • rejected_rating (float): The rating of the rejected generation.
Categories
  • format
  • text-generation
  • preference
  • instruction
  • generations

Examples:

Format your dataset for DPO fine tuning:

from distilabel.steps import FormatTextGenerationDPO\n\nformat_dpo = FormatTextGenerationDPO()\nformat_dpo.load()\n\n# NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\nresult = next(\n    format_dpo.process(\n        [\n            {\n                \"instruction\": \"What's 2+2?\",\n                \"generations\": [\"4\", \"5\", \"6\"],\n                \"ratings\": [1, 0, -1],\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#    {   'instruction': \"What's 2+2?\",\n#        'generations': ['4', '5', '6'],\n#        'ratings': [1, 0, -1],\n#        'prompt': \"What's 2+2?\",\n#        'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#        'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#        'chosen_rating': 1,\n#        'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n#        'rejected_rating': -1\n#    }\n# ]\n
Source code in src/distilabel/steps/formatting/dpo.py
class FormatTextGenerationDPO(Step):\n    \"\"\"Format the output of your LLMs for Direct Preference Optimization (DPO).\n\n    `FormatTextGenerationDPO` is a `Step` that formats the output of the combination of a `TextGeneration`\n    task with a preference `Task` i.e. a task generating `ratings`, so that those are used to rank the\n    existing generations and provide the `chosen` and `rejected` generations based on the `ratings`.\n    Use this step to transform the output of a combination of a `TextGeneration` + a preference task such as\n    `UltraFeedback` following the standard formatting from frameworks such as `axolotl` or `alignment-handbook`.\n\n    Note:\n        The `generations` column should contain at least two generations, the `ratings` column should\n        contain the same number of ratings as generations.\n\n    Input columns:\n        - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n            `generations`, if available.\n        - instruction (`str`): The instruction used to generate the `generations` with the `LLM`.\n        - generations (`List[str]`): The generations produced by the `LLM`.\n        - generation_models (`List[str]`, optional): The model names used to generate the `generations`,\n            only available if the `model_name` from the `TextGeneration` task/s is combined into a single\n            column named this way, otherwise, it will be ignored.\n        - ratings (`List[float]`): The ratings for each of the `generations`, produced by a preference\n            task such as `UltraFeedback`.\n\n    Output columns:\n        - prompt (`str`): The instruction used to generate the `generations` with the `LLM`.\n        - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n        - chosen (`List[Dict[str, str]]`): The `chosen` generation based on the `ratings`.\n        - chosen_model (`str`, optional): The model name used to generate the `chosen` generation,\n            if the `generation_models` are available.\n        - chosen_rating (`float`): The rating of the `chosen` generation.\n        - rejected (`List[Dict[str, str]]`): The `rejected` generation based on the `ratings`.\n        - rejected_model (`str`, optional): The model name used to generate the `rejected` generation,\n            if the `generation_models` are available.\n        - rejected_rating (`float`): The rating of the `rejected` generation.\n\n    Categories:\n        - format\n        - text-generation\n        - preference\n        - instruction\n        - generations\n\n    Examples:\n        Format your dataset for DPO fine tuning:\n\n        ```python\n        from distilabel.steps import FormatTextGenerationDPO\n\n        format_dpo = FormatTextGenerationDPO()\n        format_dpo.load()\n\n        # NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\n        result = next(\n            format_dpo.process(\n                [\n                    {\n                        \"instruction\": \"What's 2+2?\",\n                        \"generations\": [\"4\", \"5\", \"6\"],\n                        \"ratings\": [1, 0, -1],\n                    }\n                ]\n            )\n        )\n        # >>> result\n        # [\n        #    {   'instruction': \"What's 2+2?\",\n        #        'generations': ['4', '5', '6'],\n        #        'ratings': [1, 0, -1],\n        #        'prompt': \"What's 2+2?\",\n        #        'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n        #        'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n        #        'chosen_rating': 1,\n        #        'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n        #        'rejected_rating': -1\n        #    }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, `generations`,\n        and `ratings`.\"\"\"\n        return {\n            \"system_prompt\": False,\n            \"instruction\": True,\n            \"generations\": True,\n            \"generation_models\": False,\n            \"ratings\": True,\n        }\n\n    @property\n    def optional_inputs(self) -> List[str]:\n        \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n        which in this case are: `system_prompt`, and `generation_models`.\"\"\"\n        return [\"system_prompt\", \"generation_models\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `chosen`,\n        `chosen_model`, `chosen_rating`, `rejected`, `rejected_model`, `rejected_rating`. Both\n        the `chosen_model` and `rejected_model` being optional and only used if `generation_models`\n        is available.\n\n        Reference:\n            - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n        \"\"\"\n        return [\n            \"prompt\",\n            \"prompt_id\",\n            \"chosen\",\n            \"chosen_model\",\n            \"chosen_rating\",\n            \"rejected\",\n            \"rejected_model\",\n            \"rejected_rating\",\n        ]\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n        according to the DPO formatting standard.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n        \"\"\"\n        for input in inputs:\n            for item in input:\n                messages = [\n                    {\"role\": \"user\", \"content\": item[\"instruction\"]},  # type: ignore\n                ]\n                if (\n                    \"system_prompt\" in item\n                    and isinstance(item[\"system_prompt\"], str)  # type: ignore\n                    and len(item[\"system_prompt\"]) > 0  # type: ignore\n                ):\n                    messages.insert(\n                        0,\n                        {\"role\": \"system\", \"content\": item[\"system_prompt\"]},  # type: ignore\n                    )\n\n                item[\"prompt\"] = item[\"instruction\"]\n                item[\"prompt_id\"] = hashlib.sha256(\n                    item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n                ).hexdigest()\n\n                chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n                item[\"chosen\"] = messages + [\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": item[\"generations\"][chosen_idx],\n                    }\n                ]\n                if \"generation_models\" in item:\n                    item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n                item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n                rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n                item[\"rejected\"] = messages + [\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": item[\"generations\"][rejected_idx],\n                    }\n                ]\n                if \"generation_models\" in item:\n                    item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n                item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n            yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.inputs","title":"inputs: StepColumns property","text":"

List of inputs required by the Step, which in this case are: instruction, generations, and ratings.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.optional_inputs","title":"optional_inputs: List[str] property","text":"

List of optional inputs, which are not required by the Step but used if available, which in this case are: system_prompt, and generation_models.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.outputs","title":"outputs: StepColumns property","text":"

List of outputs generated by the Step, which are: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating. Both the chosen_model and rejected_model being optional and only used if generation_models is available.

Reference
  • Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationDPO.process","title":"process(*inputs)","text":"

The process method formats the received StepInput or list of StepInput according to the DPO formatting standard.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with batches of formatted StepInput following the DPO standard.

Source code in src/distilabel/steps/formatting/dpo.py
def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n    according to the DPO formatting standard.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with batches of formatted `StepInput` following the DPO standard.\n    \"\"\"\n    for input in inputs:\n        for item in input:\n            messages = [\n                {\"role\": \"user\", \"content\": item[\"instruction\"]},  # type: ignore\n            ]\n            if (\n                \"system_prompt\" in item\n                and isinstance(item[\"system_prompt\"], str)  # type: ignore\n                and len(item[\"system_prompt\"]) > 0  # type: ignore\n            ):\n                messages.insert(\n                    0,\n                    {\"role\": \"system\", \"content\": item[\"system_prompt\"]},  # type: ignore\n                )\n\n            item[\"prompt\"] = item[\"instruction\"]\n            item[\"prompt_id\"] = hashlib.sha256(\n                item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            chosen_idx = max(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n            item[\"chosen\"] = messages + [\n                {\n                    \"role\": \"assistant\",\n                    \"content\": item[\"generations\"][chosen_idx],\n                }\n            ]\n            if \"generation_models\" in item:\n                item[\"chosen_model\"] = item[\"generation_models\"][chosen_idx]\n            item[\"chosen_rating\"] = item[\"ratings\"][chosen_idx]\n\n            rejected_idx = min(enumerate(item[\"ratings\"]), key=lambda x: x[1])[0]\n            item[\"rejected\"] = messages + [\n                {\n                    \"role\": \"assistant\",\n                    \"content\": item[\"generations\"][rejected_idx],\n                }\n            ]\n            if \"generation_models\" in item:\n                item[\"rejected_model\"] = item[\"generation_models\"][rejected_idx]\n            item[\"rejected_rating\"] = item[\"ratings\"][rejected_idx]\n\n        yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT","title":"FormatChatGenerationSFT","text":"

Bases: Step

Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT).

FormatChatGenerationSFT is a Step that formats the output of a ChatGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook. The output of the ChatGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation.

Input columns
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generation, if available.
  • instruction (str): The instruction used to generate the generation with the LLM.
  • generation (str): The generation produced by the LLM.
Output columns
  • prompt (str): The instruction used to generate the generation with the LLM.
  • prompt_id (str): The SHA256 hash of the prompt.
  • messages (List[Dict[str, str]]): The chat-like conversation with the instruction as the user message and the generation as the assistant message.
Categories
  • format
  • chat-generation
  • instruction
  • generation

Examples:

Format your dataset for SFT:

from distilabel.steps import FormatChatGenerationSFT\n\nformat_sft = FormatChatGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n    format_sft.process(\n        [\n            {\n                \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                \"generation\": \"4\"\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#         'generation': '4',\n#         'prompt': 'What's 2+2?',\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#     }\n# ]\n
Source code in src/distilabel/steps/formatting/sft.py
class FormatChatGenerationSFT(Step):\n    \"\"\"Format the output of a `ChatGeneration` task for Supervised Fine-Tuning (SFT).\n\n    `FormatChatGenerationSFT` is a `Step` that formats the output of a `ChatGeneration` task for\n    Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl`\n    or `alignment-handbook`. The output of the `ChatGeneration` task is formatted into a chat-like\n    conversation with the `instruction` as the user message and the `generation` as the assistant\n    message. Optionally, if the `system_prompt` is available, it is included as the first message\n    in the conversation.\n\n    Input columns:\n        - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n            `generation`, if available.\n        - instruction (`str`): The instruction used to generate the `generation` with the `LLM`.\n        - generation (`str`): The generation produced by the `LLM`.\n\n    Output columns:\n        - prompt (`str`): The instruction used to generate the `generation` with the `LLM`.\n        - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n        - messages (`List[Dict[str, str]]`): The chat-like conversation with the `instruction` as\n            the user message and the `generation` as the assistant message.\n\n    Categories:\n        - format\n        - chat-generation\n        - instruction\n        - generation\n\n    Examples:\n        Format your dataset for SFT:\n\n        ```python\n        from distilabel.steps import FormatChatGenerationSFT\n\n        format_sft = FormatChatGenerationSFT()\n        format_sft.load()\n\n        # NOTE: \"system_prompt\" can be added optionally.\n        result = next(\n            format_sft.process(\n                [\n                    {\n                        \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                        \"generation\": \"4\"\n                    }\n                ]\n            )\n        )\n        # >>> result\n        # [\n        #     {\n        #         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n        #         'generation': '4',\n        #         'prompt': 'What's 2+2?',\n        #         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, and `generation`.\"\"\"\n        return [\"messages\", \"generation\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `messages`.\n\n        Reference:\n            - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n        \"\"\"\n        return [\"prompt\", \"prompt_id\", \"messages\"]\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n        according to the SFT formatting standard.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n        \"\"\"\n        for input in inputs:\n            for item in input:\n                item[\"prompt\"] = next(\n                    (\n                        turn[\"content\"]\n                        for turn in item[\"messages\"]\n                        if turn[\"role\"] == \"user\"\n                    ),\n                    None,\n                )\n\n                item[\"prompt_id\"] = hashlib.sha256(\n                    item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n                ).hexdigest()\n\n                item[\"messages\"] = item[\"messages\"] + [\n                    {\"role\": \"assistant\", \"content\": item[\"generation\"]},  # type: ignore\n                ]\n            yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.inputs","title":"inputs: StepColumns property","text":"

List of inputs required by the Step, which in this case are: instruction, and generation.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.outputs","title":"outputs: StepColumns property","text":"

List of outputs generated by the Step, which are: prompt, prompt_id, messages.

Reference
  • Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatChatGenerationSFT.process","title":"process(*inputs)","text":"

The process method formats the received StepInput or list of StepInput according to the SFT formatting standard.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with batches of formatted StepInput following the SFT standard.

Source code in src/distilabel/steps/formatting/sft.py
def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n    according to the SFT formatting standard.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n    \"\"\"\n    for input in inputs:\n        for item in input:\n            item[\"prompt\"] = next(\n                (\n                    turn[\"content\"]\n                    for turn in item[\"messages\"]\n                    if turn[\"role\"] == \"user\"\n                ),\n                None,\n            )\n\n            item[\"prompt_id\"] = hashlib.sha256(\n                item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            item[\"messages\"] = item[\"messages\"] + [\n                {\"role\": \"assistant\", \"content\": item[\"generation\"]},  # type: ignore\n            ]\n        yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT","title":"FormatTextGenerationSFT","text":"

Bases: Step

Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT).

FormatTextGenerationSFT is a Step that formats the output of a TextGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook. The output of the TextGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation.

Input columns
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generation, if available.
  • instruction (str): The instruction used to generate the generation with the LLM.
  • generation (str): The generation produced by the LLM.
Output columns
  • prompt (str): The instruction used to generate the generation with the LLM.
  • prompt_id (str): The SHA256 hash of the prompt.
  • messages (List[Dict[str, str]]): The chat-like conversation with the instruction as the user message and the generation as the assistant message.
Categories
  • format
  • text-generation
  • instruction
  • generation

Examples:

Format your dataset for SFT fine tuning:

from distilabel.steps import FormatTextGenerationSFT\n\nformat_sft = FormatTextGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n    format_sft.process(\n        [\n            {\n                \"instruction\": \"What's 2+2?\",\n                \"generation\": \"4\"\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'instruction': 'What's 2+2?',\n#         'generation': '4',\n#         'prompt': 'What's 2+2?',\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n#     }\n# ]\n
Source code in src/distilabel/steps/formatting/sft.py
class FormatTextGenerationSFT(Step):\n    \"\"\"Format the output of a `TextGeneration` task for Supervised Fine-Tuning (SFT).\n\n    `FormatTextGenerationSFT` is a `Step` that formats the output of a `TextGeneration` task for\n    Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as `axolotl`\n    or `alignment-handbook`. The output of the `TextGeneration` task is formatted into a chat-like\n    conversation with the `instruction` as the user message and the `generation` as the assistant\n    message. Optionally, if the `system_prompt` is available, it is included as the first message\n    in the conversation.\n\n    Input columns:\n        - system_prompt (`str`, optional): The system prompt used within the `LLM` to generate the\n            `generation`, if available.\n        - instruction (`str`): The instruction used to generate the `generation` with the `LLM`.\n        - generation (`str`): The generation produced by the `LLM`.\n\n    Output columns:\n        - prompt (`str`): The instruction used to generate the `generation` with the `LLM`.\n        - prompt_id (`str`): The `SHA256` hash of the `prompt`.\n        - messages (`List[Dict[str, str]]`): The chat-like conversation with the `instruction` as\n            the user message and the `generation` as the assistant message.\n\n    Categories:\n        - format\n        - text-generation\n        - instruction\n        - generation\n\n    Examples:\n        Format your dataset for SFT fine tuning:\n\n        ```python\n        from distilabel.steps import FormatTextGenerationSFT\n\n        format_sft = FormatTextGenerationSFT()\n        format_sft.load()\n\n        # NOTE: \"system_prompt\" can be added optionally.\n        result = next(\n            format_sft.process(\n                [\n                    {\n                        \"instruction\": \"What's 2+2?\",\n                        \"generation\": \"4\"\n                    }\n                ]\n            )\n        )\n        # >>> result\n        # [\n        #     {\n        #         'instruction': 'What's 2+2?',\n        #         'generation': '4',\n        #         'prompt': 'What's 2+2?',\n        #         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n        #         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"List of inputs required by the `Step`, which in this case are: `instruction`, and `generation`.\"\"\"\n        return {\n            \"system_prompt\": False,\n            \"instruction\": True,\n            \"generation\": True,\n        }\n\n    @property\n    def optional_inputs(self) -> List[str]:\n        \"\"\"List of optional inputs, which are not required by the `Step` but used if available,\n        which in this case is: `system_prompt`.\"\"\"\n        return [\"system_prompt\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"List of outputs generated by the `Step`, which are: `prompt`, `prompt_id`, `messages`.\n\n        Reference:\n            - Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k\n        \"\"\"\n        return [\"prompt\", \"prompt_id\", \"messages\"]\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n        according to the SFT formatting standard.\n\n        Args:\n            *inputs: A list of `StepInput` to be combined.\n\n        Yields:\n            A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n        \"\"\"\n        for input in inputs:\n            for item in input:\n                item[\"prompt\"] = item[\"instruction\"]\n\n                item[\"prompt_id\"] = hashlib.sha256(\n                    item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n                ).hexdigest()\n\n                item[\"messages\"] = [\n                    {\"role\": \"user\", \"content\": item[\"instruction\"]},  # type: ignore\n                    {\"role\": \"assistant\", \"content\": item[\"generation\"]},  # type: ignore\n                ]\n                if (\n                    \"system_prompt\" in item\n                    and isinstance(item[\"system_prompt\"], str)  # type: ignore\n                    and len(item[\"system_prompt\"]) > 0  # type: ignore\n                ):\n                    item[\"messages\"].insert(\n                        0,\n                        {\"role\": \"system\", \"content\": item[\"system_prompt\"]},  # type: ignore\n                    )\n\n            yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.inputs","title":"inputs: StepColumns property","text":"

List of inputs required by the Step, which in this case are: instruction, and generation.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.optional_inputs","title":"optional_inputs: List[str] property","text":"

List of optional inputs, which are not required by the Step but used if available, which in this case is: system_prompt.

"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.outputs","title":"outputs: StepColumns property","text":"

List of outputs generated by the Step, which are: prompt, prompt_id, messages.

Reference
  • Format inspired in https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
"},{"location":"api/step_gallery/extra/#distilabel.steps.FormatTextGenerationSFT.process","title":"process(*inputs)","text":"

The process method formats the received StepInput or list of StepInput according to the SFT formatting standard.

Parameters:

Name Type Description Default *inputs StepInput

A list of StepInput to be combined.

()

Yields:

Type Description StepOutput

A StepOutput with batches of formatted StepInput following the SFT standard.

Source code in src/distilabel/steps/formatting/sft.py
def process(self, *inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"The `process` method formats the received `StepInput` or list of `StepInput`\n    according to the SFT formatting standard.\n\n    Args:\n        *inputs: A list of `StepInput` to be combined.\n\n    Yields:\n        A `StepOutput` with batches of formatted `StepInput` following the SFT standard.\n    \"\"\"\n    for input in inputs:\n        for item in input:\n            item[\"prompt\"] = item[\"instruction\"]\n\n            item[\"prompt_id\"] = hashlib.sha256(\n                item[\"prompt\"].encode(\"utf-8\")  # type: ignore\n            ).hexdigest()\n\n            item[\"messages\"] = [\n                {\"role\": \"user\", \"content\": item[\"instruction\"]},  # type: ignore\n                {\"role\": \"assistant\", \"content\": item[\"generation\"]},  # type: ignore\n            ]\n            if (\n                \"system_prompt\" in item\n                and isinstance(item[\"system_prompt\"], str)  # type: ignore\n                and len(item[\"system_prompt\"]) > 0  # type: ignore\n            ):\n                item[\"messages\"].insert(\n                    0,\n                    {\"role\": \"system\", \"content\": item[\"system_prompt\"]},  # type: ignore\n                )\n\n        yield input\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts","title":"LoadDataFromDicts","text":"

Bases: GeneratorStep

Loads a dataset from a list of dictionaries.

GeneratorStep that loads a dataset from a list of dictionaries and yields it in batches.

Attributes:

Name Type Description data List[Dict[str, Any]]

The list of dictionaries to load the data from.

Runtime parameters
  • batch_size: The batch size to use when processing the data.
Output columns
  • dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
Categories
  • load

Examples:

Load data from a list of dictionaries:

from distilabel.steps import LoadDataFromDicts\n\nloader = LoadDataFromDicts(\n    data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n    batch_size=2\n)\nloader.load()\n\nresult = next(loader.process())\n# >>> result\n# ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n
Source code in src/distilabel/steps/generators/data.py
class LoadDataFromDicts(GeneratorStep):\n    \"\"\"Loads a dataset from a list of dictionaries.\n\n    `GeneratorStep` that loads a dataset from a list of dictionaries and yields it in\n    batches.\n\n    Attributes:\n        data: The list of dictionaries to load the data from.\n\n    Runtime parameters:\n        - `batch_size`: The batch size to use when processing the data.\n\n    Output columns:\n        - dynamic (based on the keys found on the first dictionary of the list): The columns\n            of the dataset.\n\n    Categories:\n        - load\n\n    Examples:\n        Load data from a list of dictionaries:\n\n        ```python\n        from distilabel.steps import LoadDataFromDicts\n\n        loader = LoadDataFromDicts(\n            data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n            batch_size=2\n        )\n        loader.load()\n\n        result = next(loader.process())\n        # >>> result\n        # ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n        ```\n    \"\"\"\n\n    data: List[Dict[str, Any]] = Field(default_factory=list, exclude=True)\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n        \"\"\"Yields batches from a list of dictionaries.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to `0`.\n\n        Yields:\n            A list of Python dictionaries as read from the inputs (propagated in batches)\n            and a flag indicating whether the yield batch is the last one.\n        \"\"\"\n        if offset:\n            self.data = self.data[offset:]\n\n        while self.data:\n            batch = self.data[: self.batch_size]\n            self.data = self.data[self.batch_size :]\n            yield (\n                batch,\n                True if len(self.data) == 0 else False,\n            )\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"Returns a list of strings with the names of the columns that the step will generate.\"\"\"\n        return list(self.data[0].keys())\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts.outputs","title":"outputs: List[str] property","text":"

Returns a list of strings with the names of the columns that the step will generate.

"},{"location":"api/step_gallery/extra/#distilabel.steps.LoadDataFromDicts.process","title":"process(offset=0)","text":"

Yields batches from a list of dictionaries.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

A list of Python dictionaries as read from the inputs (propagated in batches)

GeneratorStepOutput

and a flag indicating whether the yield batch is the last one.

Source code in src/distilabel/steps/generators/data.py
@override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n    \"\"\"Yields batches from a list of dictionaries.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to `0`.\n\n    Yields:\n        A list of Python dictionaries as read from the inputs (propagated in batches)\n        and a flag indicating whether the yield batch is the last one.\n    \"\"\"\n    if offset:\n        self.data = self.data[offset:]\n\n    while self.data:\n        batch = self.data[: self.batch_size]\n        self.data = self.data[self.batch_size :]\n        yield (\n            batch,\n            True if len(self.data) == 0 else False,\n        )\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DataSampler","title":"DataSampler","text":"

Bases: GeneratorStep

Step to sample from a dataset.

GeneratorStep that samples from a dataset and yields it in batches. This step is useful when you have a pipeline that can benefit from using examples in the prompts for example as few-shot learning, that can be changing on each row. For example, you can pass a list of dictionaries with N examples and generate M samples from it (assuming you have another step loading data, this M should have the same size as the data being loaded in that step). The size S argument is the number of samples per row generated, so each example would contain S examples to be used as examples.

Attributes:

Name Type Description data List[Dict[str, Any]]

The list of dictionaries to sample from.

size int

Number of samples per example. For example in a few-shot learning scenario, the number of few-shot examples that will be generated per example. Defaults to 2.

samples int

Number of examples that will be generated by the step in total. If used with another loader step, this should be the same as the number of samples in the loader step. Defaults to 100.

Output columns
  • dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
Categories
  • load

Examples:

Sample data from a list of dictionaries:

from distilabel.steps import DataSampler\n\nsampler = DataSampler(\n    data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n    samples=10,\n    size=2,\n    batch_size=4\n)\nsampler.load()\n\nresult = next(sampler.process())\n# >>> result\n# ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n

Pipeline with a loader and a sampler combined in a single stream:

from datasets import load_dataset\n\nfrom distilabel.steps import LoadDataFromDicts, DataSampler\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples\nfrom distilabel.pipeline import Pipeline\n\nds = (\n    load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n    .shuffle(seed=42)\n    .select(range(500))\n    .to_list()\n)\ndata = [\n    {\n        \"func_name\": \"final_velocity\",\n        \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n    },\n    {\n        \"func_name\": \"permutation_count\",\n        \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n    },\n    {\n        \"func_name\": \"getdivision\",\n        \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n    },\n]\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n    loader_seeds = LoadDataFromDicts(data=data)\n    sampler = DataSampler(\n        data=ds,\n        size=2,\n        samples=len(data),\n        batch_size=8,\n    )\n    prep_examples = PrepareExamples()\n\n    sampler >> prep_examples\n    (\n        [loader_seeds, prep_examples]\n        >> combine_steps\n    )\n# Now we have a single stream of data with the loader and the sampler data\n
Source code in src/distilabel/steps/generators/data_sampler.py
class DataSampler(GeneratorStep):\n    \"\"\"Step to sample from a dataset.\n\n    `GeneratorStep` that samples from a dataset and yields it in batches.\n    This step is useful when you have a pipeline that can benefit from using examples\n    in the prompts for example as few-shot learning, that can be changing on each row.\n    For example, you can pass a list of dictionaries with N examples and generate M samples\n    from it (assuming you have another step loading data, this M should have the same size\n    as the data being loaded in that step). The size S argument is the number of samples per\n    row generated, so each example would contain S examples to be used as examples.\n\n    Attributes:\n        data: The list of dictionaries to sample from.\n        size: Number of samples per example. For example in a few-shot learning scenario,\n            the number of few-shot examples that will be generated per example. Defaults to 2.\n        samples: Number of examples that will be generated by the step in total.\n            If used with another loader step, this should be the same as the number\n            of samples in the loader step. Defaults to 100.\n\n    Output columns:\n        - dynamic (based on the keys found on the first dictionary of the list): The columns\n            of the dataset.\n\n    Categories:\n        - load\n\n    Examples:\n        Sample data from a list of dictionaries:\n\n        ```python\n        from distilabel.steps import DataSampler\n\n        sampler = DataSampler(\n            data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n            samples=10,\n            size=2,\n            batch_size=4\n        )\n        sampler.load()\n\n        result = next(sampler.process())\n        # >>> result\n        # ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n        ```\n\n        Pipeline with a loader and a sampler combined in a single stream:\n\n        ```python\n        from datasets import load_dataset\n\n        from distilabel.steps import LoadDataFromDicts, DataSampler\n        from distilabel.steps.tasks.apigen.utils import PrepareExamples\n        from distilabel.pipeline import Pipeline\n\n        ds = (\n            load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n            .shuffle(seed=42)\n            .select(range(500))\n            .to_list()\n        )\n        data = [\n            {\n                \"func_name\": \"final_velocity\",\n                \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n            },\n            {\n                \"func_name\": \"permutation_count\",\n                \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n            },\n            {\n                \"func_name\": \"getdivision\",\n                \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n            },\n        ]\n        with Pipeline(name=\"APIGenPipeline\") as pipeline:\n            loader_seeds = LoadDataFromDicts(data=data)\n            sampler = DataSampler(\n                data=ds,\n                size=2,\n                samples=len(data),\n                batch_size=8,\n            )\n            prep_examples = PrepareExamples()\n\n            sampler >> prep_examples\n            (\n                [loader_seeds, prep_examples]\n                >> combine_steps\n            )\n        # Now we have a single stream of data with the loader and the sampler data\n        ```\n    \"\"\"\n\n    data: List[Dict[str, Any]] = Field(default_factory=list, exclude=True)\n    size: int = Field(\n        default=2,\n        description=(\n            \"Number of samples per example. For example in a few-shot learning scenario, the number \"\n            \"of few-shot examples that will be generated per example.\"\n        ),\n    )\n    samples: int = Field(\n        default=100,\n        description=(\n            \"Number of examples that will be generated by the step in total. \"\n            \"If used with another loader step, this should be the same as the number of \"\n            \"samples in the loader step.\"\n        ),\n    )\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n        \"\"\"Yields batches from a list of dictionaries.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to `0`.\n\n        Yields:\n            A list of Python dictionaries as read from the inputs (propagated in batches)\n            and a flag indicating whether the yield batch is the last one.\n        \"\"\"\n\n        total_samples = 0\n\n        while total_samples < self.samples:\n            batch = []\n            bs = min(self.batch_size, self.samples - total_samples)\n            for _ in range(self.batch_size):\n                choices = random.choices(self.data, k=self.size)\n                choices = self._transform_data(choices)\n                batch.extend(choices)\n            total_samples += bs\n            batch = list(islice(batch, bs))\n            yield (batch, True if total_samples >= self.samples else False)\n            batch = []\n\n    @staticmethod\n    def _transform_data(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:\n        if not data:\n            return []\n\n        result = {key: [] for key in data[0].keys()}\n\n        for item in data:\n            for key, value in item.items():\n                result[key].append(value)\n\n        return [result]\n\n    @property\n    def outputs(self) -> List[str]:\n        return list(self.data[0].keys())\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.DataSampler.process","title":"process(offset=0)","text":"

Yields batches from a list of dictionaries.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

A list of Python dictionaries as read from the inputs (propagated in batches)

GeneratorStepOutput

and a flag indicating whether the yield batch is the last one.

Source code in src/distilabel/steps/generators/data_sampler.py
@override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n    \"\"\"Yields batches from a list of dictionaries.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to `0`.\n\n    Yields:\n        A list of Python dictionaries as read from the inputs (propagated in batches)\n        and a flag indicating whether the yield batch is the last one.\n    \"\"\"\n\n    total_samples = 0\n\n    while total_samples < self.samples:\n        batch = []\n        bs = min(self.batch_size, self.samples - total_samples)\n        for _ in range(self.batch_size):\n            choices = random.choices(self.data, k=self.size)\n            choices = self._transform_data(choices)\n            batch.extend(choices)\n        total_samples += bs\n        batch = list(islice(batch, bs))\n        yield (batch, True if total_samples >= self.samples else False)\n        batch = []\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore","title":"RewardModelScore","text":"

Bases: Step, CudaDevicePlacementMixin

Assign a score to a response using a Reward Model.

RewardModelScore is a Step that using a Reward Model (RM) loaded using transformers, assigns an score to a response generated for an instruction, or a score to a multi-turn conversation.

Attributes:

Name Type Description model str

the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

revision str

if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

torch_dtype str

the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\".

trust_remote_code bool

whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

device_map Union[str, Dict[str, Any], None]

a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\". Defaults to None.

token Union[SecretStr, None]

the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

truncation bool

whether to truncate sequences at the maximum length. Defaults to False.

max_length Union[int, None]

maximun length to use for padding or truncation. Defaults to None.

Input columns
  • instruction (str, optional): the instruction used to generate a response. If provided, then response must be provided too.
  • response (str, optional): the response generated for instruction. If provided, then instruction must be provide too.
  • conversation (ChatType, optional): a multi-turn conversation. If not provided, then instruction and response columns must be provided.
Output columns
  • score (float): the score given by the reward model for the instruction-response pair or the conversation.
Categories
  • scorer

Examples:

Assigning an score for an instruction-response pair:

from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n    model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n    step.process(\n        inputs=[\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"response\": \"The output of 2+2 is 4\",\n            },\n            {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n        ]\n    )\n)\n# [\n#   {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n#   {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n# ]\n

Assigning an score for a multi-turn conversation:

from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n    model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n    step.process(\n        inputs=[\n            {\n                \"conversation\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                    {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n                ],\n            },\n            {\n                \"conversation\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                    {\"role\": \"assistant\", \"content\": \"4\"},\n                ],\n            },\n        ]\n    )\n)\n# [\n#   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n#   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n# ]\n
Source code in src/distilabel/steps/reward_model.py
class RewardModelScore(Step, CudaDevicePlacementMixin):\n    \"\"\"Assign a score to a response using a Reward Model.\n\n    `RewardModelScore` is a `Step` that using a Reward Model (RM) loaded using `transformers`,\n    assigns an score to a response generated for an instruction, or a score to a multi-turn\n    conversation.\n\n    Attributes:\n        model: the model Hugging Face Hub repo id or a path to a directory containing the\n            model weights and configuration files.\n        revision: if `model` refers to a Hugging Face Hub repository, then the revision\n            (e.g. a branch name or a commit id) to use. Defaults to `\"main\"`.\n        torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc.\n            Defaults to `\"auto\"`.\n        trust_remote_code: whether to allow fetching and executing remote code fetched\n            from the repository in the Hub. Defaults to `False`.\n        device_map: a dictionary mapping each layer of the model to a device, or a mode like `\"sequential\"` or `\"auto\"`. Defaults to `None`.\n        token: the Hugging Face Hub token that will be used to authenticate to the Hugging\n            Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package\n            local configuration will be used. Defaults to `None`.\n        truncation: whether to truncate sequences at the maximum length. Defaults to `False`.\n        max_length: maximun length to use for padding or truncation. Defaults to `None`.\n\n    Input columns:\n        - instruction (`str`, optional): the instruction used to generate a `response`.\n            If provided, then `response` must be provided too.\n        - response (`str`, optional): the response generated for `instruction`. If provided,\n            then `instruction` must be provide too.\n        - conversation (`ChatType`, optional): a multi-turn conversation. If not provided,\n            then `instruction` and `response` columns must be provided.\n\n    Output columns:\n        - score (`float`): the score given by the reward model for the instruction-response\n            pair or the conversation.\n\n    Categories:\n        - scorer\n\n    Examples:\n        Assigning an score for an instruction-response pair:\n\n        ```python\n        from distilabel.steps import RewardModelScore\n\n        step = RewardModelScore(\n            model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n        )\n\n        step.load()\n\n        result = next(\n            step.process(\n                inputs=[\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"response\": \"The output of 2+2 is 4\",\n                    },\n                    {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n                ]\n            )\n        )\n        # [\n        #   {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n        #   {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n        # ]\n        ```\n\n        Assigning an score for a multi-turn conversation:\n\n        ```python\n        from distilabel.steps import RewardModelScore\n\n        step = RewardModelScore(\n            model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n        )\n\n        step.load()\n\n        result = next(\n            step.process(\n                inputs=[\n                    {\n                        \"conversation\": [\n                            {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                            {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n                        ],\n                    },\n                    {\n                        \"conversation\": [\n                            {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                            {\"role\": \"assistant\", \"content\": \"4\"},\n                        ],\n                    },\n                ]\n            )\n        )\n        # [\n        #   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n        #   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n        # ]\n        ```\n    \"\"\"\n\n    model: str\n    revision: str = \"main\"\n    torch_dtype: str = \"auto\"\n    trust_remote_code: bool = False\n    device_map: Union[str, Dict[str, Any], None] = None\n    token: Union[SecretStr, None] = Field(\n        default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR), description=\"\"\n    )\n    truncation: bool = False\n    max_length: Union[int, None] = None\n\n    _model: Union[\"PreTrainedModel\", None] = PrivateAttr(None)\n    _tokenizer: Union[\"PreTrainedTokenizer\", None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        super().load()\n\n        if self.device_map in [\"cuda\", \"auto\"]:\n            CudaDevicePlacementMixin.load(self)\n\n        try:\n            from transformers import AutoModelForSequenceClassification, AutoTokenizer\n        except ImportError as e:\n            raise ImportError(\n                \"`transformers` is not installed. Please install it using `pip install transformers`.\"\n            ) from e\n\n        token = self.token.get_secret_value() if self.token is not None else self.token\n\n        self._model = AutoModelForSequenceClassification.from_pretrained(\n            self.model,\n            revision=self.revision,\n            torch_dtype=self.torch_dtype,\n            trust_remote_code=self.trust_remote_code,\n            device_map=self.device_map,\n            token=token,\n        )\n        self._tokenizer = AutoTokenizer.from_pretrained(\n            self.model,\n            revision=self.revision,\n            torch_dtype=self.torch_dtype,\n            trust_remote_code=self.trust_remote_code,\n            token=token,\n        )\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"Either `response` and `instruction`, or a `conversation` columns.\"\"\"\n        return {\n            \"response\": False,\n            \"instruction\": False,\n            \"conversation\": False,\n        }\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The `score` given by the reward model.\"\"\"\n        return [\"score\"]\n\n    def _prepare_conversation(self, input: Dict[str, Any]) -> \"ChatType\":\n        if \"instruction\" in input and \"response\" in input:\n            return [\n                {\"role\": \"user\", \"content\": input[\"instruction\"]},\n                {\"role\": \"assistant\", \"content\": input[\"response\"]},\n            ]\n\n        return input[\"conversation\"]\n\n    def _prepare_inputs(self, inputs: List[Dict[str, Any]]) -> \"torch.Tensor\":\n        return self._tokenizer.apply_chat_template(  # type: ignore\n            [self._prepare_conversation(input) for input in inputs],  # type: ignore\n            return_tensors=\"pt\",\n            padding=True,\n            truncation=self.truncation,\n            max_length=self.max_length,\n        ).to(self._model.device)  # type: ignore\n\n    def _inference(self, inputs: List[Dict[str, Any]]) -> List[float]:\n        import torch\n\n        input_ids = self._prepare_inputs(inputs)\n        with torch.no_grad():\n            output = self._model(input_ids)  # type: ignore\n            logits = output.logits\n            if logits.shape == (2, 1):\n                logits = logits.squeeze(-1)\n            return logits.tolist()\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        scores = self._inference(inputs)\n        for input, score in zip(inputs, scores):\n            input[\"score\"] = score\n        yield inputs\n\n    def unload(self) -> None:\n        if self.device_map in [\"cuda\", \"auto\"]:\n            CudaDevicePlacementMixin.unload(self)\n        super().unload()\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore.inputs","title":"inputs: StepColumns property","text":"

Either response and instruction, or a conversation columns.

"},{"location":"api/step_gallery/extra/#distilabel.steps.RewardModelScore.outputs","title":"outputs: StepColumns property","text":"

The score given by the reward model.

"},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn","title":"TruncateTextColumn","text":"

Bases: Step

Truncate a row using a tokenizer or the number of characters.

TruncateTextColumn is a Step that truncates a row according to the max length. If the tokenizer is provided, then the row will be truncated using the tokenizer, and the max_length will be used as the maximum number of tokens, otherwise it will be used as the maximum number of characters. The TruncateTextColumn step is useful when one wants to truncate a row to a certain length, to avoid posterior errors in the model due to the length.

Attributes:

Name Type Description column str

the column to truncate. Defaults to \"text\".

max_length int

the maximum length to use for truncation. If a tokenizer is given, corresponds to the number of tokens, otherwise corresponds to the number of characters. Defaults to 8192.

tokenizer Optional[str]

the name of the tokenizer to use. If provided, the row will be truncated using the tokenizer. Defaults to None.

Input columns
  • dynamic (determined by column attribute): The columns to be truncated, defaults to \"text\".
Output columns
  • dynamic (determined by column attribute): The truncated column.
Categories
  • text-manipulation

Examples:

Truncating a row to a given number of tokens:

from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(\n    tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    max_length=4,\n    column=\"text\"\n)\n\ntrunc.load()\n\nresult = next(\n    trunc.process(\n        [\n            {\"text\": \"This is a sample text that is longer than 10 characters\"}\n        ]\n    )\n)\n# result\n# [{'text': 'This is a sample'}]\n

Truncating a row to a given number of characters:

from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(max_length=10)\n\ntrunc.load()\n\nresult = next(\n    trunc.process(\n        [\n            {\"text\": \"This is a sample text that is longer than 10 characters\"}\n        ]\n    )\n)\n# result\n# [{'text': 'This is a '}]\n
Source code in src/distilabel/steps/truncate.py
class TruncateTextColumn(Step):\n    \"\"\"Truncate a row using a tokenizer or the number of characters.\n\n    `TruncateTextColumn` is a `Step` that truncates a row according to the max length. If\n    the `tokenizer` is provided, then the row will be truncated using the tokenizer,\n    and the `max_length` will be used as the maximum number of tokens, otherwise it will\n    be used as the maximum number of characters. The `TruncateTextColumn` step is useful when one\n    wants to truncate a row to a certain length, to avoid posterior errors in the model due\n    to the length.\n\n    Attributes:\n        column: the column to truncate. Defaults to `\"text\"`.\n        max_length: the maximum length to use for truncation.\n            If a `tokenizer` is given, corresponds to the number of tokens,\n            otherwise corresponds to the number of characters. Defaults to `8192`.\n        tokenizer: the name of the tokenizer to use. If provided, the row will be\n            truncated using the tokenizer. Defaults to `None`.\n\n    Input columns:\n        - dynamic (determined by `column` attribute): The columns to be truncated, defaults to \"text\".\n\n    Output columns:\n        - dynamic (determined by `column` attribute): The truncated column.\n\n    Categories:\n        - text-manipulation\n\n    Examples:\n        Truncating a row to a given number of tokens:\n\n        ```python\n        from distilabel.steps import TruncateTextColumn\n\n        trunc = TruncateTextColumn(\n            tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            max_length=4,\n            column=\"text\"\n        )\n\n        trunc.load()\n\n        result = next(\n            trunc.process(\n                [\n                    {\"text\": \"This is a sample text that is longer than 10 characters\"}\n                ]\n            )\n        )\n        # result\n        # [{'text': 'This is a sample'}]\n        ```\n\n        Truncating a row to a given number of characters:\n\n        ```python\n        from distilabel.steps import TruncateTextColumn\n\n        trunc = TruncateTextColumn(max_length=10)\n\n        trunc.load()\n\n        result = next(\n            trunc.process(\n                [\n                    {\"text\": \"This is a sample text that is longer than 10 characters\"}\n                ]\n            )\n        )\n        # result\n        # [{'text': 'This is a '}]\n        ```\n    \"\"\"\n\n    column: str = \"text\"\n    max_length: int = 8192\n    tokenizer: Optional[str] = None\n    _truncator: Optional[Callable[[str], str]] = None\n    _tokenizer: Optional[Any] = None\n\n    def load(self):\n        super().load()\n        if self.tokenizer:\n            if not importlib.util.find_spec(\"transformers\"):\n                raise ImportError(\n                    \"`transformers` is needed to tokenize, but is not installed. \"\n                    \"Please install it using `pip install transformers`.\"\n                )\n\n            from transformers import AutoTokenizer\n\n            self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)\n            self._truncator = self._truncate_with_tokenizer\n        else:\n            self._truncator = self._truncate_with_length\n\n    @property\n    def inputs(self) -> List[str]:\n        return [self.column]\n\n    @property\n    def outputs(self) -> List[str]:\n        return self.inputs\n\n    def _truncate_with_length(self, text: str) -> str:\n        \"\"\"Truncates the text according to the number of characters.\"\"\"\n        return text[: self.max_length]\n\n    def _truncate_with_tokenizer(self, text: str) -> str:\n        \"\"\"Truncates the text according to the number of characters using the tokenizer.\"\"\"\n        return self._tokenizer.decode(\n            self._tokenizer.encode(\n                text,\n                add_special_tokens=False,\n                max_length=self.max_length,\n                truncation=True,\n            )\n        )\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        for input in inputs:\n            input[self.column] = self._truncator(input[self.column])\n        yield inputs\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn._truncate_with_length","title":"_truncate_with_length(text)","text":"

Truncates the text according to the number of characters.

Source code in src/distilabel/steps/truncate.py
def _truncate_with_length(self, text: str) -> str:\n    \"\"\"Truncates the text according to the number of characters.\"\"\"\n    return text[: self.max_length]\n
"},{"location":"api/step_gallery/extra/#distilabel.steps.TruncateTextColumn._truncate_with_tokenizer","title":"_truncate_with_tokenizer(text)","text":"

Truncates the text according to the number of characters using the tokenizer.

Source code in src/distilabel/steps/truncate.py
def _truncate_with_tokenizer(self, text: str) -> str:\n    \"\"\"Truncates the text according to the number of characters using the tokenizer.\"\"\"\n    return self._tokenizer.decode(\n        self._tokenizer.encode(\n            text,\n            add_special_tokens=False,\n            max_length=self.max_length,\n            truncation=True,\n        )\n    )\n
"},{"location":"api/step_gallery/hugging_face/","title":"Hugging Face","text":"

This section contains the existing steps integrated with Hugging Face so as to easily push the generated datasets to Hugging Face.

"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk","title":"LoadDataFromDisk","text":"

Bases: LoadDataFromHub

Load a dataset that was previously saved to disk.

If you previously saved your dataset using the save_to_disk method, or Distiset.save_to_disk you can load it again to build a new pipeline using this class.

Attributes:

Name Type Description dataset_path RuntimeParameter[Union[str, Path]]

The path to the dataset or distiset.

split Optional[RuntimeParameter[str]]

The split of the dataset to load (typically will be train, test or validation).

config Optional[RuntimeParameter[str]]

The configuration of the dataset to load. Defaults to default, if there are multiple configurations in the dataset this must be suplied or an error is raised.

Runtime parameters
  • batch_size: The batch size to use when processing the data.
  • dataset_path: The path to the dataset or distiset.
  • is_distiset: Whether the dataset to load is a Distiset or not. Defaults to False.
  • split: The split of the dataset to load. Defaults to 'train'.
  • config: The configuration of the dataset to load. Defaults to default, if there are multiple configurations in the dataset this must be suplied or an error is raised.
  • num_examples: The number of examples to load from the dataset. By default will load all examples.
  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.
Output columns
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
Categories
  • load

Examples:

Load data from a Hugging Face Dataset:

from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n

Load data from a distilabel Distiset:

from distilabel.steps import LoadDataFromDisk\n\n# Specify the configuration to load.\nloader = LoadDataFromDisk(\n    dataset_path=\"path/to/dataset\",\n    is_distiset=True,\n    config=\"leaf_step_1\"\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n

Load data from a Hugging Face Dataset or Distiset in your cloud provider:

from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(\n    dataset_path=\"gcs://path/to/dataset\",\n    storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
Source code in src/distilabel/steps/generators/huggingface.py
class LoadDataFromDisk(LoadDataFromHub):\n    \"\"\"Load a dataset that was previously saved to disk.\n\n    If you previously saved your dataset using the `save_to_disk` method, or\n    `Distiset.save_to_disk` you can load it again to build a new pipeline using this class.\n\n    Attributes:\n        dataset_path: The path to the dataset or distiset.\n        split: The split of the dataset to load (typically will be `train`, `test` or `validation`).\n        config: The configuration of the dataset to load. Defaults to `default`, if there are\n            multiple configurations in the dataset this must be suplied or an error is raised.\n\n    Runtime parameters:\n        - `batch_size`: The batch size to use when processing the data.\n        - `dataset_path`: The path to the dataset or distiset.\n        - `is_distiset`: Whether the dataset to load is a `Distiset` or not. Defaults to False.\n        - `split`: The split of the dataset to load. Defaults to 'train'.\n        - `config`: The configuration of the dataset to load. Defaults to `default`, if there are\n            multiple configurations in the dataset this must be suplied or an error is raised.\n        - `num_examples`: The number of examples to load from the dataset.\n            By default will load all examples.\n        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n\n    Output columns:\n        - dynamic (`all`): The columns that will be generated by this step, based on the\n            datasets loaded from the Hugging Face Hub.\n\n    Categories:\n        - load\n\n    Examples:\n        Load data from a Hugging Face Dataset:\n\n        ```python\n        from distilabel.steps import LoadDataFromDisk\n\n        loader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n\n        Load data from a distilabel Distiset:\n\n        ```python\n        from distilabel.steps import LoadDataFromDisk\n\n        # Specify the configuration to load.\n        loader = LoadDataFromDisk(\n            dataset_path=\"path/to/dataset\",\n            is_distiset=True,\n            config=\"leaf_step_1\"\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n        ```\n\n        Load data from a Hugging Face Dataset or Distiset in your cloud provider:\n\n        ```python\n        from distilabel.steps import LoadDataFromDisk\n\n        loader = LoadDataFromDisk(\n            dataset_path=\"gcs://path/to/dataset\",\n            storage_options={\"project\": \"experiments-0001\"}\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n    \"\"\"\n\n    dataset_path: RuntimeParameter[Union[str, Path]] = Field(\n        default=None,\n        description=\"Path to the dataset or distiset.\",\n    )\n    config: Optional[RuntimeParameter[str]] = Field(\n        default=\"default\",\n        description=(\n            \"The configuration of the dataset to load. Will default to 'default'\",\n            \" which corresponds to a distiset with a single configuration.\",\n        ),\n    )\n    is_distiset: Optional[RuntimeParameter[bool]] = Field(\n        default=False,\n        description=\"Whether the dataset to load is a `Distiset` or not. Defaults to False.\",\n    )\n    keep_in_memory: Optional[RuntimeParameter[bool]] = Field(\n        default=None,\n        description=\"Whether to copy the dataset in-memory, see `datasets.Dataset.load_from_disk` \"\n        \" for more information. Defaults to `None`.\",\n    )\n    split: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The split of the dataset to load. By default will load the whole Dataset/Distiset.\",\n    )\n    repo_id: ExcludedField[Union[str, None]] = None\n\n    def load(self) -> None:\n        \"\"\"Load the dataset from the file/s in disk.\"\"\"\n        super(GeneratorStep, self).load()\n        if self.is_distiset:\n            ds = Distiset.load_from_disk(\n                self.dataset_path,\n                keep_in_memory=self.keep_in_memory,\n                storage_options=self.storage_options,\n            )\n            if self.config not in ds.keys():\n                raise DistilabelUserError(\n                    f\"Configuration '{self.config}' not found in the Distiset, available ones\"\n                    f\" are: {list(ds.keys())}. Please try changing the `config` parameter to one \"\n                    \"of the available configurations.\\n\\n\",\n                    page=\"sections/how_to_guides/advanced/distiset/#using-the-distiset-dataset-object\",\n                )\n            ds = ds[self.config]\n\n        else:\n            ds = load_from_disk(\n                self.dataset_path,\n                keep_in_memory=self.keep_in_memory,\n                storage_options=self.storage_options,\n            )\n\n        if self.split:\n            ds = ds[self.split]\n\n        self._dataset = ds\n\n        if self.num_examples:\n            self._dataset = self._dataset.select(range(self.num_examples))\n        else:\n            self.num_examples = len(self._dataset)\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The columns that will be generated by this step, based on the datasets from a file\n        in disk.\n\n        Returns:\n            The columns that will be generated by this step.\n        \"\"\"\n        # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts\n        if self._dataset is None:\n            self.load()\n\n        return self._dataset.column_names\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk.outputs","title":"outputs: List[str] property","text":"

The columns that will be generated by this step, based on the datasets from a file in disk.

Returns:

Type Description List[str]

The columns that will be generated by this step.

"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromDisk.load","title":"load()","text":"

Load the dataset from the file/s in disk.

Source code in src/distilabel/steps/generators/huggingface.py
def load(self) -> None:\n    \"\"\"Load the dataset from the file/s in disk.\"\"\"\n    super(GeneratorStep, self).load()\n    if self.is_distiset:\n        ds = Distiset.load_from_disk(\n            self.dataset_path,\n            keep_in_memory=self.keep_in_memory,\n            storage_options=self.storage_options,\n        )\n        if self.config not in ds.keys():\n            raise DistilabelUserError(\n                f\"Configuration '{self.config}' not found in the Distiset, available ones\"\n                f\" are: {list(ds.keys())}. Please try changing the `config` parameter to one \"\n                \"of the available configurations.\\n\\n\",\n                page=\"sections/how_to_guides/advanced/distiset/#using-the-distiset-dataset-object\",\n            )\n        ds = ds[self.config]\n\n    else:\n        ds = load_from_disk(\n            self.dataset_path,\n            keep_in_memory=self.keep_in_memory,\n            storage_options=self.storage_options,\n        )\n\n    if self.split:\n        ds = ds[self.split]\n\n    self._dataset = ds\n\n    if self.num_examples:\n        self._dataset = self._dataset.select(range(self.num_examples))\n    else:\n        self.num_examples = len(self._dataset)\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem","title":"LoadDataFromFileSystem","text":"

Bases: LoadDataFromHub

Loads a dataset from a file in your filesystem.

GeneratorStep that creates a dataset from a file in the filesystem, uses Hugging Face datasets library. Take a look at Hugging Face Datasets for more information of the supported file types.

Attributes:

Name Type Description data_files RuntimeParameter[Union[str, Path]]

The path to the file, or directory containing the files that conform the dataset.

split RuntimeParameter[Union[str, Path]]

The split of the dataset to load (typically will be train, test or validation).

Runtime parameters
  • batch_size: The batch size to use when processing the data.
  • data_files: The path to the file, or directory containing the files that conform the dataset.
  • split: The split of the dataset to load. Defaults to 'train'.
  • streaming: Whether to load the dataset in streaming mode or not. Defaults to False.
  • num_examples: The number of examples to load from the dataset. By default will load all examples.
  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.
  • filetype: The expected filetype. If not provided, it will be inferred from the file extension. For more than one file, it will be inferred from the first file.
Output columns
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
Categories
  • load

Examples:

Load data from a Hugging Face dataset in your file system:

from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n

Specify a filetype if the file extension is not expected:

from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n

Load data from a file in your cloud provider:

from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n    data_files=\"gcs://path/to/dataset\",\n    storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n

Load data passing a glob pattern:

from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n    data_files=\"path/to/dataset/*.jsonl\",\n    streaming=True\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
Source code in src/distilabel/steps/generators/huggingface.py
class LoadDataFromFileSystem(LoadDataFromHub):\n    \"\"\"Loads a dataset from a file in your filesystem.\n\n    `GeneratorStep` that creates a dataset from a file in the filesystem, uses Hugging Face `datasets`\n    library. Take a look at [Hugging Face Datasets](https://huggingface.co/docs/datasets/loading)\n    for more information of the supported file types.\n\n    Attributes:\n        data_files: The path to the file, or directory containing the files that conform\n            the dataset.\n        split: The split of the dataset to load (typically will be `train`, `test` or `validation`).\n\n    Runtime parameters:\n        - `batch_size`: The batch size to use when processing the data.\n        - `data_files`: The path to the file, or directory containing the files that conform\n            the dataset.\n        - `split`: The split of the dataset to load. Defaults to 'train'.\n        - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to\n            `False`.\n        - `num_examples`: The number of examples to load from the dataset.\n            By default will load all examples.\n        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n        - `filetype`: The expected filetype. If not provided, it will be inferred from the file extension.\n            For more than one file, it will be inferred from the first file.\n\n    Output columns:\n        - dynamic (`all`): The columns that will be generated by this step, based on the\n            datasets loaded from the Hugging Face Hub.\n\n    Categories:\n        - load\n\n    Examples:\n        Load data from a Hugging Face dataset in your file system:\n\n        ```python\n        from distilabel.steps import LoadDataFromFileSystem\n\n        loader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n\n        Specify a filetype if the file extension is not expected:\n\n        ```python\n        from distilabel.steps import LoadDataFromFileSystem\n\n        loader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n\n        Load data from a file in your cloud provider:\n\n        ```python\n        from distilabel.steps import LoadDataFromFileSystem\n\n        loader = LoadDataFromFileSystem(\n            data_files=\"gcs://path/to/dataset\",\n            storage_options={\"project\": \"experiments-0001\"}\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n\n        Load data passing a glob pattern:\n\n        ```python\n        from distilabel.steps import LoadDataFromFileSystem\n\n        loader = LoadDataFromFileSystem(\n            data_files=\"path/to/dataset/*.jsonl\",\n            streaming=True\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'type': 'function', 'function':...', False)\n        ```\n    \"\"\"\n\n    data_files: RuntimeParameter[Union[str, Path]] = Field(\n        default=None,\n        description=\"The data files, or directory containing the data files, to generate the dataset from.\",\n    )\n    filetype: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The expected filetype. If not provided, it will be inferred from the file extension.\",\n    )\n    repo_id: ExcludedField[Union[str, None]] = None\n\n    def load(self) -> None:\n        \"\"\"Load the dataset from the file/s in disk.\"\"\"\n        GeneratorStep.load(self)\n\n        data_path = UPath(self.data_files, storage_options=self.storage_options)\n\n        (data_files, self.filetype) = self._prepare_data_files(data_path)\n\n        self._dataset = load_dataset(\n            self.filetype,\n            data_files=data_files,\n            split=self.split,\n            streaming=self.streaming,\n            storage_options=self.storage_options,\n        )\n\n        if not self.streaming and self.num_examples:\n            self._dataset = self._dataset.select(range(self.num_examples))\n        if not self.num_examples:\n            if self.streaming:\n                # There's no better way to get the number of examples in a streaming dataset,\n                # load it again for the moment.\n                self.num_examples = len(\n                    load_dataset(\n                        self.filetype, data_files=self.data_files, split=self.split\n                    )\n                )\n            else:\n                self.num_examples = len(self._dataset)\n\n    @staticmethod\n    def _prepare_data_files(  # noqa: C901\n        data_path: UPath,\n    ) -> Tuple[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]], str]:\n        \"\"\"Prepare the loading process by setting the `data_files` attribute.\n\n        Args:\n            data_path: The path to the data files, or directory containing the data files.\n\n        Returns:\n            Tuple with the data files and the filetype.\n        \"\"\"\n\n        def get_filetype(data_path: UPath) -> str:\n            filetype = data_path.suffix.lstrip(\".\")\n            if filetype == \"jsonl\":\n                filetype = \"json\"\n            return filetype\n\n        if data_path.is_file() or (\n            len(str(data_path.parent.glob(data_path.name))) >= 1\n        ):\n            filetype = get_filetype(data_path)\n            data_files = str(data_path)\n\n        elif data_path.is_dir():\n            file_sequence = []\n            file_map = defaultdict(list)\n            for file_or_folder in data_path.iterdir():\n                if file_or_folder.is_file():\n                    file_sequence.append(str(file_or_folder))\n                elif file_or_folder.is_dir():\n                    for file in file_or_folder.iterdir():\n                        file_sequence.append(str(file))\n                        file_map[str(file_or_folder)].append(str(file))\n\n            data_files = file_sequence or file_map\n            # Try to obtain the filetype from any of the files, assuming all files have the same type.\n            if file_sequence:\n                filetype = get_filetype(UPath(file_sequence[0]))\n            else:\n                filetype = get_filetype(UPath(file_map[list(file_map.keys())[0]][0]))\n        return data_files, filetype\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The columns that will be generated by this step, based on the datasets from a file\n        in disk.\n\n        Returns:\n            The columns that will be generated by this step.\n        \"\"\"\n        # We assume there are Dataset/IterableDataset, not it's ...Dict counterparts\n        if self._dataset is None:\n            self.load()\n\n        return self._dataset.column_names\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem.outputs","title":"outputs: List[str] property","text":"

The columns that will be generated by this step, based on the datasets from a file in disk.

Returns:

Type Description List[str]

The columns that will be generated by this step.

"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromFileSystem.load","title":"load()","text":"

Load the dataset from the file/s in disk.

Source code in src/distilabel/steps/generators/huggingface.py
def load(self) -> None:\n    \"\"\"Load the dataset from the file/s in disk.\"\"\"\n    GeneratorStep.load(self)\n\n    data_path = UPath(self.data_files, storage_options=self.storage_options)\n\n    (data_files, self.filetype) = self._prepare_data_files(data_path)\n\n    self._dataset = load_dataset(\n        self.filetype,\n        data_files=data_files,\n        split=self.split,\n        streaming=self.streaming,\n        storage_options=self.storage_options,\n    )\n\n    if not self.streaming and self.num_examples:\n        self._dataset = self._dataset.select(range(self.num_examples))\n    if not self.num_examples:\n        if self.streaming:\n            # There's no better way to get the number of examples in a streaming dataset,\n            # load it again for the moment.\n            self.num_examples = len(\n                load_dataset(\n                    self.filetype, data_files=self.data_files, split=self.split\n                )\n            )\n        else:\n            self.num_examples = len(self._dataset)\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub","title":"LoadDataFromHub","text":"

Bases: GeneratorStep

Loads a dataset from the Hugging Face Hub.

GeneratorStep that loads a dataset from the Hugging Face Hub using the datasets library.

Attributes:

Name Type Description repo_id RuntimeParameter[str]

The Hugging Face Hub repository ID of the dataset to load.

split RuntimeParameter[str]

The split of the dataset to load.

config Optional[RuntimeParameter[str]]

The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations.

Runtime parameters
  • batch_size: The batch size to use when processing the data.
  • repo_id: The Hugging Face Hub repository ID of the dataset to load.
  • split: The split of the dataset to load. Defaults to 'train'.
  • config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations.
  • revision: The revision of the dataset to load. Defaults to the latest revision.
  • streaming: Whether to load the dataset in streaming mode or not. Defaults to False.
  • num_examples: The number of examples to load from the dataset. By default will load all examples.
  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.
Output columns
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
Categories
  • load

Examples:

Load data from a dataset in Hugging Face Hub:

from distilabel.steps import LoadDataFromHub\n\nloader = LoadDataFromHub(\n    repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n    split=\"test\",\n    batch_size=2\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'prompt': 'Arianna has 12...', False)\n
Source code in src/distilabel/steps/generators/huggingface.py
class LoadDataFromHub(GeneratorStep):\n    \"\"\"Loads a dataset from the Hugging Face Hub.\n\n    `GeneratorStep` that loads a dataset from the Hugging Face Hub using the `datasets`\n    library.\n\n    Attributes:\n        repo_id: The Hugging Face Hub repository ID of the dataset to load.\n        split: The split of the dataset to load.\n        config: The configuration of the dataset to load. This is optional and only needed\n            if the dataset has multiple configurations.\n\n    Runtime parameters:\n        - `batch_size`: The batch size to use when processing the data.\n        - `repo_id`: The Hugging Face Hub repository ID of the dataset to load.\n        - `split`: The split of the dataset to load. Defaults to 'train'.\n        - `config`: The configuration of the dataset to load. This is optional and only\n            needed if the dataset has multiple configurations.\n        - `revision`: The revision of the dataset to load. Defaults to the latest revision.\n        - `streaming`: Whether to load the dataset in streaming mode or not. Defaults to\n            `False`.\n        - `num_examples`: The number of examples to load from the dataset.\n            By default will load all examples.\n        - `storage_options`: Key/value pairs to be passed on to the file-system backend, if any.\n            Defaults to `None`.\n\n    Output columns:\n        - dynamic (`all`): The columns that will be generated by this step, based on the\n            datasets loaded from the Hugging Face Hub.\n\n    Categories:\n        - load\n\n    Examples:\n        Load data from a dataset in Hugging Face Hub:\n\n        ```python\n        from distilabel.steps import LoadDataFromHub\n\n        loader = LoadDataFromHub(\n            repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n            split=\"test\",\n            batch_size=2\n        )\n        loader.load()\n\n        # Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\n        result = next(loader.process())\n        # >>> result\n        # ([{'prompt': 'Arianna has 12...', False)\n        ```\n    \"\"\"\n\n    repo_id: RuntimeParameter[str] = Field(\n        default=None,\n        description=\"The Hugging Face Hub repository ID of the dataset to load.\",\n    )\n    split: RuntimeParameter[str] = Field(\n        default=\"train\",\n        description=\"The split of the dataset to load. Defaults to 'train'.\",\n    )\n    config: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The configuration of the dataset to load. This is optional and only\"\n        \" needed if the dataset has multiple configurations.\",\n    )\n    revision: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The revision of the dataset to load. Defaults to the latest revision.\",\n    )\n    streaming: RuntimeParameter[bool] = Field(\n        default=False,\n        description=\"Whether to load the dataset in streaming mode or not. Defaults to False.\",\n    )\n    num_examples: Optional[RuntimeParameter[int]] = Field(\n        default=None,\n        description=\"The number of examples to load from the dataset. By default will load all examples.\",\n    )\n    storage_options: Optional[Dict[str, Any]] = Field(\n        default=None,\n        description=\"The storage options to use when loading the dataset.\",\n    )\n\n    _dataset: Union[IterableDataset, Dataset, None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Load the dataset from the Hugging Face Hub\"\"\"\n        super().load()\n\n        if self._dataset is not None:\n            # Here to simplify the functionality of\u00a0distilabel.steps.generators.util.make_generator_step\n            return\n\n        self._dataset = load_dataset(\n            self.repo_id,  # type: ignore\n            self.config,\n            split=self.split,\n            revision=self.revision,\n            streaming=self.streaming,\n        )\n        num_examples = self._get_dataset_num_examples()\n        self.num_examples = (\n            min(self.num_examples, num_examples) if self.num_examples else num_examples\n        )\n\n        if not self.streaming:\n            self._dataset = self._dataset.select(range(self.num_examples))\n\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        \"\"\"Yields batches from the loaded dataset from the Hugging Face Hub.\n\n        Args:\n            offset: The offset to start yielding the data from. Will be used during the caching\n                process to help skipping already processed data.\n\n        Yields:\n            A tuple containing a batch of rows and a boolean indicating if the batch is\n            the last one.\n        \"\"\"\n        num_returned_rows = 0\n        for batch_num, batch in enumerate(\n            self._dataset.iter(batch_size=self.batch_size)  # type: ignore\n        ):\n            if batch_num * self.batch_size < offset:\n                continue\n            transformed_batch = self._transform_batch(batch)\n            batch_size = len(transformed_batch)\n            num_returned_rows += batch_size\n            yield transformed_batch, num_returned_rows >= self.num_examples\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The columns that will be generated by this step, based on the datasets loaded\n        from the Hugging Face Hub.\n\n        Returns:\n            The columns that will be generated by this step.\n        \"\"\"\n        return self._get_dataset_columns()\n\n    def _transform_batch(self, batch: Dict[str, Any]) -> List[Dict[str, Any]]:\n        \"\"\"Transform a batch of data from the Hugging Face Hub into a list of rows.\n\n        Args:\n            batch: The batch of data from the Hugging Face Hub.\n\n        Returns:\n            A list of rows, where each row is a dictionary of column names and values.\n        \"\"\"\n        length = len(next(iter(batch.values())))\n        rows = []\n        for i in range(length):\n            rows.append({col: values[i] for col, values in batch.items()})\n        return rows\n\n    def _get_dataset_num_examples(self) -> int:\n        \"\"\"Get the number of examples in the dataset, based on the `split` and `config`\n        runtime parameters provided.\n\n        Returns:\n            The number of examples in the dataset.\n        \"\"\"\n        default_config = self.config\n        if not default_config:\n            default_config = list(self._dataset_info.keys())[0]\n\n        return self._dataset_info[default_config].splits[self.split].num_examples\n\n    def _get_dataset_columns(self) -> List[str]:\n        \"\"\"Get the columns of the dataset, based on the `config` runtime parameter provided.\n\n        Returns:\n            The columns of the dataset.\n        \"\"\"\n        return list(\n            self._dataset_info[\n                self.config if self.config else \"default\"\n            ].features.keys()\n        )\n\n    @cached_property\n    def _dataset_info(self) -> Dict[str, DatasetInfo]:\n        \"\"\"Calls the Datasets Server API from Hugging Face to obtain the dataset information.\n\n        Returns:\n            The dataset information.\n        \"\"\"\n\n        try:\n            return get_dataset_infos(self.repo_id)\n        except Exception as e:\n            warnings.warn(\n                f\"Failed to get dataset info from Hugging Face Hub, trying to get it loading the dataset. Error: {e}\",\n                UserWarning,\n                stacklevel=2,\n            )\n            ds = load_dataset(self.repo_id, config=self.config, split=self.split)\n            if self.config:\n                return ds[self.config].info\n            return ds.info\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.outputs","title":"outputs: List[str] property","text":"

The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.

Returns:

Type Description List[str]

The columns that will be generated by this step.

"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.load","title":"load()","text":"

Load the dataset from the Hugging Face Hub

Source code in src/distilabel/steps/generators/huggingface.py
def load(self) -> None:\n    \"\"\"Load the dataset from the Hugging Face Hub\"\"\"\n    super().load()\n\n    if self._dataset is not None:\n        # Here to simplify the functionality of\u00a0distilabel.steps.generators.util.make_generator_step\n        return\n\n    self._dataset = load_dataset(\n        self.repo_id,  # type: ignore\n        self.config,\n        split=self.split,\n        revision=self.revision,\n        streaming=self.streaming,\n    )\n    num_examples = self._get_dataset_num_examples()\n    self.num_examples = (\n        min(self.num_examples, num_examples) if self.num_examples else num_examples\n    )\n\n    if not self.streaming:\n        self._dataset = self._dataset.select(range(self.num_examples))\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.LoadDataFromHub.process","title":"process(offset=0)","text":"

Yields batches from the loaded dataset from the Hugging Face Hub.

Parameters:

Name Type Description Default offset int

The offset to start yielding the data from. Will be used during the caching process to help skipping already processed data.

0

Yields:

Type Description GeneratorStepOutput

A tuple containing a batch of rows and a boolean indicating if the batch is

GeneratorStepOutput

the last one.

Source code in src/distilabel/steps/generators/huggingface.py
def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n    \"\"\"Yields batches from the loaded dataset from the Hugging Face Hub.\n\n    Args:\n        offset: The offset to start yielding the data from. Will be used during the caching\n            process to help skipping already processed data.\n\n    Yields:\n        A tuple containing a batch of rows and a boolean indicating if the batch is\n        the last one.\n    \"\"\"\n    num_returned_rows = 0\n    for batch_num, batch in enumerate(\n        self._dataset.iter(batch_size=self.batch_size)  # type: ignore\n    ):\n        if batch_num * self.batch_size < offset:\n            continue\n        transformed_batch = self._transform_batch(batch)\n        batch_size = len(transformed_batch)\n        num_returned_rows += batch_size\n        yield transformed_batch, num_returned_rows >= self.num_examples\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.PushToHub","title":"PushToHub","text":"

Bases: GlobalStep

Push data to a Hugging Face Hub dataset.

A GlobalStep which creates a datasets.Dataset with the input data and pushes it to the Hugging Face Hub.

Attributes:

Name Type Description repo_id RuntimeParameter[str]

The Hugging Face Hub repository ID where the dataset will be uploaded.

split RuntimeParameter[str]

The split of the dataset that will be pushed. Defaults to \"train\".

private RuntimeParameter[bool]

Whether the dataset to be pushed should be private or not. Defaults to False.

token Optional[RuntimeParameter[str]]

The token that will be used to authenticate in the Hub. If not provided, the token will be tried to be obtained from the environment variable HF_TOKEN. If not provided using one of the previous methods, then huggingface_hub library will try to use the token from the local Hugging Face CLI configuration. Defaults to None.

Runtime parameters
  • repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.
  • split: The split of the dataset that will be pushed.
  • private: Whether the dataset to be pushed should be private or not.
  • token: The token that will be used to authenticate in the Hub.
Input columns
  • dynamic (all): all columns from the input will be used to create the dataset.
Categories
  • save
  • dataset
  • huggingface

Examples:

Push batches of your dataset to the Hugging Face Hub repository:

from distilabel.steps import PushToHub\n\npush = PushToHub(repo_id=\"path_to/repo\")\npush.load()\n\nresult = next(\n    push.process(\n        [\n            {\n                \"instruction\": \"instruction \",\n                \"generation\": \"generation\"\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction ', 'generation': 'generation'}]\n
Source code in src/distilabel/steps/globals/huggingface.py
class PushToHub(GlobalStep):\n    \"\"\"Push data to a Hugging Face Hub dataset.\n\n    A `GlobalStep` which creates a `datasets.Dataset` with the input data and pushes\n    it to the Hugging Face Hub.\n\n    Attributes:\n        repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.\n        split: The split of the dataset that will be pushed. Defaults to `\"train\"`.\n        private: Whether the dataset to be pushed should be private or not. Defaults to\n            `False`.\n        token: The token that will be used to authenticate in the Hub. If not provided, the\n            token will be tried to be obtained from the environment variable `HF_TOKEN`.\n            If not provided using one of the previous methods, then `huggingface_hub` library\n            will try to use the token from the local Hugging Face CLI configuration. Defaults\n            to `None`.\n\n    Runtime parameters:\n        - `repo_id`: The Hugging Face Hub repository ID where the dataset will be uploaded.\n        - `split`: The split of the dataset that will be pushed.\n        - `private`: Whether the dataset to be pushed should be private or not.\n        - `token`: The token that will be used to authenticate in the Hub.\n\n    Input columns:\n        - dynamic (`all`): all columns from the input will be used to create the dataset.\n\n    Categories:\n        - save\n        - dataset\n        - huggingface\n\n    Examples:\n        Push batches of your dataset to the Hugging Face Hub repository:\n\n        ```python\n        from distilabel.steps import PushToHub\n\n        push = PushToHub(repo_id=\"path_to/repo\")\n        push.load()\n\n        result = next(\n            push.process(\n                [\n                    {\n                        \"instruction\": \"instruction \",\n                        \"generation\": \"generation\"\n                    }\n                ],\n            )\n        )\n        # >>> result\n        # [{'instruction': 'instruction ', 'generation': 'generation'}]\n        ```\n    \"\"\"\n\n    repo_id: RuntimeParameter[str] = Field(\n        default=None,\n        description=\"The Hugging Face Hub repository ID where the dataset will be uploaded.\",\n    )\n    split: RuntimeParameter[str] = Field(\n        default=\"train\",\n        description=\"The split of the dataset that will be pushed. Defaults to 'train'.\",\n    )\n    private: RuntimeParameter[bool] = Field(\n        default=False,\n        description=\"Whether the dataset to be pushed should be private or not. Defaults\"\n        \" to `False`.\",\n    )\n    token: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The token that will be used to authenticate in the Hub. If not provided,\"\n        \" the token will be tried to be obtained from the environment variable `HF_TOKEN`.\"\n        \" If not provided using one of the previous methods, then `huggingface_hub` library\"\n        \" will try to use the token from the local Hugging Face CLI configuration. Defaults\"\n        \" to `None`\",\n    )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Method that processes the input data, respecting the `datasets.Dataset` formatting,\n        and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.\n\n        Args:\n            inputs: that input data within a single object (as it's a GlobalStep) that\n                will be transformed into a `datasets.Dataset`.\n\n        Yields:\n            Propagates the received inputs so that the `Distiset` can be generated if this is\n            the last step of the `Pipeline`, or if this is not a leaf step and has follow up\n            steps.\n        \"\"\"\n        dataset_dict = defaultdict(list)\n        for input in inputs:\n            for key, value in input.items():\n                dataset_dict[key].append(value)\n        dataset_dict = dict(dataset_dict)\n        dataset = Dataset.from_dict(dataset_dict)\n        dataset.push_to_hub(\n            self.repo_id,  # type: ignore\n            split=self.split,\n            private=self.private,\n            token=self.token or os.getenv(\"HF_TOKEN\"),\n        )\n        yield inputs\n
"},{"location":"api/step_gallery/hugging_face/#distilabel.steps.PushToHub.process","title":"process(inputs)","text":"

Method that processes the input data, respecting the datasets.Dataset formatting, and pushes it to the Hugging Face Hub based on the RuntimeParameters attributes.

Parameters:

Name Type Description Default inputs StepInput

that input data within a single object (as it's a GlobalStep) that will be transformed into a datasets.Dataset.

required

Yields:

Type Description StepOutput

Propagates the received inputs so that the Distiset can be generated if this is

StepOutput

the last step of the Pipeline, or if this is not a leaf step and has follow up

StepOutput

steps.

Source code in src/distilabel/steps/globals/huggingface.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Method that processes the input data, respecting the `datasets.Dataset` formatting,\n    and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.\n\n    Args:\n        inputs: that input data within a single object (as it's a GlobalStep) that\n            will be transformed into a `datasets.Dataset`.\n\n    Yields:\n        Propagates the received inputs so that the `Distiset` can be generated if this is\n        the last step of the `Pipeline`, or if this is not a leaf step and has follow up\n        steps.\n    \"\"\"\n    dataset_dict = defaultdict(list)\n    for input in inputs:\n        for key, value in input.items():\n            dataset_dict[key].append(value)\n    dataset_dict = dict(dataset_dict)\n    dataset = Dataset.from_dict(dataset_dict)\n    dataset.push_to_hub(\n        self.repo_id,  # type: ignore\n        split=self.split,\n        private=self.private,\n        token=self.token or os.getenv(\"HF_TOKEN\"),\n    )\n    yield inputs\n
"},{"location":"api/task/","title":"Task","text":"

This section contains the API reference for the distilabel tasks.

For more information on how the Task works and see some examples, check the Tutorial - Task page.

"},{"location":"api/task/#distilabel.steps.tasks.base","title":"base","text":""},{"location":"api/task/#distilabel.steps.tasks.base._Task","title":"_Task","text":"

Bases: _Step, ABC

_Task is an abstract class that implements the _Step interface and adds the format_input and format_output methods to format the inputs and outputs of the task. It also adds a llm attribute to be used as the LLM to generate the outputs.

Attributes:

Name Type Description llm LLM

the LLM to be used to generate the outputs of the task.

group_generations bool

whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False.

add_raw_output RuntimeParameter[bool]

whether to include a field with the raw output of the LLM in the distilabel_metadata field of the output. Can be helpful to not loose data with Tasks that need to format the output of the LLM. Defaults to False.

num_generations RuntimeParameter[int]

The number of generations to be produced per input.

Source code in src/distilabel/steps/tasks/base.py
class _Task(_Step, ABC):\n    \"\"\"_Task is an abstract class that implements the `_Step` interface and adds the\n    `format_input` and `format_output` methods to format the inputs and outputs of the\n    task. It also adds a `llm` attribute to be used as the LLM to generate the outputs.\n\n    Attributes:\n        llm: the `LLM` to be used to generate the outputs of the task.\n        group_generations: whether to group the `num_generations` generated per input in\n            a list or create a row per generation. Defaults to `False`.\n        add_raw_output: whether to include a field with the raw output of the LLM in the\n            `distilabel_metadata` field of the output. Can be helpful to not loose data\n            with `Tasks` that need to format the output of the `LLM`. Defaults to `False`.\n        num_generations: The number of generations to be produced per input.\n    \"\"\"\n\n    llm: LLM\n\n    group_generations: bool = False\n    add_raw_output: RuntimeParameter[bool] = Field(\n        default=True,\n        description=(\n            \"Whether to include the raw output of the LLM in the key `raw_output_<TASK_NAME>`\"\n            \" of the `distilabel_metadata` dictionary output column\"\n        ),\n    )\n    add_raw_input: RuntimeParameter[bool] = Field(\n        default=True,\n        description=(\n            \"Whether to include the raw input of the LLM in the key `raw_input_<TASK_NAME>`\"\n            \" of the `distilabel_metadata` dictionary column\"\n        ),\n    )\n    num_generations: RuntimeParameter[int] = Field(\n        default=1, description=\"The number of generations to be produced per input.\"\n    )\n    use_default_structured_output: bool = False\n\n    _can_be_used_with_offline_batch_generation: bool = PrivateAttr(False)\n\n    def model_post_init(self, __context: Any) -> None:\n        if (\n            self.llm.use_offline_batch_generation\n            and not self._can_be_used_with_offline_batch_generation\n        ):\n            raise DistilabelUserError(\n                f\"`{self.__class__.__name__}` task cannot be used with offline batch generation\"\n                \" feature.\",\n                page=\"sections/how_to_guides/advanced/offline-batch-generation\",\n            )\n\n        super().model_post_init(__context)\n\n    @property\n    def is_global(self) -> bool:\n        \"\"\"Extends the `is_global` property to return `True` if the task is using the\n        offline batch generation feature, otherwise it returns the value of the parent\n        class property. `offline_batch_generation` requires to receive all the inputs\n        at once, so for the `_BatchManager` this is a global step.\n\n        Returns:\n            Whether the task is a global step or not.\n        \"\"\"\n        if self.llm.use_offline_batch_generation:\n            return True\n\n        return super().is_global\n\n    def load(self) -> None:\n        \"\"\"Loads the LLM via the `LLM.load()` method.\"\"\"\n        super().load()\n        self._set_default_structured_output()\n        self.llm.load()\n\n    @override\n    def unload(self) -> None:\n        \"\"\"Unloads the LLM.\"\"\"\n        self._logger.debug(\"Executing task unload logic.\")\n        self.llm.unload()\n\n    @override\n    def impute_step_outputs(\n        self, step_output: List[Dict[str, Any]]\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Imputes the outputs of the task in case the LLM failed to generate a response.\n        \"\"\"\n        result = []\n        for row in step_output:\n            data = row.copy()\n            for output in self.get_outputs().keys():\n                data[output] = None\n            data = self._maybe_add_raw_input_output(\n                data,\n                None,\n                None,\n                add_raw_output=self.add_raw_output,\n                add_raw_input=self.add_raw_input,\n            )\n            result.append(data)\n        return result\n\n    @abstractmethod\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Union[Dict[str, Any], None] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"Abstract method to format the outputs of the task. It needs to receive an output\n        as a string, and generates a Python dictionary with the outputs of the task. In\n        addition the `input` used to generate the output is also received just in case it's\n        needed to be able to parse the output correctly.\n        \"\"\"\n        pass\n\n    def _format_outputs(\n        self,\n        outputs: \"GenerateOutput\",\n        input: Union[Dict[str, Any], None] = None,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"Formats the outputs of the task using the `format_output` method. If the output\n        is `None` (i.e. the LLM failed to generate a response), then the outputs will be\n        set to `None` as well.\n\n        Args:\n            outputs: The outputs (`n` generations) for the provided `input`.\n            input: The input used to generate the output.\n\n        Returns:\n            A list containing a dictionary with the outputs of the task for each input.\n        \"\"\"\n        inputs = [None] if input is None else [input]\n\n        formatted_outputs = []\n        for output, input in zip(outputs, inputs * len(outputs)):  # type: ignore\n            try:\n                formatted_output = self.format_output(output, input)\n                formatted_output = self._maybe_add_raw_input_output(\n                    formatted_output,\n                    output,\n                    input,\n                    add_raw_output=self.add_raw_output,  # type: ignore\n                    add_raw_input=self.add_raw_input,  # type: ignore\n                )\n                formatted_outputs.append(formatted_output)\n            except Exception as e:\n                self._logger.warning(  # type: ignore\n                    f\"Task '{self.name}' failed to format output: {e}. Saving raw response.\"  # type: ignore\n                )\n                formatted_outputs.append(self._output_on_failure(output, input))\n        return formatted_outputs\n\n    def _output_on_failure(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"In case of failure to format the output, this method will return a dictionary including\n        a new field `distilabel_meta` with the raw output of the LLM.\n        \"\"\"\n        # Create a dictionary with the outputs of the task (every output set to None)\n        outputs = {output: None for output in self.outputs}\n        outputs[\"model_name\"] = self.llm.model_name  # type: ignore\n        outputs = self._maybe_add_raw_input_output(\n            outputs,\n            output,\n            input,\n            add_raw_output=self.add_raw_output,  # type: ignore\n            add_raw_input=self.add_raw_input,  # type: ignore\n        )\n        return outputs\n\n    def _maybe_add_raw_input_output(\n        self,\n        output: Dict[str, Any],\n        raw_output: Union[str, None],\n        input: Union[str, None],\n        add_raw_output: bool = True,\n        add_raw_input: bool = True,\n    ):\n        \"\"\"Adds the raw output and or the formatted input of the LLM to the output dictionary\n        if `add_raw_output` is True or `add_raw_input` is True.\n        \"\"\"\n        meta = output.get(DISTILABEL_METADATA_KEY, {})\n\n        if add_raw_output:\n            meta[f\"raw_output_{self.name}\"] = raw_output\n        if add_raw_input:\n            meta[f\"raw_input_{self.name}\"] = self.format_input(input) if input else None\n        if meta:\n            output[DISTILABEL_METADATA_KEY] = meta\n\n        return output\n\n    def _set_default_structured_output(self) -> None:\n        \"\"\"Prepares the structured output to be set in the selected `LLM`.\n\n        If the method `get_structured_output` returns None (the default), there's no need\n        to set anything, as it doesn't apply.\n        If the `use_default_structured_output` and there's no previous structured output\n        set by hand, then decide the type of structured output to select depending on the\n        `LLM` provider.\n        \"\"\"\n        schema = self.get_structured_output()\n        if not schema:\n            return\n\n        if self.use_default_structured_output and not self.llm.structured_output:\n            # In case the default structured output is required, we have to set it before\n            # the LLM is loaded\n            from distilabel.models.llms import InferenceEndpointsLLM\n            from distilabel.models.llms.base import AsyncLLM\n\n            def check_dependency(module_name: str) -> None:\n                if not importlib.util.find_spec(module_name):\n                    raise ImportError(\n                        f\"`{module_name}` is not installed and is needed for the structured generation with this LLM.\"\n                        f\" Please install it using `pip install {module_name}`.\"\n                    )\n\n            dependency = \"outlines\"\n            structured_output = {\"schema\": schema}\n            if isinstance(self.llm, InferenceEndpointsLLM):\n                structured_output.update({\"format\": \"json\"})\n            # To determine instructor or outlines format\n            elif isinstance(self.llm, AsyncLLM) and not isinstance(\n                self.llm, InferenceEndpointsLLM\n            ):\n                dependency = \"instructor\"\n                structured_output.update({\"format\": \"json\"})\n\n            check_dependency(dependency)\n            self.llm.structured_output = structured_output\n\n    def get_structured_output(self) -> Union[Dict[str, Any], None]:\n        \"\"\"Returns the structured output for a task that implements one by default,\n        must be overriden by subclasses of `Task`. When implemented, should be a json\n        schema that enforces the response from the LLM so that it's easier to parse.\n        \"\"\"\n        return None\n\n    def _sample_input(self) -> \"ChatType\":\n        \"\"\"Returns a sample input to be used in the `print` method.\n        Tasks that don't adhere to a format input that returns a map of the type\n        str -> str should override this method to return a sample input.\n        \"\"\"\n        return self.format_input(\n            {input: f\"<PLACEHOLDER_{input.upper()}>\" for input in self.inputs}\n        )\n\n    def print(self, sample_input: Optional[\"ChatType\"] = None) -> None:\n        \"\"\"Prints a sample input to the console using the `rich` library.\n        Helper method to visualize the prompt of the task.\n\n        Args:\n            sample_input: A sample input to be printed. If not provided, a default will be\n                generated using the `_sample_input` method, which can be overriden by\n                subclasses. This should correspond to the same example you could pass to\n                the `format_input` method.\n                The variables be named <PLACEHOLDER_VARIABLE_NAME> by default.\n\n        Examples:\n            Print the URIAL prompt:\n\n            ```python\n            from distilabel.steps.tasks import URIAL\n            from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n            # Consider this as a placeholder for your actual LLM.\n            urial = URIAL(\n                llm=InferenceEndpointsLLM(\n                    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                ),\n            )\n            urial.load()\n            urial.print()\n            \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n            \u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n            \u2502 \u2502 # Instruction                                                                             \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you).               \u2502 \u2502\n            \u2502 \u2502 Users place their queries under \"# User:\", and your responses are under  \"# Assistant:\".  \u2502 \u2502\n            \u2502 \u2502 You are a helpful, respectful, and honest assistant.                                      \u2502 \u2502\n            \u2502 \u2502 You should always answer as helpfully as possible while ensuring safety.                  \u2502 \u2502\n            \u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n            \u2502 \u2502 have an engaging tone.                                                                    \u2502 \u2502\n            \u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic,      \u2502 \u2502\n            \u2502 \u2502 dangerous, or illegal content, even if it may be helpful.                                 \u2502 \u2502\n            \u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some        \u2502 \u2502\n            \u2502 \u2502 controversial topics.                                                                     \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502 # User:                                                                                   \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502 <PLACEHOLDER_INSTRUCTION>                                                                 \u2502 \u2502\n            \u2502 \u2502                                                                                           \u2502 \u2502\n            \u2502 \u2502 # Assistant:                                                                              \u2502 \u2502\n            \u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n            \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n            ```\n        \"\"\"\n        from rich.console import Console, Group\n        from rich.panel import Panel\n        from rich.text import Text\n\n        console = Console()\n        sample_input = sample_input or self._sample_input()\n\n        panels = []\n        for item in sample_input:\n            content = Text.assemble((item.get(\"content\", \"\"),))\n            panel = Panel(\n                content,\n                title=f\"[bold][magenta]{item.get('role', '').capitalize()} Message[/magenta][/bold]\",\n                border_style=\"light_cyan3\",\n            )\n            panels.append(panel)\n\n        # Create a group of panels\n        # Wrap the group in an outer panel\n        outer_panel = Panel(\n            Group(*panels),\n            title=f\"[bold][magenta]Prompt: {type(self).__name__} [/magenta][/bold]\",\n            border_style=\"light_cyan3\",\n            expand=False,\n        )\n        console.print(outer_panel)\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.is_global","title":"is_global: bool property","text":"

Extends the is_global property to return True if the task is using the offline batch generation feature, otherwise it returns the value of the parent class property. offline_batch_generation requires to receive all the inputs at once, so for the _BatchManager this is a global step.

Returns:

Type Description bool

Whether the task is a global step or not.

"},{"location":"api/task/#distilabel.steps.tasks.base._Task.load","title":"load()","text":"

Loads the LLM via the LLM.load() method.

Source code in src/distilabel/steps/tasks/base.py
def load(self) -> None:\n    \"\"\"Loads the LLM via the `LLM.load()` method.\"\"\"\n    super().load()\n    self._set_default_structured_output()\n    self.llm.load()\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.unload","title":"unload()","text":"

Unloads the LLM.

Source code in src/distilabel/steps/tasks/base.py
@override\ndef unload(self) -> None:\n    \"\"\"Unloads the LLM.\"\"\"\n    self._logger.debug(\"Executing task unload logic.\")\n    self.llm.unload()\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.impute_step_outputs","title":"impute_step_outputs(step_output)","text":"

Imputes the outputs of the task in case the LLM failed to generate a response.

Source code in src/distilabel/steps/tasks/base.py
@override\ndef impute_step_outputs(\n    self, step_output: List[Dict[str, Any]]\n) -> List[Dict[str, Any]]:\n    \"\"\"\n    Imputes the outputs of the task in case the LLM failed to generate a response.\n    \"\"\"\n    result = []\n    for row in step_output:\n        data = row.copy()\n        for output in self.get_outputs().keys():\n            data[output] = None\n        data = self._maybe_add_raw_input_output(\n            data,\n            None,\n            None,\n            add_raw_output=self.add_raw_output,\n            add_raw_input=self.add_raw_input,\n        )\n        result.append(data)\n    return result\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.format_output","title":"format_output(output, input=None) abstractmethod","text":"

Abstract method to format the outputs of the task. It needs to receive an output as a string, and generates a Python dictionary with the outputs of the task. In addition the input used to generate the output is also received just in case it's needed to be able to parse the output correctly.

Source code in src/distilabel/steps/tasks/base.py
@abstractmethod\ndef format_output(\n    self,\n    output: Union[str, None],\n    input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n    \"\"\"Abstract method to format the outputs of the task. It needs to receive an output\n    as a string, and generates a Python dictionary with the outputs of the task. In\n    addition the `input` used to generate the output is also received just in case it's\n    needed to be able to parse the output correctly.\n    \"\"\"\n    pass\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.get_structured_output","title":"get_structured_output()","text":"

Returns the structured output for a task that implements one by default, must be overriden by subclasses of Task. When implemented, should be a json schema that enforces the response from the LLM so that it's easier to parse.

Source code in src/distilabel/steps/tasks/base.py
def get_structured_output(self) -> Union[Dict[str, Any], None]:\n    \"\"\"Returns the structured output for a task that implements one by default,\n    must be overriden by subclasses of `Task`. When implemented, should be a json\n    schema that enforces the response from the LLM so that it's easier to parse.\n    \"\"\"\n    return None\n
"},{"location":"api/task/#distilabel.steps.tasks.base._Task.print","title":"print(sample_input=None)","text":"

Prints a sample input to the console using the rich library. Helper method to visualize the prompt of the task.

Parameters:

Name Type Description Default sample_input Optional[ChatType]

A sample input to be printed. If not provided, a default will be generated using the _sample_input method, which can be overriden by subclasses. This should correspond to the same example you could pass to the format_input method. The variables be named by default. None

Examples:

Print the URIAL prompt:

from distilabel.steps.tasks import URIAL\nfrom distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nurial = URIAL(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n)\nurial.load()\nurial.print()\n\u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n\u2502 \u2502 # Instruction                                                                             \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you).               \u2502 \u2502\n\u2502 \u2502 Users place their queries under \"# User:\", and your responses are under  \"# Assistant:\".  \u2502 \u2502\n\u2502 \u2502 You are a helpful, respectful, and honest assistant.                                      \u2502 \u2502\n\u2502 \u2502 You should always answer as helpfully as possible while ensuring safety.                  \u2502 \u2502\n\u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n\u2502 \u2502 have an engaging tone.                                                                    \u2502 \u2502\n\u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic,      \u2502 \u2502\n\u2502 \u2502 dangerous, or illegal content, even if it may be helpful.                                 \u2502 \u2502\n\u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some        \u2502 \u2502\n\u2502 \u2502 controversial topics.                                                                     \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502 # User:                                                                                   \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502 <PLACEHOLDER_INSTRUCTION>                                                                 \u2502 \u2502\n\u2502 \u2502                                                                                           \u2502 \u2502\n\u2502 \u2502 # Assistant:                                                                              \u2502 \u2502\n\u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n
Source code in src/distilabel/steps/tasks/base.py
def print(self, sample_input: Optional[\"ChatType\"] = None) -> None:\n    \"\"\"Prints a sample input to the console using the `rich` library.\n    Helper method to visualize the prompt of the task.\n\n    Args:\n        sample_input: A sample input to be printed. If not provided, a default will be\n            generated using the `_sample_input` method, which can be overriden by\n            subclasses. This should correspond to the same example you could pass to\n            the `format_input` method.\n            The variables be named <PLACEHOLDER_VARIABLE_NAME> by default.\n\n    Examples:\n        Print the URIAL prompt:\n\n        ```python\n        from distilabel.steps.tasks import URIAL\n        from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        urial = URIAL(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n        )\n        urial.load()\n        urial.print()\n        \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 Prompt: URIAL  \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n        \u2502 \u256d\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500 User Message \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e \u2502\n        \u2502 \u2502 # Instruction                                                                             \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502 Below is a list of conversations between a human and an AI assistant (you).               \u2502 \u2502\n        \u2502 \u2502 Users place their queries under \"# User:\", and your responses are under  \"# Assistant:\".  \u2502 \u2502\n        \u2502 \u2502 You are a helpful, respectful, and honest assistant.                                      \u2502 \u2502\n        \u2502 \u2502 You should always answer as helpfully as possible while ensuring safety.                  \u2502 \u2502\n        \u2502 \u2502 Your answers should be well-structured and provide detailed information. They should also \u2502 \u2502\n        \u2502 \u2502 have an engaging tone.                                                                    \u2502 \u2502\n        \u2502 \u2502 Your responses must not contain any fake, harmful, unethical, racist, sexist, toxic,      \u2502 \u2502\n        \u2502 \u2502 dangerous, or illegal content, even if it may be helpful.                                 \u2502 \u2502\n        \u2502 \u2502 Your response must be socially responsible, and thus you can refuse to answer some        \u2502 \u2502\n        \u2502 \u2502 controversial topics.                                                                     \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502 # User:                                                                                   \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502 <PLACEHOLDER_INSTRUCTION>                                                                 \u2502 \u2502\n        \u2502 \u2502                                                                                           \u2502 \u2502\n        \u2502 \u2502 # Assistant:                                                                              \u2502 \u2502\n        \u2502 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f \u2502\n        \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n        ```\n    \"\"\"\n    from rich.console import Console, Group\n    from rich.panel import Panel\n    from rich.text import Text\n\n    console = Console()\n    sample_input = sample_input or self._sample_input()\n\n    panels = []\n    for item in sample_input:\n        content = Text.assemble((item.get(\"content\", \"\"),))\n        panel = Panel(\n            content,\n            title=f\"[bold][magenta]{item.get('role', '').capitalize()} Message[/magenta][/bold]\",\n            border_style=\"light_cyan3\",\n        )\n        panels.append(panel)\n\n    # Create a group of panels\n    # Wrap the group in an outer panel\n    outer_panel = Panel(\n        Group(*panels),\n        title=f\"[bold][magenta]Prompt: {type(self).__name__} [/magenta][/bold]\",\n        border_style=\"light_cyan3\",\n        expand=False,\n    )\n    console.print(outer_panel)\n
"},{"location":"api/task/#distilabel.steps.tasks.base.Task","title":"Task","text":"

Bases: _Task, Step

Task is a class that implements the _Task abstract class and adds the Step interface to be used as a step in the pipeline.

Attributes:

Name Type Description llm

the LLM to be used to generate the outputs of the task.

group_generations

whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False.

num_generations

The number of generations to be produced per input.

Source code in src/distilabel/steps/tasks/base.py
class Task(_Task, Step):\n    \"\"\"Task is a class that implements the `_Task` abstract class and adds the `Step`\n    interface to be used as a step in the pipeline.\n\n    Attributes:\n        llm: the `LLM` to be used to generate the outputs of the task.\n        group_generations: whether to group the `num_generations` generated per input in\n            a list or create a row per generation. Defaults to `False`.\n        num_generations: The number of generations to be produced per input.\n    \"\"\"\n\n    @abstractmethod\n    def format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n        \"\"\"Abstract method to format the inputs of the task. It needs to receive an input\n        as a Python dictionary, and generates an OpenAI chat-like list of dicts.\"\"\"\n        pass\n\n    def _format_inputs(self, inputs: List[Dict[str, Any]]) -> List[\"FormattedInput\"]:\n        \"\"\"Formats the inputs of the task using the `format_input` method.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list containing the formatted inputs, which are `ChatType`-like following\n            the OpenAI formatting.\n        \"\"\"\n        return [self.format_input(input) for input in inputs]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n\n        formatted_inputs = self._format_inputs(inputs)\n\n        # `outputs` is a list containing a list of generations per input\n        outputs = self.llm.generate_outputs(\n            inputs=formatted_inputs,\n            num_generations=self.num_generations,  # type: ignore\n            **self.llm.get_generation_kwargs(),  # type: ignore\n        )\n\n        task_outputs = []\n        for input, input_outputs in zip(inputs, outputs):\n            formatted_outputs = self._format_outputs(input_outputs, input)\n\n            if self.group_generations:\n                combined = group_dicts(*formatted_outputs)\n                task_outputs.append(\n                    {**input, **combined, \"model_name\": self.llm.model_name}\n                )\n                continue\n\n            # Create a row per generation\n            for formatted_output in formatted_outputs:\n                task_outputs.append(\n                    {**input, **formatted_output, \"model_name\": self.llm.model_name}\n                )\n\n        yield task_outputs\n
"},{"location":"api/task/#distilabel.steps.tasks.base.Task.format_input","title":"format_input(input) abstractmethod","text":"

Abstract method to format the inputs of the task. It needs to receive an input as a Python dictionary, and generates an OpenAI chat-like list of dicts.

Source code in src/distilabel/steps/tasks/base.py
@abstractmethod\ndef format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n    \"\"\"Abstract method to format the inputs of the task. It needs to receive an input\n    as a Python dictionary, and generates an OpenAI chat-like list of dicts.\"\"\"\n    pass\n
"},{"location":"api/task/#distilabel.steps.tasks.base.Task.process","title":"process(inputs)","text":"

Processes the inputs of the task and generates the outputs using the LLM.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Yields:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/tasks/base.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Yields:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n\n    formatted_inputs = self._format_inputs(inputs)\n\n    # `outputs` is a list containing a list of generations per input\n    outputs = self.llm.generate_outputs(\n        inputs=formatted_inputs,\n        num_generations=self.num_generations,  # type: ignore\n        **self.llm.get_generation_kwargs(),  # type: ignore\n    )\n\n    task_outputs = []\n    for input, input_outputs in zip(inputs, outputs):\n        formatted_outputs = self._format_outputs(input_outputs, input)\n\n        if self.group_generations:\n            combined = group_dicts(*formatted_outputs)\n            task_outputs.append(\n                {**input, **combined, \"model_name\": self.llm.model_name}\n            )\n            continue\n\n        # Create a row per generation\n        for formatted_output in formatted_outputs:\n            task_outputs.append(\n                {**input, **formatted_output, \"model_name\": self.llm.model_name}\n            )\n\n    yield task_outputs\n
"},{"location":"api/task/generator_task/","title":"GeneratorTask","text":"

This section contains the API reference for the distilabel generator tasks.

For more information on how the GeneratorTask works and see some examples, check the Tutorial - Task - GeneratorTask page.

"},{"location":"api/task/generator_task/#distilabel.steps.tasks.base.GeneratorTask","title":"GeneratorTask","text":"

Bases: _Task, GeneratorStep

GeneratorTask is a class that implements the _Task abstract class and adds the GeneratorStep interface to be used as a step in the pipeline.

Attributes:

Name Type Description llm

the LLM to be used to generate the outputs of the task.

group_generations

whether to group the num_generations generated per input in a list or create a row per generation. Defaults to False.

num_generations

The number of generations to be produced per input.

Source code in src/distilabel/steps/tasks/base.py
class GeneratorTask(_Task, GeneratorStep):\n    \"\"\"`GeneratorTask` is a class that implements the `_Task` abstract class and adds the\n    `GeneratorStep` interface to be used as a step in the pipeline.\n\n    Attributes:\n        llm: the `LLM` to be used to generate the outputs of the task.\n        group_generations: whether to group the `num_generations` generated per input in\n            a list or create a row per generation. Defaults to `False`.\n        num_generations: The number of generations to be produced per input.\n    \"\"\"\n\n    pass\n
"},{"location":"api/task/task_gallery/","title":"Task Gallery","text":"

This section contains the existing Task subclasses implemented in distilabel.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks","title":"tasks","text":""},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker","title":"APIGenExecutionChecker","text":"

Bases: Step

Executes the generated function calls.

This step checks if a given answer from a model as generated by APIGenGenerator can be executed against the given library (given by libpath, which is a string pointing to a python .py file with functions).

Attributes:

Name Type Description libpath str

The path to the library where we will retrieve the functions. It can also point to a folder with the functions. In this case, the folder layout should be a folder with .py files, each containing a single function, the name of the function being the same as the filename.

check_is_dangerous bool

Bool to exclude some potentially dangerous functions, it contains some heuristics found while testing. This functions can run subprocesses, deal with the OS, or have other potentially dangerous operations. Defaults to True.

Input columns
  • answers (str): List with arguments to be passed to the function, dumped as a string from a list of dictionaries. Should be loaded using json.loads.
Output columns
  • keep_row_after_execution_check (bool): Whether the function should be kept or not.
  • execution_result (str): The result from executing the function.
Categories
  • filtering
  • execution
References
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets
  • Salesforce/xlam-function-calling-60k

Examples:

Execute a function from a given library with the answer from an LLM:

from distilabel.steps.tasks import APIGenExecutionChecker\n\n# For the libpath you can use as an example the file at the tests folder:\n# ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\ntask = APIGenExecutionChecker(\n    libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n)\ntask.load()\n\nres = next(\n    task.process(\n        [\n            {\n                \"answers\": [\n                    {\n                        \"arguments\": {\n                            \"initial_velocity\": 0.2,\n                            \"acceleration\": 0.1,\n                            \"time\": 0.5,\n                        },\n                        \"name\": \"final_velocity\",\n                    }\n                ],\n            }\n        ]\n    )\n)\nres\n#[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n
Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
class APIGenExecutionChecker(Step):\n    \"\"\"Executes the generated function calls.\n\n    This step checks if a given answer from a model as generated by `APIGenGenerator`\n    can be executed against the given library (given by `libpath`, which is a string\n    pointing to a python .py file with functions).\n\n    Attributes:\n        libpath: The path to the library where we will retrieve the functions.\n            It can also point to a folder with the functions. In this case, the folder\n            layout should be a folder with .py files, each containing a single function,\n            the name of the function being the same as the filename.\n        check_is_dangerous: Bool to exclude some potentially dangerous functions, it contains\n            some heuristics found while testing. This functions can run subprocesses, deal with\n            the OS, or have other potentially dangerous operations. Defaults to True.\n\n    Input columns:\n        - answers (`str`): List with arguments to be passed to the function,\n            dumped as a string from a list of dictionaries. Should be loaded using\n            `json.loads`.\n\n    Output columns:\n        - keep_row_after_execution_check (`bool`): Whether the function should be kept or not.\n        - execution_result (`str`): The result from executing the function.\n\n    Categories:\n        - filtering\n        - execution\n\n    References:\n        - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n        - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n    Examples:\n        Execute a function from a given library with the answer from an LLM:\n\n        ```python\n        from distilabel.steps.tasks import APIGenExecutionChecker\n\n        # For the libpath you can use as an example the file at the tests folder:\n        # ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\n        task = APIGenExecutionChecker(\n            libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n        )\n        task.load()\n\n        res = next(\n            task.process(\n                [\n                    {\n                        \"answers\": [\n                            {\n                                \"arguments\": {\n                                    \"initial_velocity\": 0.2,\n                                    \"acceleration\": 0.1,\n                                    \"time\": 0.5,\n                                },\n                                \"name\": \"final_velocity\",\n                            }\n                        ],\n                    }\n                ]\n            )\n        )\n        res\n        #[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n        ```\n    \"\"\"\n\n    libpath: str = Field(\n        default=...,\n        description=(\n            \"The path to the library where we will retrieve the functions, \"\n            \"or a folder with python files named the same as the functions they contain.\",\n        ),\n    )\n    check_is_dangerous: bool = Field(\n        default=True,\n        description=(\n            \"Bool to exclude some potentially dangerous functions, it contains \"\n            \"some heuristics found while testing. This functions can run subprocesses, \"\n            \"deal with the OS, or have other potentially dangerous operations.\",\n        ),\n    )\n\n    _toolbox: Union[\"ModuleType\", None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the library where the functions will be extracted from.\"\"\"\n        super().load()\n        if Path(self.libpath).suffix == \".py\":\n            self._toolbox = load_module_from_path(self.libpath)\n\n    def unload(self) -> None:\n        self._toolbox = None\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task are those found in the original dataset.\"\"\"\n        return [\"answers\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs are the columns required by `APIGenGenerator` task.\"\"\"\n        return [\"keep_row_after_execution_check\", \"execution_result\"]\n\n    def _get_function(self, function_name: str) -> Callable:\n        \"\"\"Retrieves the function from the toolbox.\n\n        Args:\n            function_name: The name of the function to retrieve.\n\n        Returns:\n            Callable: The function to be executed.\n        \"\"\"\n        if self._toolbox:\n            return getattr(self._toolbox, function_name, None)\n        try:\n            toolbox = load_module_from_path(\n                str(Path(self.libpath) / f\"{function_name}.py\")\n            )\n            return getattr(toolbox, function_name, None)\n        except FileNotFoundError:\n            return None\n        except Exception as e:\n            self._logger.warning(f\"Error loading function '{function_name}': {e}\")\n            return None\n\n    def _is_dangerous(self, function: Callable) -> bool:\n        \"\"\"Checks if a function is dangerous to remove it.\n        Contains a list of heuristics to avoid executing possibly dangerous functions.\n        \"\"\"\n        source_code = inspect.getsource(function)\n        # We don't want to execute functions that use subprocess\n        if (\n            (\"subprocess.\" in source_code)\n            or (\"os.system(\" in source_code)\n            or (\"input(\" in source_code)\n            # Avoiding threading\n            or (\"threading.Thread(\" in source_code)\n            or (\"exec(\" in source_code)\n            # Avoiding argparse (not sure why)\n            or (\"argparse.ArgumentParser(\" in source_code)\n            # Avoiding logging changing the levels to not mess with the logs\n            or (\".setLevel(\" in source_code)\n            # Don't run a test battery\n            or (\"unittest.main(\" in source_code)\n            # Avoid exiting the program\n            or (\"sys.exit(\" in source_code)\n            or (\"exit(\" in source_code)\n            or (\"raise SystemExit(\" in source_code)\n            or (\"multiprocessing.Pool(\" in source_code)\n        ):\n            return True\n        return False\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        \"\"\"Checks the answer to see if it can be executed.\n        Captures the possible errors and returns them.\n\n        If a single example is provided, it is copied to avoid raising an error.\n\n        Args:\n            inputs: A list of dictionaries with the input data.\n\n        Yields:\n            A list of dictionaries with the output data.\n        \"\"\"\n        for input in inputs:\n            output = []\n            if input[\"answers\"]:\n                answers = json.loads(input[\"answers\"])\n            else:\n                input.update(\n                    **{\n                        \"keep_row_after_execution_check\": False,\n                        \"execution_result\": [\"No answers were provided.\"],\n                    }\n                )\n                continue\n            for answer in answers:\n                if answer is None:\n                    output.append(\n                        {\n                            \"keep\": False,\n                            \"execution_result\": \"Nothing was generated for this answer.\",\n                        }\n                    )\n                    continue\n\n                function_name = answer.get(\"name\", None)\n                arguments = answer.get(\"arguments\", None)\n\n                self._logger.debug(\n                    f\"Executing function '{function_name}' with arguments: {arguments}\"\n                )\n                function = self._get_function(function_name)\n\n                if self.check_is_dangerous:\n                    if function and self._is_dangerous(function):\n                        function = None\n\n                if function is None:\n                    output.append(\n                        {\n                            \"keep\": False,\n                            \"execution_result\": f\"Function '{function_name}' not found.\",\n                        }\n                    )\n                else:\n                    execution = execute_from_response(function, arguments)\n                    output.append(\n                        {\n                            \"keep\": execution[\"keep\"],\n                            \"execution_result\": execution[\"execution_result\"],\n                        }\n                    )\n            # We only consider a good response if all the answers were executed successfully,\n            # but keep the reasons for further review if needed.\n            input.update(\n                **{\n                    \"keep_row_after_execution_check\": all(\n                        o[\"keep\"] is True for o in output\n                    ),\n                    \"execution_result\": [o[\"execution_result\"] for o in output],\n                }\n            )\n\n        yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task are those found in the original dataset.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.outputs","title":"outputs: StepColumns property","text":"

The outputs are the columns required by APIGenGenerator task.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.load","title":"load()","text":"

Loads the library where the functions will be extracted from.

Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
def load(self) -> None:\n    \"\"\"Loads the library where the functions will be extracted from.\"\"\"\n    super().load()\n    if Path(self.libpath).suffix == \".py\":\n        self._toolbox = load_module_from_path(self.libpath)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker._get_function","title":"_get_function(function_name)","text":"

Retrieves the function from the toolbox.

Parameters:

Name Type Description Default function_name str

The name of the function to retrieve.

required

Returns:

Name Type Description Callable Callable

The function to be executed.

Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
def _get_function(self, function_name: str) -> Callable:\n    \"\"\"Retrieves the function from the toolbox.\n\n    Args:\n        function_name: The name of the function to retrieve.\n\n    Returns:\n        Callable: The function to be executed.\n    \"\"\"\n    if self._toolbox:\n        return getattr(self._toolbox, function_name, None)\n    try:\n        toolbox = load_module_from_path(\n            str(Path(self.libpath) / f\"{function_name}.py\")\n        )\n        return getattr(toolbox, function_name, None)\n    except FileNotFoundError:\n        return None\n    except Exception as e:\n        self._logger.warning(f\"Error loading function '{function_name}': {e}\")\n        return None\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker._is_dangerous","title":"_is_dangerous(function)","text":"

Checks if a function is dangerous to remove it. Contains a list of heuristics to avoid executing possibly dangerous functions.

Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
def _is_dangerous(self, function: Callable) -> bool:\n    \"\"\"Checks if a function is dangerous to remove it.\n    Contains a list of heuristics to avoid executing possibly dangerous functions.\n    \"\"\"\n    source_code = inspect.getsource(function)\n    # We don't want to execute functions that use subprocess\n    if (\n        (\"subprocess.\" in source_code)\n        or (\"os.system(\" in source_code)\n        or (\"input(\" in source_code)\n        # Avoiding threading\n        or (\"threading.Thread(\" in source_code)\n        or (\"exec(\" in source_code)\n        # Avoiding argparse (not sure why)\n        or (\"argparse.ArgumentParser(\" in source_code)\n        # Avoiding logging changing the levels to not mess with the logs\n        or (\".setLevel(\" in source_code)\n        # Don't run a test battery\n        or (\"unittest.main(\" in source_code)\n        # Avoid exiting the program\n        or (\"sys.exit(\" in source_code)\n        or (\"exit(\" in source_code)\n        or (\"raise SystemExit(\" in source_code)\n        or (\"multiprocessing.Pool(\" in source_code)\n    ):\n        return True\n    return False\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenExecutionChecker.process","title":"process(inputs)","text":"

Checks the answer to see if it can be executed. Captures the possible errors and returns them.

If a single example is provided, it is copied to avoid raising an error.

Parameters:

Name Type Description Default inputs StepInput

A list of dictionaries with the input data.

required

Yields:

Type Description StepOutput

A list of dictionaries with the output data.

Source code in src/distilabel/steps/tasks/apigen/execution_checker.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":\n    \"\"\"Checks the answer to see if it can be executed.\n    Captures the possible errors and returns them.\n\n    If a single example is provided, it is copied to avoid raising an error.\n\n    Args:\n        inputs: A list of dictionaries with the input data.\n\n    Yields:\n        A list of dictionaries with the output data.\n    \"\"\"\n    for input in inputs:\n        output = []\n        if input[\"answers\"]:\n            answers = json.loads(input[\"answers\"])\n        else:\n            input.update(\n                **{\n                    \"keep_row_after_execution_check\": False,\n                    \"execution_result\": [\"No answers were provided.\"],\n                }\n            )\n            continue\n        for answer in answers:\n            if answer is None:\n                output.append(\n                    {\n                        \"keep\": False,\n                        \"execution_result\": \"Nothing was generated for this answer.\",\n                    }\n                )\n                continue\n\n            function_name = answer.get(\"name\", None)\n            arguments = answer.get(\"arguments\", None)\n\n            self._logger.debug(\n                f\"Executing function '{function_name}' with arguments: {arguments}\"\n            )\n            function = self._get_function(function_name)\n\n            if self.check_is_dangerous:\n                if function and self._is_dangerous(function):\n                    function = None\n\n            if function is None:\n                output.append(\n                    {\n                        \"keep\": False,\n                        \"execution_result\": f\"Function '{function_name}' not found.\",\n                    }\n                )\n            else:\n                execution = execute_from_response(function, arguments)\n                output.append(\n                    {\n                        \"keep\": execution[\"keep\"],\n                        \"execution_result\": execution[\"execution_result\"],\n                    }\n                )\n        # We only consider a good response if all the answers were executed successfully,\n        # but keep the reasons for further review if needed.\n        input.update(\n            **{\n                \"keep_row_after_execution_check\": all(\n                    o[\"keep\"] is True for o in output\n                ),\n                \"execution_result\": [o[\"execution_result\"] for o in output],\n            }\n        )\n\n    yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator","title":"APIGenGenerator","text":"

Bases: Task

Generate queries and answers for the given functions in JSON format.

The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\nverifiable and diverse function-calling datasets. The task generates a set of diverse queries\nand corresponding answers for the given functions in JSON format.\n\nAttributes:\n    system_prompt: The system prompt to guide the user in the generation of queries and answers.\n    use_tools: Whether to use the tools available in the prompt to generate the queries and answers.\n        In case the tools are given in the input, they will be added to the prompt.\n    number: The number of queries to generate. It can be a list, where each number will be\n        chosen randomly, or a dictionary with the number of queries and the probability of each.\n        I.e: `number=1`, `number=[1, 2, 3]`, `number={1: 0.5, 2: 0.3, 3: 0.2}` are all valid inputs.\n        It corresponds to the number of parallel queries to generate.\n    use_default_structured_output: Whether to use the default structured output or not.\n\nInput columns:\n    - examples (`str`): Examples used as few shots to guide the model.\n    - func_name (`str`): Name for the function to generate.\n    - func_desc (`str`): Description of what the function should do.\n    - tools (`str`): JSON formatted string containing the tool representation of the function.\n\nOutput columns:\n    - query (`str`): The list of queries.\n    - answers (`str`): JSON formatted string with the list of answers, containing the info as\n        a dictionary to be passed to the functions.\n\nCategories:\n    - text-generation\n\nReferences:\n    - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n    - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\nExamples:\n    Generate without structured output (original implementation):\n\n    ```python\n    from distilabel.steps.tasks import ApiGenGenerator\n    from distilabel.models import InferenceEndpointsLLM\n\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 1024,\n        },\n    )\n    apigen = ApiGenGenerator(\n        use_default_structured_output=False,\n        llm=llm\n    )\n    apigen.load()\n\n    res = next(\n        apigen.process(\n            [\n                {\n                    \"examples\": 'QUERY:\n

What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', \"func_name\": \"getrandommovie\", \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\" } ] ) ) res # [{'examples': 'QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', # 'number': 1, # 'func_name': 'getrandommovie', # 'func_desc': 'Returns a list of random movies from a database by calling an external API.', # 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?', # 'Give me 5 random movie suggestions from your database to plan my weekend.'], # 'answers': [[{'name': 'getrandommovie', 'arguments': {}}], # [{'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}, # {'name': 'getrandommovie', 'arguments': {}}]], # 'raw_input_api_gen_generator_0': [{'role': 'system', # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.

Construct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.

Ensure the query: - Is clear and concise - Demonstrates typical use cases - Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words - Across a variety level of difficulties, ranging from beginner and advanced use cases - The corresponding result's parameter types and ranges match with the function's descriptions

Ensure the answer: - Is a list of function calls in JSON format - The length of the answer list should be equal to the number of requests in the query - Can solve all the requests in the query effectively\"}, # {'role': 'user', # 'content': 'Here are examples of queries and the corresponding answers for similar functions: QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]

Note that the query could be interpreted as a combination of several independent requests. Based on these examples, generate 2 diverse query and answer pairs for the function getrandommovie The detailed function description is the following: Returns a list of random movies from a database by calling an external API.

The output MUST strictly adhere to the following JSON format, and NO other text MUST be included:

[\n   {\n       \"query\": \"The generated query.\",\n       \"answers\": [\n           {\n               \"name\": \"api_name\",\n               \"arguments\": {\n                   \"arg_name\": \"value\"\n                   ... (more arguments as required)\n               }\n           },\n           ... (more API calls as required)\n       ]\n   }\n]\n

Now please generate 2 diverse query and answer pairs following the above format.'}]}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ```

    Generate with structured output:\n\n    ```python\n    from distilabel.steps.tasks import ApiGenGenerator\n    from distilabel.models import InferenceEndpointsLLM\n\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 1024,\n        },\n    )\n    apigen = ApiGenGenerator(\n        use_default_structured_output=True,\n        llm=llm\n    )\n    apigen.load()\n\n    res_struct = next(\n        apigen.process(\n            [\n                {\n                    \"examples\": 'QUERY:\n

What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', \"func_name\": \"getrandommovie\", \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\" } ] ) ) res_struct # [{'examples': 'QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]', # 'number': 1, # 'func_name': 'getrandommovie', # 'func_desc': 'Returns a list of random movies from a database by calling an external API.', # 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\", # \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"], # 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}], # [{'arguments': {}, 'name': 'getrandommovie'}]], # 'raw_input_api_gen_generator_0': [{'role': 'system', # 'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.

Construct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.

Ensure the query: - Is clear and concise - Demonstrates typical use cases - Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words - Across a variety level of difficulties, ranging from beginner and advanced use cases - The corresponding result's parameter types and ranges match with the function's descriptions

Ensure the answer: - Is a list of function calls in JSON format - The length of the answer list should be equal to the number of requests in the query - Can solve all the requests in the query effectively\"}, # {'role': 'user', # 'content': 'Here are examples of queries and the corresponding answers for similar functions: QUERY: What is the binary sum of 10010 and 11101? ANSWER: [{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]

Note that the query could be interpreted as a combination of several independent requests. Based on these examples, generate 2 diverse query and answer pairs for the function getrandommovie The detailed function description is the following: Returns a list of random movies from a database by calling an external API.

Now please generate 2 diverse query and answer pairs following the above format.'}]}, # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}] ```

Source code in src/distilabel/steps/tasks/apigen/generator.py
class APIGenGenerator(Task):\n    \"\"\"Generate queries and answers for the given functions in JSON format.\n\n    The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\n    verifiable and diverse function-calling datasets. The task generates a set of diverse queries\n    and corresponding answers for the given functions in JSON format.\n\n    Attributes:\n        system_prompt: The system prompt to guide the user in the generation of queries and answers.\n        use_tools: Whether to use the tools available in the prompt to generate the queries and answers.\n            In case the tools are given in the input, they will be added to the prompt.\n        number: The number of queries to generate. It can be a list, where each number will be\n            chosen randomly, or a dictionary with the number of queries and the probability of each.\n            I.e: `number=1`, `number=[1, 2, 3]`, `number={1: 0.5, 2: 0.3, 3: 0.2}` are all valid inputs.\n            It corresponds to the number of parallel queries to generate.\n        use_default_structured_output: Whether to use the default structured output or not.\n\n    Input columns:\n        - examples (`str`): Examples used as few shots to guide the model.\n        - func_name (`str`): Name for the function to generate.\n        - func_desc (`str`): Description of what the function should do.\n        - tools (`str`): JSON formatted string containing the tool representation of the function.\n\n    Output columns:\n        - query (`str`): The list of queries.\n        - answers (`str`): JSON formatted string with the list of answers, containing the info as\n            a dictionary to be passed to the functions.\n\n    Categories:\n        - text-generation\n\n    References:\n        - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n        - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n    Examples:\n        Generate without structured output (original implementation):\n\n        ```python\n        from distilabel.steps.tasks import ApiGenGenerator\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 1024,\n            },\n        )\n        apigen = ApiGenGenerator(\n            use_default_structured_output=False,\n            llm=llm\n        )\n        apigen.load()\n\n        res = next(\n            apigen.process(\n                [\n                    {\n                        \"examples\": 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n                        \"func_name\": \"getrandommovie\",\n                        \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n                    }\n                ]\n            )\n        )\n        res\n        # [{'examples': 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n        # 'number': 1,\n        # 'func_name': 'getrandommovie',\n        # 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n        # 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?',\n        # 'Give me 5 random movie suggestions from your database to plan my weekend.'],\n        # 'answers': [[{'name': 'getrandommovie', 'arguments': {}}],\n        # [{'name': 'getrandommovie', 'arguments': {}},\n        #     {'name': 'getrandommovie', 'arguments': {}},\n        #     {'name': 'getrandommovie', 'arguments': {}},\n        #     {'name': 'getrandommovie', 'arguments': {}},\n        #     {'name': 'getrandommovie', 'arguments': {}}]],\n        # 'raw_input_api_gen_generator_0': [{'role': 'system',\n        #     'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\"},\n        #     {'role': 'user',\n        #     'content': 'Here are examples of queries and the corresponding answers for similar functions:\\nQUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\\nThe detailed function description is the following:\\nReturns a list of random movies from a database by calling an external API.\\n\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n```json\\n[\\n   {\\n       \"query\": \"The generated query.\",\\n       \"answers\": [\\n           {\\n               \"name\": \"api_name\",\\n               \"arguments\": {\\n                   \"arg_name\": \"value\"\\n                   ... (more arguments as required)\\n               }\\n           },\\n           ... (more API calls as required)\\n       ]\\n   }\\n]\\n```\\n\\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Generate with structured output:\n\n        ```python\n        from distilabel.steps.tasks import ApiGenGenerator\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 1024,\n            },\n        )\n        apigen = ApiGenGenerator(\n            use_default_structured_output=True,\n            llm=llm\n        )\n        apigen.load()\n\n        res_struct = next(\n            apigen.process(\n                [\n                    {\n                        \"examples\": 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n                        \"func_name\": \"getrandommovie\",\n                        \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n                    }\n                ]\n            )\n        )\n        res_struct\n        # [{'examples': 'QUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n        # 'number': 1,\n        # 'func_name': 'getrandommovie',\n        # 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n        # 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\",\n        # \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"],\n        # 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}],\n        # [{'arguments': {}, 'name': 'getrandommovie'}]],\n        # 'raw_input_api_gen_generator_0': [{'role': 'system',\n        #     'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\"},\n        #     {'role': 'user',\n        #     'content': 'Here are examples of queries and the corresponding answers for similar functions:\\nQUERY:\\nWhat is the binary sum of 10010 and 11101?\\nANSWER:\\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\\nThe detailed function description is the following:\\nReturns a list of random movies from a database by calling an external API.\\n\\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n    \"\"\"\n\n    system_prompt: str = SYSTEM_PROMPT_API_GEN\n    use_default_structured_output: bool = False\n    number: Union[int, List[int], Dict[int, float]] = 1\n    use_tools: bool = True\n\n    _number: Union[int, None] = PrivateAttr(None)\n    _fn_parallel_queries: Union[Callable[[], str], None] = PrivateAttr(None)\n    _format_inst: Union[str, None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the template for the generator prompt.\"\"\"\n        super().load()\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"apigen\"\n            / \"generator.jinja2\"\n        )\n        self._template = Template(open(_path).read())\n        self._format_inst = self._set_format_inst()\n\n    def _parallel_queries(self, number: int) -> Callable[[int], str]:\n        \"\"\"Prepares the function to update the parallel queries guide in the prompt.\n\n        Raises:\n            ValueError: if `is_parallel` is not a boolean or a list of floats.\n\n        Returns:\n            The function to generate the parallel queries guide.\n        \"\"\"\n        if number > 1:\n            return (\n                \"It can contain multiple parallel queries in natural language for the given functions. \"\n                \"They could use either the same function with different arguments or different functions.\\n\"\n            )\n        return \"\"\n\n    def _get_number(self) -> int:\n        \"\"\"Generates the number of queries to generate in a single call.\n        The number must be set to `_number` to avoid changing the original value\n        when calling `_default_error`.\n        \"\"\"\n        if isinstance(self.number, list):\n            self._number = random.choice(self.number)\n        elif isinstance(self.number, dict):\n            self._number = random.choices(\n                list(self.number.keys()), list(self.number.values())\n            )[0]\n        else:\n            self._number = self.number\n        return self._number\n\n    def _set_format_inst(self) -> str:\n        \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n        If the default structured output is used, returns an empty string because nothing\n        else is needed, otherwise, returns the original addition to the prompt to guide the model\n        to generate a formatted JSON.\n        \"\"\"\n        return (\n            \"\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n\"\n            \"```\\n\"\n            \"[\\n\"\n            \"   {\\n\"\n            '       \"query\": \"The generated query.\",\\n'\n            '       \"answers\": [\\n'\n            \"           {\\n\"\n            '               \"name\": \"api_name\",\\n'\n            '               \"arguments\": {\\n'\n            '                   \"arg_name\": \"value\"\\n'\n            \"                   ... (more arguments as required)\\n\"\n            \"               }\\n\"\n            \"           },\\n\"\n            \"           ... (more API calls as required)\\n\"\n            \"       ]\\n\"\n            \"   }\\n\"\n            \"]\\n\"\n            \"```\\n\"\n        )\n\n    def _get_func_desc(self, input: Dict[str, Any]) -> str:\n        \"\"\"If available and required, will use the info from the tools in the\n        prompt for extra information. Otherwise will use jut the function description.\n        \"\"\"\n        if not self.use_tools:\n            return input[\"func_desc\"]\n        extra = \"\"  # Extra information from the tools (if available will be added)\n        if \"tools\" in input:\n            extra = f\"\\n\\nThis is the available tool to guide you (respect the order of the parameters):\\n{input['tools']}\"\n        return input[\"func_desc\"] + extra\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task.\"\"\"\n        return {\n            \"examples\": True,\n            \"func_name\": True,\n            \"func_desc\": True,\n            \"tools\": False,\n        }\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType`.\"\"\"\n        number = self._get_number()\n        parallel_queries = self._parallel_queries(number)\n        return [\n            {\"role\": \"system\", \"content\": self.system_prompt},\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    examples=input[\"examples\"],\n                    parallel_queries=parallel_queries,\n                    number=number,\n                    func_name=input[\"func_name\"],\n                    func_desc=self._get_func_desc(input),\n                    format_inst=self._format_inst,\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The output for the task are the queries and corresponding answers.\"\"\"\n        return [\"query\", \"answers\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the queries and answers pairs.\n            The answers are an array of answers corresponding to the query.\n            Each answer is represented as an object with the following properties:\n                - name (string): The name of the tool used to generate the answer.\n                - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n            Each argument is represented as a key-value pair, where the key is the parameter name and the\n            value is the corresponding value.\n        \"\"\"\n        if output is None:\n            return self._default_error(input)\n\n        if not self.use_default_structured_output:\n            output = remove_fences(output)\n\n        try:\n            pairs = orjson.loads(output)\n        except orjson.JSONDecodeError:\n            return self._default_error(input)\n\n        pairs = pairs[\"pairs\"] if self.use_default_structured_output else pairs\n\n        return self._format_output(pairs, input)\n\n    def _format_output(\n        self, pairs: Dict[str, Any], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Parses the response, returning a dictionary with queries and answers.\n\n        Args:\n            pairs: The parsed dictionary from the LLM's output.\n            input: The input from the `LLM`.\n\n        Returns:\n            Formatted output, where the `queries` are a list of strings, and the `answers`\n            are a list of objects.\n        \"\"\"\n        try:\n            input.update(\n                **{\n                    \"query\": pairs[0][\"query\"],\n                    \"answers\": json.dumps(pairs[0][\"answers\"]),\n                }\n            )\n            return input\n        except Exception as e:\n            self._logger.error(f\"Error formatting output: {e}, pairs: '{pairs}'\")\n            return self._default_error(input)\n\n    def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n        \"\"\"Returns a default error output, to fill the responses in case of failure.\"\"\"\n        input.update(\n            **{\n                \"query\": None,\n                \"answers\": json.dumps([None] * self._number),\n            }\n        )\n        return input\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from typing import Dict, List\n        from pydantic import BaseModel\n\n\n        class Answer(BaseModel):\n            name: str\n            arguments: Dict[str, str]\n\n        class QueryAnswer(BaseModel):\n            query: str\n            answers: List[Answer]\n\n        class QueryAnswerPairs(BaseModel):\n            pairs: List[QueryAnswer]\n\n        json.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        return {\n            \"$defs\": {\n                \"Answer\": {\n                    \"properties\": {\n                        \"name\": {\"title\": \"Name\", \"type\": \"string\"},\n                        \"arguments\": {\n                            \"additionalProperties\": {\"type\": \"string\"},\n                            \"title\": \"Arguments\",\n                            \"type\": \"object\",\n                        },\n                    },\n                    \"required\": [\"name\", \"arguments\"],\n                    \"title\": \"Answer\",\n                    \"type\": \"object\",\n                },\n                \"QueryAnswer\": {\n                    \"properties\": {\n                        \"query\": {\"title\": \"Query\", \"type\": \"string\"},\n                        \"answers\": {\n                            \"items\": {\"$ref\": \"#/$defs/Answer\"},\n                            \"title\": \"Answers\",\n                            \"type\": \"array\",\n                        },\n                    },\n                    \"required\": [\"query\", \"answers\"],\n                    \"title\": \"QueryAnswer\",\n                    \"type\": \"object\",\n                },\n            },\n            \"properties\": {\n                \"pairs\": {\n                    \"items\": {\"$ref\": \"#/$defs/QueryAnswer\"},\n                    \"title\": \"Pairs\",\n                    \"type\": \"array\",\n                }\n            },\n            \"required\": [\"pairs\"],\n            \"title\": \"QueryAnswerPairs\",\n            \"type\": \"object\",\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.outputs","title":"outputs: StepColumns property","text":"

The output for the task are the queries and corresponding answers.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.load","title":"load()","text":"

Loads the template for the generator prompt.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def load(self) -> None:\n    \"\"\"Loads the template for the generator prompt.\"\"\"\n    super().load()\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"apigen\"\n        / \"generator.jinja2\"\n    )\n    self._template = Template(open(_path).read())\n    self._format_inst = self._set_format_inst()\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._parallel_queries","title":"_parallel_queries(number)","text":"

Prepares the function to update the parallel queries guide in the prompt.

Raises:

Type Description ValueError

if is_parallel is not a boolean or a list of floats.

Returns:

Type Description Callable[[int], str]

The function to generate the parallel queries guide.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _parallel_queries(self, number: int) -> Callable[[int], str]:\n    \"\"\"Prepares the function to update the parallel queries guide in the prompt.\n\n    Raises:\n        ValueError: if `is_parallel` is not a boolean or a list of floats.\n\n    Returns:\n        The function to generate the parallel queries guide.\n    \"\"\"\n    if number > 1:\n        return (\n            \"It can contain multiple parallel queries in natural language for the given functions. \"\n            \"They could use either the same function with different arguments or different functions.\\n\"\n        )\n    return \"\"\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._get_number","title":"_get_number()","text":"

Generates the number of queries to generate in a single call. The number must be set to _number to avoid changing the original value when calling _default_error.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _get_number(self) -> int:\n    \"\"\"Generates the number of queries to generate in a single call.\n    The number must be set to `_number` to avoid changing the original value\n    when calling `_default_error`.\n    \"\"\"\n    if isinstance(self.number, list):\n        self._number = random.choice(self.number)\n    elif isinstance(self.number, dict):\n        self._number = random.choices(\n            list(self.number.keys()), list(self.number.values())\n        )[0]\n    else:\n        self._number = self.number\n    return self._number\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._set_format_inst","title":"_set_format_inst()","text":"

Prepares the function to generate the formatted instructions for the prompt.

If the default structured output is used, returns an empty string because nothing else is needed, otherwise, returns the original addition to the prompt to guide the model to generate a formatted JSON.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _set_format_inst(self) -> str:\n    \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n    If the default structured output is used, returns an empty string because nothing\n    else is needed, otherwise, returns the original addition to the prompt to guide the model\n    to generate a formatted JSON.\n    \"\"\"\n    return (\n        \"\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n\"\n        \"```\\n\"\n        \"[\\n\"\n        \"   {\\n\"\n        '       \"query\": \"The generated query.\",\\n'\n        '       \"answers\": [\\n'\n        \"           {\\n\"\n        '               \"name\": \"api_name\",\\n'\n        '               \"arguments\": {\\n'\n        '                   \"arg_name\": \"value\"\\n'\n        \"                   ... (more arguments as required)\\n\"\n        \"               }\\n\"\n        \"           },\\n\"\n        \"           ... (more API calls as required)\\n\"\n        \"       ]\\n\"\n        \"   }\\n\"\n        \"]\\n\"\n        \"```\\n\"\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._get_func_desc","title":"_get_func_desc(input)","text":"

If available and required, will use the info from the tools in the prompt for extra information. Otherwise will use jut the function description.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _get_func_desc(self, input: Dict[str, Any]) -> str:\n    \"\"\"If available and required, will use the info from the tools in the\n    prompt for extra information. Otherwise will use jut the function description.\n    \"\"\"\n    if not self.use_tools:\n        return input[\"func_desc\"]\n    extra = \"\"  # Extra information from the tools (if available will be added)\n    if \"tools\" in input:\n        extra = f\"\\n\\nThis is the available tool to guide you (respect the order of the parameters):\\n{input['tools']}\"\n    return input[\"func_desc\"] + extra\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType`.\"\"\"\n    number = self._get_number()\n    parallel_queries = self._parallel_queries(number)\n    return [\n        {\"role\": \"system\", \"content\": self.system_prompt},\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                examples=input[\"examples\"],\n                parallel_queries=parallel_queries,\n                number=number,\n                func_name=input[\"func_name\"],\n                func_desc=self._get_func_desc(input),\n                format_inst=self._format_inst,\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the queries and answers pairs.

Dict[str, Any]

The answers are an array of answers corresponding to the query.

Dict[str, Any]

Each answer is represented as an object with the following properties: - name (string): The name of the tool used to generate the answer. - arguments (object): An object representing the arguments passed to the tool to generate the answer.

Dict[str, Any]

Each argument is represented as a key-value pair, where the key is the parameter name and the

Dict[str, Any]

value is the corresponding value.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the queries and answers pairs.\n        The answers are an array of answers corresponding to the query.\n        Each answer is represented as an object with the following properties:\n            - name (string): The name of the tool used to generate the answer.\n            - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n        Each argument is represented as a key-value pair, where the key is the parameter name and the\n        value is the corresponding value.\n    \"\"\"\n    if output is None:\n        return self._default_error(input)\n\n    if not self.use_default_structured_output:\n        output = remove_fences(output)\n\n    try:\n        pairs = orjson.loads(output)\n    except orjson.JSONDecodeError:\n        return self._default_error(input)\n\n    pairs = pairs[\"pairs\"] if self.use_default_structured_output else pairs\n\n    return self._format_output(pairs, input)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._format_output","title":"_format_output(pairs, input)","text":"

Parses the response, returning a dictionary with queries and answers.

Parameters:

Name Type Description Default pairs Dict[str, Any]

The parsed dictionary from the LLM's output.

required input Dict[str, Any]

The input from the LLM.

required

Returns:

Type Description Dict[str, Any]

Formatted output, where the queries are a list of strings, and the answers

Dict[str, Any]

are a list of objects.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _format_output(\n    self, pairs: Dict[str, Any], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"Parses the response, returning a dictionary with queries and answers.\n\n    Args:\n        pairs: The parsed dictionary from the LLM's output.\n        input: The input from the `LLM`.\n\n    Returns:\n        Formatted output, where the `queries` are a list of strings, and the `answers`\n        are a list of objects.\n    \"\"\"\n    try:\n        input.update(\n            **{\n                \"query\": pairs[0][\"query\"],\n                \"answers\": json.dumps(pairs[0][\"answers\"]),\n            }\n        )\n        return input\n    except Exception as e:\n        self._logger.error(f\"Error formatting output: {e}, pairs: '{pairs}'\")\n        return self._default_error(input)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator._default_error","title":"_default_error(input)","text":"

Returns a default error output, to fill the responses in case of failure.

Source code in src/distilabel/steps/tasks/apigen/generator.py
def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n    \"\"\"Returns a default error output, to fill the responses in case of failure.\"\"\"\n    input.update(\n        **{\n            \"query\": None,\n            \"answers\": json.dumps([None] * self._number),\n        }\n    )\n    return input\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenGenerator.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from typing import Dict, List\nfrom pydantic import BaseModel\n\n\nclass Answer(BaseModel):\n    name: str\n    arguments: Dict[str, str]\n\nclass QueryAnswer(BaseModel):\n    query: str\n    answers: List[Answer]\n\nclass QueryAnswerPairs(BaseModel):\n    pairs: List[QueryAnswer]\n\njson.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/apigen/generator.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from typing import Dict, List\n    from pydantic import BaseModel\n\n\n    class Answer(BaseModel):\n        name: str\n        arguments: Dict[str, str]\n\n    class QueryAnswer(BaseModel):\n        query: str\n        answers: List[Answer]\n\n    class QueryAnswerPairs(BaseModel):\n        pairs: List[QueryAnswer]\n\n    json.dumps(QueryAnswerPairs.model_json_schema(), indent=4)\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    return {\n        \"$defs\": {\n            \"Answer\": {\n                \"properties\": {\n                    \"name\": {\"title\": \"Name\", \"type\": \"string\"},\n                    \"arguments\": {\n                        \"additionalProperties\": {\"type\": \"string\"},\n                        \"title\": \"Arguments\",\n                        \"type\": \"object\",\n                    },\n                },\n                \"required\": [\"name\", \"arguments\"],\n                \"title\": \"Answer\",\n                \"type\": \"object\",\n            },\n            \"QueryAnswer\": {\n                \"properties\": {\n                    \"query\": {\"title\": \"Query\", \"type\": \"string\"},\n                    \"answers\": {\n                        \"items\": {\"$ref\": \"#/$defs/Answer\"},\n                        \"title\": \"Answers\",\n                        \"type\": \"array\",\n                    },\n                },\n                \"required\": [\"query\", \"answers\"],\n                \"title\": \"QueryAnswer\",\n                \"type\": \"object\",\n            },\n        },\n        \"properties\": {\n            \"pairs\": {\n                \"items\": {\"$ref\": \"#/$defs/QueryAnswer\"},\n                \"title\": \"Pairs\",\n                \"type\": \"array\",\n            }\n        },\n        \"required\": [\"pairs\"],\n        \"title\": \"QueryAnswerPairs\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker","title":"APIGenSemanticChecker","text":"

Bases: Task

Generate queries and answers for the given functions in JSON format.

The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format.

Attributes:

Name Type Description system_prompt str

System prompt for the task. Has a default one.

exclude_failed_execution str

Whether to exclude failed executions (won't run on those rows that have a False in keep_row_after_execution_check column, which comes from running APIGenExecutionChecker). Defaults to True.

Input columns
  • func_desc (str): Description of what the function should do.
  • query (str): Instruction from the user.
  • answers (str): JSON encoded list with arguments to be passed to the function/API. Should be loaded using json.loads.
  • execution_result (str): Result of the function/API executed.
Output columns
  • thought (str): Reasoning for the output on whether to keep this output or not.
  • keep_row_after_semantic_check (bool): True or False, can be used to filter afterwards.
Categories
  • filtering
  • text-generation
References
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets
  • Salesforce/xlam-function-calling-60k

Examples:

Semantic checker for generated function calls (original implementation):\n\n```python\nfrom distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\nsemantic_checker = APIGenSemanticChecker(\n    use_default_structured_output=False,\n    llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n    semantic_checker.process(\n        [\n            {\n                \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n            }\n        ]\n    )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'thought': '',\n# 'keep_row_after_semantic_check': True,\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n#     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n#     {'role': 'user',\n#     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n   \"thought\": \"Concisely describe your reasoning here\",\\n   \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n```\n\nSemantic checker for generated function calls (structured output):\n\n```python\nfrom distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\nsemantic_checker = APIGenSemanticChecker(\n    use_default_structured_output=True,\n    llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n    semantic_checker.process(\n        [\n            {\n                \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n            }\n        ]\n    )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'keep_row_after_semantic_check': True,\n# 'thought': '',\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n#     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n#     {'role': 'user',\n#     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n```\n
Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
class APIGenSemanticChecker(Task):\n    r\"\"\"Generate queries and answers for the given functions in JSON format.\n\n    The `APIGenGenerator` is inspired by the APIGen pipeline, which was designed to generate\n    verifiable and diverse function-calling datasets. The task generates a set of diverse queries\n    and corresponding answers for the given functions in JSON format.\n\n    Attributes:\n        system_prompt: System prompt for the task. Has a default one.\n        exclude_failed_execution: Whether to exclude failed executions (won't run on those\n            rows that have a False in `keep_row_after_execution_check` column, which\n            comes from running `APIGenExecutionChecker`). Defaults to True.\n\n    Input columns:\n        - func_desc (`str`): Description of what the function should do.\n        - query (`str`): Instruction from the user.\n        - answers (`str`): JSON encoded list with arguments to be passed to the function/API.\n            Should be loaded using `json.loads`.\n        - execution_result (`str`): Result of the function/API executed.\n\n    Output columns:\n        - thought (`str`): Reasoning for the output on whether to keep this output or not.\n        - keep_row_after_semantic_check (`bool`): True or False, can be used to filter\n            afterwards.\n\n    Categories:\n        - filtering\n        - text-generation\n\n    References:\n        - [APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets](https://arxiv.org/abs/2406.18518)\n        - [Salesforce/xlam-function-calling-60k](https://huggingface.co/datasets/Salesforce/xlam-function-calling-60k)\n\n    Examples:\n\n        Semantic checker for generated function calls (original implementation):\n\n        ```python\n        from distilabel.steps.tasks import APIGenSemanticChecker\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 1024,\n            },\n        )\n        semantic_checker = APIGenSemanticChecker(\n            use_default_structured_output=False,\n            llm=llm\n        )\n        semantic_checker.load()\n\n        res = next(\n            semantic_checker.process(\n                [\n                    {\n                        \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                        \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                        \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                        \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n                    }\n                ]\n            )\n        )\n        res\n        # [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n        # 'query': 'What information can be obtained about the Maine Coon cat breed?',\n        # 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n        # 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n        # 'thought': '',\n        # 'keep_row_after_semantic_check': True,\n        # 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n        #     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n        #     {'role': 'user',\n        #     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n   \"thought\": \"Concisely describe your reasoning here\",\\n   \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Semantic checker for generated function calls (structured output):\n\n        ```python\n        from distilabel.steps.tasks import APIGenSemanticChecker\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 1024,\n            },\n        )\n        semantic_checker = APIGenSemanticChecker(\n            use_default_structured_output=True,\n            llm=llm\n        )\n        semantic_checker.load()\n\n        res = next(\n            semantic_checker.process(\n                [\n                    {\n                        \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                        \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                        \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                        \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n                    }\n                ]\n            )\n        )\n        res\n        # [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n        # 'query': 'What information can be obtained about the Maine Coon cat breed?',\n        # 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n        # 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n        # 'keep_row_after_semantic_check': True,\n        # 'thought': '',\n        # 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n        #     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n        #     {'role': 'user',\n        #     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n    \"\"\"\n\n    system_prompt: str = SYSTEM_PROMPT_SEMANTIC_CHECKER\n    use_default_structured_output: bool = False\n\n    _format_inst: Union[str, None] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the template for the generator prompt.\"\"\"\n        super().load()\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"apigen\"\n            / \"semantic_checker.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n        self._format_inst = self._set_format_inst()\n\n    def _set_format_inst(self) -> str:\n        \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n        If the default structured output is used, returns an empty string because nothing\n        else is needed, otherwise, returns the original addition to the prompt to guide the model\n        to generate a formatted JSON.\n        \"\"\"\n        return (\n            \"\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n\"\n            \"```\\n\"\n            \"{\\n\"\n            '   \"thought\": \"Concisely describe your reasoning here\",\\n'\n            '   \"passes\": \"yes\" or \"no\"\\n'\n            \"}\\n\"\n            \"```\\n\"\n        )\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task.\"\"\"\n        return {\n            \"func_desc\": True,\n            \"query\": True,\n            \"answers\": True,\n            \"execution_result\": True,\n            \"keep_row_after_execution_check\": True,\n        }\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType`.\"\"\"\n        return [\n            {\"role\": \"system\", \"content\": self.system_prompt},\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    func_desc=input[\"func_desc\"],\n                    query=input[\"query\"] or \"\",\n                    func_call=input[\"answers\"] or \"\",\n                    execution_result=input[\"execution_result\"],\n                    format_inst=self._format_inst,\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The output for the task are the queries and corresponding answers.\"\"\"\n        return [\"keep_row_after_semantic_check\", \"thought\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the queries and answers pairs.\n            The answers are an array of answers corresponding to the query.\n            Each answer is represented as an object with the following properties:\n                - name (string): The name of the tool used to generate the answer.\n                - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n            Each argument is represented as a key-value pair, where the key is the parameter name and the\n            value is the corresponding value.\n        \"\"\"\n        if output is None:\n            return self._default_error(input)\n\n        output = remove_fences(output)\n\n        try:\n            result = orjson.loads(output)\n            # Update the column name and change to bool\n            result[\"keep_row_after_semantic_check\"] = (\n                result.pop(\"passes\").lower() == \"yes\"\n            )\n            input.update(**result)\n            return input\n        except orjson.JSONDecodeError:\n            return self._default_error(input)\n\n    def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n        \"\"\"Default error message for the task.\"\"\"\n        input.update({\"thought\": None, \"keep_row_after_semantic_check\": None})\n        return input\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from typing import Literal\n        from pydantic import BaseModel\n        import json\n\n        class Checker(BaseModel):\n            thought: str\n            passes: Literal[\"yes\", \"no\"]\n\n        json.dumps(Checker.model_json_schema(), indent=4)\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        return {\n            \"properties\": {\n                \"thought\": {\"title\": \"Thought\", \"type\": \"string\"},\n                \"passes\": {\"enum\": [\"yes\", \"no\"], \"title\": \"Passes\", \"type\": \"string\"},\n            },\n            \"required\": [\"thought\", \"passes\"],\n            \"title\": \"Checker\",\n            \"type\": \"object\",\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.outputs","title":"outputs: StepColumns property","text":"

The output for the task are the queries and corresponding answers.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.load","title":"load()","text":"

Loads the template for the generator prompt.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def load(self) -> None:\n    \"\"\"Loads the template for the generator prompt.\"\"\"\n    super().load()\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"apigen\"\n        / \"semantic_checker.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n    self._format_inst = self._set_format_inst()\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker._set_format_inst","title":"_set_format_inst()","text":"

Prepares the function to generate the formatted instructions for the prompt.

If the default structured output is used, returns an empty string because nothing else is needed, otherwise, returns the original addition to the prompt to guide the model to generate a formatted JSON.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def _set_format_inst(self) -> str:\n    \"\"\"Prepares the function to generate the formatted instructions for the prompt.\n\n    If the default structured output is used, returns an empty string because nothing\n    else is needed, otherwise, returns the original addition to the prompt to guide the model\n    to generate a formatted JSON.\n    \"\"\"\n    return (\n        \"\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n\"\n        \"```\\n\"\n        \"{\\n\"\n        '   \"thought\": \"Concisely describe your reasoning here\",\\n'\n        '   \"passes\": \"yes\" or \"no\"\\n'\n        \"}\\n\"\n        \"```\\n\"\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType`.\"\"\"\n    return [\n        {\"role\": \"system\", \"content\": self.system_prompt},\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                func_desc=input[\"func_desc\"],\n                query=input[\"query\"] or \"\",\n                func_call=input[\"answers\"] or \"\",\n                execution_result=input[\"execution_result\"],\n                format_inst=self._format_inst,\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the queries and answers pairs.

Dict[str, Any]

The answers are an array of answers corresponding to the query.

Dict[str, Any]

Each answer is represented as an object with the following properties: - name (string): The name of the tool used to generate the answer. - arguments (object): An object representing the arguments passed to the tool to generate the answer.

Dict[str, Any]

Each argument is represented as a key-value pair, where the key is the parameter name and the

Dict[str, Any]

value is the corresponding value.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the queries and answers pairs.\n        The answers are an array of answers corresponding to the query.\n        Each answer is represented as an object with the following properties:\n            - name (string): The name of the tool used to generate the answer.\n            - arguments (object): An object representing the arguments passed to the tool to generate the answer.\n        Each argument is represented as a key-value pair, where the key is the parameter name and the\n        value is the corresponding value.\n    \"\"\"\n    if output is None:\n        return self._default_error(input)\n\n    output = remove_fences(output)\n\n    try:\n        result = orjson.loads(output)\n        # Update the column name and change to bool\n        result[\"keep_row_after_semantic_check\"] = (\n            result.pop(\"passes\").lower() == \"yes\"\n        )\n        input.update(**result)\n        return input\n    except orjson.JSONDecodeError:\n        return self._default_error(input)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker._default_error","title":"_default_error(input)","text":"

Default error message for the task.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
def _default_error(self, input: Dict[str, Any]) -> Dict[str, Any]:\n    \"\"\"Default error message for the task.\"\"\"\n    input.update({\"thought\": None, \"keep_row_after_semantic_check\": None})\n    return input\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.APIGenSemanticChecker.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from typing import Literal\nfrom pydantic import BaseModel\nimport json\n\nclass Checker(BaseModel):\n    thought: str\n    passes: Literal[\"yes\", \"no\"]\n\njson.dumps(Checker.model_json_schema(), indent=4)\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/apigen/semantic_checker.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from typing import Literal\n    from pydantic import BaseModel\n    import json\n\n    class Checker(BaseModel):\n        thought: str\n        passes: Literal[\"yes\", \"no\"]\n\n    json.dumps(Checker.model_json_schema(), indent=4)\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    return {\n        \"properties\": {\n            \"thought\": {\"title\": \"Thought\", \"type\": \"string\"},\n            \"passes\": {\"enum\": [\"yes\", \"no\"], \"title\": \"Passes\", \"type\": \"string\"},\n        },\n        \"required\": [\"thought\", \"passes\"],\n        \"title\": \"Checker\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller","title":"ArgillaLabeller","text":"

Bases: Task

Annotate Argilla records based on input fields, example records and question settings.

This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM. It uses a system prompt that guides the LLM to understand the input fields, the question type, and the question settings. The task then formats the input data and generates a response based on the question. The response is validated against the question's value model, and the final suggestion is prepared for annotation.

Attributes:

Name Type Description _template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • record (argilla.Record): The record to be annotated.
  • fields (Optional[List[Dict[str, Any]]]): The list of field settings for the input fields.
  • question (Optional[Dict[str, Any]]): The question settings for the question to be answered.
  • example_records (Optional[List[Dict[str, Any]]]): The few shot example records with responses to be used to answer the question.
  • guidelines (Optional[str]): The guidelines for the annotation task.
Output columns
  • suggestion (Dict[str, Any]): The final suggestion for annotation.
Categories
  • text-classification
  • scorer
  • text-generation
References
  • Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets

Examples:

Annotate a record with the same dataset and question:

import argilla as rg\nfrom argilla import Suggestion\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\npending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\ncompleted_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\npending_records = list(\n    dataset.records(\n        query=rg.Query(filter=pending_records_filter),\n        limit=5,\n    )\n)\nexample_records = list(\n    dataset.records(\n        query=rg.Query(filter=completed_records_filter),\n        limit=5,\n    )\n)\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    fields=[field],\n    question=question,\n    example_records=example_records,\n    guidelines=dataset.guidelines\n)\nlabeller.load()\n\n# Process the pending records\nresult = next(\n    labeller.process(\n        [\n            {\n                \"record\": record\n            } for record in pending_records\n        ]\n    )\n)\n\n# Add the suggestions to the records\nfor record, suggestion in zip(pending_records, result):\n    record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated records\ndataset.records.log(pending_records)\n

Annotate a record with alternating datasets and questions:

import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\nquestion2 = dataset.settings.questions[\"label2\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\nlabeller.load()\n\n# Process the record\nrecord = next(dataset.records())\nresult = next(\n    labeller.process(\n        [\n            {\n                \"record\": record,\n                \"fields\": [field],\n                \"question\": question,\n            },\n            {\n                \"record\": record,\n                \"fields\": [field],\n                \"question\": question2,\n            }\n        ]\n    )\n)\n\n# Add the suggestions to the record\nfor suggestion in result:\n    record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated record\ndataset.records.log([record])\n

Overwrite default prompts and instructions:

import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Overwrite default prompts and instructions\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n    question_to_label_instruction={\n        \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n        \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n        \"text\": \"Provide a text response to the question.\",\n        \"rating\": \"Provide a rating for the question.\",\n    },\n)\nlabeller.load()\n
Source code in src/distilabel/steps/tasks/argilla_labeller.py
class ArgillaLabeller(Task):\n    \"\"\"\n    Annotate Argilla records based on input fields, example records and question settings.\n\n    This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM.\n    It uses a system prompt that guides the LLM to understand the input fields, the question type,\n    and the question settings. The task then formats the input data and generates a response based on the question.\n    The response is validated against the question's value model, and the final suggestion is prepared for annotation.\n\n    Attributes:\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - record (`argilla.Record`): The record to be annotated.\n        - fields (`Optional[List[Dict[str, Any]]]`): The list of field settings for the input fields.\n        - question (`Optional[Dict[str, Any]]`): The question settings for the question to be answered.\n        - example_records (`Optional[List[Dict[str, Any]]]`): The few shot example records with responses to be used to answer the question.\n        - guidelines (`Optional[str]`): The guidelines for the annotation task.\n\n    Output columns:\n        - suggestion (`Dict[str, Any]`): The final suggestion for annotation.\n\n    Categories:\n        - text-classification\n        - scorer\n        - text-generation\n\n    References:\n        - [`Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets`](https://github.com/argilla-io/argilla/)\n\n    Examples:\n        Annotate a record with the same dataset and question:\n\n        ```python\n        import argilla as rg\n        from argilla import Suggestion\n        from distilabel.steps.tasks import ArgillaLabeller\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Get information from Argilla dataset definition\n        dataset = rg.Dataset(\"my_dataset\")\n        pending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\n        completed_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\n        pending_records = list(\n            dataset.records(\n                query=rg.Query(filter=pending_records_filter),\n                limit=5,\n            )\n        )\n        example_records = list(\n            dataset.records(\n                query=rg.Query(filter=completed_records_filter),\n                limit=5,\n            )\n        )\n        field = dataset.settings.fields[\"text\"]\n        question = dataset.settings.questions[\"label\"]\n\n        # Initialize the labeller with the model and fields\n        labeller = ArgillaLabeller(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            fields=[field],\n            question=question,\n            example_records=example_records,\n            guidelines=dataset.guidelines\n        )\n        labeller.load()\n\n        # Process the pending records\n        result = next(\n            labeller.process(\n                [\n                    {\n                        \"record\": record\n                    } for record in pending_records\n                ]\n            )\n        )\n\n        # Add the suggestions to the records\n        for record, suggestion in zip(pending_records, result):\n            record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n        # Log the updated records\n        dataset.records.log(pending_records)\n        ```\n\n        Annotate a record with alternating datasets and questions:\n\n        ```python\n        import argilla as rg\n        from distilabel.steps.tasks import ArgillaLabeller\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Get information from Argilla dataset definition\n        dataset = rg.Dataset(\"my_dataset\")\n        field = dataset.settings.fields[\"text\"]\n        question = dataset.settings.questions[\"label\"]\n        question2 = dataset.settings.questions[\"label2\"]\n\n        # Initialize the labeller with the model and fields\n        labeller = ArgillaLabeller(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            )\n        )\n        labeller.load()\n\n        # Process the record\n        record = next(dataset.records())\n        result = next(\n            labeller.process(\n                [\n                    {\n                        \"record\": record,\n                        \"fields\": [field],\n                        \"question\": question,\n                    },\n                    {\n                        \"record\": record,\n                        \"fields\": [field],\n                        \"question\": question2,\n                    }\n                ]\n            )\n        )\n\n        # Add the suggestions to the record\n        for suggestion in result:\n            record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n        # Log the updated record\n        dataset.records.log([record])\n        ```\n\n        Overwrite default prompts and instructions:\n\n        ```python\n        import argilla as rg\n        from distilabel.steps.tasks import ArgillaLabeller\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Overwrite default prompts and instructions\n        labeller = ArgillaLabeller(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n            question_to_label_instruction={\n                \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n                \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n                \"text\": \"Provide a text response to the question.\",\n                \"rating\": \"Provide a rating for the question.\",\n            },\n        )\n        labeller.load()\n        ```\n    \"\"\"\n\n    system_prompt: str = (\n        \"You are an expert annotator and labelling assistant that understands complex domains and natural language processing. \"\n        \"You are given input fields and a question. \"\n        \"You should create a valid JSON object as an response to the question based on the input fields. \"\n    )\n    question_to_label_instruction: Dict[str, str] = {\n        \"label_selection\": \"Select the appropriate label for the fields from the list of optional labels.\",\n        \"multi_label_selection\": \"Select none, one or multiple labels for the fields from the list of optional labels.\",\n        \"text\": \"Provide a response to the question based on the fields.\",\n        \"rating\": \"Provide a rating for the question based on the fields.\",\n    }\n    example_records: Optional[\n        RuntimeParameter[Union[List[Union[Dict[str, Any], BaseModel]], None]]\n    ] = Field(\n        default=None,\n        description=\"The few shot serialized example records or `BaseModel`s with responses to be used to answer the question.\",\n    )\n    fields: Optional[\n        RuntimeParameter[Union[List[Union[BaseModel, Dict[str, Any]]], None]]\n    ] = Field(\n        default=None,\n        description=\"The field serialized field settings or `BaseModel` for the fields to be used to answer the question.\",\n    )\n    question: Optional[\n        RuntimeParameter[\n            Union[\n                Dict[str, Any],\n                BaseModel,\n                None,\n            ]\n        ]\n    ] = Field(\n        default=None,\n        description=\"The question serialized question settings or `BaseModel` for the question to be answered.\",\n    )\n    guidelines: Optional[RuntimeParameter[str]] = Field(\n        default=None,\n        description=\"The guidelines for the annotation task.\",\n    )\n\n    _template: Union[Template, None] = PrivateAttr(...)\n    _client: Optional[Any] = PrivateAttr(None)\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"argillalabeller.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> Dict[str, bool]:\n        return {\n            \"record\": True,\n            \"fields\": False,\n            \"question\": False,\n            \"example_records\": False,\n            \"guidelines\": False,\n        }\n\n    def _format_record(\n        self, record: Dict[str, Any], fields: List[Dict[str, Any]]\n    ) -> str:\n        \"\"\"Format the record fields into a string.\n\n        Args:\n            record (Dict[str, Any]): The record to format.\n            fields (List[Dict[str, Any]]): The fields to format.\n\n        Returns:\n            str: The formatted record fields.\n        \"\"\"\n        output = []\n        for field in fields:\n            output.append(record.get(\"fields\", {}).get(field.get(\"name\", \"\")))\n        return \"fields: \" + \"\\n\".join(output)\n\n    def _get_label_instruction(self, question: Dict[str, Any]) -> str:\n        \"\"\"Get the label instruction for the question.\n\n        Args:\n            question (Dict[str, Any]): The question to get the label instruction for.\n\n        Returns:\n            str: The label instruction for the question.\n        \"\"\"\n        question_type = question[\"settings\"][\"type\"]\n        return self.question_to_label_instruction[question_type]\n\n    def _format_question(self, question: Dict[str, Any]) -> str:\n        \"\"\"Format the question settings into a string.\n\n        Args:\n            question (Dict[str, Any]): The question to format.\n\n        Returns:\n            str: The formatted question.\n        \"\"\"\n        output = []\n        output.append(f\"question: {self._get_label_instruction(question)}\")\n        if \"options\" in question.get(\"settings\", {}):\n            output.append(\n                f\"optional labels: {[option['value'] for option in question.get('settings', {}).get('options', [])]}\"\n            )\n        return \"\\n\".join(output)\n\n    def _format_example_records(\n        self,\n        records: List[Dict[str, Any]],\n        fields: List[Dict[str, Any]],\n        question: Dict[str, Any],\n    ) -> str:\n        \"\"\"Format the example records into a string.\n\n        Args:\n            records (List[Dict[str, Any]]): The records to format.\n            fields (List[Dict[str, Any]]): The fields to format.\n            question (Dict[str, Any]): The question to format.\n\n        Returns:\n            str: The formatted example records.\n        \"\"\"\n        base = []\n        for record in records:\n            responses = record.get(\"responses\", {})\n            if responses.get(question[\"name\"]):\n                base.append(self._format_record(record, fields))\n                value = responses[question[\"name\"]][0][\"value\"]\n                formatted_value = self._assign_value_to_question_value_model(\n                    value, question\n                )\n                base.append(f\"response: {formatted_value}\")\n                base.append(\"\")\n            else:\n                warnings.warn(\n                    f\"Record {record} has no response for question {question['name']}. Skipping example record.\",\n                    stacklevel=2,\n                )\n        return \"\\n\".join(base)\n\n    def format_input(\n        self,\n        input: Dict[\n            str,\n            Union[\n                Dict[str, Any],\n                \"Record\",\n                \"TextField\",\n                \"MultiLabelQuestion\",\n                \"LabelQuestion\",\n                \"RatingQuestion\",\n                \"TextQuestion\",\n            ],\n        ],\n    ) -> \"ChatType\":\n        \"\"\"Format the input into a chat message.\n\n        Args:\n            input: The input to format.\n\n        Returns:\n            The formatted chat message.\n\n        Raises:\n            ValueError: If question or fields are not provided.\n        \"\"\"\n        input_keys = list(self.inputs.keys())\n        record = input[input_keys[0]]\n        fields = input.get(input_keys[1], self.fields)\n        question = input.get(input_keys[2], self.question)\n        examples = input.get(input_keys[3], self.example_records)\n        guidelines = input.get(input_keys[4], self.guidelines)\n\n        if question is None:\n            raise ValueError(\"Question must be provided.\")\n        if fields is None or any(field is None for field in fields):\n            raise ValueError(\"Fields must be provided.\")\n\n        record = record.to_dict() if not isinstance(record, dict) else record\n        question = question.serialize() if not isinstance(question, dict) else question\n        fields = [\n            field.serialize() if not isinstance(field, dict) else field\n            for field in fields\n        ]\n        examples = (\n            [\n                example.to_dict() if not isinstance(example, dict) else example\n                for example in examples\n            ]\n            if examples\n            else None\n        )\n\n        formatted_fields = self._format_record(record, fields)\n        formatted_question = self._format_question(question)\n        formatted_examples = (\n            self._format_example_records(examples, fields, question)\n            if examples\n            else False\n        )\n\n        prompt = self._template.render(\n            fields=formatted_fields,\n            question=formatted_question,\n            examples=formatted_examples,\n            guidelines=guidelines,\n        )\n\n        messages = []\n        if self.system_prompt:\n            messages.append({\"role\": \"system\", \"content\": self.system_prompt})\n        messages.append({\"role\": \"user\", \"content\": prompt})\n        return messages\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"suggestion\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Format the output into a dictionary.\n\n        Args:\n            output (Union[str, None]): The output to format.\n            input (Dict[str, Any]): The input to format.\n\n        Returns:\n            Dict[str, Any]: The formatted output.\n        \"\"\"\n        from argilla import Suggestion\n\n        question: Union[\n            Any,\n            Dict[str, Any],\n            LabelQuestion,\n            MultiLabelQuestion,\n            RatingQuestion,\n            TextQuestion,\n            None,\n        ] = input.get(list(self.inputs.keys())[2], self.question) or self.question\n        question = question.serialize() if not isinstance(question, dict) else question\n        model = self._get_pydantic_model_of_structured_output(question)\n        validated_output = model(**json.loads(output))\n        value = self._get_value_from_question_value_model(validated_output)\n        suggestion = Suggestion(\n            value=value,\n            question_name=question[\"name\"],\n            type=\"model\",\n            agent=self.llm.model_name,\n        ).serialize()\n        return {\n            self.outputs[0]: {\n                k: v\n                for k, v in suggestion.items()\n                if k in [\"value\", \"question_name\", \"type\", \"agent\"]\n            }\n        }\n\n    def _set_llm_structured_output_for_question(self, question: Dict[str, Any]) -> None:\n        runtime_parameters = self.llm._runtime_parameters\n        runtime_parameters.update(\n            {\n                \"structured_output\": {\n                    \"format\": \"json\",\n                    \"schema\": self._get_pydantic_model_of_structured_output(question),\n                },\n            }\n        )\n        self.llm.set_runtime_parameters(runtime_parameters)\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        \"\"\"Process the input through the task.\n\n        Args:\n            inputs (StepInput): The input to process.\n\n        Returns:\n            StepOutput: The output of the task.\n        \"\"\"\n\n        question_list = [input.get(\"question\", self.question) for input in inputs]\n        fields_list = [input.get(\"fields\", self.fields) for input in inputs]\n        # check if any field for the field in fields is None\n        for fields in fields_list:\n            if any(field is None for field in fields):\n                raise ValueError(\n                    \"Fields must be provided during init or through `process` method.\"\n                )\n        # check if any question is None\n        if any(question is None for question in question_list):\n            raise ValueError(\n                \"Question must be provided during init or through `process` method.\"\n            )\n        question_list = [\n            question.serialize() if not isinstance(question, dict) else question\n            for question in question_list\n        ]\n        if not all(question == question_list[0] for question in question_list):\n            warnings.warn(\n                \"Not all questions are the same. Processing each question separately by setting the structured output for each question. This may impact performance.\",\n                stacklevel=2,\n            )\n            for input, question in zip(inputs, question_list):\n                self._set_llm_structured_output_for_question(question)\n                yield from super().process([input])\n        else:\n            question = question_list[0]\n            self._set_llm_structured_output_for_question(question)\n            yield from super().process(inputs)\n\n    def _get_value_from_question_value_model(\n        self, question_value_model: BaseModel\n    ) -> Any:\n        \"\"\"Get the value from the question value model.\n\n        Args:\n            question_value_model (BaseModel): The question value model to get the value from.\n\n        Returns:\n            Any: The value from the question value model.\n        \"\"\"\n        for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n            if hasattr(question_value_model, attr):\n                return getattr(question_value_model, attr)\n        raise ValueError(f\"Unsupported question type: {question_value_model}\")\n\n    def _assign_value_to_question_value_model(\n        self, value: Any, question: Dict[str, Any]\n    ) -> BaseModel:\n        \"\"\"Assign the value to the question value model.\n\n        Args:\n            value (Any): The value to assign.\n            question (Dict[str, Any]): The question to assign the value to.\n\n        Returns:\n            BaseModel: The question value model with the assigned value.\n        \"\"\"\n        question_value_model = self._get_pydantic_model_of_structured_output(question)\n        for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n            try:\n                model_dict = {attr: value}\n                question_value_model = question_value_model(**model_dict)\n                return question_value_model.model_dump_json()\n            except AttributeError:\n                pass\n        return value\n\n    def _get_pydantic_model_of_structured_output(\n        self,\n        question: Dict[str, Any],\n    ) -> BaseModel:\n        \"\"\"Get the Pydantic model of the structured output.\n\n        Args:\n            question (Dict[str, Any]): The question to get the Pydantic model of the structured output for.\n\n        Returns:\n            BaseModel: The Pydantic model of the structured output.\n        \"\"\"\n\n        question_type = question[\"settings\"][\"type\"]\n\n        if question_type == \"multi_label_selection\":\n\n            class QuestionValueModel(BaseModel):\n                labels: Optional[List[str]] = Field(default_factory=list)\n\n        elif question_type == \"label_selection\":\n\n            class QuestionValueModel(BaseModel):\n                label: str\n\n        elif question_type == \"text\":\n\n            class QuestionValueModel(BaseModel):\n                text: str\n\n        elif question_type == \"rating\":\n\n            class QuestionValueModel(BaseModel):\n                rating: int\n        else:\n            raise ValueError(f\"Unsupported question type: {question}\")\n\n        return QuestionValueModel\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"argillalabeller.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_record","title":"_format_record(record, fields)","text":"

Format the record fields into a string.

Parameters:

Name Type Description Default record Dict[str, Any]

The record to format.

required fields List[Dict[str, Any]]

The fields to format.

required

Returns:

Name Type Description str str

The formatted record fields.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _format_record(\n    self, record: Dict[str, Any], fields: List[Dict[str, Any]]\n) -> str:\n    \"\"\"Format the record fields into a string.\n\n    Args:\n        record (Dict[str, Any]): The record to format.\n        fields (List[Dict[str, Any]]): The fields to format.\n\n    Returns:\n        str: The formatted record fields.\n    \"\"\"\n    output = []\n    for field in fields:\n        output.append(record.get(\"fields\", {}).get(field.get(\"name\", \"\")))\n    return \"fields: \" + \"\\n\".join(output)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_label_instruction","title":"_get_label_instruction(question)","text":"

Get the label instruction for the question.

Parameters:

Name Type Description Default question Dict[str, Any]

The question to get the label instruction for.

required

Returns:

Name Type Description str str

The label instruction for the question.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _get_label_instruction(self, question: Dict[str, Any]) -> str:\n    \"\"\"Get the label instruction for the question.\n\n    Args:\n        question (Dict[str, Any]): The question to get the label instruction for.\n\n    Returns:\n        str: The label instruction for the question.\n    \"\"\"\n    question_type = question[\"settings\"][\"type\"]\n    return self.question_to_label_instruction[question_type]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_question","title":"_format_question(question)","text":"

Format the question settings into a string.

Parameters:

Name Type Description Default question Dict[str, Any]

The question to format.

required

Returns:

Name Type Description str str

The formatted question.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _format_question(self, question: Dict[str, Any]) -> str:\n    \"\"\"Format the question settings into a string.\n\n    Args:\n        question (Dict[str, Any]): The question to format.\n\n    Returns:\n        str: The formatted question.\n    \"\"\"\n    output = []\n    output.append(f\"question: {self._get_label_instruction(question)}\")\n    if \"options\" in question.get(\"settings\", {}):\n        output.append(\n            f\"optional labels: {[option['value'] for option in question.get('settings', {}).get('options', [])]}\"\n        )\n    return \"\\n\".join(output)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._format_example_records","title":"_format_example_records(records, fields, question)","text":"

Format the example records into a string.

Parameters:

Name Type Description Default records List[Dict[str, Any]]

The records to format.

required fields List[Dict[str, Any]]

The fields to format.

required question Dict[str, Any]

The question to format.

required

Returns:

Name Type Description str str

The formatted example records.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _format_example_records(\n    self,\n    records: List[Dict[str, Any]],\n    fields: List[Dict[str, Any]],\n    question: Dict[str, Any],\n) -> str:\n    \"\"\"Format the example records into a string.\n\n    Args:\n        records (List[Dict[str, Any]]): The records to format.\n        fields (List[Dict[str, Any]]): The fields to format.\n        question (Dict[str, Any]): The question to format.\n\n    Returns:\n        str: The formatted example records.\n    \"\"\"\n    base = []\n    for record in records:\n        responses = record.get(\"responses\", {})\n        if responses.get(question[\"name\"]):\n            base.append(self._format_record(record, fields))\n            value = responses[question[\"name\"]][0][\"value\"]\n            formatted_value = self._assign_value_to_question_value_model(\n                value, question\n            )\n            base.append(f\"response: {formatted_value}\")\n            base.append(\"\")\n        else:\n            warnings.warn(\n                f\"Record {record} has no response for question {question['name']}. Skipping example record.\",\n                stacklevel=2,\n            )\n    return \"\\n\".join(base)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.format_input","title":"format_input(input)","text":"

Format the input into a chat message.

Parameters:

Name Type Description Default input Dict[str, Union[Dict[str, Any], Record, TextField, MultiLabelQuestion, LabelQuestion, RatingQuestion, TextQuestion]]

The input to format.

required

Returns:

Type Description ChatType

The formatted chat message.

Raises:

Type Description ValueError

If question or fields are not provided.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def format_input(\n    self,\n    input: Dict[\n        str,\n        Union[\n            Dict[str, Any],\n            \"Record\",\n            \"TextField\",\n            \"MultiLabelQuestion\",\n            \"LabelQuestion\",\n            \"RatingQuestion\",\n            \"TextQuestion\",\n        ],\n    ],\n) -> \"ChatType\":\n    \"\"\"Format the input into a chat message.\n\n    Args:\n        input: The input to format.\n\n    Returns:\n        The formatted chat message.\n\n    Raises:\n        ValueError: If question or fields are not provided.\n    \"\"\"\n    input_keys = list(self.inputs.keys())\n    record = input[input_keys[0]]\n    fields = input.get(input_keys[1], self.fields)\n    question = input.get(input_keys[2], self.question)\n    examples = input.get(input_keys[3], self.example_records)\n    guidelines = input.get(input_keys[4], self.guidelines)\n\n    if question is None:\n        raise ValueError(\"Question must be provided.\")\n    if fields is None or any(field is None for field in fields):\n        raise ValueError(\"Fields must be provided.\")\n\n    record = record.to_dict() if not isinstance(record, dict) else record\n    question = question.serialize() if not isinstance(question, dict) else question\n    fields = [\n        field.serialize() if not isinstance(field, dict) else field\n        for field in fields\n    ]\n    examples = (\n        [\n            example.to_dict() if not isinstance(example, dict) else example\n            for example in examples\n        ]\n        if examples\n        else None\n    )\n\n    formatted_fields = self._format_record(record, fields)\n    formatted_question = self._format_question(question)\n    formatted_examples = (\n        self._format_example_records(examples, fields, question)\n        if examples\n        else False\n    )\n\n    prompt = self._template.render(\n        fields=formatted_fields,\n        question=formatted_question,\n        examples=formatted_examples,\n        guidelines=guidelines,\n    )\n\n    messages = []\n    if self.system_prompt:\n        messages.append({\"role\": \"system\", \"content\": self.system_prompt})\n    messages.append({\"role\": \"user\", \"content\": prompt})\n    return messages\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.format_output","title":"format_output(output, input)","text":"

Format the output into a dictionary.

Parameters:

Name Type Description Default output Union[str, None]

The output to format.

required input Dict[str, Any]

The input to format.

required

Returns:

Type Description Dict[str, Any]

Dict[str, Any]: The formatted output.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"Format the output into a dictionary.\n\n    Args:\n        output (Union[str, None]): The output to format.\n        input (Dict[str, Any]): The input to format.\n\n    Returns:\n        Dict[str, Any]: The formatted output.\n    \"\"\"\n    from argilla import Suggestion\n\n    question: Union[\n        Any,\n        Dict[str, Any],\n        LabelQuestion,\n        MultiLabelQuestion,\n        RatingQuestion,\n        TextQuestion,\n        None,\n    ] = input.get(list(self.inputs.keys())[2], self.question) or self.question\n    question = question.serialize() if not isinstance(question, dict) else question\n    model = self._get_pydantic_model_of_structured_output(question)\n    validated_output = model(**json.loads(output))\n    value = self._get_value_from_question_value_model(validated_output)\n    suggestion = Suggestion(\n        value=value,\n        question_name=question[\"name\"],\n        type=\"model\",\n        agent=self.llm.model_name,\n    ).serialize()\n    return {\n        self.outputs[0]: {\n            k: v\n            for k, v in suggestion.items()\n            if k in [\"value\", \"question_name\", \"type\", \"agent\"]\n        }\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller.process","title":"process(inputs)","text":"

Process the input through the task.

Parameters:

Name Type Description Default inputs StepInput

The input to process.

required

Returns:

Name Type Description StepOutput StepOutput

The output of the task.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":\n    \"\"\"Process the input through the task.\n\n    Args:\n        inputs (StepInput): The input to process.\n\n    Returns:\n        StepOutput: The output of the task.\n    \"\"\"\n\n    question_list = [input.get(\"question\", self.question) for input in inputs]\n    fields_list = [input.get(\"fields\", self.fields) for input in inputs]\n    # check if any field for the field in fields is None\n    for fields in fields_list:\n        if any(field is None for field in fields):\n            raise ValueError(\n                \"Fields must be provided during init or through `process` method.\"\n            )\n    # check if any question is None\n    if any(question is None for question in question_list):\n        raise ValueError(\n            \"Question must be provided during init or through `process` method.\"\n        )\n    question_list = [\n        question.serialize() if not isinstance(question, dict) else question\n        for question in question_list\n    ]\n    if not all(question == question_list[0] for question in question_list):\n        warnings.warn(\n            \"Not all questions are the same. Processing each question separately by setting the structured output for each question. This may impact performance.\",\n            stacklevel=2,\n        )\n        for input, question in zip(inputs, question_list):\n            self._set_llm_structured_output_for_question(question)\n            yield from super().process([input])\n    else:\n        question = question_list[0]\n        self._set_llm_structured_output_for_question(question)\n        yield from super().process(inputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_value_from_question_value_model","title":"_get_value_from_question_value_model(question_value_model)","text":"

Get the value from the question value model.

Parameters:

Name Type Description Default question_value_model BaseModel

The question value model to get the value from.

required

Returns:

Name Type Description Any Any

The value from the question value model.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _get_value_from_question_value_model(\n    self, question_value_model: BaseModel\n) -> Any:\n    \"\"\"Get the value from the question value model.\n\n    Args:\n        question_value_model (BaseModel): The question value model to get the value from.\n\n    Returns:\n        Any: The value from the question value model.\n    \"\"\"\n    for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n        if hasattr(question_value_model, attr):\n            return getattr(question_value_model, attr)\n    raise ValueError(f\"Unsupported question type: {question_value_model}\")\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._assign_value_to_question_value_model","title":"_assign_value_to_question_value_model(value, question)","text":"

Assign the value to the question value model.

Parameters:

Name Type Description Default value Any

The value to assign.

required question Dict[str, Any]

The question to assign the value to.

required

Returns:

Name Type Description BaseModel BaseModel

The question value model with the assigned value.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _assign_value_to_question_value_model(\n    self, value: Any, question: Dict[str, Any]\n) -> BaseModel:\n    \"\"\"Assign the value to the question value model.\n\n    Args:\n        value (Any): The value to assign.\n        question (Dict[str, Any]): The question to assign the value to.\n\n    Returns:\n        BaseModel: The question value model with the assigned value.\n    \"\"\"\n    question_value_model = self._get_pydantic_model_of_structured_output(question)\n    for attr in [\"label\", \"labels\", \"rating\", \"text\"]:\n        try:\n            model_dict = {attr: value}\n            question_value_model = question_value_model(**model_dict)\n            return question_value_model.model_dump_json()\n        except AttributeError:\n            pass\n    return value\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ArgillaLabeller._get_pydantic_model_of_structured_output","title":"_get_pydantic_model_of_structured_output(question)","text":"

Get the Pydantic model of the structured output.

Parameters:

Name Type Description Default question Dict[str, Any]

The question to get the Pydantic model of the structured output for.

required

Returns:

Name Type Description BaseModel BaseModel

The Pydantic model of the structured output.

Source code in src/distilabel/steps/tasks/argilla_labeller.py
def _get_pydantic_model_of_structured_output(\n    self,\n    question: Dict[str, Any],\n) -> BaseModel:\n    \"\"\"Get the Pydantic model of the structured output.\n\n    Args:\n        question (Dict[str, Any]): The question to get the Pydantic model of the structured output for.\n\n    Returns:\n        BaseModel: The Pydantic model of the structured output.\n    \"\"\"\n\n    question_type = question[\"settings\"][\"type\"]\n\n    if question_type == \"multi_label_selection\":\n\n        class QuestionValueModel(BaseModel):\n            labels: Optional[List[str]] = Field(default_factory=list)\n\n    elif question_type == \"label_selection\":\n\n        class QuestionValueModel(BaseModel):\n            label: str\n\n    elif question_type == \"text\":\n\n        class QuestionValueModel(BaseModel):\n            text: str\n\n    elif question_type == \"rating\":\n\n        class QuestionValueModel(BaseModel):\n            rating: int\n    else:\n        raise ValueError(f\"Unsupported question type: {question}\")\n\n    return QuestionValueModel\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR","title":"CLAIR","text":"

Bases: Task

Contrastive Learning from AI Revisions (CLAIR).

CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting preference A preferred A\u2019 is much more contrastive and precise.

Input columns
  • task (str): The task or instruction.
  • student_solution (str): An answer to the task that is to be revised.
Output columns
  • revision (str): The revised text.
  • rational (str): The rational for the provided revision.
  • model_name (str): The name of the model used to generate the revision and rational.
Categories
  • preference
  • text-generation
References
  • Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment
  • APO and CLAIR - GitHub Repository

Examples:

Create contrastive preference pairs:

from distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 4096,\n    },\n)\nclair_task = CLAIR(llm=llm)\n\nclair_task.load()\n\nresult = next(\n    clair_task.process(\n        [\n            {\n                \"task\": \"How many gaps are there between the earth and the moon?\",\n                \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n            }\n        ]\n    )\n)\n# result\n# [{'task': 'How many gaps are there between the earth and the moon?',\n# 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n# 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n# 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n#     'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n#     {'role': 'user',\n#     'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n

Citations:

```\n@misc{doosterlinck2024anchoredpreferenceoptimizationcontrastive,\n    title={Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment},\n    author={Karel D'Oosterlinck and Winnie Xu and Chris Develder and Thomas Demeester and Amanpreet Singh and Christopher Potts and Douwe Kiela and Shikib Mehri},\n    year={2024},\n    eprint={2408.06266},\n    archivePrefix={arXiv},\n    primaryClass={cs.LG},\n    url={https://arxiv.org/abs/2408.06266},\n}\n```\n
Source code in src/distilabel/steps/tasks/clair.py
class CLAIR(Task):\n    r\"\"\"Contrastive Learning from AI Revisions (CLAIR).\n\n    CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting\n    preference A `preferred` A\u2019 is much more contrastive and precise.\n\n    Input columns:\n        - task (`str`): The task or instruction.\n        - student_solution (`str`): An answer to the task that is to be revised.\n\n    Output columns:\n        - revision (`str`): The revised text.\n        - rational (`str`): The rational for the provided revision.\n        - model_name (`str`): The name of the model used to generate the revision and rational.\n\n    Categories:\n        - preference\n        - text-generation\n\n    References:\n        - [`Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment`](https://arxiv.org/abs/2408.06266v1)\n        - [`APO and CLAIR - GitHub Repository`](https://github.com/ContextualAI/CLAIR_and_APO)\n\n    Examples:\n        Create contrastive preference pairs:\n\n        ```python\n        from distilabel.steps.tasks import CLAIR\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 4096,\n            },\n        )\n        clair_task = CLAIR(llm=llm)\n\n        clair_task.load()\n\n        result = next(\n            clair_task.process(\n                [\n                    {\n                        \"task\": \"How many gaps are there between the earth and the moon?\",\n                        \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n                    }\n                ]\n            )\n        )\n        # result\n        # [{'task': 'How many gaps are there between the earth and the moon?',\n        # 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n        # 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n        # 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n        # 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n        # 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n        #     'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n        #     {'role': 'user',\n        #     'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n    Citations:\n\n        ```\n        @misc{doosterlinck2024anchoredpreferenceoptimizationcontrastive,\n            title={Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment},\n            author={Karel D'Oosterlinck and Winnie Xu and Chris Develder and Thomas Demeester and Amanpreet Singh and Christopher Potts and Douwe Kiela and Shikib Mehri},\n            year={2024},\n            eprint={2408.06266},\n            archivePrefix={arXiv},\n            primaryClass={cs.LG},\n            url={https://arxiv.org/abs/2408.06266},\n        }\n        ```\n    \"\"\"\n\n    system_prompt: str = SYSTEM_PROMPT\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    def load(self) -> None:\n        super().load()\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"clair.jinja2\"\n        )\n        with open(_path, \"r\") as f:\n            self._template = Template(f.read())\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return [\"task\", \"student_solution\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"revision\", \"rational\", \"model_name\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\"role\": \"system\", \"content\": self.system_prompt},\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    task=input[\"task\"], student_solution=input[\"student_solution\"]\n                ),\n            },\n        ]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the key `scores` containing the scores for each instruction-response pair.\n        \"\"\"\n        if output is None:\n            return self._default_error()\n\n        return self._format_output(output)\n\n    def _format_output(self, output: Union[str, None]) -> Dict[str, Any]:\n        if \"**Corrected Student Solution:**\" in output:\n            splits = output.split(\"**Corrected Student Solution:**\")\n        elif \"{corrected_student_solution}:\" in output:\n            splits = output.split(\"{corrected_student_solution}:\")\n        elif \"{corrected_student_solution}\" in output:\n            splits = output.split(\"{corrected_student_solution}\")\n        elif \"**Worsened Student Solution:**\" in output:\n            splits = output.split(\"**Worsened Student Solution:**\")\n        elif \"{worsened_student_solution}:\" in output:\n            splits = output.split(\"{worsened_student_solution}:\")\n        elif \"{worsened_student_solution}\" in output:\n            splits = output.split(\"{worsened_student_solution}\")\n        else:\n            splits = None\n\n        # Safety check when the output doesn't follow the expected format\n        if not splits:\n            return self._default_error()\n\n        if len(splits) >= 2:\n            revision = splits[1]\n            revision = revision.strip(\"\\n\\n\").strip()  # noqa: B005\n\n            rational = splits[0]\n            if \"{teacher_reasoning}\" in rational:\n                rational = rational.split(\"{teacher_reasoning}\")[1].strip(\":\").strip()\n            rational = rational.strip(\"\\n\\n\").strip()  # noqa: B005\n        else:\n            return self._default_error()\n        return {\"revision\": revision, \"rational\": rational}\n\n    def _default_error(self) -> Dict[str, None]:\n        return {\"revision\": None, \"rational\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/clair.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\"role\": \"system\", \"content\": self.system_prompt},\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                task=input[\"task\"], student_solution=input[\"student_solution\"]\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.CLAIR.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction-response pair.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the key scores containing the scores for each instruction-response pair.

Source code in src/distilabel/steps/tasks/clair.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the key `scores` containing the scores for each instruction-response pair.\n    \"\"\"\n    if output is None:\n        return self._default_error()\n\n    return self._format_output(output)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer","title":"ComplexityScorer","text":"

Bases: Task

Score instructions based on their complexity using an LLM.

ComplexityScorer is a pre-defined task used to rank a list of instructions based in their complexity. It's an implementation of the complexity score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

Attributes:

Name Type Description _template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • instructions (List[str]): The list of instructions to be scored.
Output columns
  • scores (List[float]): The score for each instruction.
  • model_name (str): The model name used to generate the scores.
Categories
  • scorer
  • complexity
  • instruction
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Evaluate the complexity of your instructions:

from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n    )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n

Generate structured output with default schema:

from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    use_default_structured_output=use_default_structured_output\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n    )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n  \"scores\": [\\n    1, \\n    2\\n  ]\\n}'}}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/tasks/complexity_scorer.py
class ComplexityScorer(Task):\n    \"\"\"Score instructions based on their complexity using an `LLM`.\n\n    `ComplexityScorer` is a pre-defined task used to rank a list of instructions based in\n    their complexity. It's an implementation of the complexity score task from the paper\n    'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection\n    in Instruction Tuning'.\n\n    Attributes:\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - instructions (`List[str]`): The list of instructions to be scored.\n\n    Output columns:\n        - scores (`List[float]`): The score for each instruction.\n        - model_name (`str`): The model name used to generate the scores.\n\n    Categories:\n        - scorer\n        - complexity\n        - instruction\n\n    References:\n        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Evaluate the complexity of your instructions:\n\n        ```python\n        from distilabel.steps.tasks import ComplexityScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        scorer = ComplexityScorer(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            )\n        )\n\n        scorer.load()\n\n        result = next(\n            scorer.process(\n                [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n            )\n        )\n        # result\n        # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n        ```\n\n        Generate structured output with default schema:\n\n        ```python\n        from distilabel.steps.tasks import ComplexityScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        scorer = ComplexityScorer(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            use_default_structured_output=use_default_structured_output\n        )\n\n        scorer.load()\n\n        result = next(\n            scorer.process(\n                [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n            )\n        )\n        # result\n        # [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\\\n  \"scores\": [\\\\n    1, \\\\n    2\\\\n  ]\\\\n}'}}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"complexity-scorer.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the task are the `instructions`.\"\"\"\n        return [\"instructions\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(instructions=input[\"instructions\"]),  # type: ignore\n            }\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are: a list of `scores` containing the complexity score for each\n        instruction in `instructions`, and the `model_name`.\"\"\"\n        return [\"scores\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the key `scores` containing the scores for each instruction.\n        \"\"\"\n        if output is None:\n            return {\"scores\": [None] * len(input[\"instructions\"])}\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output, input)\n\n        scores = []\n        score_lines = output.split(\"\\n\")\n        for i, line in enumerate(score_lines):\n            match = _PARSE_SCORE_LINE_REGEX.match(line)\n            score = float(match.group(1)) if match else None\n            scores.append(score)\n            if i == len(input[\"instructions\"]) - 1:\n                break\n        return {\"scores\": scores}\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from pydantic import BaseModel\n        from typing import List\n\n        class SchemaComplexityScorer(BaseModel):\n            scores: List[int]\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        return {\n            \"properties\": {\n                \"scores\": {\n                    \"items\": {\"type\": \"integer\"},\n                    \"title\": \"Scores\",\n                    \"type\": \"array\",\n                }\n            },\n            \"required\": [\"scores\"],\n            \"title\": \"SchemaComplexityScorer\",\n            \"type\": \"object\",\n        }\n\n    def _format_structured_output(\n        self, output: str, input: Dict[str, Any]\n    ) -> Dict[str, str]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with either `positive`, or `positive` and `negative` keys.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            return {\"scores\": [None] * len(input[\"instructions\"])}\n\n    @override\n    def _sample_input(self) -> \"ChatType\":\n        \"\"\"Returns a sample input to be used in the `print` method.\n        Tasks that don't adhere to a format input that returns a map of the type\n        str -> str should override this method to return a sample input.\n        \"\"\"\n        return self.format_input(\n            {\n                \"instructions\": [\n                    f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n                ],\n            }\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.inputs","title":"inputs: List[str] property","text":"

The inputs for the task are the instructions.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.outputs","title":"outputs: List[str] property","text":"

The output for the task are: a list of scores containing the complexity score for each instruction in instructions, and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"complexity-scorer.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(instructions=input[\"instructions\"]),  # type: ignore\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the key scores containing the scores for each instruction.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the key `scores` containing the scores for each instruction.\n    \"\"\"\n    if output is None:\n        return {\"scores\": [None] * len(input[\"instructions\"])}\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output, input)\n\n    scores = []\n    score_lines = output.split(\"\\n\")\n    for i, line in enumerate(score_lines):\n        match = _PARSE_SCORE_LINE_REGEX.match(line)\n        score = float(match.group(1)) if match else None\n        scores.append(score)\n        if i == len(input[\"instructions\"]) - 1:\n            break\n    return {\"scores\": scores}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaComplexityScorer(BaseModel):\n    scores: List[int]\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from pydantic import BaseModel\n    from typing import List\n\n    class SchemaComplexityScorer(BaseModel):\n        scores: List[int]\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    return {\n        \"properties\": {\n            \"scores\": {\n                \"items\": {\"type\": \"integer\"},\n                \"title\": \"Scores\",\n                \"type\": \"array\",\n            }\n        },\n        \"required\": [\"scores\"],\n        \"title\": \"SchemaComplexityScorer\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer._format_structured_output","title":"_format_structured_output(output, input)","text":"

Parses the structured response, which should correspond to a dictionary with either positive, or positive and negative keys.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, str]

Formatted output.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
def _format_structured_output(\n    self, output: str, input: Dict[str, Any]\n) -> Dict[str, str]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with either `positive`, or `positive` and `negative` keys.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        return {\"scores\": [None] * len(input[\"instructions\"])}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ComplexityScorer._sample_input","title":"_sample_input()","text":"

Returns a sample input to be used in the print method. Tasks that don't adhere to a format input that returns a map of the type str -> str should override this method to return a sample input.

Source code in src/distilabel/steps/tasks/complexity_scorer.py
@override\ndef _sample_input(self) -> \"ChatType\":\n    \"\"\"Returns a sample input to be used in the `print` method.\n    Tasks that don't adhere to a format input that returns a map of the type\n    str -> str should override this method to return a sample input.\n    \"\"\"\n    return self.format_input(\n        {\n            \"instructions\": [\n                f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n            ],\n        }\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct","title":"EvolInstruct","text":"

Bases: Task

Evolve instructions using an LLM.

WizardLM: Empowering Large Language Models to Follow Complex Instructions

Attributes:

Name Type Description num_evolutions int

The number of evolutions to be performed.

store_evolutions bool

Whether to store all the evolutions or just the last one. Defaults to False.

generate_answers bool

Whether to generate answers for the evolved instructions. Defaults to False.

include_original_instruction bool

Whether to include the original instruction in the evolved_instructions output column. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used for evolving the instructions. Defaults to the ones provided in the utils.py file.

seed RuntimeParameter[int]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
Input columns
  • instruction (str): The instruction to evolve.
Output columns
  • evolved_instruction (str): The evolved instruction if store_evolutions=False.
  • evolved_instructions (List[str]): The evolved instructions if store_evolutions=True.
  • model_name (str): The name of the LLM used to evolve the instructions.
  • answer (str): The answer to the evolved instruction if generate_answers=True and store_evolutions=False.
  • answers (List[str]): The answers to the evolved instructions if generate_answers=True and store_evolutions=True.
Categories
  • evol
  • instruction
References
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions
  • GitHub: h2oai/h2o-wizardlm

Examples:

Evolve an instruction using an LLM:

from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n

Keep the iterations of the evolutions:

from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n    store_evolutions=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'evolved_instructions': ['initial evolution', 'final evolution'],\n#         'model_name': 'model_name'\n#     }\n# ]\n

Generate answers for the instructions in a single step:

from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n    generate_answers=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'evolved_instruction': 'evolved instruction',\n#         'answer': 'answer to the instruction',\n#         'model_name': 'model_name'\n#     }\n# ]\n
Citations
@misc{xu2023wizardlmempoweringlargelanguage,\n    title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n    author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n    year={2023},\n    eprint={2304.12244},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2304.12244},\n}\n
Source code in src/distilabel/steps/tasks/evol_instruct/base.py
class EvolInstruct(Task):\n    \"\"\"Evolve instructions using an `LLM`.\n\n    WizardLM: Empowering Large Language Models to Follow Complex Instructions\n\n    Attributes:\n        num_evolutions: The number of evolutions to be performed.\n        store_evolutions: Whether to store all the evolutions or just the last one. Defaults\n            to `False`.\n        generate_answers: Whether to generate answers for the evolved instructions. Defaults\n            to `False`.\n        include_original_instruction: Whether to include the original instruction in the\n            `evolved_instructions` output column. Defaults to `False`.\n        mutation_templates: The mutation templates to be used for evolving the instructions.\n            Defaults to the ones provided in the `utils.py` file.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n    Input columns:\n        - instruction (`str`): The instruction to evolve.\n\n    Output columns:\n        - evolved_instruction (`str`): The evolved instruction if `store_evolutions=False`.\n        - evolved_instructions (`List[str]`): The evolved instructions if `store_evolutions=True`.\n        - model_name (`str`): The name of the LLM used to evolve the instructions.\n        - answer (`str`): The answer to the evolved instruction if `generate_answers=True`\n            and `store_evolutions=False`.\n        - answers (`List[str]`): The answers to the evolved instructions if `generate_answers=True`\n            and `store_evolutions=True`.\n\n    Categories:\n        - evol\n        - instruction\n\n    References:\n        - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n        - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm)\n\n    Examples:\n        Evolve an instruction using an LLM:\n\n        ```python\n        from distilabel.steps.tasks import EvolInstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_instruct = EvolInstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n        )\n\n        evol_instruct.load()\n\n        result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n        # result\n        # [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n        ```\n\n        Keep the iterations of the evolutions:\n\n        ```python\n        from distilabel.steps.tasks import EvolInstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_instruct = EvolInstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n            store_evolutions=True,\n        )\n\n        evol_instruct.load()\n\n        result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n        # result\n        # [\n        #     {\n        #         'instruction': 'common instruction',\n        #         'evolved_instructions': ['initial evolution', 'final evolution'],\n        #         'model_name': 'model_name'\n        #     }\n        # ]\n        ```\n\n        Generate answers for the instructions in a single step:\n\n        ```python\n        from distilabel.steps.tasks import EvolInstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_instruct = EvolInstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n            generate_answers=True,\n        )\n\n        evol_instruct.load()\n\n        result = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n        # result\n        # [\n        #     {\n        #         'instruction': 'common instruction',\n        #         'evolved_instruction': 'evolved instruction',\n        #         'answer': 'answer to the instruction',\n        #         'model_name': 'model_name'\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{xu2023wizardlmempoweringlargelanguage,\n            title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n            author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n            year={2023},\n            eprint={2304.12244},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2304.12244},\n        }\n        ```\n    \"\"\"\n\n    num_evolutions: int\n    store_evolutions: bool = False\n    generate_answers: bool = False\n    include_original_instruction: bool = False\n    mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n\n    seed: RuntimeParameter[int] = Field(\n        default=42,\n        description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to seed a random seed.\",\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`.\"\"\"\n        return [\"instruction\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation. And the\n        `system_prompt` is added as the first message if it exists.\"\"\"\n        return [{\"role\": \"user\", \"content\": input}]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `evolved_instruction/s`, the `answer` if `generate_answers=True`\n        and the `model_name`.\"\"\"\n        # TODO: having to define a `model_name` column every time as the `Task.outputs` is not ideal,\n        # this could be handled always and the value could be included within the DAG validation when\n        # a `Task` is used, since all the `Task` subclasses will have an `llm` with a `model_name` attr.\n        _outputs = [\n            (\n                \"evolved_instruction\"\n                if not self.store_evolutions\n                else \"evolved_instructions\"\n            ),\n            \"model_name\",\n        ]\n        if self.generate_answers:\n            _outputs.append(\"answer\" if not self.store_evolutions else \"answers\")\n        return _outputs\n\n    @override\n    def format_output(  # type: ignore\n        self, instructions: Union[str, List[str]], answers: Optional[List[str]] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        \"\"\"The output for the task is a dict with: `evolved_instruction` or `evolved_instructions`,\n        depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n        `answer` if `generate_answers=True`; and, finally, the `model_name`.\n\n        Args:\n            instructions: The instructions to be included within the output.\n            answers: The answers to be included within the output if `generate_answers=True`.\n\n        Returns:\n            If `store_evolutions=False` and `generate_answers=True` return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};\n            if `store_evolutions=True` and `generate_answers=True` return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};\n            if `store_evolutions=False` and `generate_answers=False` return {\"evolved_instruction\": ..., \"model_name\": ...};\n            if `store_evolutions=True` and `generate_answers=False` return {\"evolved_instructions\": ..., \"model_name\": ...}.\n        \"\"\"\n        _output = {}\n        if not self.store_evolutions:\n            _output[\"evolved_instruction\"] = instructions[-1]\n        else:\n            _output[\"evolved_instructions\"] = instructions\n\n        if self.generate_answers and answers:\n            if not self.store_evolutions:\n                _output[\"answer\"] = answers[-1]\n            else:\n                _output[\"answers\"] = answers\n\n        _output[\"model_name\"] = self.llm.model_name\n        return _output\n\n    @property\n    def mutation_templates_names(self) -> List[str]:\n        \"\"\"Returns the names i.e. keys of the provided `mutation_templates`.\"\"\"\n        return list(self.mutation_templates.keys())\n\n    def _apply_random_mutation(self, instruction: str) -> str:\n        \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n        enum, and returns the provided instruction within the mutation prompt.\n\n        Args:\n            instruction: The instruction to be included within the mutation prompt.\n\n        Returns:\n            A random mutation prompt with the provided instruction.\n        \"\"\"\n        mutation = np.random.choice(self.mutation_templates_names)\n        return self.mutation_templates[mutation].replace(\"<PROMPT>\", instruction)  # type: ignore\n\n    def _evolve_instructions(self, inputs: \"StepInput\") -> List[List[str]]:\n        \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list where each item is a list with either the last evolved instruction if\n            `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n        \"\"\"\n\n        instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n\n        for iter_no in range(self.num_evolutions):\n            formatted_prompts = []\n            for instruction in instructions:\n                formatted_prompts.append(self._apply_random_mutation(instruction[-1]))\n\n            formatted_prompts = [\n                self.format_input(prompt) for prompt in formatted_prompts\n            ]\n            generated_prompts = flatten_responses(\n                self.llm.generate(\n                    formatted_prompts,\n                    **self.llm.generation_kwargs,  # type: ignore\n                )\n            )\n\n            evolved_instructions = []\n            for generated_prompt in generated_prompts:\n                generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n                evolved_instructions.append(generated_prompt)\n\n            if self.store_evolutions:\n                instructions = [\n                    instruction + [evolved_instruction]\n                    for instruction, evolved_instruction in zip(\n                        instructions, evolved_instructions\n                    )\n                ]\n            else:\n                instructions = [\n                    [evolved_instruction]\n                    for evolved_instruction in evolved_instructions\n                ]\n\n            self._logger.info(\n                f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(instructions)} instructions!\"\n            )\n\n        return instructions\n\n    def _generate_answers(\n        self, evolved_instructions: List[List[str]]\n    ) -> List[List[str]]:\n        \"\"\"Generates the answer for the instructions in `instructions`.\n\n        Args:\n            evolved_instructions: A list of lists where each item is a list with either the last\n                evolved instruction if `store_evolutions=False` or all the evolved instructions\n                if `store_evolutions=True`.\n\n        Returns:\n            A list of answers for each instruction.\n        \"\"\"\n        formatted_instructions = [\n            self.format_input(instruction)\n            for instructions in evolved_instructions\n            for instruction in instructions\n        ]\n\n        responses = self.llm.generate(\n            formatted_instructions,\n            num_generations=1,\n            **self.llm.generation_kwargs,  # type: ignore\n        )\n\n        step = (\n            self.num_evolutions\n            if not self.include_original_instruction\n            else self.num_evolutions + 1\n        )\n        return [\n            flatten_responses(responses[i : i + step])\n            for i in range(0, len(responses), step)\n        ]\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n\n        evolved_instructions = self._evolve_instructions(inputs)\n\n        if self.store_evolutions:\n            # Remove the input instruction from the `evolved_instructions` list\n            from_ = 1 if not self.include_original_instruction else 0\n            evolved_instructions = [\n                instruction[from_:] for instruction in evolved_instructions\n            ]\n\n        if not self.generate_answers:\n            for input, instruction in zip(inputs, evolved_instructions):\n                input.update(self.format_output(instruction))\n            yield inputs\n\n        self._logger.info(\n            f\"\ud83c\udf89 Finished evolving {len(evolved_instructions)} instructions!\"\n        )\n\n        if self.generate_answers:\n            self._logger.info(\n                f\"\ud83e\udde0 Generating answers for the {len(evolved_instructions)} evolved instructions!\"\n            )\n\n            answers = self._generate_answers(evolved_instructions)\n\n            self._logger.info(\n                f\"\ud83c\udf89 Finished generating answers for the {len(evolved_instructions)} evolved\"\n                \" instructions!\"\n            )\n\n            for idx, (input, instruction) in enumerate(\n                zip(inputs, evolved_instructions)\n            ):\n                input.update(self.format_output(instruction, answers[idx]))\n            yield inputs\n\n    @override\n    def _sample_input(self) -> ChatType:\n        return self.format_input(\n            self._apply_random_mutation(\"<PLACEHOLDER_INSTRUCTION>\")\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.inputs","title":"inputs: List[str] property","text":"

The input for the task is the instruction.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.outputs","title":"outputs: List[str] property","text":"

The output for the task are the evolved_instruction/s, the answer if generate_answers=True and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.mutation_templates_names","title":"mutation_templates_names: List[str] property","text":"

Returns the names i.e. keys of the provided mutation_templates.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. And the system_prompt is added as the first message if it exists.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
def format_input(self, input: str) -> ChatType:  # type: ignore\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation. And the\n    `system_prompt` is added as the first message if it exists.\"\"\"\n    return [{\"role\": \"user\", \"content\": input}]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.format_output","title":"format_output(instructions, answers=None)","text":"

The output for the task is a dict with: evolved_instruction or evolved_instructions, depending whether the value is either False or True for store_evolutions, respectively; answer if generate_answers=True; and, finally, the model_name.

Parameters:

Name Type Description Default instructions Union[str, List[str]]

The instructions to be included within the output.

required answers Optional[List[str]]

The answers to be included within the output if generate_answers=True.

None

Returns:

Type Description Dict[str, Any]

If store_evolutions=False and generate_answers=True return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};

Dict[str, Any]

if store_evolutions=True and generate_answers=True return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};

Dict[str, Any]

if store_evolutions=False and generate_answers=False return {\"evolved_instruction\": ..., \"model_name\": ...};

Dict[str, Any]

if store_evolutions=True and generate_answers=False return {\"evolved_instructions\": ..., \"model_name\": ...}.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
@override\ndef format_output(  # type: ignore\n    self, instructions: Union[str, List[str]], answers: Optional[List[str]] = None\n) -> Dict[str, Any]:  # type: ignore\n    \"\"\"The output for the task is a dict with: `evolved_instruction` or `evolved_instructions`,\n    depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n    `answer` if `generate_answers=True`; and, finally, the `model_name`.\n\n    Args:\n        instructions: The instructions to be included within the output.\n        answers: The answers to be included within the output if `generate_answers=True`.\n\n    Returns:\n        If `store_evolutions=False` and `generate_answers=True` return {\"evolved_instruction\": ..., \"model_name\": ..., \"answer\": ...};\n        if `store_evolutions=True` and `generate_answers=True` return {\"evolved_instructions\": ..., \"model_name\": ..., \"answer\": ...};\n        if `store_evolutions=False` and `generate_answers=False` return {\"evolved_instruction\": ..., \"model_name\": ...};\n        if `store_evolutions=True` and `generate_answers=False` return {\"evolved_instructions\": ..., \"model_name\": ...}.\n    \"\"\"\n    _output = {}\n    if not self.store_evolutions:\n        _output[\"evolved_instruction\"] = instructions[-1]\n    else:\n        _output[\"evolved_instructions\"] = instructions\n\n    if self.generate_answers and answers:\n        if not self.store_evolutions:\n            _output[\"answer\"] = answers[-1]\n        else:\n            _output[\"answers\"] = answers\n\n    _output[\"model_name\"] = self.llm.model_name\n    return _output\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._apply_random_mutation","title":"_apply_random_mutation(instruction)","text":"

Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt.

Parameters:

Name Type Description Default instruction str

The instruction to be included within the mutation prompt.

required

Returns:

Type Description str

A random mutation prompt with the provided instruction.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
def _apply_random_mutation(self, instruction: str) -> str:\n    \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n    enum, and returns the provided instruction within the mutation prompt.\n\n    Args:\n        instruction: The instruction to be included within the mutation prompt.\n\n    Returns:\n        A random mutation prompt with the provided instruction.\n    \"\"\"\n    mutation = np.random.choice(self.mutation_templates_names)\n    return self.mutation_templates[mutation].replace(\"<PROMPT>\", instruction)  # type: ignore\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._evolve_instructions","title":"_evolve_instructions(inputs)","text":"

Evolves the instructions provided as part of the inputs of the task.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description List[List[str]]

A list where each item is a list with either the last evolved instruction if

List[List[str]]

store_evolutions=False or all the evolved instructions if store_evolutions=True.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
def _evolve_instructions(self, inputs: \"StepInput\") -> List[List[str]]:\n    \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list where each item is a list with either the last evolved instruction if\n        `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n    \"\"\"\n\n    instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n\n    for iter_no in range(self.num_evolutions):\n        formatted_prompts = []\n        for instruction in instructions:\n            formatted_prompts.append(self._apply_random_mutation(instruction[-1]))\n\n        formatted_prompts = [\n            self.format_input(prompt) for prompt in formatted_prompts\n        ]\n        generated_prompts = flatten_responses(\n            self.llm.generate(\n                formatted_prompts,\n                **self.llm.generation_kwargs,  # type: ignore\n            )\n        )\n\n        evolved_instructions = []\n        for generated_prompt in generated_prompts:\n            generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n            evolved_instructions.append(generated_prompt)\n\n        if self.store_evolutions:\n            instructions = [\n                instruction + [evolved_instruction]\n                for instruction, evolved_instruction in zip(\n                    instructions, evolved_instructions\n                )\n            ]\n        else:\n            instructions = [\n                [evolved_instruction]\n                for evolved_instruction in evolved_instructions\n            ]\n\n        self._logger.info(\n            f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(instructions)} instructions!\"\n        )\n\n    return instructions\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct._generate_answers","title":"_generate_answers(evolved_instructions)","text":"

Generates the answer for the instructions in instructions.

Parameters:

Name Type Description Default evolved_instructions List[List[str]]

A list of lists where each item is a list with either the last evolved instruction if store_evolutions=False or all the evolved instructions if store_evolutions=True.

required

Returns:

Type Description List[List[str]]

A list of answers for each instruction.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
def _generate_answers(\n    self, evolved_instructions: List[List[str]]\n) -> List[List[str]]:\n    \"\"\"Generates the answer for the instructions in `instructions`.\n\n    Args:\n        evolved_instructions: A list of lists where each item is a list with either the last\n            evolved instruction if `store_evolutions=False` or all the evolved instructions\n            if `store_evolutions=True`.\n\n    Returns:\n        A list of answers for each instruction.\n    \"\"\"\n    formatted_instructions = [\n        self.format_input(instruction)\n        for instructions in evolved_instructions\n        for instruction in instructions\n    ]\n\n    responses = self.llm.generate(\n        formatted_instructions,\n        num_generations=1,\n        **self.llm.generation_kwargs,  # type: ignore\n    )\n\n    step = (\n        self.num_evolutions\n        if not self.include_original_instruction\n        else self.num_evolutions + 1\n    )\n    return [\n        flatten_responses(responses[i : i + step])\n        for i in range(0, len(responses), step)\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstruct.process","title":"process(inputs)","text":"

Processes the inputs of the task and generates the outputs using the LLM.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Yields:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/tasks/evol_instruct/base.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Yields:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n\n    evolved_instructions = self._evolve_instructions(inputs)\n\n    if self.store_evolutions:\n        # Remove the input instruction from the `evolved_instructions` list\n        from_ = 1 if not self.include_original_instruction else 0\n        evolved_instructions = [\n            instruction[from_:] for instruction in evolved_instructions\n        ]\n\n    if not self.generate_answers:\n        for input, instruction in zip(inputs, evolved_instructions):\n            input.update(self.format_output(instruction))\n        yield inputs\n\n    self._logger.info(\n        f\"\ud83c\udf89 Finished evolving {len(evolved_instructions)} instructions!\"\n    )\n\n    if self.generate_answers:\n        self._logger.info(\n            f\"\ud83e\udde0 Generating answers for the {len(evolved_instructions)} evolved instructions!\"\n        )\n\n        answers = self._generate_answers(evolved_instructions)\n\n        self._logger.info(\n            f\"\ud83c\udf89 Finished generating answers for the {len(evolved_instructions)} evolved\"\n            \" instructions!\"\n        )\n\n        for idx, (input, instruction) in enumerate(\n            zip(inputs, evolved_instructions)\n        ):\n            input.update(self.format_output(instruction, answers[idx]))\n        yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolComplexity","title":"EvolComplexity","text":"

Bases: EvolInstruct

Evolve instructions to make them more complex using an LLM.

EvolComplexity is a task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, using slight different prompts, but the exact same evolutionary approach.

Attributes:

Name Type Description num_instructions

The number of instructions to be generated.

generate_answers

Whether to generate answers for the instructions or not. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used for the generation of the instructions.

min_length Dict[str, str]

Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

max_length Dict[str, str]

Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

seed Dict[str, str]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.
  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.
  • seed: The number of evolutions to be run.
Input columns
  • instruction (str): The instruction to evolve.
Output columns
  • evolved_instruction (str): The evolved instruction.
  • answer (str, optional): The answer to the instruction if generate_answers=True.
  • model_name (str): The name of the LLM used to evolve the instructions.
Categories
  • evol
  • instruction
  • deita
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

Examples:

Evolve an instruction using an LLM:

from distilabel.steps.tasks import EvolComplexity\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity = EvolComplexity(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_complexity.load()\n\nresult = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
@misc{xu2023wizardlmempoweringlargelanguage,\n    title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n    author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n    year={2023},\n    eprint={2304.12244},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2304.12244},\n}\n
Source code in src/distilabel/steps/tasks/evol_instruct/evol_complexity/base.py
class EvolComplexity(EvolInstruct):\n    \"\"\"Evolve instructions to make them more complex using an `LLM`.\n\n    `EvolComplexity` is a task that evolves instructions to make them more complex,\n    and it is based in the EvolInstruct task, using slight different prompts, but the\n    exact same evolutionary approach.\n\n    Attributes:\n        num_instructions: The number of instructions to be generated.\n        generate_answers: Whether to generate answers for the instructions or not. Defaults\n            to `False`.\n        mutation_templates: The mutation templates to be used for the generation of the\n            instructions.\n        min_length: Defines the length (in bytes) that the generated instruction needs to\n            be higher than, to be considered valid. Defaults to `512`.\n        max_length: Defines the length (in bytes) that the generated instruction needs to\n            be lower than, to be considered valid. Defaults to `1024`.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `min_length`: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\n        - `max_length`: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\n        - `seed`: The number of evolutions to be run.\n\n    Input columns:\n        - instruction (`str`): The instruction to evolve.\n\n    Output columns:\n        - evolved_instruction (`str`): The evolved instruction.\n        - answer (`str`, optional): The answer to the instruction if `generate_answers=True`.\n        - model_name (`str`): The name of the LLM used to evolve the instructions.\n\n    Categories:\n        - evol\n        - instruction\n        - deita\n\n    References:\n        - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n        - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n\n    Examples:\n        Evolve an instruction using an LLM:\n\n        ```python\n        from distilabel.steps.tasks import EvolComplexity\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_complexity = EvolComplexity(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n        )\n\n        evol_complexity.load()\n\n        result = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n        # result\n        # [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n\n        ```\n        @misc{xu2023wizardlmempoweringlargelanguage,\n            title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n            author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n            year={2023},\n            eprint={2304.12244},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2304.12244},\n        }\n        ```\n    \"\"\"\n\n    mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolComplexityGenerator","title":"EvolComplexityGenerator","text":"

Bases: EvolInstructGenerator

Generate evolved instructions with increased complexity using an LLM.

EvolComplexityGenerator is a generation task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, but using slight different prompts, but the exact same evolutionary approach.

Attributes:

Name Type Description num_instructions

The number of instructions to be generated.

generate_answers

Whether to generate answers for the instructions or not. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used for the generation of the instructions.

min_length Dict[str, str]

Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

max_length Dict[str, str]

Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

seed Dict[str, str]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.
  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.
  • seed: The number of evolutions to be run.
Output columns
  • instruction (str): The evolved instruction.
  • answer (str, optional): The answer to the instruction if generate_answers=True.
  • model_name (str): The name of the LLM used to evolve the instructions.
Categories
  • evol
  • instruction
  • generation
  • deita
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

Examples:

Generate evolved instructions without initial instructions:

from distilabel.steps.tasks import EvolComplexityGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity_generator = EvolComplexityGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=2,\n)\n\nevol_complexity_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
@misc{xu2023wizardlmempoweringlargelanguage,\n    title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n    author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n    year={2023},\n    eprint={2304.12244},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2304.12244},\n}\n
Source code in src/distilabel/steps/tasks/evol_instruct/evol_complexity/generator.py
class EvolComplexityGenerator(EvolInstructGenerator):\n    \"\"\"Generate evolved instructions with increased complexity using an `LLM`.\n\n    `EvolComplexityGenerator` is a generation task that evolves instructions to make\n    them more complex, and it is based in the EvolInstruct task, but using slight different\n    prompts, but the exact same evolutionary approach.\n\n    Attributes:\n        num_instructions: The number of instructions to be generated.\n        generate_answers: Whether to generate answers for the instructions or not. Defaults\n            to `False`.\n        mutation_templates: The mutation templates to be used for the generation of the\n            instructions.\n        min_length: Defines the length (in bytes) that the generated instruction needs to\n            be higher than, to be considered valid. Defaults to `512`.\n        max_length: Defines the length (in bytes) that the generated instruction needs to\n            be lower than, to be considered valid. Defaults to `1024`.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `min_length`: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\n        - `max_length`: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\n        - `seed`: The number of evolutions to be run.\n\n    Output columns:\n        - instruction (`str`): The evolved instruction.\n        - answer (`str`, optional): The answer to the instruction if `generate_answers=True`.\n        - model_name (`str`): The name of the LLM used to evolve the instructions.\n\n    Categories:\n        - evol\n        - instruction\n        - generation\n        - deita\n\n    References:\n        - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n        - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n\n    Examples:\n        Generate evolved instructions without initial instructions:\n\n        ```python\n        from distilabel.steps.tasks import EvolComplexityGenerator\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_complexity_generator = EvolComplexityGenerator(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_instructions=2,\n        )\n\n        evol_complexity_generator.load()\n\n        result = next(scorer.process())\n        # result\n        # [{'instruction': 'generated instruction', 'model_name': 'test'}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n\n        ```\n        @misc{xu2023wizardlmempoweringlargelanguage,\n            title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n            author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n            year={2023},\n            eprint={2304.12244},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2304.12244},\n        }\n        ```\n    \"\"\"\n\n    mutation_templates: Dict[str, str] = GENERATION_MUTATION_TEMPLATES\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator","title":"EvolInstructGenerator","text":"

Bases: GeneratorTask

Generate evolved instructions using an LLM.

WizardLM: Empowering Large Language Models to Follow Complex Instructions

Attributes:

Name Type Description num_instructions int

The number of instructions to be generated.

generate_answers bool

Whether to generate answers for the instructions or not. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used for the generation of the instructions.

min_length RuntimeParameter[int]

Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

max_length RuntimeParameter[int]

Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

seed RuntimeParameter[int]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.
  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
Output columns
  • instruction (str): The generated instruction if generate_answers=False.
  • answer (str): The generated answer if generate_answers=True.
  • instructions (List[str]): The generated instructions if generate_answers=True.
  • model_name (str): The name of the LLM used to generate and evolve the instructions.
Categories
  • evol
  • instruction
  • generation
References
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions
  • GitHub: h2oai/h2o-wizardlm

Examples:

Generate evolved instructions without initial instructions:

from distilabel.steps.tasks import EvolInstructGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct_generator = EvolInstructGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=2,\n)\n\nevol_instruct_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n
Citations
@misc{xu2023wizardlmempoweringlargelanguage,\n    title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n    author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n    year={2023},\n    eprint={2304.12244},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2304.12244},\n}\n
Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
class EvolInstructGenerator(GeneratorTask):\n    \"\"\"Generate evolved instructions using an `LLM`.\n\n    WizardLM: Empowering Large Language Models to Follow Complex Instructions\n\n    Attributes:\n        num_instructions: The number of instructions to be generated.\n        generate_answers: Whether to generate answers for the instructions or not. Defaults\n            to `False`.\n        mutation_templates: The mutation templates to be used for the generation of the\n            instructions.\n        min_length: Defines the length (in bytes) that the generated instruction needs to\n            be higher than, to be considered valid. Defaults to `512`.\n        max_length: Defines the length (in bytes) that the generated instruction needs to\n            be lower than, to be considered valid. Defaults to `1024`.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `min_length`: Defines the length (in bytes) that the generated instruction needs\n            to be higher than, to be considered valid.\n        - `max_length`: Defines the length (in bytes) that the generated instruction needs\n            to be lower than, to be considered valid.\n        - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n    Output columns:\n        - instruction (`str`): The generated instruction if `generate_answers=False`.\n        - answer (`str`): The generated answer if `generate_answers=True`.\n        - instructions (`List[str]`): The generated instructions if `generate_answers=True`.\n        - model_name (`str`): The name of the LLM used to generate and evolve the instructions.\n\n    Categories:\n        - evol\n        - instruction\n        - generation\n\n    References:\n        - [WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244)\n        - [GitHub: h2oai/h2o-wizardlm](https://github.com/h2oai/h2o-wizardlm)\n\n    Examples:\n        Generate evolved instructions without initial instructions:\n\n        ```python\n        from distilabel.steps.tasks import EvolInstructGenerator\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_instruct_generator = EvolInstructGenerator(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_instructions=2,\n        )\n\n        evol_instruct_generator.load()\n\n        result = next(scorer.process())\n        # result\n        # [{'instruction': 'generated instruction', 'model_name': 'test'}]\n        ```\n\n    Citations:\n        ```\n        @misc{xu2023wizardlmempoweringlargelanguage,\n            title={WizardLM: Empowering Large Language Models to Follow Complex Instructions},\n            author={Can Xu and Qingfeng Sun and Kai Zheng and Xiubo Geng and Pu Zhao and Jiazhan Feng and Chongyang Tao and Daxin Jiang},\n            year={2023},\n            eprint={2304.12244},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2304.12244},\n        }\n        ```\n    \"\"\"\n\n    num_instructions: int\n    generate_answers: bool = False\n    mutation_templates: Dict[str, str] = GENERATION_MUTATION_TEMPLATES\n\n    min_length: RuntimeParameter[int] = Field(\n        default=512,\n        description=\"Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.\",\n    )\n    max_length: RuntimeParameter[int] = Field(\n        default=1024,\n        description=\"Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.\",\n    )\n\n    seed: RuntimeParameter[int] = Field(\n        default=42,\n        description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to seed a random seed.\",\n    )\n    _seed_texts: Optional[List[str]] = PrivateAttr(default_factory=list)\n    _prompts: Optional[List[str]] = PrivateAttr(default_factory=list)\n\n    def _generate_seed_texts(self) -> List[str]:\n        \"\"\"Generates a list of seed texts to be used as part of the starting prompts for the task.\n\n        It will use the `FRESH_START` mutation template, as it needs to generate text from scratch; and\n        a list of English words will be used to generate the seed texts that will be provided to the\n        mutation method and included within the prompt.\n\n        Returns:\n            A list of seed texts to be used as part of the starting prompts for the task.\n        \"\"\"\n        seed_texts = []\n        for _ in range(self.num_instructions * 10):\n            num_words = np.random.choice([1, 2, 3, 4])\n            seed_texts.append(\n                self.mutation_templates[\"FRESH_START\"].replace(  # type: ignore\n                    \"<PROMPT>\",\n                    \", \".join(\n                        [\n                            np.random.choice(self._english_nouns).strip()\n                            for _ in range(num_words)\n                        ]\n                    ),\n                )\n            )\n        return seed_texts\n\n    @override\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n        This is useful if you want to do some validation that requires the entire model to be initialized.\n        \"\"\"\n        super().model_post_init(__context)\n\n        np.random.seed(self.seed)\n\n        self._seed_texts = self._generate_seed_texts()\n        self._prompts = [\n            np.random.choice(self._seed_texts) for _ in range(self.num_instructions)\n        ]\n\n    @cached_property\n    def _english_nouns(self) -> List[str]:\n        \"\"\"A list of English nouns to be used as part of the starting prompts for the task.\n\n        References:\n            - https://github.com/h2oai/h2o-wizardlm\n        \"\"\"\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps/tasks/evol_instruct/english_nouns.txt\"\n        )\n        with open(_path, mode=\"r\") as f:\n            return [line.strip() for line in f.readlines()]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `instruction`, the `answer` if `generate_answers=True`\n        and the `model_name`.\"\"\"\n        _outputs = [\"instruction\", \"model_name\"]\n        if self.generate_answers:\n            _outputs.append(\"answer\")\n        return _outputs\n\n    def format_output(  # type: ignore\n        self, instruction: str, answer: Optional[str] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output for the task is a dict with: `instruction`; `answer` if `generate_answers=True`;\n        and, finally, the `model_name`.\n\n        Args:\n            instruction: The instruction to be included within the output.\n            answer: The answer to be included within the output if `generate_answers=True`.\n\n        Returns:\n            If `generate_answers=True` return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};\n            if `generate_answers=False` return {\"instruction\": ..., \"model_name\": ...};\n        \"\"\"\n        _output = {\n            \"instruction\": instruction,\n            \"model_name\": self.llm.model_name,\n        }\n        if self.generate_answers and answer is not None:\n            _output[\"answer\"] = answer\n        return _output\n\n    @property\n    def mutation_templates_names(self) -> List[str]:\n        \"\"\"Returns the names i.e. keys of the provided `mutation_templates`.\"\"\"\n        return list(self.mutation_templates.keys())\n\n    def _apply_random_mutation(self, iter_no: int) -> List[\"ChatType\"]:\n        \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n        enum, and returns the provided instruction within the mutation prompt.\n\n        Args:\n            iter_no: The iteration number to be used to check whether the iteration is the\n                first one i.e. FRESH_START, or not.\n\n        Returns:\n            A random mutation prompt with the provided instruction formatted as an OpenAI conversation.\n        \"\"\"\n        prompts = []\n        for idx in range(self.num_instructions):\n            if (\n                iter_no == 0\n                or \"Write one question or request containing\" in self._prompts[idx]  # type: ignore\n            ):\n                mutation = \"FRESH_START\"\n            else:\n                mutation = np.random.choice(self.mutation_templates_names)\n                if mutation == \"FRESH_START\":\n                    self._prompts[idx] = np.random.choice(self._seed_texts)  # type: ignore\n\n            prompt_with_template = (\n                self.mutation_templates[mutation].replace(  # type: ignore\n                    \"<PROMPT>\",\n                    self._prompts[idx],  # type: ignore\n                )  # type: ignore\n                if iter_no != 0\n                else self._prompts[idx]  # type: ignore\n            )\n            prompts.append([{\"role\": \"user\", \"content\": prompt_with_template}])\n        return prompts\n\n    def _generate_answers(self, instructions: List[List[str]]) -> List[str]:\n        \"\"\"Generates the answer for the last instruction in `instructions`.\n\n        Args:\n            instructions: A list of lists where each item is a list with either the last\n                evolved instruction if `store_evolutions=False` or all the evolved instructions\n                if `store_evolutions=True`.\n\n        Returns:\n            A list of answers for the last instruction in `instructions`.\n        \"\"\"\n        # TODO: update to generate answers for all the instructions\n        _formatted_instructions = [\n            [{\"role\": \"user\", \"content\": instruction[-1]}]\n            for instruction in instructions\n        ]\n        responses = self.llm.generate(\n            _formatted_instructions,\n            **self.llm.generation_kwargs,  # type: ignore\n        )\n        return flatten_responses(responses)\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n        \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to 0.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task, and a boolean\n            flag indicating whether the task has finished or not i.e. is the last batch.\n        \"\"\"\n        instructions = []\n        mutation_no = 0\n\n        # TODO: update to take into account `offset`\n        iter_no = 0\n        while len(instructions) < self.num_instructions:\n            prompts = self._apply_random_mutation(iter_no=iter_no)\n\n            generated_prompts = flatten_responses(\n                self.llm.generate(prompts, **self.llm.generation_kwargs)  # type: ignore\n            )\n            for idx, generated_prompt in enumerate(generated_prompts):\n                generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n                if self.max_length >= len(generated_prompt) >= self.min_length:  # type: ignore\n                    instructions.append(generated_prompt)\n                    self._prompts[idx] = np.random.choice(self._seed_texts)  # type: ignore\n                else:\n                    self._prompts[idx] = generated_prompt  # type: ignore\n\n            self._logger.info(\n                f\"\ud83d\udd04 Ran iteration {iter_no} with {len(instructions)} instructions already evolved!\"\n            )\n            iter_no += 1\n\n            if len(instructions) > self.num_instructions:\n                instructions = instructions[: self.num_instructions]\n            if len(instructions) > mutation_no:\n                mutation_no = len(instructions) - mutation_no\n\n            if not self.generate_answers and len(instructions[-mutation_no:]) > 0:\n                yield (\n                    [\n                        self.format_output(mutated_instruction)\n                        for mutated_instruction in instructions[-mutation_no:]\n                    ],\n                    len(instructions) >= self.num_instructions,\n                )\n\n        self._logger.info(f\"\ud83c\udf89 Finished evolving {len(instructions)} instructions!\")\n\n        if self.generate_answers:\n            self._logger.info(\n                f\"\ud83e\udde0 Generating answers for the {len(instructions)} evolved instructions!\"\n            )\n\n            answers = self._generate_answers(instructions)\n\n            self._logger.info(\n                f\"\ud83c\udf89 Finished generating answers for the {len(instructions)} evolved instructions!\"\n            )\n\n            yield (\n                [\n                    self.format_output(instruction, answer)\n                    for instruction, answer in zip(instructions, answers)\n                ],\n                True,\n            )\n\n    @override\n    def _sample_input(self) -> \"ChatType\":\n        return self._apply_random_mutation(iter_no=0)[0]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._english_nouns","title":"_english_nouns: List[str] cached property","text":"

A list of English nouns to be used as part of the starting prompts for the task.

References
  • https://github.com/h2oai/h2o-wizardlm
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.outputs","title":"outputs: List[str] property","text":"

The output for the task are the instruction, the answer if generate_answers=True and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.mutation_templates_names","title":"mutation_templates_names: List[str] property","text":"

Returns the names i.e. keys of the provided mutation_templates.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._generate_seed_texts","title":"_generate_seed_texts()","text":"

Generates a list of seed texts to be used as part of the starting prompts for the task.

It will use the FRESH_START mutation template, as it needs to generate text from scratch; and a list of English words will be used to generate the seed texts that will be provided to the mutation method and included within the prompt.

Returns:

Type Description List[str]

A list of seed texts to be used as part of the starting prompts for the task.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
def _generate_seed_texts(self) -> List[str]:\n    \"\"\"Generates a list of seed texts to be used as part of the starting prompts for the task.\n\n    It will use the `FRESH_START` mutation template, as it needs to generate text from scratch; and\n    a list of English words will be used to generate the seed texts that will be provided to the\n    mutation method and included within the prompt.\n\n    Returns:\n        A list of seed texts to be used as part of the starting prompts for the task.\n    \"\"\"\n    seed_texts = []\n    for _ in range(self.num_instructions * 10):\n        num_words = np.random.choice([1, 2, 3, 4])\n        seed_texts.append(\n            self.mutation_templates[\"FRESH_START\"].replace(  # type: ignore\n                \"<PROMPT>\",\n                \", \".join(\n                    [\n                        np.random.choice(self._english_nouns).strip()\n                        for _ in range(num_words)\n                    ]\n                ),\n            )\n        )\n    return seed_texts\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.model_post_init","title":"model_post_init(__context)","text":"

Override this method to perform additional initialization after __init__ and model_construct. This is useful if you want to do some validation that requires the entire model to be initialized.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
@override\ndef model_post_init(self, __context: Any) -> None:\n    \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n    This is useful if you want to do some validation that requires the entire model to be initialized.\n    \"\"\"\n    super().model_post_init(__context)\n\n    np.random.seed(self.seed)\n\n    self._seed_texts = self._generate_seed_texts()\n    self._prompts = [\n        np.random.choice(self._seed_texts) for _ in range(self.num_instructions)\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.format_output","title":"format_output(instruction, answer=None)","text":"

The output for the task is a dict with: instruction; answer if generate_answers=True; and, finally, the model_name.

Parameters:

Name Type Description Default instruction str

The instruction to be included within the output.

required answer Optional[str]

The answer to be included within the output if generate_answers=True.

None

Returns:

Type Description Dict[str, Any]

If generate_answers=True return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};

Dict[str, Any]

if generate_answers=False return {\"instruction\": ..., \"model_name\": ...};

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
def format_output(  # type: ignore\n    self, instruction: str, answer: Optional[str] = None\n) -> Dict[str, Any]:\n    \"\"\"The output for the task is a dict with: `instruction`; `answer` if `generate_answers=True`;\n    and, finally, the `model_name`.\n\n    Args:\n        instruction: The instruction to be included within the output.\n        answer: The answer to be included within the output if `generate_answers=True`.\n\n    Returns:\n        If `generate_answers=True` return {\"instruction\": ..., \"answer\": ..., \"model_name\": ...};\n        if `generate_answers=False` return {\"instruction\": ..., \"model_name\": ...};\n    \"\"\"\n    _output = {\n        \"instruction\": instruction,\n        \"model_name\": self.llm.model_name,\n    }\n    if self.generate_answers and answer is not None:\n        _output[\"answer\"] = answer\n    return _output\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._apply_random_mutation","title":"_apply_random_mutation(iter_no)","text":"

Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt.

Parameters:

Name Type Description Default iter_no int

The iteration number to be used to check whether the iteration is the first one i.e. FRESH_START, or not.

required

Returns:

Type Description List[ChatType]

A random mutation prompt with the provided instruction formatted as an OpenAI conversation.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
def _apply_random_mutation(self, iter_no: int) -> List[\"ChatType\"]:\n    \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n    enum, and returns the provided instruction within the mutation prompt.\n\n    Args:\n        iter_no: The iteration number to be used to check whether the iteration is the\n            first one i.e. FRESH_START, or not.\n\n    Returns:\n        A random mutation prompt with the provided instruction formatted as an OpenAI conversation.\n    \"\"\"\n    prompts = []\n    for idx in range(self.num_instructions):\n        if (\n            iter_no == 0\n            or \"Write one question or request containing\" in self._prompts[idx]  # type: ignore\n        ):\n            mutation = \"FRESH_START\"\n        else:\n            mutation = np.random.choice(self.mutation_templates_names)\n            if mutation == \"FRESH_START\":\n                self._prompts[idx] = np.random.choice(self._seed_texts)  # type: ignore\n\n        prompt_with_template = (\n            self.mutation_templates[mutation].replace(  # type: ignore\n                \"<PROMPT>\",\n                self._prompts[idx],  # type: ignore\n            )  # type: ignore\n            if iter_no != 0\n            else self._prompts[idx]  # type: ignore\n        )\n        prompts.append([{\"role\": \"user\", \"content\": prompt_with_template}])\n    return prompts\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator._generate_answers","title":"_generate_answers(instructions)","text":"

Generates the answer for the last instruction in instructions.

Parameters:

Name Type Description Default instructions List[List[str]]

A list of lists where each item is a list with either the last evolved instruction if store_evolutions=False or all the evolved instructions if store_evolutions=True.

required

Returns:

Type Description List[str]

A list of answers for the last instruction in instructions.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
def _generate_answers(self, instructions: List[List[str]]) -> List[str]:\n    \"\"\"Generates the answer for the last instruction in `instructions`.\n\n    Args:\n        instructions: A list of lists where each item is a list with either the last\n            evolved instruction if `store_evolutions=False` or all the evolved instructions\n            if `store_evolutions=True`.\n\n    Returns:\n        A list of answers for the last instruction in `instructions`.\n    \"\"\"\n    # TODO: update to generate answers for all the instructions\n    _formatted_instructions = [\n        [{\"role\": \"user\", \"content\": instruction[-1]}]\n        for instruction in instructions\n    ]\n    responses = self.llm.generate(\n        _formatted_instructions,\n        **self.llm.generation_kwargs,  # type: ignore\n    )\n    return flatten_responses(responses)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolInstructGenerator.process","title":"process(offset=0)","text":"

Processes the inputs of the task and generates the outputs using the LLM.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

A list of Python dictionaries with the outputs of the task, and a boolean

GeneratorStepOutput

flag indicating whether the task has finished or not i.e. is the last batch.

Source code in src/distilabel/steps/tasks/evol_instruct/generator.py
@override\ndef process(self, offset: int = 0) -> \"GeneratorStepOutput\":  # type: ignore\n    \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to 0.\n\n    Yields:\n        A list of Python dictionaries with the outputs of the task, and a boolean\n        flag indicating whether the task has finished or not i.e. is the last batch.\n    \"\"\"\n    instructions = []\n    mutation_no = 0\n\n    # TODO: update to take into account `offset`\n    iter_no = 0\n    while len(instructions) < self.num_instructions:\n        prompts = self._apply_random_mutation(iter_no=iter_no)\n\n        generated_prompts = flatten_responses(\n            self.llm.generate(prompts, **self.llm.generation_kwargs)  # type: ignore\n        )\n        for idx, generated_prompt in enumerate(generated_prompts):\n            generated_prompt = generated_prompt.split(\"Prompt#:\")[-1].strip()\n            if self.max_length >= len(generated_prompt) >= self.min_length:  # type: ignore\n                instructions.append(generated_prompt)\n                self._prompts[idx] = np.random.choice(self._seed_texts)  # type: ignore\n            else:\n                self._prompts[idx] = generated_prompt  # type: ignore\n\n        self._logger.info(\n            f\"\ud83d\udd04 Ran iteration {iter_no} with {len(instructions)} instructions already evolved!\"\n        )\n        iter_no += 1\n\n        if len(instructions) > self.num_instructions:\n            instructions = instructions[: self.num_instructions]\n        if len(instructions) > mutation_no:\n            mutation_no = len(instructions) - mutation_no\n\n        if not self.generate_answers and len(instructions[-mutation_no:]) > 0:\n            yield (\n                [\n                    self.format_output(mutated_instruction)\n                    for mutated_instruction in instructions[-mutation_no:]\n                ],\n                len(instructions) >= self.num_instructions,\n            )\n\n    self._logger.info(f\"\ud83c\udf89 Finished evolving {len(instructions)} instructions!\")\n\n    if self.generate_answers:\n        self._logger.info(\n            f\"\ud83e\udde0 Generating answers for the {len(instructions)} evolved instructions!\"\n        )\n\n        answers = self._generate_answers(instructions)\n\n        self._logger.info(\n            f\"\ud83c\udf89 Finished generating answers for the {len(instructions)} evolved instructions!\"\n        )\n\n        yield (\n            [\n                self.format_output(instruction, answer)\n                for instruction, answer in zip(instructions, answers)\n            ],\n            True,\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality","title":"EvolQuality","text":"

Bases: Task

Evolve the quality of the responses using an LLM.

EvolQuality task is used to evolve the quality of the responses given a prompt, by generating a new response with a language model. This step implements the evolution quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

Attributes:

Name Type Description num_evolutions int

The number of evolutions to be performed on the responses.

store_evolutions bool

Whether to store all the evolved responses or just the last one. Defaults to False.

include_original_response bool

Whether to include the original response within the evolved responses. Defaults to False.

mutation_templates Dict[str, str]

The mutation templates to be used to evolve the responses.

seed RuntimeParameter[int]

The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

Runtime parameters
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
Input columns
  • instruction (str): The instruction that was used to generate the responses.
  • response (str): The responses to be rewritten.
Output columns
  • evolved_response (str): The evolved response if store_evolutions=False.
  • evolved_responses (List[str]): The evolved responses if store_evolutions=True.
  • model_name (str): The name of the LLM used to evolve the responses.
Categories
  • evol
  • response
  • deita
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Evolve the quality of the responses given a prompt:

from distilabel.steps.tasks import EvolQuality\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_quality = EvolQuality(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_quality.load()\n\nresult = next(\n    evol_quality.process(\n        [\n            {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'response': 'a response',\n#         'evolved_response': 'evolved response',\n#         'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n#     }\n# ]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/tasks/evol_quality/base.py
class EvolQuality(Task):\n    \"\"\"Evolve the quality of the responses using an `LLM`.\n\n    `EvolQuality` task is used to evolve the quality of the responses given a prompt,\n    by generating a new response with a language model. This step implements the evolution\n    quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of\n    Automatic Data Selection in Instruction Tuning'.\n\n    Attributes:\n        num_evolutions: The number of evolutions to be performed on the responses.\n        store_evolutions: Whether to store all the evolved responses or just the last one.\n            Defaults to `False`.\n        include_original_response: Whether to include the original response within the evolved\n            responses. Defaults to `False`.\n        mutation_templates: The mutation templates to be used to evolve the responses.\n        seed: The seed to be set for `numpy` in order to randomly pick a mutation method.\n            Defaults to `42`.\n\n    Runtime parameters:\n        - `seed`: The seed to be set for `numpy` in order to randomly pick a mutation method.\n\n    Input columns:\n        - instruction (`str`): The instruction that was used to generate the `responses`.\n        - response (`str`): The responses to be rewritten.\n\n    Output columns:\n        - evolved_response (`str`): The evolved response if `store_evolutions=False`.\n        - evolved_responses (`List[str]`): The evolved responses if `store_evolutions=True`.\n        - model_name (`str`): The name of the LLM used to evolve the responses.\n\n    Categories:\n        - evol\n        - response\n        - deita\n\n    References:\n        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Evolve the quality of the responses given a prompt:\n\n        ```python\n        from distilabel.steps.tasks import EvolQuality\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        evol_quality = EvolQuality(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_evolutions=2,\n        )\n\n        evol_quality.load()\n\n        result = next(\n            evol_quality.process(\n                [\n                    {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'common instruction',\n        #         'response': 'a response',\n        #         'evolved_response': 'evolved response',\n        #         'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    num_evolutions: int\n    store_evolutions: bool = False\n    include_original_response: bool = False\n    mutation_templates: Dict[str, str] = MUTATION_TEMPLATES\n\n    seed: RuntimeParameter[int] = Field(\n        default=42,\n        description=\"As `numpy` is being used in order to randomly pick a mutation method, then is nice to set a random seed.\",\n    )\n\n    @override\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n        This is useful if you want to do some validation that requires the entire model to be initialized.\n        \"\"\"\n        super().model_post_init(__context)\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task are the `instruction` and `response`.\"\"\"\n        return [\"instruction\", \"response\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation. And the\n        `system_prompt` is added as the first message if it exists.\"\"\"\n        return [{\"role\": \"user\", \"content\": input}]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `evolved_response/s` and the `model_name`.\"\"\"\n        # TODO: having to define a `model_name` column every time as the `Task.outputs` is not ideal,\n        # this could be handled always and the value could be included within the DAG validation when\n        # a `Task` is used, since all the `Task` subclasses will have an `llm` with a `model_name` attr.\n        _outputs = [\n            (\"evolved_response\" if not self.store_evolutions else \"evolved_responses\"),\n            \"model_name\",\n        ]\n\n        return _outputs\n\n    def format_output(self, responses: Union[str, List[str]]) -> Dict[str, Any]:  # type: ignore\n        \"\"\"The output for the task is a dict with: `evolved_response` or `evolved_responses`,\n        depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n        and, finally, the `model_name`.\n\n        Args:\n            responses: The responses to be included within the output.\n\n        Returns:\n            if `store_evolutions=False` return {\"evolved_response\": ..., \"model_name\": ...};\n            if `store_evolutions=True` return {\"evolved_responses\": ..., \"model_name\": ...}.\n        \"\"\"\n        _output = {}\n\n        if not self.store_evolutions:\n            _output[\"evolved_response\"] = responses[-1]\n        else:\n            _output[\"evolved_responses\"] = responses\n\n        _output[\"model_name\"] = self.llm.model_name\n        return _output\n\n    @property\n    def mutation_templates_names(self) -> List[str]:\n        \"\"\"Returns the names i.e. keys of the provided `mutation_templates` enum.\"\"\"\n        return list(self.mutation_templates.keys())\n\n    def _apply_random_mutation(self, instruction: str, response: str) -> str:\n        \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n        enum, and returns the provided instruction within the mutation prompt.\n\n        Args:\n            instruction: The instruction to be included within the mutation prompt.\n\n        Returns:\n            A random mutation prompt with the provided instruction.\n        \"\"\"\n        mutation = np.random.choice(self.mutation_templates_names)\n        return (\n            self.mutation_templates[mutation]\n            .replace(\"<PROMPT>\", instruction)\n            .replace(\"<RESPONSE>\", response)\n        )\n\n    def _evolve_reponses(self, inputs: \"StepInput\") -> List[List[str]]:\n        \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list where each item is a list with either the last evolved instruction if\n            `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n        \"\"\"\n        np.random.seed(self.seed)\n        instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n        responses: List[List[str]] = [[input[\"response\"]] for input in inputs]\n\n        for iter_no in range(self.num_evolutions):\n            formatted_prompts = []\n            for instruction, response in zip(instructions, responses):\n                formatted_prompts.append(\n                    self._apply_random_mutation(instruction[-1], response[-1])\n                )\n\n            formatted_prompts = [\n                self.format_input(prompt) for prompt in formatted_prompts\n            ]\n\n            generated_responses = self.llm.generate(\n                formatted_prompts,\n                **self.llm.generation_kwargs,  # type: ignore\n            )\n\n            if self.store_evolutions:\n                responses = [\n                    response + [evolved_response[0]]\n                    for response, evolved_response in zip(\n                        responses, generated_responses\n                    )\n                ]\n            else:\n                responses = [\n                    [evolved_response[0]] for evolved_response in generated_responses\n                ]\n\n            self._logger.info(\n                f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(responses)} responses!\"\n            )\n\n        return responses\n\n    @override\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Returns:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n\n        responses = self._evolve_reponses(inputs)\n\n        if self.store_evolutions:\n            # Remove the input instruction from the `evolved_responses` list\n            from_ = 1 if not self.include_original_response else 0\n            responses = [response[from_:] for response in responses]\n\n        for input, response in zip(inputs, responses):\n            input.update(self.format_output(response))\n        yield inputs\n\n        self._logger.info(f\"\ud83c\udf89 Finished evolving {len(responses)} instructions!\")\n\n    @override\n    def _sample_input(self) -> ChatType:\n        return self.format_input(\"<PLACEHOLDER_INSTRUCTION>\")\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.inputs","title":"inputs: List[str] property","text":"

The input for the task are the instruction and response.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.outputs","title":"outputs: List[str] property","text":"

The output for the task are the evolved_response/s and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.mutation_templates_names","title":"mutation_templates_names: List[str] property","text":"

Returns the names i.e. keys of the provided mutation_templates enum.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.model_post_init","title":"model_post_init(__context)","text":"

Override this method to perform additional initialization after __init__ and model_construct. This is useful if you want to do some validation that requires the entire model to be initialized.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
@override\ndef model_post_init(self, __context: Any) -> None:\n    \"\"\"Override this method to perform additional initialization after `__init__` and `model_construct`.\n    This is useful if you want to do some validation that requires the entire model to be initialized.\n    \"\"\"\n    super().model_post_init(__context)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation. And the system_prompt is added as the first message if it exists.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
def format_input(self, input: str) -> ChatType:  # type: ignore\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation. And the\n    `system_prompt` is added as the first message if it exists.\"\"\"\n    return [{\"role\": \"user\", \"content\": input}]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.format_output","title":"format_output(responses)","text":"

The output for the task is a dict with: evolved_response or evolved_responses, depending whether the value is either False or True for store_evolutions, respectively; and, finally, the model_name.

Parameters:

Name Type Description Default responses Union[str, List[str]]

The responses to be included within the output.

required

Returns:

Type Description Dict[str, Any]

if store_evolutions=False return {\"evolved_response\": ..., \"model_name\": ...};

Dict[str, Any]

if store_evolutions=True return {\"evolved_responses\": ..., \"model_name\": ...}.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
def format_output(self, responses: Union[str, List[str]]) -> Dict[str, Any]:  # type: ignore\n    \"\"\"The output for the task is a dict with: `evolved_response` or `evolved_responses`,\n    depending whether the value is either `False` or `True` for `store_evolutions`, respectively;\n    and, finally, the `model_name`.\n\n    Args:\n        responses: The responses to be included within the output.\n\n    Returns:\n        if `store_evolutions=False` return {\"evolved_response\": ..., \"model_name\": ...};\n        if `store_evolutions=True` return {\"evolved_responses\": ..., \"model_name\": ...}.\n    \"\"\"\n    _output = {}\n\n    if not self.store_evolutions:\n        _output[\"evolved_response\"] = responses[-1]\n    else:\n        _output[\"evolved_responses\"] = responses\n\n    _output[\"model_name\"] = self.llm.model_name\n    return _output\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality._apply_random_mutation","title":"_apply_random_mutation(instruction, response)","text":"

Applies a random mutation from the ones provided as part of the mutation_templates enum, and returns the provided instruction within the mutation prompt.

Parameters:

Name Type Description Default instruction str

The instruction to be included within the mutation prompt.

required

Returns:

Type Description str

A random mutation prompt with the provided instruction.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
def _apply_random_mutation(self, instruction: str, response: str) -> str:\n    \"\"\"Applies a random mutation from the ones provided as part of the `mutation_templates`\n    enum, and returns the provided instruction within the mutation prompt.\n\n    Args:\n        instruction: The instruction to be included within the mutation prompt.\n\n    Returns:\n        A random mutation prompt with the provided instruction.\n    \"\"\"\n    mutation = np.random.choice(self.mutation_templates_names)\n    return (\n        self.mutation_templates[mutation]\n        .replace(\"<PROMPT>\", instruction)\n        .replace(\"<RESPONSE>\", response)\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality._evolve_reponses","title":"_evolve_reponses(inputs)","text":"

Evolves the instructions provided as part of the inputs of the task.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description List[List[str]]

A list where each item is a list with either the last evolved instruction if

List[List[str]]

store_evolutions=False or all the evolved instructions if store_evolutions=True.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
def _evolve_reponses(self, inputs: \"StepInput\") -> List[List[str]]:\n    \"\"\"Evolves the instructions provided as part of the inputs of the task.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list where each item is a list with either the last evolved instruction if\n        `store_evolutions=False` or all the evolved instructions if `store_evolutions=True`.\n    \"\"\"\n    np.random.seed(self.seed)\n    instructions: List[List[str]] = [[input[\"instruction\"]] for input in inputs]\n    responses: List[List[str]] = [[input[\"response\"]] for input in inputs]\n\n    for iter_no in range(self.num_evolutions):\n        formatted_prompts = []\n        for instruction, response in zip(instructions, responses):\n            formatted_prompts.append(\n                self._apply_random_mutation(instruction[-1], response[-1])\n            )\n\n        formatted_prompts = [\n            self.format_input(prompt) for prompt in formatted_prompts\n        ]\n\n        generated_responses = self.llm.generate(\n            formatted_prompts,\n            **self.llm.generation_kwargs,  # type: ignore\n        )\n\n        if self.store_evolutions:\n            responses = [\n                response + [evolved_response[0]]\n                for response, evolved_response in zip(\n                    responses, generated_responses\n                )\n            ]\n        else:\n            responses = [\n                [evolved_response[0]] for evolved_response in generated_responses\n            ]\n\n        self._logger.info(\n            f\"\ud83d\udd04 Ran iteration {iter_no} evolving {len(responses)} responses!\"\n        )\n\n    return responses\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.EvolQuality.process","title":"process(inputs)","text":"

Processes the inputs of the task and generates the outputs using the LLM.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Returns:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/tasks/evol_quality/base.py
@override\ndef process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Processes the inputs of the task and generates the outputs using the LLM.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Returns:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n\n    responses = self._evolve_reponses(inputs)\n\n    if self.store_evolutions:\n        # Remove the input instruction from the `evolved_responses` list\n        from_ = 1 if not self.include_original_response else 0\n        responses = [response[from_:] for response in responses]\n\n    for input, response in zip(inputs, responses):\n        input.update(self.format_output(response))\n    yield inputs\n\n    self._logger.info(f\"\ud83c\udf89 Finished evolving {len(responses)} instructions!\")\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings","title":"GenerateEmbeddings","text":"

Bases: Step

Generate embeddings using the last hidden state of an LLM.

Generate embeddings for a text input using the last hidden state of an LLM, as described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

Attributes:

Name Type Description llm LLM

The LLM to use to generate the embeddings.

Input columns
  • text (str, List[Dict[str, str]]): The input text or conversation to generate embeddings for.
Output columns
  • embedding (List[float]): The embedding of the input text or conversation.
  • model_name (str): The model name used to generate the embeddings.
Categories
  • embedding
  • llm
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Rank LLM candidates:

from distilabel.steps.tasks import GenerateEmbeddings\nfrom distilabel.models.llms.huggingface import TransformersLLM\n\n# Consider this as a placeholder for your actual LLM.\nembedder = GenerateEmbeddings(\n    llm=TransformersLLM(\n        model=\"TaylorAI/bge-micro-v2\",\n        model_kwargs={\"is_decoder\": True},\n        cuda_devices=[],\n    )\n)\nembedder.load()\n\nresult = next(\n    embedder.process(\n        [\n            {\"text\": \"Hello, how are you?\"},\n        ]\n    )\n)\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/tasks/generate_embeddings.py
class GenerateEmbeddings(Step):\n    \"\"\"Generate embeddings using the last hidden state of an `LLM`.\n\n    Generate embeddings for a text input using the last hidden state of an `LLM`, as\n    described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of\n    Automatic Data Selection in Instruction Tuning'.\n\n    Attributes:\n        llm: The `LLM` to use to generate the embeddings.\n\n    Input columns:\n        - text (`str`, `List[Dict[str, str]]`): The input text or conversation to generate\n            embeddings for.\n\n    Output columns:\n        - embedding (`List[float]`): The embedding of the input text or conversation.\n        - model_name (`str`): The model name used to generate the embeddings.\n\n    Categories:\n        - embedding\n        - llm\n\n    References:\n        - [What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Rank LLM candidates:\n\n        ```python\n        from distilabel.steps.tasks import GenerateEmbeddings\n        from distilabel.models.llms.huggingface import TransformersLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        embedder = GenerateEmbeddings(\n            llm=TransformersLLM(\n                model=\"TaylorAI/bge-micro-v2\",\n                model_kwargs={\"is_decoder\": True},\n                cuda_devices=[],\n            )\n        )\n        embedder.load()\n\n        result = next(\n            embedder.process(\n                [\n                    {\"text\": \"Hello, how are you?\"},\n                ]\n            )\n        )\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    llm: LLM\n\n    def load(self) -> None:\n        \"\"\"Loads the `LLM` used to generate the embeddings.\"\"\"\n        super().load()\n\n        self.llm.load()\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The inputs for the task is a `text` column containing either a string or a\n        list of dictionaries in OpenAI chat-like format.\"\"\"\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs for the task is an `embedding` column containing the embedding of\n        the `text` input.\"\"\"\n        return [\"embedding\", \"model_name\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"Formats the input to be used by the LLM to generate the embeddings. The input\n        can be in `ChatType` format or a string. If a string, it will be converted to a\n        list of dictionaries in OpenAI chat-like format.\n\n        Args:\n            input: The input to format.\n\n        Returns:\n            The OpenAI chat-like format of the input.\n        \"\"\"\n        text = input[\"text\"] = input[\"text\"]\n\n        # input is in `ChatType` format\n        if isinstance(text, str):\n            return [{\"role\": \"user\", \"content\": text}]\n\n        if is_openai_format(text):\n            return text\n\n        raise DistilabelUserError(\n            f\"Couldn't format input for step {self.name}. The `text` input column has to\"\n            \" be a string or a list of dictionaries in OpenAI chat-like format.\",\n            page=\"components-gallery/tasks/generateembeddings/\",\n        )\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Generates an embedding for each input using the last hidden state of the `LLM`.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task.\n        \"\"\"\n        formatted_inputs = [self.format_input(input) for input in inputs]\n        last_hidden_states = self.llm.get_last_hidden_states(formatted_inputs)\n        for input, hidden_state in zip(inputs, last_hidden_states):\n            input[\"embedding\"] = hidden_state[-1].tolist()\n            input[\"model_name\"] = self.llm.model_name\n        yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.inputs","title":"inputs: StepColumns property","text":"

The inputs for the task is a text column containing either a string or a list of dictionaries in OpenAI chat-like format.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.outputs","title":"outputs: StepColumns property","text":"

The outputs for the task is an embedding column containing the embedding of the text input.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.load","title":"load()","text":"

Loads the LLM used to generate the embeddings.

Source code in src/distilabel/steps/tasks/generate_embeddings.py
def load(self) -> None:\n    \"\"\"Loads the `LLM` used to generate the embeddings.\"\"\"\n    super().load()\n\n    self.llm.load()\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.format_input","title":"format_input(input)","text":"

Formats the input to be used by the LLM to generate the embeddings. The input can be in ChatType format or a string. If a string, it will be converted to a list of dictionaries in OpenAI chat-like format.

Parameters:

Name Type Description Default input Dict[str, Any]

The input to format.

required

Returns:

Type Description ChatType

The OpenAI chat-like format of the input.

Source code in src/distilabel/steps/tasks/generate_embeddings.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"Formats the input to be used by the LLM to generate the embeddings. The input\n    can be in `ChatType` format or a string. If a string, it will be converted to a\n    list of dictionaries in OpenAI chat-like format.\n\n    Args:\n        input: The input to format.\n\n    Returns:\n        The OpenAI chat-like format of the input.\n    \"\"\"\n    text = input[\"text\"] = input[\"text\"]\n\n    # input is in `ChatType` format\n    if isinstance(text, str):\n        return [{\"role\": \"user\", \"content\": text}]\n\n    if is_openai_format(text):\n        return text\n\n    raise DistilabelUserError(\n        f\"Couldn't format input for step {self.name}. The `text` input column has to\"\n        \" be a string or a list of dictionaries in OpenAI chat-like format.\",\n        page=\"components-gallery/tasks/generateembeddings/\",\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateEmbeddings.process","title":"process(inputs)","text":"

Generates an embedding for each input using the last hidden state of the LLM.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Yields:

Type Description StepOutput

A list of Python dictionaries with the outputs of the task.

Source code in src/distilabel/steps/tasks/generate_embeddings.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Generates an embedding for each input using the last hidden state of the `LLM`.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Yields:\n        A list of Python dictionaries with the outputs of the task.\n    \"\"\"\n    formatted_inputs = [self.format_input(input) for input in inputs]\n    last_hidden_states = self.llm.get_last_hidden_states(formatted_inputs)\n    for input, hidden_state in zip(inputs, last_hidden_states):\n        input[\"embedding\"] = hidden_state[-1].tolist()\n        input[\"model_name\"] = self.llm.model_name\n    yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct","title":"Genstruct","text":"

Bases: Task

Generate a pair of instruction-response from a document using an LLM.

Genstruct is a pre-defined task designed to generate valid instructions from a given raw document, with the title and the content, enabling the creation of new, partially synthetic instruction finetuning datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is inspired in the Ada-Instruct paper.

Note

The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended option is to use NousResearch/Genstruct-7B as the LLM provided to the task, since it was trained for this specific task.

Attributes:

Name Type Description _template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • title (str): The title of the document.
  • content (str): The content of the document.
Output columns
  • user (str): The user's instruction based on the document.
  • assistant (str): The assistant's response based on the user's instruction.
  • model_name (str): The model name used to generate the feedback and result.
Categories
  • text-generation
  • instruction
  • response
References
  • Genstruct 7B by Nous Research
  • Ada-Instruct: Adapting Instruction Generators for Complex Reasoning

Examples:

Generate instructions from raw documents using the title and content:

from distilabel.steps.tasks import Genstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ngenstruct = Genstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"NousResearch/Genstruct-7B\",\n    ),\n)\n\ngenstruct.load()\n\nresult = next(\n    genstruct.process(\n        [\n            {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'title': 'An instruction',\n#         'content': 'content of the document',\n#         'model_name': 'test',\n#         'user': 'An instruction',\n#         'assistant': 'content of the document',\n#     }\n# ]\n
Citations
@misc{cui2023adainstructadaptinginstructiongenerators,\n    title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning},\n    author={Wanyun Cui and Qianle Wang},\n    year={2023},\n    eprint={2310.04484},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2310.04484},\n}\n
Source code in src/distilabel/steps/tasks/genstruct.py
class Genstruct(Task):\n    \"\"\"Generate a pair of instruction-response from a document using an `LLM`.\n\n    `Genstruct` is a pre-defined task designed to generate valid instructions from a given raw document,\n    with the title and the content, enabling the creation of new, partially synthetic instruction finetuning\n    datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is\n    inspired in the Ada-Instruct paper.\n\n    Note:\n        The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended\n        option is to use `NousResearch/Genstruct-7B` as the LLM provided to the task, since it was trained\n        for this specific task.\n\n    Attributes:\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - title (`str`): The title of the document.\n        - content (`str`): The content of the document.\n\n    Output columns:\n        - user (`str`): The user's instruction based on the document.\n        - assistant (`str`): The assistant's response based on the user's instruction.\n        - model_name (`str`): The model name used to generate the `feedback` and `result`.\n\n    Categories:\n        - text-generation\n        - instruction\n        - response\n\n    References:\n        - [Genstruct 7B by Nous Research](https://huggingface.co/NousResearch/Genstruct-7B)\n        - [Ada-Instruct: Adapting Instruction Generators for Complex Reasoning](https://arxiv.org/abs/2310.04484)\n\n    Examples:\n        Generate instructions from raw documents using the title and content:\n\n        ```python\n        from distilabel.steps.tasks import Genstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        genstruct = Genstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"NousResearch/Genstruct-7B\",\n            ),\n        )\n\n        genstruct.load()\n\n        result = next(\n            genstruct.process(\n                [\n                    {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'title': 'An instruction',\n        #         'content': 'content of the document',\n        #         'model_name': 'test',\n        #         'user': 'An instruction',\n        #         'assistant': 'content of the document',\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{cui2023adainstructadaptinginstructiongenerators,\n            title={Ada-Instruct: Adapting Instruction Generators for Complex Reasoning},\n            author={Wanyun Cui and Qianle Wang},\n            year={2023},\n            eprint={2310.04484},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2310.04484},\n        }\n        ```\n    \"\"\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"genstruct.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the task are the `title` and the `content`.\"\"\"\n        return [\"title\", \"content\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    title=input[\"title\"], content=input[\"content\"]\n                ),\n            }\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `user` instruction based on the provided document\n        and the `assistant` response based on the user's instruction.\"\"\"\n        return [\"user\", \"assistant\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted so that both the user and the assistant messages are\n        captured.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the keys `user` and `assistant` containing the content for each role.\n        \"\"\"\n        if output is None:\n            return {\"user\": None, \"assistant\": None}\n\n        matches = re.search(_PARSE_GENSTRUCT_OUTPUT_REGEX, output, re.DOTALL)\n        if not matches:\n            return {\"user\": None, \"assistant\": None}\n\n        return {\n            \"user\": matches.group(1).strip(),\n            \"assistant\": matches.group(2).strip(),\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.inputs","title":"inputs: List[str] property","text":"

The inputs for the task are the title and the content.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.outputs","title":"outputs: List[str] property","text":"

The output for the task are the user instruction based on the provided document and the assistant response based on the user's instruction.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/genstruct.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"genstruct.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/genstruct.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                title=input[\"title\"], content=input[\"content\"]\n            ),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Genstruct.format_output","title":"format_output(output, input)","text":"

The output is formatted so that both the user and the assistant messages are captured.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the keys user and assistant containing the content for each role.

Source code in src/distilabel/steps/tasks/genstruct.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted so that both the user and the assistant messages are\n    captured.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the keys `user` and `assistant` containing the content for each role.\n    \"\"\"\n    if output is None:\n        return {\"user\": None, \"assistant\": None}\n\n    matches = re.search(_PARSE_GENSTRUCT_OUTPUT_REGEX, output, re.DOTALL)\n    if not matches:\n        return {\"user\": None, \"assistant\": None}\n\n    return {\n        \"user\": matches.group(1).strip(),\n        \"assistant\": matches.group(2).strip(),\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator","title":"BitextRetrievalGenerator","text":"

Bases: _EmbeddingDataGenerator

Generate bitext retrieval data with an LLM to later on train an embedding model.

BitextRetrievalGenerator is a GeneratorTask that generates bitext retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Attributes:

Name Type Description source_language str

The source language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

target_language str

The target language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

unit Optional[Literal['sentence', 'phrase', 'passage']]

The unit of the data to be generated, which can be sentence, phrase, or passage. Defaults to None, meaning that it will be randomly sampled.

difficulty Optional[Literal['elementary school', 'high school', 'college']]

The difficulty of the query to be generated, which can be elementary school, high school, or college. Defaults to None, meaning that it will be randomly sampled.

high_score Optional[Literal['4', '4.5', '5']]

The high score of the query to be generated, which can be 4, 4.5, or 5. Defaults to None, meaning that it will be randomly sampled.

low_score Optional[Literal['2.5', '3', '3.5']]

The low score of the query to be generated, which can be 2.5, 3, or 3.5. Defaults to None, meaning that it will be randomly sampled.

seed Optional[Literal['2.5', '3', '3.5']]

The random seed to be set in case there's any sampling within the format_input method.

Output columns
  • S1 (str): the first sentence generated by the LLM.
  • S2 (str): the second sentence generated by the LLM.
  • S3 (str): the third sentence generated by the LLM.
  • model_name (str): the name of the model used to generate the bitext retrieval data.

Examples:

Generate bitext retrieval data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import BitextRetrievalGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = BitextRetrievalGenerator(\n        source_language=\"English\",\n        target_language=\"Spanish\",\n        unit=\"sentence\",\n        difficulty=\"elementary school\",\n        high_score=\"4\",\n        low_score=\"2.5\",\n        llm=...,\n    )\n\n    ...\n\n    task >> ...\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class BitextRetrievalGenerator(_EmbeddingDataGenerator):\n    \"\"\"Generate bitext retrieval data with an `LLM` to later on train an embedding model.\n\n    `BitextRetrievalGenerator` is a `GeneratorTask` that generates bitext retrieval data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Attributes:\n        source_language: The source language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        target_language: The target language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        unit: The unit of the data to be generated, which can be `sentence`, `phrase`, or `passage`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        difficulty: The difficulty of the query to be generated, which can be `elementary school`, `high school`, or `college`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        high_score: The high score of the query to be generated, which can be `4`, `4.5`, or `5`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        low_score: The low score of the query to be generated, which can be `2.5`, `3`, or `3.5`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n    Output columns:\n        - S1 (`str`): the first sentence generated by the `LLM`.\n        - S2 (`str`): the second sentence generated by the `LLM`.\n        - S3 (`str`): the third sentence generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the bitext retrieval\n            data.\n\n    Examples:\n        Generate bitext retrieval data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import BitextRetrievalGenerator\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = BitextRetrievalGenerator(\n                source_language=\"English\",\n                target_language=\"Spanish\",\n                unit=\"sentence\",\n                difficulty=\"elementary school\",\n                high_score=\"4\",\n                low_score=\"2.5\",\n                llm=...,\n            )\n\n            ...\n\n            task >> ...\n        ```\n    \"\"\"\n\n    source_language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n    target_language: str = Field(\n        default=...,\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    unit: Optional[Literal[\"sentence\", \"phrase\", \"passage\"]] = None\n    difficulty: Optional[Literal[\"elementary school\", \"high school\", \"college\"]] = None\n    high_score: Optional[Literal[\"4\", \"4.5\", \"5\"]] = None\n    low_score: Optional[Literal[\"2.5\", \"3\", \"3.5\"]] = None\n\n    _template_name: str = PrivateAttr(default=\"bitext-retrieval\")\n    _can_be_used_with_offline_batch_generation = True\n\n    @property\n    def prompt(self) -> ChatType:\n        \"\"\"Contains the `prompt` to be used in the `process` method, rendering the `_template`; and\n        formatted as an OpenAI formatted chat i.e. a `ChatType`, assuming that there's only one turn,\n        being from the user with the content being the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    source_language=self.source_language,\n                    target_language=self.target_language,\n                    unit=self.unit or random.choice([\"sentence\", \"phrase\", \"passage\"]),\n                    difficulty=self.difficulty\n                    or random.choice([\"elementary school\", \"high school\", \"college\"]),\n                    high_score=self.high_score or random.choice([\"4\", \"4.5\", \"5\"]),\n                    low_score=self.low_score or random.choice([\"2.5\", \"3\", \"3.5\"]),\n                ).strip(),\n            }\n        ]  # type: ignore\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"S1\", \"S2\", \"S3\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator.prompt","title":"prompt: ChatType property","text":"

Contains the prompt to be used in the process method, rendering the _template; and formatted as an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.BitextRetrievalGenerator.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData","title":"GenerateLongTextMatchingData","text":"

Bases: _EmbeddingDataGeneration

Generate long text matching data with an LLM to later on train an embedding model.

GenerateLongTextMatchingData is a Task that generates long text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Note

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-long\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-long category.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

seed str

The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params.

Input columns
  • task (str): The task description to be used in the generation.
Output columns
  • input (str): the input generated by the LLM.
  • positive_document (str): the positive document generated by the LLM.
  • model_name (str): the name of the model used to generate the long text matching data.
References
  • Improving Text Embeddings with Large Language Models

Examples:

Generate synthetic long text matching data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-matching-long\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateLongTextMatchingData(\n        language=\"English\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class GenerateLongTextMatchingData(_EmbeddingDataGeneration):\n    \"\"\"Generate long text matching data with an `LLM` to later on train an embedding model.\n\n    `GenerateLongTextMatchingData` is a `Task` that generates long text matching data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Note:\n        Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n        with the `category=\"text-matching-long\"`; so that the `LLM` generates a list of tasks that\n        are flattened so that each row contains a single task for the text-matching-long category.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n            Note that in this task the `seed` has no effect since there are no sampling params.\n\n    Input columns:\n        - task (`str`): The task description to be used in the generation.\n\n    Output columns:\n        - input (`str`): the input generated by the `LLM`.\n        - positive_document (`str`): the positive document generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the long text matching\n            data.\n\n    References:\n        - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n    Examples:\n        Generate synthetic long text matching data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = EmbeddingTaskGenerator(\n                category=\"text-matching-long\",\n                flatten_tasks=True,\n                llm=...,  # LLM instance\n            )\n\n            generate = GenerateLongTextMatchingData(\n                language=\"English\",\n                llm=...,  # LLM instance\n            )\n\n            task >> generate\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    _template_name: str = PrivateAttr(default=\"long-text-matching\")\n    _can_be_used_with_offline_batch_generation = True\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n        randomly sampling those if not provided. This method will render the `_template` with\n        the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n        there's only one turn, being from the user with the content being the rendered `_template`.\n\n        Args:\n            input: The input dictionary containing the `task` to be used in the `_template`.\n\n        Returns:\n            A list with a single chat containing the user's message with the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    task=input[\"task\"],\n                    language=self.language,\n                ).strip(),\n            }\n        ]\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"input\", \"positive_document\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateLongTextMatchingData.format_input","title":"format_input(input)","text":"

Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

Parameters:

Name Type Description Default input Dict[str, Any]

The input dictionary containing the task to be used in the _template.

required

Returns:

Type Description ChatType

A list with a single chat containing the user's message with the rendered _template.

Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n    randomly sampling those if not provided. This method will render the `_template` with\n    the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n    there's only one turn, being from the user with the content being the rendered `_template`.\n\n    Args:\n        input: The input dictionary containing the `task` to be used in the `_template`.\n\n    Returns:\n        A list with a single chat containing the user's message with the rendered `_template`.\n    \"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                task=input[\"task\"],\n                language=self.language,\n            ).strip(),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData","title":"GenerateShortTextMatchingData","text":"

Bases: _EmbeddingDataGeneration

Generate short text matching data with an LLM to later on train an embedding model.

GenerateShortTextMatchingData is a Task that generates short text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Note

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-short\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-short category.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

seed str

The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params.

Input columns
  • task (str): The task description to be used in the generation.
Output columns
  • input (str): the input generated by the LLM.
  • positive_document (str): the positive document generated by the LLM.
  • model_name (str): the name of the model used to generate the short text matching data.
References
  • Improving Text Embeddings with Large Language Models

Examples:

Generate synthetic short text matching data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-matching-short\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateShortTextMatchingData(\n        language=\"English\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class GenerateShortTextMatchingData(_EmbeddingDataGeneration):\n    \"\"\"Generate short text matching data with an `LLM` to later on train an embedding model.\n\n    `GenerateShortTextMatchingData` is a `Task` that generates short text matching data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Note:\n        Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n        with the `category=\"text-matching-short\"`; so that the `LLM` generates a list of tasks that\n        are flattened so that each row contains a single task for the text-matching-short category.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n            Note that in this task the `seed` has no effect since there are no sampling params.\n\n    Input columns:\n        - task (`str`): The task description to be used in the generation.\n\n    Output columns:\n        - input (`str`): the input generated by the `LLM`.\n        - positive_document (`str`): the positive document generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the short text matching\n            data.\n\n    References:\n        - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n    Examples:\n        Generate synthetic short text matching data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = EmbeddingTaskGenerator(\n                category=\"text-matching-short\",\n                flatten_tasks=True,\n                llm=...,  # LLM instance\n            )\n\n            generate = GenerateShortTextMatchingData(\n                language=\"English\",\n                llm=...,  # LLM instance\n            )\n\n            task >> generate\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    _template_name: str = PrivateAttr(default=\"short-text-matching\")\n    _can_be_used_with_offline_batch_generation = True\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n        randomly sampling those if not provided. This method will render the `_template` with\n                the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n                there's only one turn, being from the user with the content being the rendered `_template`.\n\n                Args:\n                    input: The input dictionary containing the `task` to be used in the `_template`.\n\n                Returns:\n                    A list with a single chat containing the user's message with the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    task=input[\"task\"],\n                    language=self.language,\n                ).strip(),\n            }\n        ]\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"input\", \"positive_document\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateShortTextMatchingData.format_input","title":"format_input(input)","text":"

Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

    Args:\n        input: The input dictionary containing the `task` to be used in the `_template`.\n\n    Returns:\n        A list with a single chat containing the user's message with the rendered `_template`.\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n    randomly sampling those if not provided. This method will render the `_template` with\n            the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n            there's only one turn, being from the user with the content being the rendered `_template`.\n\n            Args:\n                input: The input dictionary containing the `task` to be used in the `_template`.\n\n            Returns:\n                A list with a single chat containing the user's message with the rendered `_template`.\n    \"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                task=input[\"task\"],\n                language=self.language,\n            ).strip(),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData","title":"GenerateTextClassificationData","text":"

Bases: _EmbeddingDataGeneration

Generate text classification data with an LLM to later on train an embedding model.

GenerateTextClassificationData is a Task that generates text classification data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Note

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-classification\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-classification category.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

difficulty Optional[Literal['high school', 'college', 'PhD']]

The difficulty of the query to be generated, which can be high school, college, or PhD. Defaults to None, meaning that it will be randomly sampled.

clarity Optional[Literal['clear', 'understandable with some effort', 'ambiguous']]

The clarity of the query to be generated, which can be clear, understandable with some effort, or ambiguous. Defaults to None, meaning that it will be randomly sampled.

seed Optional[Literal['clear', 'understandable with some effort', 'ambiguous']]

The random seed to be set in case there's any sampling within the format_input method.

Input columns
  • task (str): The task description to be used in the generation.
Output columns
  • input_text (str): the input text generated by the LLM.
  • label (str): the label generated by the LLM.
  • misleading_label (str): the misleading label generated by the LLM.
  • model_name (str): the name of the model used to generate the text classification data.
References
  • Improving Text Embeddings with Large Language Models

Examples:

Generate synthetic text classification data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-classification\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateTextClassificationData(\n        language=\"English\",\n        difficulty=\"high school\",\n        clarity=\"clear\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class GenerateTextClassificationData(_EmbeddingDataGeneration):\n    \"\"\"Generate text classification data with an `LLM` to later on train an embedding model.\n\n    `GenerateTextClassificationData` is a `Task` that generates text classification data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Note:\n        Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n        with the `category=\"text-classification\"`; so that the `LLM` generates a list of tasks that\n        are flattened so that each row contains a single task for the text-classification category.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        difficulty: The difficulty of the query to be generated, which can be `high school`, `college`, or `PhD`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        clarity: The clarity of the query to be generated, which can be `clear`, `understandable with some effort`,\n            or `ambiguous`. Defaults to `None`, meaning that it will be randomly sampled.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n    Input columns:\n        - task (`str`): The task description to be used in the generation.\n\n    Output columns:\n        - input_text (`str`): the input text generated by the `LLM`.\n        - label (`str`): the label generated by the `LLM`.\n        - misleading_label (`str`): the misleading label generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the text classification\n            data.\n\n    References:\n        - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n    Examples:\n        Generate synthetic text classification data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = EmbeddingTaskGenerator(\n                category=\"text-classification\",\n                flatten_tasks=True,\n                llm=...,  # LLM instance\n            )\n\n            generate = GenerateTextClassificationData(\n                language=\"English\",\n                difficulty=\"high school\",\n                clarity=\"clear\",\n                llm=...,  # LLM instance\n            )\n\n            task >> generate\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    difficulty: Optional[Literal[\"high school\", \"college\", \"PhD\"]] = None\n    clarity: Optional[\n        Literal[\"clear\", \"understandable with some effort\", \"ambiguous\"]\n    ] = None\n\n    _template_name: str = PrivateAttr(default=\"text-classification\")\n    _can_be_used_with_offline_batch_generation = True\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n        randomly sampling those if not provided. This method will render the `_template` with\n        the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n        there's only one turn, being from the user with the content being the rendered `_template`.\n\n        Args:\n            input: The input dictionary containing the `task` to be used in the `_template`.\n\n        Returns:\n            A list with a single chat containing the user's message with the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    task=input[\"task\"],\n                    language=self.language,\n                    difficulty=self.difficulty\n                    or random.choice([\"high school\", \"college\", \"PhD\"]),\n                    clarity=self.clarity\n                    or random.choice(\n                        [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n                    ),\n                ).strip(),\n            }\n        ]\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"input_text\", \"label\", \"misleading_label\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextClassificationData.format_input","title":"format_input(input)","text":"

Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

Parameters:

Name Type Description Default input Dict[str, Any]

The input dictionary containing the task to be used in the _template.

required

Returns:

Type Description ChatType

A list with a single chat containing the user's message with the rendered _template.

Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n    randomly sampling those if not provided. This method will render the `_template` with\n    the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n    there's only one turn, being from the user with the content being the rendered `_template`.\n\n    Args:\n        input: The input dictionary containing the `task` to be used in the `_template`.\n\n    Returns:\n        A list with a single chat containing the user's message with the rendered `_template`.\n    \"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                task=input[\"task\"],\n                language=self.language,\n                difficulty=self.difficulty\n                or random.choice([\"high school\", \"college\", \"PhD\"]),\n                clarity=self.clarity\n                or random.choice(\n                    [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n                ),\n            ).strip(),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData","title":"GenerateTextRetrievalData","text":"

Bases: _EmbeddingDataGeneration

Generate text retrieval data with an LLM to later on train an embedding model.

GenerateTextRetrievalData is a Task that generates text retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Note

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-retrieval\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-retrieval category.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

query_type Optional[Literal['extremely long-tail', 'long-tail', 'common']]

The type of query to be generated, which can be extremely long-tail, long-tail, or common. Defaults to None, meaning that it will be randomly sampled.

query_length Optional[Literal['less than 5 words', '5 to 15 words', 'at least 10 words']]

The length of the query to be generated, which can be less than 5 words, 5 to 15 words, or at least 10 words. Defaults to None, meaning that it will be randomly sampled.

difficulty Optional[Literal['high school', 'college', 'PhD']]

The difficulty of the query to be generated, which can be high school, college, or PhD. Defaults to None, meaning that it will be randomly sampled.

clarity Optional[Literal['clear', 'understandable with some effort', 'ambiguous']]

The clarity of the query to be generated, which can be clear, understandable with some effort, or ambiguous. Defaults to None, meaning that it will be randomly sampled.

num_words Optional[Literal[50, 100, 200, 300, 400, 500]]

The number of words in the query to be generated, which can be 50, 100, 200, 300, 400, or 500. Defaults to None, meaning that it will be randomly sampled.

seed Optional[Literal[50, 100, 200, 300, 400, 500]]

The random seed to be set in case there's any sampling within the format_input method.

Input columns
  • task (str): The task description to be used in the generation.
Output columns
  • user_query (str): the user query generated by the LLM.
  • positive_document (str): the positive document generated by the LLM.
  • hard_negative_document (str): the hard negative document generated by the LLM.
  • model_name (str): the name of the model used to generate the text retrieval data.
References
  • Improving Text Embeddings with Large Language Models

Examples:

Generate synthetic text retrieval data for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-retrieval\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateTextRetrievalData(\n        language=\"English\",\n        query_type=\"common\",\n        query_length=\"5 to 15 words\",\n        difficulty=\"high school\",\n        clarity=\"clear\",\n        num_words=100,\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class GenerateTextRetrievalData(_EmbeddingDataGeneration):\n    \"\"\"Generate text retrieval data with an `LLM` to later on train an embedding model.\n\n    `GenerateTextRetrievalData` is a `Task` that generates text retrieval data with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Note:\n        Ideally this task should be used with `EmbeddingTaskGenerator` with `flatten_tasks=True`\n        with the `category=\"text-retrieval\"`; so that the `LLM` generates a list of tasks that\n        are flattened so that each row contains a single task for the text-retrieval category.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        query_type: The type of query to be generated, which can be `extremely long-tail`, `long-tail`,\n            or `common`. Defaults to `None`, meaning that it will be randomly sampled.\n        query_length: The length of the query to be generated, which can be `less than 5 words`, `5 to 15 words`,\n            or `at least 10 words`. Defaults to `None`, meaning that it will be randomly sampled.\n        difficulty: The difficulty of the query to be generated, which can be `high school`, `college`, or `PhD`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        clarity: The clarity of the query to be generated, which can be `clear`, `understandable with some effort`,\n            or `ambiguous`. Defaults to `None`, meaning that it will be randomly sampled.\n        num_words: The number of words in the query to be generated, which can be `50`, `100`, `200`, `300`, `400`, or `500`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n    Input columns:\n        - task (`str`): The task description to be used in the generation.\n\n    Output columns:\n        - user_query (`str`): the user query generated by the `LLM`.\n        - positive_document (`str`): the positive document generated by the `LLM`.\n        - hard_negative_document (`str`): the hard negative document generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the text retrieval data.\n\n    References:\n        - [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368)\n\n    Examples:\n        Generate synthetic text retrieval data for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = EmbeddingTaskGenerator(\n                category=\"text-retrieval\",\n                flatten_tasks=True,\n                llm=...,  # LLM instance\n            )\n\n            generate = GenerateTextRetrievalData(\n                language=\"English\",\n                query_type=\"common\",\n                query_length=\"5 to 15 words\",\n                difficulty=\"high school\",\n                clarity=\"clear\",\n                num_words=100,\n                llm=...,  # LLM instance\n            )\n\n            task >> generate\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    query_type: Optional[Literal[\"extremely long-tail\", \"long-tail\", \"common\"]] = None\n    query_length: Optional[\n        Literal[\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n    ] = None\n    difficulty: Optional[Literal[\"high school\", \"college\", \"PhD\"]] = None\n    clarity: Optional[\n        Literal[\"clear\", \"understandable with some effort\", \"ambiguous\"]\n    ] = None\n    num_words: Optional[Literal[50, 100, 200, 300, 400, 500]] = None\n\n    _template_name: str = PrivateAttr(default=\"text-retrieval\")\n    _can_be_used_with_offline_batch_generation = True\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n        randomly sampling those if not provided. This method will render the `_template` with\n        the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n        there's only one turn, being from the user with the content being the rendered `_template`.\n\n        Args:\n            input: The input dictionary containing the `task` to be used in the `_template`.\n\n        Returns:\n            A list with a single chat containing the user's message with the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    task=input[\"task\"],\n                    language=self.language,\n                    query_type=self.query_type\n                    or random.choice([\"extremely long-tail\", \"long-tail\", \"common\"]),\n                    query_length=self.query_length\n                    or random.choice(\n                        [\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n                    ),\n                    difficulty=self.difficulty\n                    or random.choice([\"high school\", \"college\", \"PhD\"]),\n                    clarity=self.clarity\n                    or random.choice(\n                        [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n                    ),\n                    num_words=self.num_words\n                    or random.choice([50, 100, 200, 300, 400, 500]),\n                ).strip(),\n            }\n        ]\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\n            \"user_query\",\n            \"positive_document\",\n            \"hard_negative_document\",\n        ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateTextRetrievalData.format_input","title":"format_input(input)","text":"

Method to format the input based on the task and the provided attributes, or just randomly sampling those if not provided. This method will render the _template with the provided arguments and return an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

Parameters:

Name Type Description Default input Dict[str, Any]

The input dictionary containing the task to be used in the _template.

required

Returns:

Type Description ChatType

A list with a single chat containing the user's message with the rendered _template.

Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"Method to format the input based on the `task` and the provided attributes, or just\n    randomly sampling those if not provided. This method will render the `_template` with\n    the provided arguments and return an OpenAI formatted chat i.e. a `ChatType`, assuming that\n    there's only one turn, being from the user with the content being the rendered `_template`.\n\n    Args:\n        input: The input dictionary containing the `task` to be used in the `_template`.\n\n    Returns:\n        A list with a single chat containing the user's message with the rendered `_template`.\n    \"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                task=input[\"task\"],\n                language=self.language,\n                query_type=self.query_type\n                or random.choice([\"extremely long-tail\", \"long-tail\", \"common\"]),\n                query_length=self.query_length\n                or random.choice(\n                    [\"less than 5 words\", \"5 to 15 words\", \"at least 10 words\"]\n                ),\n                difficulty=self.difficulty\n                or random.choice([\"high school\", \"college\", \"PhD\"]),\n                clarity=self.clarity\n                or random.choice(\n                    [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n                ),\n                num_words=self.num_words\n                or random.choice([50, 100, 200, 300, 400, 500]),\n            ).strip(),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator","title":"MonolingualTripletGenerator","text":"

Bases: _EmbeddingDataGenerator

Generate monolingual triplets with an LLM to later on train an embedding model.

MonolingualTripletGenerator is a GeneratorTask that generates monolingual triplets with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

Attributes:

Name Type Description language str

The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

unit Optional[Literal['sentence', 'phrase', 'passage']]

The unit of the data to be generated, which can be sentence, phrase, or passage. Defaults to None, meaning that it will be randomly sampled.

difficulty Optional[Literal['elementary school', 'high school', 'college']]

The difficulty of the query to be generated, which can be elementary school, high school, or college. Defaults to None, meaning that it will be randomly sampled.

high_score Optional[Literal['4', '4.5', '5']]

The high score of the query to be generated, which can be 4, 4.5, or 5. Defaults to None, meaning that it will be randomly sampled.

low_score Optional[Literal['2.5', '3', '3.5']]

The low score of the query to be generated, which can be 2.5, 3, or 3.5. Defaults to None, meaning that it will be randomly sampled.

seed Optional[Literal['2.5', '3', '3.5']]

The random seed to be set in case there's any sampling within the format_input method.

Output columns
  • S1 (str): the first sentence generated by the LLM.
  • S2 (str): the second sentence generated by the LLM.
  • S3 (str): the third sentence generated by the LLM.
  • model_name (str): the name of the model used to generate the monolingual triplets.

Examples:

Generate monolingual triplets for training embedding models:

from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import MonolingualTripletGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = MonolingualTripletGenerator(\n        language=\"English\",\n        unit=\"sentence\",\n        difficulty=\"elementary school\",\n        high_score=\"4\",\n        low_score=\"2.5\",\n        llm=...,\n    )\n\n    ...\n\n    task >> ...\n
Source code in src/distilabel/steps/tasks/improving_text_embeddings.py
class MonolingualTripletGenerator(_EmbeddingDataGenerator):\n    \"\"\"Generate monolingual triplets with an `LLM` to later on train an embedding model.\n\n    `MonolingualTripletGenerator` is a `GeneratorTask` that generates monolingual triplets with an\n    `LLM` to later on train an embedding model. The task is based on the paper \"Improving\n    Text Embeddings with Large Language Models\" and the data is generated based on the\n    provided attributes, or randomly sampled if not provided.\n\n    Attributes:\n        language: The language of the data to be generated, which can be any of the languages\n            retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.\n        unit: The unit of the data to be generated, which can be `sentence`, `phrase`, or `passage`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        difficulty: The difficulty of the query to be generated, which can be `elementary school`, `high school`, or `college`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        high_score: The high score of the query to be generated, which can be `4`, `4.5`, or `5`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        low_score: The low score of the query to be generated, which can be `2.5`, `3`, or `3.5`.\n            Defaults to `None`, meaning that it will be randomly sampled.\n        seed: The random seed to be set in case there's any sampling within the `format_input` method.\n\n    Output columns:\n        - S1 (`str`): the first sentence generated by the `LLM`.\n        - S2 (`str`): the second sentence generated by the `LLM`.\n        - S3 (`str`): the third sentence generated by the `LLM`.\n        - model_name (`str`): the name of the model used to generate the monolingual triplets.\n\n    Examples:\n        Generate monolingual triplets for training embedding models:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps.tasks import MonolingualTripletGenerator\n\n        with Pipeline(\"my-pipeline\") as pipeline:\n            task = MonolingualTripletGenerator(\n                language=\"English\",\n                unit=\"sentence\",\n                difficulty=\"elementary school\",\n                high_score=\"4\",\n                low_score=\"2.5\",\n                llm=...,\n            )\n\n            ...\n\n            task >> ...\n        ```\n    \"\"\"\n\n    language: str = Field(\n        default=\"English\",\n        description=\"The languages are retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf\",\n    )\n\n    unit: Optional[Literal[\"sentence\", \"phrase\", \"passage\"]] = None\n    difficulty: Optional[Literal[\"elementary school\", \"high school\", \"college\"]] = None\n    high_score: Optional[Literal[\"4\", \"4.5\", \"5\"]] = None\n    low_score: Optional[Literal[\"2.5\", \"3\", \"3.5\"]] = None\n\n    _template_name: str = PrivateAttr(default=\"monolingual-triplet\")\n    _can_be_used_with_offline_batch_generation = True\n\n    @property\n    def prompt(self) -> ChatType:\n        \"\"\"Contains the `prompt` to be used in the `process` method, rendering the `_template`; and\n        formatted as an OpenAI formatted chat i.e. a `ChatType`, assuming that there's only one turn,\n        being from the user with the content being the rendered `_template`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    language=self.language,\n                    unit=self.unit or random.choice([\"sentence\", \"phrase\", \"passage\"]),\n                    difficulty=self.difficulty\n                    or random.choice([\"elementary school\", \"high school\", \"college\"]),\n                    high_score=self.high_score or random.choice([\"4\", \"4.5\", \"5\"]),\n                    low_score=self.low_score or random.choice([\"2.5\", \"3\", \"3.5\"]),\n                ).strip(),\n            }\n        ]  # type: ignore\n\n    @property\n    def keys(self) -> List[str]:\n        \"\"\"Contains the `keys` that will be parsed from the `LLM` output into a Python dict.\"\"\"\n        return [\"S1\", \"S2\", \"S3\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator.prompt","title":"prompt: ChatType property","text":"

Contains the prompt to be used in the process method, rendering the _template; and formatted as an OpenAI formatted chat i.e. a ChatType, assuming that there's only one turn, being from the user with the content being the rendered _template.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MonolingualTripletGenerator.keys","title":"keys: List[str] property","text":"

Contains the keys that will be parsed from the LLM output into a Python dict.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation","title":"InstructionBacktranslation","text":"

Bases: Task

Self-Alignment with Instruction Backtranslation.

Attributes:

Name Type Description _template Optional[Template]

the Jinja2 template to use for the Instruction Backtranslation task.

Input columns
  • instruction (str): The reference instruction to evaluate the text output.
  • generation (str): The text output to evaluate for the given instruction.
Output columns
  • score (str): The score for the generation based on the given instruction.
  • reason (str): The reason for the provided score.
  • model_name (str): The model name used to score the generation.
Categories
  • critique
References
  • Self-Alignment with Instruction Backtranslation

Examples:

Generate a score and reason for a given instruction and generation:

from distilabel.steps.tasks import InstructionBacktranslation\n\ninstruction_backtranslation = InstructionBacktranslation(\n        name=\"instruction_backtranslation\",\n        llm=llm,\n        input_batch_size=10,\n        output_mappings={\"model_name\": \"scoring_model\"},\n    )\ninstruction_backtranslation.load()\n\nresult = next(\n    instruction_backtranslation.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generation\": \"4\",\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         \"instruction\": \"How much is 2+2?\",\n#         \"generation\": \"4\",\n#         \"score\": 3,\n#         \"reason\": \"Reason for the generation.\",\n#         \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n#     }\n# ]\n
Citations
@misc{li2024selfalignmentinstructionbacktranslation,\n    title={Self-Alignment with Instruction Backtranslation},\n    author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Omer Levy and Luke Zettlemoyer and Jason Weston and Mike Lewis},\n    year={2024},\n    eprint={2308.06259},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2308.06259},\n}\n
Source code in src/distilabel/steps/tasks/instruction_backtranslation.py
class InstructionBacktranslation(Task):\n    \"\"\"Self-Alignment with Instruction Backtranslation.\n\n    Attributes:\n        _template: the Jinja2 template to use for the Instruction Backtranslation task.\n\n    Input columns:\n        - instruction (`str`): The reference instruction to evaluate the text output.\n        - generation (`str`): The text output to evaluate for the given instruction.\n\n    Output columns:\n        - score (`str`): The score for the generation based on the given instruction.\n        - reason (`str`): The reason for the provided score.\n        - model_name (`str`): The model name used to score the generation.\n\n    Categories:\n        - critique\n\n    References:\n        - [`Self-Alignment with Instruction Backtranslation`](https://arxiv.org/abs/2308.06259)\n\n    Examples:\n        Generate a score and reason for a given instruction and generation:\n\n        ```python\n        from distilabel.steps.tasks import InstructionBacktranslation\n\n        instruction_backtranslation = InstructionBacktranslation(\n                name=\"instruction_backtranslation\",\n                llm=llm,\n                input_batch_size=10,\n                output_mappings={\"model_name\": \"scoring_model\"},\n            )\n        instruction_backtranslation.load()\n\n        result = next(\n            instruction_backtranslation.process(\n                [\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"generation\": \"4\",\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         \"instruction\": \"How much is 2+2?\",\n        #         \"generation\": \"4\",\n        #         \"score\": 3,\n        #         \"reason\": \"Reason for the generation.\",\n        #         \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{li2024selfalignmentinstructionbacktranslation,\n            title={Self-Alignment with Instruction Backtranslation},\n            author={Xian Li and Ping Yu and Chunting Zhou and Timo Schick and Omer Levy and Luke Zettlemoyer and Jason Weston and Mike Lewis},\n            year={2024},\n            eprint={2308.06259},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2308.06259},\n        }\n        ```\n    \"\"\"\n\n    _template: Optional[\"Template\"] = PrivateAttr(default=...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"instruction-backtranslation.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`, and the `generation` for it.\"\"\"\n        return [\"instruction\", \"generation\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    instruction=input[\"instruction\"], generation=input[\"generation\"]\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `score`, `reason` and the `model_name`.\"\"\"\n        return [\"score\", \"reason\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `score` and `reason`. The\n        `model_name` will be automatically included within the `process` method of `Task`.\n\n        Args:\n            output: a string representing the output of the LLM via the `process` method.\n            input: the input to the task, as required by some tasks to format the output.\n\n        Returns:\n            A dictionary containing the `score` and the `reason` for the provided `score`.\n        \"\"\"\n        pattern = r\"(.+?)Score: (\\d)\"\n\n        matches = None\n        if output is not None:\n            matches = re.findall(pattern, output, re.DOTALL)\n        if matches is None:\n            return {\"score\": None, \"reason\": None}\n\n        return {\n            \"score\": int(matches[0][1]),\n            \"reason\": matches[0][0].strip(),\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.inputs","title":"inputs: List[str] property","text":"

The input for the task is the instruction, and the generation for it.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.outputs","title":"outputs: List[str] property","text":"

The output for the task is the score, reason and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/instruction_backtranslation.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"instruction-backtranslation.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/instruction_backtranslation.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                instruction=input[\"instruction\"], generation=input[\"generation\"]\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.InstructionBacktranslation.format_output","title":"format_output(output, input)","text":"

The output is formatted as a dictionary with the score and reason. The model_name will be automatically included within the process method of Task.

Parameters:

Name Type Description Default output Union[str, None]

a string representing the output of the LLM via the process method.

required input Dict[str, Any]

the input to the task, as required by some tasks to format the output.

required

Returns:

Type Description Dict[str, Any]

A dictionary containing the score and the reason for the provided score.

Source code in src/distilabel/steps/tasks/instruction_backtranslation.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `score` and `reason`. The\n    `model_name` will be automatically included within the `process` method of `Task`.\n\n    Args:\n        output: a string representing the output of the LLM via the `process` method.\n        input: the input to the task, as required by some tasks to format the output.\n\n    Returns:\n        A dictionary containing the `score` and the `reason` for the provided `score`.\n    \"\"\"\n    pattern = r\"(.+?)Score: (\\d)\"\n\n    matches = None\n    if output is not None:\n        matches = re.findall(pattern, output, re.DOTALL)\n    if matches is None:\n        return {\"score\": None, \"reason\": None}\n\n    return {\n        \"score\": int(matches[0][1]),\n        \"reason\": matches[0][0].strip(),\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie","title":"Magpie","text":"

Bases: Task, MagpieBase

Generates conversations using an instruct fine-tuned LLM.

Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as if it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'.

Attributes:

Name Type Description n_turns

the number of turns that the generated conversation will have. Defaults to 1.

end_with_user

whether the conversation should end with a user message. Defaults to False.

include_system_prompt

whether to include the system prompt used in the generated conversation. Defaults to False.

only_instruction

whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

system_prompt

an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None.

Runtime parameters
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.
  • end_with_user: whether the conversation should end with a user message. Defaults to False.
  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.
  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.
  • system_prompt: an optional system prompt or list of system prompts that can be used to steer the LLM to generate content of certain topic, guide the style, etc. If it's a list of system prompts, then a random system prompt will be chosen per input/output batch. If the provided inputs contains a system_prompt column, then this runtime parameter will be ignored and the one from the column will be used. Defaults to None.
  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic.
Input columns
  • system_prompt (str, optional): an optional system prompt that can be provided to guide the generation of the instruct LLM and steer it to generate instructions of certain topic.
Output columns
  • conversation (ChatType): the generated conversation which is a list of chat items with a role and a message. Only if only_instruction=False.
  • instruction (str): the generated instructions if only_instruction=True or n_turns==1.
  • response (str): the generated response if n_turns==1.
  • system_prompt_key (str, optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary.
  • model_name (str): The model name used to generate the conversation or instruction.
Categories
  • text-generation
  • instruction
References
  • Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing

Examples:

Generating instructions with Llama 3 8B Instruct and TransformersLLM:

from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 64,\n        },\n        device=\"mps\",\n    ),\n    only_instruction=True,\n)\n\nmagpie.load()\n\nresult = next(\n    magpie.process(\n        inputs=[\n            {\n                \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n            },\n            {\n                \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n            },\n        ]\n    )\n)\n# [\n#     {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n#     {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n# ]\n

Generating conversations with Llama 3 8B Instruct and TransformersLLM:

from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 256,\n        },\n        device=\"mps\",\n    ),\n    n_turns=2,\n)\n\nmagpie.load()\n\nresult = next(\n    magpie.process(\n        inputs=[\n            {\n                \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n            },\n            {\n                \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n            },\n        ]\n    )\n)\n# [\n#     {\n#         'conversation': [\n#             {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n#             {\n#                 'role': 'user',\n#                 'content': 'I'm having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n# of x\".'\n#             },\n#             {\n#                 'role': 'assistant',\n#                 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don't worry, I'm here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n# x\". What it's asking us to do is find the'\n#             }\n#         ]\n#     },\n#     {\n#         'conversation': [\n#             {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n#             {\n#                 'role': 'user',\n#                 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n# might be pests or diseases, but I'm not sure which.\"\n#             },\n#             {\n#                 'role': 'assistant',\n#                 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n# **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n#             }\n#         ]\n#     }\n# ]\n
Source code in src/distilabel/steps/tasks/magpie/base.py
class Magpie(Task, MagpieBase):\n    \"\"\"Generates conversations using an instruct fine-tuned LLM.\n\n    Magpie is a neat method that allows generating user instructions with no seed data\n    or specific system prompt thanks to the autoregressive capabilities of the instruct\n    fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message\n    and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query\n    or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the\n    LLM without any user message, then the LLM will continue generating tokens as if it was\n    the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM.\n    After this instruct is generated, it can be sent again to the LLM to generate this time\n    an assistant response. This process can be repeated N times allowing to build a multi-turn\n    conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from\n    Scratch by Prompting Aligned LLMs with Nothing'.\n\n    Attributes:\n        n_turns: the number of turns that the generated conversation will have.\n            Defaults to `1`.\n        end_with_user: whether the conversation should end with a user message.\n            Defaults to `False`.\n        include_system_prompt: whether to include the system prompt used in the generated\n            conversation. Defaults to `False`.\n        only_instruction: whether to generate only the instruction. If this argument is\n            `True`, then `n_turns` will be ignored. Defaults to `False`.\n        system_prompt: an optional system prompt, or a list of system prompts from which\n            a random one will be chosen, or a dictionary of system prompts from which a\n            random one will be choosen, or a dictionary of system prompts with their probability\n            of being chosen. The random system prompt will be chosen per input/output batch.\n            This system prompt can be used to guide the generation of the instruct LLM and\n            steer it to generate instructions of a certain topic. Defaults to `None`.\n\n    Runtime parameters:\n        - `n_turns`: the number of turns that the generated conversation will have. Defaults\n            to `1`.\n        - `end_with_user`: whether the conversation should end with a user message.\n            Defaults to `False`.\n        - `include_system_prompt`: whether to include the system prompt used in the generated\n            conversation. Defaults to `False`.\n        - `only_instruction`: whether to generate only the instruction. If this argument is\n            `True`, then `n_turns` will be ignored. Defaults to `False`.\n        - `system_prompt`: an optional system prompt or list of system prompts that can\n            be used to steer the LLM to generate content of certain topic, guide the style,\n            etc. If it's a list of system prompts, then a random system prompt will be chosen\n            per input/output batch. If the provided inputs contains a `system_prompt` column,\n            then this runtime parameter will be ignored and the one from the column will\n            be used. Defaults to `None`.\n        - `system_prompt`: an optional system prompt, or a list of system prompts from which\n            a random one will be chosen, or a dictionary of system prompts from which a\n            random one will be choosen, or a dictionary of system prompts with their probability\n            of being chosen. The random system prompt will be chosen per input/output batch.\n            This system prompt can be used to guide the generation of the instruct LLM and\n            steer it to generate instructions of a certain topic.\n\n    Input columns:\n        - system_prompt (`str`, optional): an optional system prompt that can be provided\n            to guide the generation of the instruct LLM and steer it to generate instructions\n            of certain topic.\n\n    Output columns:\n        - conversation (`ChatType`): the generated conversation which is a list of chat\n            items with a role and a message. Only if `only_instruction=False`.\n        - instruction (`str`): the generated instructions if `only_instruction=True` or `n_turns==1`.\n        - response (`str`): the generated response if `n_turns==1`.\n        - system_prompt_key (`str`, optional): the key of the system prompt used to generate\n            the conversation or instruction. Only if `system_prompt` is a dictionary.\n        - model_name (`str`): The model name used to generate the `conversation` or `instruction`.\n\n    Categories:\n        - text-generation\n        - instruction\n\n    References:\n        - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)\n\n    Examples:\n        Generating instructions with Llama 3 8B Instruct and TransformersLLM:\n\n        ```python\n        from distilabel.models import TransformersLLM\n        from distilabel.steps.tasks import Magpie\n\n        magpie = Magpie(\n            llm=TransformersLLM(\n                model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 1.0,\n                    \"max_new_tokens\": 64,\n                },\n                device=\"mps\",\n            ),\n            only_instruction=True,\n        )\n\n        magpie.load()\n\n        result = next(\n            magpie.process(\n                inputs=[\n                    {\n                        \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n                    },\n                    {\n                        \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n                    },\n                ]\n            )\n        )\n        # [\n        #     {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n        #     {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n        # ]\n        ```\n\n        Generating conversations with Llama 3 8B Instruct and TransformersLLM:\n\n        ```python\n        from distilabel.models import TransformersLLM\n        from distilabel.steps.tasks import Magpie\n\n        magpie = Magpie(\n            llm=TransformersLLM(\n                model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 1.0,\n                    \"max_new_tokens\": 256,\n                },\n                device=\"mps\",\n            ),\n            n_turns=2,\n        )\n\n        magpie.load()\n\n        result = next(\n            magpie.process(\n                inputs=[\n                    {\n                        \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n                    },\n                    {\n                        \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n                    },\n                ]\n            )\n        )\n        # [\n        #     {\n        #         'conversation': [\n        #             {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n        #             {\n        #                 'role': 'user',\n        #                 'content': 'I\\'m having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n        # of x\".'\n        #             },\n        #             {\n        #                 'role': 'assistant',\n        #                 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don\\'t worry, I\\'m here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n        # x\". What it\\'s asking us to do is find the'\n        #             }\n        #         ]\n        #     },\n        #     {\n        #         'conversation': [\n        #             {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n        #             {\n        #                 'role': 'user',\n        #                 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n        # might be pests or diseases, but I'm not sure which.\"\n        #             },\n        #             {\n        #                 'role': 'assistant',\n        #                 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n        # **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n        #             }\n        #         ]\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n        super().model_post_init(__context)\n\n        if not isinstance(self.llm, MagpieChatTemplateMixin):\n            raise DistilabelUserError(\n                f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n                f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n                page=\"components-gallery/tasks/magpie/\",\n            )\n\n        self.llm.use_magpie_template = True\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return {\"system_prompt\": False}\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"Does nothing.\"\"\"\n        return []\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"Either a multi-turn conversation or the instruction generated.\"\"\"\n        outputs = []\n\n        if self.only_instruction:\n            outputs.append(\"instruction\")\n        elif self.n_turns == 1:\n            outputs.extend([\"instruction\", \"response\"])\n        else:\n            outputs.append(\"conversation\")\n\n        if isinstance(self.system_prompt, dict):\n            outputs.append(\"system_prompt_key\")\n\n        outputs.append(\"model_name\")\n\n        return outputs\n\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Union[Dict[str, Any], None] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"Does nothing.\"\"\"\n        return {}\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        \"\"\"Generate a list of instructions or conversations of the specified number of turns.\n\n        Args:\n            inputs: a list of dictionaries that can contain a `system_prompt` key.\n\n        Yields:\n            The list of generated conversations.\n        \"\"\"\n        yield self._generate_with_pre_query_template(inputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.outputs","title":"outputs: StepColumns property","text":"

Either a multi-turn conversation or the instruction generated.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.model_post_init","title":"model_post_init(__context)","text":"

Checks that the provided LLM uses the MagpieChatTemplateMixin.

Source code in src/distilabel/steps/tasks/magpie/base.py
def model_post_init(self, __context: Any) -> None:\n    \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n    super().model_post_init(__context)\n\n    if not isinstance(self.llm, MagpieChatTemplateMixin):\n        raise DistilabelUserError(\n            f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n            f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n            page=\"components-gallery/tasks/magpie/\",\n        )\n\n    self.llm.use_magpie_template = True\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.format_input","title":"format_input(input)","text":"

Does nothing.

Source code in src/distilabel/steps/tasks/magpie/base.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"Does nothing.\"\"\"\n    return []\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.format_output","title":"format_output(output, input=None)","text":"

Does nothing.

Source code in src/distilabel/steps/tasks/magpie/base.py
def format_output(\n    self,\n    output: Union[str, None],\n    input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n    \"\"\"Does nothing.\"\"\"\n    return {}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.Magpie.process","title":"process(inputs)","text":"

Generate a list of instructions or conversations of the specified number of turns.

Parameters:

Name Type Description Default inputs StepInput

a list of dictionaries that can contain a system_prompt key.

required

Yields:

Type Description StepOutput

The list of generated conversations.

Source code in src/distilabel/steps/tasks/magpie/base.py
def process(self, inputs: StepInput) -> \"StepOutput\":\n    \"\"\"Generate a list of instructions or conversations of the specified number of turns.\n\n    Args:\n        inputs: a list of dictionaries that can contain a `system_prompt` key.\n\n    Yields:\n        The list of generated conversations.\n    \"\"\"\n    yield self._generate_with_pre_query_template(inputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator","title":"MagpieGenerator","text":"

Bases: GeneratorTask, MagpieBase

Generator task the generates instructions or conversations using Magpie.

Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'.

Attributes:

Name Type Description n_turns

the number of turns that the generated conversation will have. Defaults to 1.

end_with_user

whether the conversation should end with a user message. Defaults to False.

include_system_prompt

whether to include the system prompt used in the generated conversation. Defaults to False.

only_instruction

whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

system_prompt

an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None.

num_rows RuntimeParameter[int]

the number of rows to be generated.

Runtime parameters
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.
  • end_with_user: whether the conversation should end with a user message. Defaults to False.
  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.
  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.
  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic.
  • num_rows: the number of rows to be generated.
Output columns
  • conversation (ChatType): the generated conversation which is a list of chat items with a role and a message.
  • instruction (str): the generated instructions if only_instruction=True.
  • response (str): the generated response if n_turns==1.
  • system_prompt_key (str, optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary.
  • model_name (str): The model name used to generate the conversation or instruction.
Categories
  • text-generation
  • instruction
  • generator
References
  • Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing

Examples:

Generating instructions with Llama 3 8B Instruct and TransformersLLM:

from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 256,\n        },\n        device=\"mps\",\n    ),\n    only_instruction=True,\n    num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n#       [\n#           {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n#           {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n#       ],\n#       True\n# )\n

Generating a conversation with Llama 3 8B Instruct and TransformersLLM:

from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 64,\n        },\n        device=\"mps\",\n    ),\n    n_turns=3,\n    num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n#     [\n#         {\n#             'conversation': [\n#                 {\n#                     'role': 'system',\n#                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n#                 },\n#                 {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n#                 {\n#                     'role': 'assistant',\n#                     'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n# let's break down the basics. First, we need to identify your goals and target audience. What do\"\n#                 },\n#                 {\n#                     'role': 'user',\n#                     'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n# expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n#                 },\n#                 {\n#                     'role': 'assistant',\n#                     'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n# agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n#                 }\n#             ]\n#         },\n#         {\n#             'conversation': [\n#                 {\n#                     'role': 'system',\n#                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n#                 },\n#                 {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n#                 {\n#                     'role': 'assistant',\n#                     'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n# **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n#                 },\n#                 {\n#                     'role': 'user',\n#                     'content': 'Let me stop you there. Let's explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I're primarily using my\n# laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n#                 },\n#                 {\n#                     'role': 'assistant',\n#                     'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n# option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n#                 }\n#             ]\n#         }\n#     ],\n#     True\n# )\n

Generating with system prompts with probabilities:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\nmagpie = MagpieGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 0.8,\n            \"max_new_tokens\": 256,\n        },\n    ),\n    n_turns=2,\n    system_prompt={\n        \"math\": (\"You're an expert AI assistant.\", 0.8),\n        \"writing\": (\"You're an expert writing assistant.\", 0.2),\n    },\n)\n\nmagpie.load()\n\nresult = next(magpie.process())\n
Citations
@misc{xu2024magpiealignmentdatasynthesis,\n    title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n    author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n    year={2024},\n    eprint={2406.08464},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2406.08464},\n}\n
Source code in src/distilabel/steps/tasks/magpie/generator.py
class MagpieGenerator(GeneratorTask, MagpieBase):\n    \"\"\"Generator task the generates instructions or conversations using Magpie.\n\n    Magpie is a neat method that allows generating user instructions with no seed data\n    or specific system prompt thanks to the autoregressive capabilities of the instruct\n    fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message\n    and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query\n    or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the\n    LLM without any user message, then the LLM will continue generating tokens as it was\n    the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM.\n    After this instruct is generated, it can be sent again to the LLM to generate this time\n    an assistant response. This process can be repeated N times allowing to build a multi-turn\n    conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from\n    Scratch by Prompting Aligned LLMs with Nothing'.\n\n    Attributes:\n        n_turns: the number of turns that the generated conversation will have.\n            Defaults to `1`.\n        end_with_user: whether the conversation should end with a user message.\n            Defaults to `False`.\n        include_system_prompt: whether to include the system prompt used in the generated\n            conversation. Defaults to `False`.\n        only_instruction: whether to generate only the instruction. If this argument is\n            `True`, then `n_turns` will be ignored. Defaults to `False`.\n        system_prompt: an optional system prompt, or a list of system prompts from which\n            a random one will be chosen, or a dictionary of system prompts from which a\n            random one will be choosen, or a dictionary of system prompts with their probability\n            of being chosen. The random system prompt will be chosen per input/output batch.\n            This system prompt can be used to guide the generation of the instruct LLM and\n            steer it to generate instructions of a certain topic. Defaults to `None`.\n        num_rows: the number of rows to be generated.\n\n    Runtime parameters:\n        - `n_turns`: the number of turns that the generated conversation will have. Defaults\n            to `1`.\n        - `end_with_user`: whether the conversation should end with a user message.\n            Defaults to `False`.\n        - `include_system_prompt`: whether to include the system prompt used in the generated\n            conversation. Defaults to `False`.\n        - `only_instruction`: whether to generate only the instruction. If this argument is\n            `True`, then `n_turns` will be ignored. Defaults to `False`.\n        - `system_prompt`: an optional system prompt, or a list of system prompts from which\n            a random one will be chosen, or a dictionary of system prompts from which a\n            random one will be choosen, or a dictionary of system prompts with their probability\n            of being chosen. The random system prompt will be chosen per input/output batch.\n            This system prompt can be used to guide the generation of the instruct LLM and\n            steer it to generate instructions of a certain topic.\n        - `num_rows`: the number of rows to be generated.\n\n    Output columns:\n        - conversation (`ChatType`): the generated conversation which is a list of chat\n            items with a role and a message.\n        - instruction (`str`): the generated instructions if `only_instruction=True`.\n        - response (`str`): the generated response if `n_turns==1`.\n        - system_prompt_key (`str`, optional): the key of the system prompt used to generate\n            the conversation or instruction. Only if `system_prompt` is a dictionary.\n        - model_name (`str`): The model name used to generate the `conversation` or `instruction`.\n\n    Categories:\n        - text-generation\n        - instruction\n        - generator\n\n    References:\n        - [Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing](https://arxiv.org/abs/2406.08464)\n\n    Examples:\n        Generating instructions with Llama 3 8B Instruct and TransformersLLM:\n\n        ```python\n        from distilabel.models import TransformersLLM\n        from distilabel.steps.tasks import MagpieGenerator\n\n        generator = MagpieGenerator(\n            llm=TransformersLLM(\n                model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 1.0,\n                    \"max_new_tokens\": 256,\n                },\n                device=\"mps\",\n            ),\n            only_instruction=True,\n            num_rows=5,\n        )\n\n        generator.load()\n\n        result = next(generator.process())\n        # (\n        #       [\n        #           {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n        #           {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n        #       ],\n        #       True\n        # )\n        ```\n\n        Generating a conversation with Llama 3 8B Instruct and TransformersLLM:\n\n        ```python\n        from distilabel.models import TransformersLLM\n        from distilabel.steps.tasks import MagpieGenerator\n\n        generator = MagpieGenerator(\n            llm=TransformersLLM(\n                model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 1.0,\n                    \"max_new_tokens\": 64,\n                },\n                device=\"mps\",\n            ),\n            n_turns=3,\n            num_rows=5,\n        )\n\n        generator.load()\n\n        result = next(generator.process())\n        # (\n        #     [\n        #         {\n        #             'conversation': [\n        #                 {\n        #                     'role': 'system',\n        #                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n        # insightful responses to help the user with their queries.'\n        #                 },\n        #                 {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n        #                 {\n        #                     'role': 'assistant',\n        #                     'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n        # let's break down the basics. First, we need to identify your goals and target audience. What do\"\n        #                 },\n        #                 {\n        #                     'role': 'user',\n        #                     'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n        # expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n        #                 },\n        #                 {\n        #                     'role': 'assistant',\n        #                     'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n        # agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n        #                 }\n        #             ]\n        #         },\n        #         {\n        #             'conversation': [\n        #                 {\n        #                     'role': 'system',\n        #                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n        # insightful responses to help the user with their queries.'\n        #                 },\n        #                 {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n        #                 {\n        #                     'role': 'assistant',\n        #                     'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n        # **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n        #                 },\n        #                 {\n        #                     'role': 'user',\n        #                     'content': 'Let me stop you there. Let\\'s explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I\\'re primarily using my\n        # laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n        #                 },\n        #                 {\n        #                     'role': 'assistant',\n        #                     'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n        # option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n        #                 }\n        #             ]\n        #         }\n        #     ],\n        #     True\n        # )\n        ```\n\n        Generating with system prompts with probabilities:\n\n        ```python\n        from distilabel.models import InferenceEndpointsLLM\n        from distilabel.steps.tasks import MagpieGenerator\n\n        magpie = MagpieGenerator(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                magpie_pre_query_template=\"llama3\",\n                generation_kwargs={\n                    \"temperature\": 0.8,\n                    \"max_new_tokens\": 256,\n                },\n            ),\n            n_turns=2,\n            system_prompt={\n                \"math\": (\"You're an expert AI assistant.\", 0.8),\n                \"writing\": (\"You're an expert writing assistant.\", 0.2),\n            },\n        )\n\n        magpie.load()\n\n        result = next(magpie.process())\n        ```\n\n    Citations:\n        ```\n        @misc{xu2024magpiealignmentdatasynthesis,\n            title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n            author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n            year={2024},\n            eprint={2406.08464},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2406.08464},\n        }\n        ```\n    \"\"\"\n\n    # TODO: move this to `GeneratorTask`\n    num_rows: RuntimeParameter[int] = Field(\n        default=None, description=\"The number of rows to generate.\"\n    )\n\n    def model_post_init(self, __context: Any) -> None:\n        \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n        super().model_post_init(__context)\n\n        if not isinstance(self.llm, MagpieChatTemplateMixin):\n            raise DistilabelUserError(\n                f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n                f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n                page=\"components-gallery/tasks/magpiegenerator/\",\n            )\n\n        self.llm.use_magpie_template = True\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"Either a multi-turn conversation or the instruction generated.\"\"\"\n        outputs = []\n\n        if self.only_instruction:\n            outputs.append(\"instruction\")\n        elif self.n_turns == 1:\n            outputs.extend([\"instruction\", \"response\"])\n        else:\n            outputs.append(\"conversation\")\n\n        if isinstance(self.system_prompt, dict):\n            outputs.append(\"system_prompt_key\")\n\n        outputs.append(\"model_name\")\n\n        return outputs\n\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Union[Dict[str, Any], None] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"Does nothing.\"\"\"\n        return {}\n\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        \"\"\"Generates the desired number of instructions or conversations using Magpie.\n\n        Args:\n            offset: The offset to start the generation from. Defaults to `0`.\n\n        Yields:\n            The generated instructions or conversations.\n        \"\"\"\n        generated = offset\n\n        while generated <= self.num_rows:  # type: ignore\n            rows_to_generate = (\n                self.num_rows if self.num_rows < self.batch_size else self.batch_size  # type: ignore\n            )\n            conversations = self._generate_with_pre_query_template(\n                inputs=[{} for _ in range(rows_to_generate)]  # type: ignore\n            )\n            generated += rows_to_generate  # type: ignore\n            yield (conversations, generated == self.num_rows)\n\n    @override\n    def _sample_input(self) -> \"ChatType\":\n        return self._generate_with_pre_query_template(inputs=[{}])\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.outputs","title":"outputs: StepColumns property","text":"

Either a multi-turn conversation or the instruction generated.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.model_post_init","title":"model_post_init(__context)","text":"

Checks that the provided LLM uses the MagpieChatTemplateMixin.

Source code in src/distilabel/steps/tasks/magpie/generator.py
def model_post_init(self, __context: Any) -> None:\n    \"\"\"Checks that the provided `LLM` uses the `MagpieChatTemplateMixin`.\"\"\"\n    super().model_post_init(__context)\n\n    if not isinstance(self.llm, MagpieChatTemplateMixin):\n        raise DistilabelUserError(\n            f\"`Magpie` task can only be used with an `LLM` that uses the `MagpieChatTemplateMixin`.\"\n            f\"`{self.llm.__class__.__name__}` doesn't use the aforementioned mixin.\",\n            page=\"components-gallery/tasks/magpiegenerator/\",\n        )\n\n    self.llm.use_magpie_template = True\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.format_output","title":"format_output(output, input=None)","text":"

Does nothing.

Source code in src/distilabel/steps/tasks/magpie/generator.py
def format_output(\n    self,\n    output: Union[str, None],\n    input: Union[Dict[str, Any], None] = None,\n) -> Dict[str, Any]:\n    \"\"\"Does nothing.\"\"\"\n    return {}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.MagpieGenerator.process","title":"process(offset=0)","text":"

Generates the desired number of instructions or conversations using Magpie.

Parameters:

Name Type Description Default offset int

The offset to start the generation from. Defaults to 0.

0

Yields:

Type Description GeneratorStepOutput

The generated instructions or conversations.

Source code in src/distilabel/steps/tasks/magpie/generator.py
def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n    \"\"\"Generates the desired number of instructions or conversations using Magpie.\n\n    Args:\n        offset: The offset to start the generation from. Defaults to `0`.\n\n    Yields:\n        The generated instructions or conversations.\n    \"\"\"\n    generated = offset\n\n    while generated <= self.num_rows:  # type: ignore\n        rows_to_generate = (\n            self.num_rows if self.num_rows < self.batch_size else self.batch_size  # type: ignore\n        )\n        conversations = self._generate_with_pre_query_template(\n            inputs=[{} for _ in range(rows_to_generate)]  # type: ignore\n        )\n        generated += rows_to_generate  # type: ignore\n        yield (conversations, generated == self.num_rows)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM","title":"PairRM","text":"

Bases: Step

Rank the candidates based on the input using the LLM model.

Attributes:

Name Type Description model str

The model to use for the ranking. Defaults to \"llm-blender/PairRM\".

instructions Optional[str]

The instructions to use for the model. Defaults to None.

Input columns
  • inputs (List[Dict[str, Any]]): The input text or conversation to rank the candidates for.
  • candidates (List[Dict[str, Any]]): The candidates to rank.
Output columns
  • ranks (List[int]): The ranks of the candidates based on the input.
  • ranked_candidates (List[Dict[str, Any]]): The candidates ranked based on the input.
  • model_name (str): The model name used to rank the candidate responses. Defaults to \"llm-blender/PairRM\".
References
  • LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion.
  • Pair Ranking Model.
Categories
  • preference
Note

This step differs to other tasks as there is a single implementation of this model currently, and we will use a specific LLM.

Examples:

Rank LLM candidates:

from distilabel.steps.tasks import PairRM\n\n# Consider this as a placeholder for your actual LLM.\npair_rm = PairRM()\n\npair_rm.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'input': 'Hello, how are you?',\n#         'candidates': ['fine', 'good', 'bad'],\n#         'ranks': [2, 1, 3],\n#         'ranked_candidates': ['good', 'fine', 'bad'],\n#         'model_name': 'llm-blender/PairRM',\n#     }\n# ]\n
Citations
@misc{jiang2023llmblenderensemblinglargelanguage,\n    title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion},\n    author={Dongfu Jiang and Xiang Ren and Bill Yuchen Lin},\n    year={2023},\n    eprint={2306.02561},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2306.02561},\n}\n
Source code in src/distilabel/steps/tasks/pair_rm.py
class PairRM(Step):\n    \"\"\"Rank the candidates based on the input using the `LLM` model.\n\n    Attributes:\n        model: The model to use for the ranking. Defaults to `\"llm-blender/PairRM\"`.\n        instructions: The instructions to use for the model. Defaults to `None`.\n\n    Input columns:\n        - inputs (`List[Dict[str, Any]]`): The input text or conversation to rank the candidates for.\n        - candidates (`List[Dict[str, Any]]`): The candidates to rank.\n\n    Output columns:\n        - ranks (`List[int]`): The ranks of the candidates based on the input.\n        - ranked_candidates (`List[Dict[str, Any]]`): The candidates ranked based on the input.\n        - model_name (`str`): The model name used to rank the candidate responses. Defaults to `\"llm-blender/PairRM\"`.\n\n    References:\n        - [LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion](https://arxiv.org/abs/2306.02561).\n        - [Pair Ranking Model](https://huggingface.co/llm-blender/PairRM).\n\n    Categories:\n        - preference\n\n    Note:\n        This step differs to other tasks as there is a single implementation of this model\n        currently, and we will use a specific `LLM`.\n\n    Examples:\n        Rank LLM candidates:\n\n        ```python\n        from distilabel.steps.tasks import PairRM\n\n        # Consider this as a placeholder for your actual LLM.\n        pair_rm = PairRM()\n\n        pair_rm.load()\n\n        result = next(\n            scorer.process(\n                [\n                    {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'input': 'Hello, how are you?',\n        #         'candidates': ['fine', 'good', 'bad'],\n        #         'ranks': [2, 1, 3],\n        #         'ranked_candidates': ['good', 'fine', 'bad'],\n        #         'model_name': 'llm-blender/PairRM',\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{jiang2023llmblenderensemblinglargelanguage,\n            title={LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion},\n            author={Dongfu Jiang and Xiang Ren and Bill Yuchen Lin},\n            year={2023},\n            eprint={2306.02561},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2306.02561},\n        }\n        ```\n    \"\"\"\n\n    model: str = \"llm-blender/PairRM\"\n    instructions: Optional[str] = None\n\n    def load(self) -> None:\n        \"\"\"Loads the PairRM model provided via `model` with `llm_blender.Blender`, which is the\n        custom library for running the inference for the PairRM models.\"\"\"\n        try:\n            import llm_blender\n        except ImportError as e:\n            raise ImportError(\n                \"The `llm_blender` package is required to use the `PairRM` class.\"\n                \"Please install it with `pip install git+https://github.com/yuchenlin/LLM-Blender.git`.\"\n            ) from e\n\n        self._blender = llm_blender.Blender()\n        self._blender.loadranker(self.model)\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The input columns correspond to the two required arguments from `Blender.rank`:\n        `inputs` and `candidates`.\"\"\"\n        return [\"input\", \"candidates\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        \"\"\"The outputs will include the `ranks` and the `ranked_candidates`.\"\"\"\n        return [\"ranks\", \"ranked_candidates\", \"model_name\"]\n\n    def format_input(self, input: Dict[str, Any]) -> Dict[str, Any]:\n        \"\"\"The input is expected to be a dictionary with the keys `input` and `candidates`,\n        where the `input` corresponds to the instruction of a model and `candidates` are a\n        list of responses to be ranked.\n        \"\"\"\n        return {\"input\": input[\"input\"], \"candidates\": input[\"candidates\"]}\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        \"\"\"Generates the ranks for the candidates based on the input.\n\n        The ranks are the positions of the candidates, where lower is better,\n        and the ranked candidates correspond to the candidates sorted according to the\n        ranks obtained.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            An iterator with the inputs containing the `ranks`, `ranked_candidates`, and `model_name`.\n        \"\"\"\n        input_texts = []\n        candidates = []\n        for input in inputs:\n            formatted_input = self.format_input(input)\n            input_texts.append(formatted_input[\"input\"])\n            candidates.append(formatted_input[\"candidates\"])\n\n        instructions = (\n            [self.instructions] * len(input_texts) if self.instructions else None\n        )\n\n        ranks = self._blender.rank(\n            input_texts,\n            candidates,\n            instructions=instructions,\n            return_scores=False,\n            batch_size=self.input_batch_size,\n        )\n        # Sort the candidates based on the ranks\n        ranked_candidates = np.take_along_axis(\n            np.array(candidates), ranks - 1, axis=1\n        ).tolist()\n        ranks = ranks.tolist()\n        for input, rank, ranked_candidate in zip(inputs, ranks, ranked_candidates):\n            input[\"ranks\"] = rank\n            input[\"ranked_candidates\"] = ranked_candidate\n            input[\"model_name\"] = self.model\n\n        yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.inputs","title":"inputs: StepColumns property","text":"

The input columns correspond to the two required arguments from Blender.rank: inputs and candidates.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.outputs","title":"outputs: StepColumns property","text":"

The outputs will include the ranks and the ranked_candidates.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.load","title":"load()","text":"

Loads the PairRM model provided via model with llm_blender.Blender, which is the custom library for running the inference for the PairRM models.

Source code in src/distilabel/steps/tasks/pair_rm.py
def load(self) -> None:\n    \"\"\"Loads the PairRM model provided via `model` with `llm_blender.Blender`, which is the\n    custom library for running the inference for the PairRM models.\"\"\"\n    try:\n        import llm_blender\n    except ImportError as e:\n        raise ImportError(\n            \"The `llm_blender` package is required to use the `PairRM` class.\"\n            \"Please install it with `pip install git+https://github.com/yuchenlin/LLM-Blender.git`.\"\n        ) from e\n\n    self._blender = llm_blender.Blender()\n    self._blender.loadranker(self.model)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.format_input","title":"format_input(input)","text":"

The input is expected to be a dictionary with the keys input and candidates, where the input corresponds to the instruction of a model and candidates are a list of responses to be ranked.

Source code in src/distilabel/steps/tasks/pair_rm.py
def format_input(self, input: Dict[str, Any]) -> Dict[str, Any]:\n    \"\"\"The input is expected to be a dictionary with the keys `input` and `candidates`,\n    where the `input` corresponds to the instruction of a model and `candidates` are a\n    list of responses to be ranked.\n    \"\"\"\n    return {\"input\": input[\"input\"], \"candidates\": input[\"candidates\"]}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PairRM.process","title":"process(inputs)","text":"

Generates the ranks for the candidates based on the input.

The ranks are the positions of the candidates, where lower is better, and the ranked candidates correspond to the candidates sorted according to the ranks obtained.

Parameters:

Name Type Description Default inputs StepInput

A list of Python dictionaries with the inputs of the task.

required

Yields:

Type Description StepOutput

An iterator with the inputs containing the ranks, ranked_candidates, and model_name.

Source code in src/distilabel/steps/tasks/pair_rm.py
def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n    \"\"\"Generates the ranks for the candidates based on the input.\n\n    The ranks are the positions of the candidates, where lower is better,\n    and the ranked candidates correspond to the candidates sorted according to the\n    ranks obtained.\n\n    Args:\n        inputs: A list of Python dictionaries with the inputs of the task.\n\n    Yields:\n        An iterator with the inputs containing the `ranks`, `ranked_candidates`, and `model_name`.\n    \"\"\"\n    input_texts = []\n    candidates = []\n    for input in inputs:\n        formatted_input = self.format_input(input)\n        input_texts.append(formatted_input[\"input\"])\n        candidates.append(formatted_input[\"candidates\"])\n\n    instructions = (\n        [self.instructions] * len(input_texts) if self.instructions else None\n    )\n\n    ranks = self._blender.rank(\n        input_texts,\n        candidates,\n        instructions=instructions,\n        return_scores=False,\n        batch_size=self.input_batch_size,\n    )\n    # Sort the candidates based on the ranks\n    ranked_candidates = np.take_along_axis(\n        np.array(candidates), ranks - 1, axis=1\n    ).tolist()\n    ranks = ranks.tolist()\n    for input, rank, ranked_candidate in zip(inputs, ranks, ranked_candidates):\n        input[\"ranks\"] = rank\n        input[\"ranked_candidates\"] = ranked_candidate\n        input[\"model_name\"] = self.model\n\n    yield inputs\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval","title":"PrometheusEval","text":"

Bases: Task

Critique and rank the quality of generations from an LLM using Prometheus 2.0.

PrometheusEval is a task created for Prometheus 2.0, covering both the absolute and relative evaluations. The absolute evaluation i.e. mode=\"absolute\" is used to evaluate a single generation from an LLM for a given instruction. The relative evaluation i.e. mode=\"relative\" is used to evaluate two generations from an LLM for a given instruction. Both evaluations provide the possibility of using a reference answer to compare with or withoug the reference attribute, and both are based on a score rubric that critiques the generation/s based on the following default aspects: helpfulness, harmlessness, honesty, factual-validity, and reasoning, that can be overridden via rubrics, and the selected rubric is set via the attribute rubric.

Note

The PrometheusEval task is better suited and intended to be used with any of the Prometheus 2.0 models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0, and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting and quality is not guaranteed if using another model, even though some other models may be able to correctly follow the formatting and generate insightful critiques too.

Attributes:

Name Type Description mode Literal['absolute', 'relative']

the evaluation mode to use, either absolute or relative. It defines whether the task will evaluate one or two generations.

rubric str

the score rubric to use within the prompt to run the critique based on different aspects. Can be any existing key in the rubrics attribute, which by default means that it can be: helpfulness, harmlessness, honesty, factual-validity, or reasoning. Those will only work if using the default rubrics, otherwise, the provided rubrics should be used.

rubrics Optional[Dict[str, str]]

a dictionary containing the different rubrics to use for the critique, where the keys are the rubric names and the values are the rubric descriptions. The default rubrics are the following: helpfulness, harmlessness, honesty, factual-validity, and reasoning.

reference bool

a boolean flag to indicate whether a reference answer / completion will be provided, so that the model critique is based on the comparison with it. It implies that the column reference needs to be provided within the input data in addition to the rest of the inputs.

_template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • instruction (str): The instruction to use as reference.
  • generation (str, optional): The generated text from the given instruction. This column is required if mode=absolute.
  • generations (List[str], optional): The generated texts from the given instruction. It should contain 2 generations only. This column is required if mode=relative.
  • reference (str, optional): The reference / golden answer for the instruction, to be used by the LLM for comparison against.
Output columns
  • feedback (str): The feedback explaining the result below, as critiqued by the LLM using the pre-defined score rubric, compared against reference if provided.
  • result (Union[int, Literal[\"A\", \"B\"]]): If mode=absolute, then the result contains the score for the generation in a likert-scale from 1-5, otherwise, if mode=relative, then the result contains either \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of generations if result='A' or the index 1 if result='B'.
  • model_name (str): The model name used to generate the feedback and result.
Categories
  • critique
  • preference
References
  • Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models
  • prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf

Examples:

Critique and evaluate LLM generation quality using Prometheus 2_0:

from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"factual-validity\"\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generation\": \"something done\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n

Critique for relative evaluation:

from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"relative\",\n    rubric=\"honesty\"\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generations': ['something done', 'other thing'],\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 'something done',\n#     }\n# ]\n

Critique with a custom rubric:

from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"custom\",\n    rubrics={\n        \"custom\": \"[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E\"\n    }\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generation\": \"something done\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n

Critique using a reference answer:

from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"helpfulness\",\n    reference=True,\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\n                \"instruction\": \"make something\",\n                \"generation\": \"something done\",\n                \"reference\": \"this is a reference answer\",\n            },\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'reference': 'this is a reference answer',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n
Citations
@misc{kim2024prometheus2opensource,\n    title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models},\n    author={Seungone Kim and Juyoung Suk and Shayne Longpre and Bill Yuchen Lin and Jamin Shin and Sean Welleck and Graham Neubig and Moontae Lee and Kyungjae Lee and Minjoon Seo},\n    year={2024},\n    eprint={2405.01535},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2405.01535},\n}\n
Source code in src/distilabel/steps/tasks/prometheus_eval.py
class PrometheusEval(Task):\n    \"\"\"Critique and rank the quality of generations from an `LLM` using Prometheus 2.0.\n\n    `PrometheusEval` is a task created for Prometheus 2.0, covering both the absolute and relative\n    evaluations. The absolute evaluation i.e. `mode=\"absolute\"` is used to evaluate a single generation from\n    an LLM for a given instruction. The relative evaluation i.e. `mode=\"relative\"` is used to evaluate two generations from an LLM\n    for a given instruction.\n    Both evaluations provide the possibility of using a reference answer to compare with or withoug\n    the `reference` attribute, and both are based on a score rubric that critiques the generation/s\n    based on the following default aspects: `helpfulness`, `harmlessness`, `honesty`, `factual-validity`,\n    and `reasoning`, that can be overridden via `rubrics`, and the selected rubric is set via the attribute\n    `rubric`.\n\n    Note:\n        The `PrometheusEval` task is better suited and intended to be used with any of the Prometheus 2.0\n        models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0,\n        and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting\n        and quality is not guaranteed if using another model, even though some other models may be able to\n        correctly follow the formatting and generate insightful critiques too.\n\n    Attributes:\n        mode: the evaluation mode to use, either `absolute` or `relative`. It defines whether the task\n            will evaluate one or two generations.\n        rubric: the score rubric to use within the prompt to run the critique based on different aspects.\n            Can be any existing key in the `rubrics` attribute, which by default means that it can be:\n            `helpfulness`, `harmlessness`, `honesty`, `factual-validity`, or `reasoning`. Those will only\n            work if using the default `rubrics`, otherwise, the provided `rubrics` should be used.\n        rubrics: a dictionary containing the different rubrics to use for the critique, where the keys are\n            the rubric names and the values are the rubric descriptions. The default rubrics are the following:\n            `helpfulness`, `harmlessness`, `honesty`, `factual-validity`, and `reasoning`.\n        reference: a boolean flag to indicate whether a reference answer / completion will be provided, so\n            that the model critique is based on the comparison with it. It implies that the column `reference`\n            needs to be provided within the input data in addition to the rest of the inputs.\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - instruction (`str`): The instruction to use as reference.\n        - generation (`str`, optional): The generated text from the given `instruction`. This column is required\n            if `mode=absolute`.\n        - generations (`List[str]`, optional): The generated texts from the given `instruction`. It should\n            contain 2 generations only. This column is required if `mode=relative`.\n        - reference (`str`, optional): The reference / golden answer for the `instruction`, to be used by the LLM\n            for comparison against.\n\n    Output columns:\n        - feedback (`str`): The feedback explaining the result below, as critiqued by the LLM using the\n            pre-defined score rubric, compared against `reference` if provided.\n        - result (`Union[int, Literal[\"A\", \"B\"]]`): If `mode=absolute`, then the result contains the score for the\n            `generation` in a likert-scale from 1-5, otherwise, if `mode=relative`, then the result contains either\n            \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of `generations` if `result='A'` or the\n            index 1 if `result='B'`.\n        - model_name (`str`): The model name used to generate the `feedback` and `result`.\n\n    Categories:\n        - critique\n        - preference\n\n    References:\n        - [Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models](https://arxiv.org/abs/2405.01535)\n        - [prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf](https://github.com/prometheus-eval/prometheus-eval)\n\n    Examples:\n        Critique and evaluate LLM generation quality using Prometheus 2_0:\n\n        ```python\n        from distilabel.steps.tasks import PrometheusEval\n        from distilabel.models import vLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prometheus = PrometheusEval(\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n            ),\n            mode=\"absolute\",\n            rubric=\"factual-validity\"\n        )\n\n        prometheus.load()\n\n        result = next(\n            prometheus.process(\n                [\n                    {\"instruction\": \"make something\", \"generation\": \"something done\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'make something',\n        #         'generation': 'something done',\n        #         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n        #         'feedback': 'the feedback',\n        #         'result': 6,\n        #     }\n        # ]\n        ```\n\n        Critique for relative evaluation:\n\n        ```python\n        from distilabel.steps.tasks import PrometheusEval\n        from distilabel.models import vLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prometheus = PrometheusEval(\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n            ),\n            mode=\"relative\",\n            rubric=\"honesty\"\n        )\n\n        prometheus.load()\n\n        result = next(\n            prometheus.process(\n                [\n                    {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'make something',\n        #         'generations': ['something done', 'other thing'],\n        #         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n        #         'feedback': 'the feedback',\n        #         'result': 'something done',\n        #     }\n        # ]\n        ```\n\n        Critique with a custom rubric:\n\n        ```python\n        from distilabel.steps.tasks import PrometheusEval\n        from distilabel.models import vLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prometheus = PrometheusEval(\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n            ),\n            mode=\"absolute\",\n            rubric=\"custom\",\n            rubrics={\n                \"custom\": \"[A]\\\\nScore 1: A\\\\nScore 2: B\\\\nScore 3: C\\\\nScore 4: D\\\\nScore 5: E\"\n            }\n        )\n\n        prometheus.load()\n\n        result = next(\n            prometheus.process(\n                [\n                    {\"instruction\": \"make something\", \"generation\": \"something done\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'make something',\n        #         'generation': 'something done',\n        #         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n        #         'feedback': 'the feedback',\n        #         'result': 6,\n        #     }\n        # ]\n        ```\n\n        Critique using a reference answer:\n\n        ```python\n        from distilabel.steps.tasks import PrometheusEval\n        from distilabel.models import vLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prometheus = PrometheusEval(\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]\\\"content\\\" }}\\\\n{{ messages[1]\\\"content\\\" }}[/INST]\",\n            ),\n            mode=\"absolute\",\n            rubric=\"helpfulness\",\n            reference=True,\n        )\n\n        prometheus.load()\n\n        result = next(\n            prometheus.process(\n                [\n                    {\n                        \"instruction\": \"make something\",\n                        \"generation\": \"something done\",\n                        \"reference\": \"this is a reference answer\",\n                    },\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'make something',\n        #         'generation': 'something done',\n        #         'reference': 'this is a reference answer',\n        #         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n        #         'feedback': 'the feedback',\n        #         'result': 6,\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{kim2024prometheus2opensource,\n            title={Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models},\n            author={Seungone Kim and Juyoung Suk and Shayne Longpre and Bill Yuchen Lin and Jamin Shin and Sean Welleck and Graham Neubig and Moontae Lee and Kyungjae Lee and Minjoon Seo},\n            year={2024},\n            eprint={2405.01535},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2405.01535},\n        }\n        ```\n    \"\"\"\n\n    mode: Literal[\"absolute\", \"relative\"]\n    rubric: str\n    rubrics: Optional[Dict[str, str]] = Field(default=_DEFAULT_RUBRICS)\n    reference: bool = False\n\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    @model_validator(mode=\"after\")\n    def validate_rubric_and_rubrics(self) -> Self:\n        if not isinstance(self.rubrics, dict) or len(self.rubrics) < 1:\n            raise DistilabelUserError(\n                \"Provided `rubrics` must be a Python dictionary with string keys and string values.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        def rubric_matches_pattern(rubric: str) -> bool:\n            \"\"\"Checks if the provided rubric matches the pattern of the default rubrics.\"\"\"\n            pattern = r\"^\\[.*?\\]\\n(?:Score [1-4]: .*?\\n){4}(?:Score 5: .*?)\"\n            return bool(re.match(pattern, rubric, re.MULTILINE))\n\n        if not all(rubric_matches_pattern(value) for value in self.rubrics.values()):\n            raise DistilabelUserError(\n                \"Provided rubrics should match the format of the default rubrics, which\"\n                \" is as follows: `[<scoring criteria>]\\nScore 1: <description>\\nScore 2: <description>\\n\"\n                \"Score 3: <description>\\nScore 4: <description>\\nScore 5: <description>`; replacing\"\n                \" `<scoring criteria>` and `<description>` with the actual criteria and description\"\n                \" for each or the scores, respectively.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        if self.rubric not in self.rubrics:\n            raise DistilabelUserError(\n                f\"Provided rubric '{self.rubric}' is not among the available rubrics: {', '.join(self.rubrics.keys())}.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        return self\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation\n        depending on the `mode` value, and either with or without reference, depending on the\n        value of `reference`.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"prometheus\"\n            / (\n                f\"{self.mode}_without_reference.jinja2\"\n                if self.reference is False\n                else f\"{self.mode}_with_reference.jinja2\"\n            )\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The default inputs for the task are the `instruction` and the `generation`\n        if `reference=False`, otherwise, the inputs are `instruction`, `generation`, and\n        `reference`.\"\"\"\n        if self.mode == \"absolute\":\n            if self.reference:\n                return [\"instruction\", \"generation\", \"reference\"]\n            return [\"instruction\", \"generation\"]\n        else:\n            if self.reference:\n                return [\"instruction\", \"generations\", \"reference\"]\n            return [\"instruction\", \"generations\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` where the prompt is formatted according\n        to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction\n        from the user, including a pre-defined system prompt.\"\"\"\n        template_kwargs = {\n            \"instruction\": input[\"instruction\"],\n            \"rubric\": self.rubrics[self.rubric],\n        }\n        if self.reference:\n            template_kwargs[\"reference\"] = input[\"reference\"]\n\n        if self.mode == \"absolute\":\n            if not isinstance(input[\"generation\"], str):\n                raise DistilabelUserError(\n                    f\"Provided `generation` is of type {type(input['generation'])} but a string\"\n                    \" should be provided instead.\",\n                    page=\"components-gallery/tasks/prometheuseval/\",\n                )\n\n            template_kwargs[\"generation\"] = input[\"generation\"]\n            system_message = (\n                \"You are a fair judge assistant tasked with providing clear, objective feedback based\"\n                \" on specific criteria, ensuring each assessment reflects the absolute standards set\"\n                \" for performance.\"\n            )\n        else:  # self.mode == \"relative\"\n            if (\n                not isinstance(input[\"generations\"], list)\n                or not all(\n                    isinstance(generation, str) for generation in input[\"generations\"]\n                )\n                or len(input[\"generations\"]) != 2\n            ):\n                raise DistilabelUserError(\n                    f\"Provided `generations` is of type {type(input['generations'])} but a list of strings with length 2 should be provided instead.\",\n                    page=\"components-gallery/tasks/prometheuseval/\",\n                )\n\n            template_kwargs[\"generations\"] = input[\"generations\"]\n            system_message = (\n                \"You are a fair judge assistant assigned to deliver insightful feedback that compares\"\n                \" individual performances, highlighting how each stands relative to others within the\"\n                \" same cohort.\"\n            )\n\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": system_message,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(**template_kwargs),  # type: ignore\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task are the `feedback` and the `result` generated by Prometheus,\n        as well as the `model_name` which is automatically included based on the `LLM` used.\n        \"\"\"\n        return [\"feedback\", \"result\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dict with the keys `feedback` and `result` captured\n        using a regex from the Prometheus output.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Optionally provided in case it's useful to build the output.\n\n        Returns:\n            A dict with the keys `feedback` and `result` generated by the LLM.\n        \"\"\"\n        if output is None:\n            return {\"feedback\": None, \"result\": None}\n\n        parts = output.split(\"[RESULT]\")\n        if len(parts) != 2:\n            return {\"feedback\": None, \"result\": None}\n\n        feedback, result = parts[0].strip(), parts[1].strip()\n        if feedback.startswith(\"Feedback:\"):\n            feedback = feedback[len(\"Feedback:\") :].strip()\n        if self.mode == \"absolute\":\n            if not result.isdigit() or result not in [\"1\", \"2\", \"3\", \"4\", \"5\"]:\n                return {\"feedback\": None, \"result\": None}\n            return {\"feedback\": feedback, \"result\": int(result)}\n        else:  # self.mode == \"relative\"\n            if result not in [\"A\", \"B\"]:\n                return {\"feedback\": None, \"result\": None}\n            return {\"feedback\": feedback, \"result\": result}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.inputs","title":"inputs: List[str] property","text":"

The default inputs for the task are the instruction and the generation if reference=False, otherwise, the inputs are instruction, generation, and reference.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.outputs","title":"outputs: List[str] property","text":"

The output for the task are the feedback and the result generated by Prometheus, as well as the model_name which is automatically included based on the LLM used.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.load","title":"load()","text":"

Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation depending on the mode value, and either with or without reference, depending on the value of reference.

Source code in src/distilabel/steps/tasks/prometheus_eval.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template for Prometheus 2.0 either absolute or relative evaluation\n    depending on the `mode` value, and either with or without reference, depending on the\n    value of `reference`.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"prometheus\"\n        / (\n            f\"{self.mode}_without_reference.jinja2\"\n            if self.reference is False\n            else f\"{self.mode}_with_reference.jinja2\"\n        )\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType where the prompt is formatted according to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction from the user, including a pre-defined system prompt.

Source code in src/distilabel/steps/tasks/prometheus_eval.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` where the prompt is formatted according\n    to the selected Jinja2 template for Prometheus 2.0, assuming that's the first interaction\n    from the user, including a pre-defined system prompt.\"\"\"\n    template_kwargs = {\n        \"instruction\": input[\"instruction\"],\n        \"rubric\": self.rubrics[self.rubric],\n    }\n    if self.reference:\n        template_kwargs[\"reference\"] = input[\"reference\"]\n\n    if self.mode == \"absolute\":\n        if not isinstance(input[\"generation\"], str):\n            raise DistilabelUserError(\n                f\"Provided `generation` is of type {type(input['generation'])} but a string\"\n                \" should be provided instead.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        template_kwargs[\"generation\"] = input[\"generation\"]\n        system_message = (\n            \"You are a fair judge assistant tasked with providing clear, objective feedback based\"\n            \" on specific criteria, ensuring each assessment reflects the absolute standards set\"\n            \" for performance.\"\n        )\n    else:  # self.mode == \"relative\"\n        if (\n            not isinstance(input[\"generations\"], list)\n            or not all(\n                isinstance(generation, str) for generation in input[\"generations\"]\n            )\n            or len(input[\"generations\"]) != 2\n        ):\n            raise DistilabelUserError(\n                f\"Provided `generations` is of type {type(input['generations'])} but a list of strings with length 2 should be provided instead.\",\n                page=\"components-gallery/tasks/prometheuseval/\",\n            )\n\n        template_kwargs[\"generations\"] = input[\"generations\"]\n        system_message = (\n            \"You are a fair judge assistant assigned to deliver insightful feedback that compares\"\n            \" individual performances, highlighting how each stands relative to others within the\"\n            \" same cohort.\"\n        )\n\n    return [\n        {\n            \"role\": \"system\",\n            \"content\": system_message,\n        },\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(**template_kwargs),  # type: ignore\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.PrometheusEval.format_output","title":"format_output(output, input)","text":"

The output is formatted as a dict with the keys feedback and result captured using a regex from the Prometheus output.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Optionally provided in case it's useful to build the output.

required

Returns:

Type Description Dict[str, Any]

A dict with the keys feedback and result generated by the LLM.

Source code in src/distilabel/steps/tasks/prometheus_eval.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dict with the keys `feedback` and `result` captured\n    using a regex from the Prometheus output.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Optionally provided in case it's useful to build the output.\n\n    Returns:\n        A dict with the keys `feedback` and `result` generated by the LLM.\n    \"\"\"\n    if output is None:\n        return {\"feedback\": None, \"result\": None}\n\n    parts = output.split(\"[RESULT]\")\n    if len(parts) != 2:\n        return {\"feedback\": None, \"result\": None}\n\n    feedback, result = parts[0].strip(), parts[1].strip()\n    if feedback.startswith(\"Feedback:\"):\n        feedback = feedback[len(\"Feedback:\") :].strip()\n    if self.mode == \"absolute\":\n        if not result.isdigit() or result not in [\"1\", \"2\", \"3\", \"4\", \"5\"]:\n            return {\"feedback\": None, \"result\": None}\n        return {\"feedback\": feedback, \"result\": int(result)}\n    else:  # self.mode == \"relative\"\n        if result not in [\"A\", \"B\"]:\n            return {\"feedback\": None, \"result\": None}\n        return {\"feedback\": feedback, \"result\": result}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer","title":"QualityScorer","text":"

Bases: Task

Score responses based on their quality using an LLM.

QualityScorer is a pre-defined task that defines the instruction as the input and score as the output. This task is used to rate the quality of instructions and responses. It's an implementation of the quality score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs are scored in terms of quality, obtaining a quality score for each instruction.

Attributes:

Name Type Description _template Union[Template, None]

a Jinja2 template used to format the input for the LLM.

Input columns
  • instruction (str): The instruction that was used to generate the responses.
  • responses (List[str]): The responses to be scored. Each response forms a pair with the instruction.
Output columns
  • scores (List[float]): The score for each instruction.
  • model_name (str): The model name used to generate the scores.
Categories
  • scorer
  • quality
  • response
References
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

Examples:

Evaluate the quality of your instructions:

from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = QualityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n            }\n        ]\n    )\n)\n# result\n[\n    {\n        'instructions': 'instruction',\n        'model_name': 'test',\n        'scores': [5, 3, 1],\n    }\n]\n

Generate structured output with default schema:

from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\nscorer = QualityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    use_default_structured_output=True\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n            }\n        ]\n    )\n)\n\n# result\n[{'instruction': 'instruction',\n'responses': ['good response', 'weird response', 'bad response'],\n'scores': [1, 2, 3],\n'distilabel_metadata': {'raw_output_quality_scorer_0': '{  \"scores\": [1, 2, 3] }'},\n'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
Citations
@misc{liu2024makesgooddataalignment,\n    title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n    author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n    year={2024},\n    eprint={2312.15685},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2312.15685},\n}\n
Source code in src/distilabel/steps/tasks/quality_scorer.py
class QualityScorer(Task):\n    \"\"\"Score responses based on their quality using an `LLM`.\n\n    `QualityScorer` is a pre-defined task that defines the `instruction` as the input\n    and `score` as the output. This task is used to rate the quality of instructions and responses.\n    It's an implementation of the quality score task from the paper 'What Makes Good Data\n    for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.\n    The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs\n    are scored in terms of quality, obtaining a quality score for each instruction.\n\n    Attributes:\n        _template: a Jinja2 template used to format the input for the LLM.\n\n    Input columns:\n        - instruction (`str`): The instruction that was used to generate the `responses`.\n        - responses (`List[str]`): The responses to be scored. Each response forms a pair with the instruction.\n\n    Output columns:\n        - scores (`List[float]`): The score for each instruction.\n        - model_name (`str`): The model name used to generate the scores.\n\n    Categories:\n        - scorer\n        - quality\n        - response\n\n    References:\n        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)\n\n    Examples:\n        Evaluate the quality of your instructions:\n\n        ```python\n        from distilabel.steps.tasks import QualityScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        scorer = QualityScorer(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            )\n        )\n\n        scorer.load()\n\n        result = next(\n            scorer.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n                    }\n                ]\n            )\n        )\n        # result\n        [\n            {\n                'instructions': 'instruction',\n                'model_name': 'test',\n                'scores': [5, 3, 1],\n            }\n        ]\n        ```\n\n        Generate structured output with default schema:\n\n        ```python\n        from distilabel.steps.tasks import QualityScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        scorer = QualityScorer(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            use_default_structured_output=True\n        )\n\n        scorer.load()\n\n        result = next(\n            scorer.process(\n                [\n                    {\n                        \"instruction\": \"instruction\",\n                        \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n                    }\n                ]\n            )\n        )\n\n        # result\n        [{'instruction': 'instruction',\n        'responses': ['good response', 'weird response', 'bad response'],\n        'scores': [1, 2, 3],\n        'distilabel_metadata': {'raw_output_quality_scorer_0': '{  \"scores\": [1, 2, 3] }'},\n        'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n    Citations:\n        ```\n        @misc{liu2024makesgooddataalignment,\n            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},\n            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},\n            year={2024},\n            eprint={2312.15685},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2312.15685},\n        }\n        ```\n    \"\"\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"quality-scorer.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the task are `instruction` and `responses`.\"\"\"\n        return [\"instruction\", \"responses\"]\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    instruction=input[\"instruction\"], responses=input[\"responses\"]\n                ),\n            }\n        ]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is a list of `scores` containing the quality score for each\n        response in `responses`.\"\"\"\n        return [\"scores\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with the key `scores` containing the scores for each instruction-response pair.\n        \"\"\"\n        if output is None:\n            return {\"scores\": [None] * len(input[\"responses\"])}\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output, input)\n\n        scores = []\n        score_lines = output.split(\"\\n\")\n\n        for i, line in enumerate(score_lines):\n            match = _PARSE_SCORE_LINE_REGEX.match(line)\n            score = float(match.group(1)) if match else None\n            scores.append(score)\n            if i == len(input[\"responses\"]) - 1:\n                break\n        return {\"scores\": scores}\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from pydantic import BaseModel\n        from typing import List\n\n        class SchemaQualityScorer(BaseModel):\n            scores: List[int]\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        return {\n            \"properties\": {\n                \"scores\": {\n                    \"items\": {\"type\": \"integer\"},\n                    \"title\": \"Scores\",\n                    \"type\": \"array\",\n                }\n            },\n            \"required\": [\"scores\"],\n            \"title\": \"SchemaQualityScorer\",\n            \"type\": \"object\",\n        }\n\n    def _format_structured_output(\n        self, output: str, input: Dict[str, Any]\n    ) -> Dict[str, str]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with the scores, and a list with them.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            return {\"scores\": [None] * len(input[\"responses\"])}\n\n    @override\n    def _sample_input(self) -> ChatType:\n        return self.format_input(\n            {\n                \"instruction\": f\"<PLACEHOLDER_{'instruction'.upper()}>\",\n                \"responses\": [\n                    f\"<PLACEHOLDER_{f'RESPONSE_{i}'.upper()}>\" for i in range(2)\n                ],\n            }\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.inputs","title":"inputs: List[str] property","text":"

The inputs for the task are instruction and responses.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.outputs","title":"outputs property","text":"

The output for the task is a list of scores containing the quality score for each response in responses.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/quality_scorer.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"quality-scorer.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/quality_scorer.py
def format_input(self, input: Dict[str, Any]) -> ChatType:  # type: ignore\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                instruction=input[\"instruction\"], responses=input[\"responses\"]\n            ),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.format_output","title":"format_output(output, input)","text":"

The output is formatted as a list with the score of each instruction-response pair.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Dict[str, Any]

the input to the task. Used for obtaining the number of responses.

required

Returns:

Type Description Dict[str, Any]

A dict with the key scores containing the scores for each instruction-response pair.

Source code in src/distilabel/steps/tasks/quality_scorer.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the score of each instruction-response pair.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with the key `scores` containing the scores for each instruction-response pair.\n    \"\"\"\n    if output is None:\n        return {\"scores\": [None] * len(input[\"responses\"])}\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output, input)\n\n    scores = []\n    score_lines = output.split(\"\\n\")\n\n    for i, line in enumerate(score_lines):\n        match = _PARSE_SCORE_LINE_REGEX.match(line)\n        score = float(match.group(1)) if match else None\n        scores.append(score)\n        if i == len(input[\"responses\"]) - 1:\n            break\n    return {\"scores\": scores}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaQualityScorer(BaseModel):\n    scores: List[int]\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/quality_scorer.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from pydantic import BaseModel\n    from typing import List\n\n    class SchemaQualityScorer(BaseModel):\n        scores: List[int]\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    return {\n        \"properties\": {\n            \"scores\": {\n                \"items\": {\"type\": \"integer\"},\n                \"title\": \"Scores\",\n                \"type\": \"array\",\n            }\n        },\n        \"required\": [\"scores\"],\n        \"title\": \"SchemaQualityScorer\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.QualityScorer._format_structured_output","title":"_format_structured_output(output, input)","text":"

Parses the structured response, which should correspond to a dictionary with the scores, and a list with them.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, str]

Formatted output.

Source code in src/distilabel/steps/tasks/quality_scorer.py
def _format_structured_output(\n    self, output: str, input: Dict[str, Any]\n) -> Dict[str, str]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with the scores, and a list with them.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        return {\"scores\": [None] * len(input[\"responses\"])}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct","title":"SelfInstruct","text":"

Bases: Task

Generate instructions based on a given input using an LLM.

SelfInstruct is a pre-defined task that, given a number of instructions, a certain criteria for query generations, an application description, and an input, generates a number of instruction related to the given input and following what is stated in the criteria for query generation and the application description. It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning Language Models with Self-Generated Instructions\".

Attributes:

Name Type Description num_instructions int

The number of instructions to be generated. Defaults to 5.

criteria_for_query_generation str

The criteria for the query generation. Defaults to the criteria defined within the paper.

application_description str

The description of the AI application that one want to build with these instructions. Defaults to AI assistant.

Input columns
  • input (str): The input to generate the instructions. It's also called seed in the paper.
Output columns
  • instructions (List[str]): The generated instructions.
  • model_name (str): The model name used to generate the instructions.
Categories
  • text-generation
Reference
  • Self-Instruct: Aligning Language Models with Self-Generated Instructions

Examples:

Generate instructions based on a given input:

from distilabel.steps.tasks import SelfInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\nself_instruct = SelfInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=5,  # This is the default value\n)\n\nself_instruct.load()\n\nresult = next(self_instruct.process([{\"input\": \"instruction\"}]))\n# result\n# [\n#     {\n#         'input': 'instruction',\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#         'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n#     }\n# ]\n
Citations
@misc{wang2023selfinstructaligninglanguagemodels,\n    title={Self-Instruct: Aligning Language Models with Self-Generated Instructions},\n    author={Yizhong Wang and Yeganeh Kordi and Swaroop Mishra and Alisa Liu and Noah A. Smith and Daniel Khashabi and Hannaneh Hajishirzi},\n    year={2023},\n    eprint={2212.10560},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2212.10560},\n}\n
Source code in src/distilabel/steps/tasks/self_instruct.py
class SelfInstruct(Task):\n    \"\"\"Generate instructions based on a given input using an `LLM`.\n\n    `SelfInstruct` is a pre-defined task that, given a number of instructions, a\n    certain criteria for query generations, an application description, and an input,\n    generates a number of instruction related to the given input and following what\n    is stated in the criteria for query generation and the application description.\n    It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning\n    Language Models with Self-Generated Instructions\".\n\n    Attributes:\n        num_instructions: The number of instructions to be generated. Defaults to 5.\n        criteria_for_query_generation: The criteria for the query generation. Defaults\n            to the criteria defined within the paper.\n        application_description: The description of the AI application that one want\n            to build with these instructions. Defaults to `AI assistant`.\n\n    Input columns:\n        - input (`str`): The input to generate the instructions. It's also called seed in\n            the paper.\n\n    Output columns:\n        - instructions (`List[str]`): The generated instructions.\n        - model_name (`str`): The model name used to generate the instructions.\n\n    Categories:\n        - text-generation\n\n    Reference:\n        - [`Self-Instruct: Aligning Language Models with Self-Generated Instructions`](https://arxiv.org/abs/2212.10560)\n\n    Examples:\n        Generate instructions based on a given input:\n\n        ```python\n        from distilabel.steps.tasks import SelfInstruct\n        from distilabel.models import InferenceEndpointsLLM\n\n        self_instruct = SelfInstruct(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            num_instructions=5,  # This is the default value\n        )\n\n        self_instruct.load()\n\n        result = next(self_instruct.process([{\"input\": \"instruction\"}]))\n        # result\n        # [\n        #     {\n        #         'input': 'instruction',\n        #         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n        #         'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n        #     }\n        # ]\n        ```\n\n    Citations:\n        ```\n        @misc{wang2023selfinstructaligninglanguagemodels,\n            title={Self-Instruct: Aligning Language Models with Self-Generated Instructions},\n            author={Yizhong Wang and Yeganeh Kordi and Swaroop Mishra and Alisa Liu and Noah A. Smith and Daniel Khashabi and Hannaneh Hajishirzi},\n            year={2023},\n            eprint={2212.10560},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2212.10560},\n        }\n        ```\n    \"\"\"\n\n    num_instructions: int = 5\n    criteria_for_query_generation: str = (\n        \"Incorporate a diverse range of verbs, avoiding repetition.\\n\"\n        \"Ensure queries are compatible with AI model's text generation functions and are limited to 1-2 sentences.\\n\"\n        \"Design queries to be self-contained and standalone.\\n\"\n        'Blend interrogative (e.g., \"What is the significance of x?\") and imperative (e.g., \"Detail the process of x.\") styles.'\n    )\n    application_description: str = \"AI assistant\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"self-instruct.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `input` i.e. seed text.\"\"\"\n        return [\"input\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    input=input[\"input\"],\n                    application_description=self.application_description,\n                    criteria_for_query_generation=self.criteria_for_query_generation,\n                    num_instructions=self.num_instructions,\n                ),\n            }\n        ]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n        return [\"instructions\", \"model_name\"]\n\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Optional[Dict[str, Any]] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a list with the generated instructions.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Used for obtaining the number of responses.\n\n        Returns:\n            A dict with containing the generated instructions.\n        \"\"\"\n        if output is None:\n            return {\"instructions\": []}\n        return {\"instructions\": [line for line in output.split(\"\\n\") if line != \"\"]}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.inputs","title":"inputs: List[str] property","text":"

The input for the task is the input i.e. seed text.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.outputs","title":"outputs property","text":"

The output for the task is a list of instructions containing the generated instructions.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/self_instruct.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"self-instruct.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/self_instruct.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                input=input[\"input\"],\n                application_description=self.application_description,\n                criteria_for_query_generation=self.criteria_for_query_generation,\n                num_instructions=self.num_instructions,\n            ),\n        }\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.SelfInstruct.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a list with the generated instructions.

Parameters:

Name Type Description Default output Union[str, None]

the raw output of the LLM.

required input Optional[Dict[str, Any]]

the input to the task. Used for obtaining the number of responses.

None

Returns:

Type Description Dict[str, Any]

A dict with containing the generated instructions.

Source code in src/distilabel/steps/tasks/self_instruct.py
def format_output(\n    self,\n    output: Union[str, None],\n    input: Optional[Dict[str, Any]] = None,\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a list with the generated instructions.\n\n    Args:\n        output: the raw output of the LLM.\n        input: the input to the task. Used for obtaining the number of responses.\n\n    Returns:\n        A dict with containing the generated instructions.\n    \"\"\"\n    if output is None:\n        return {\"instructions\": []}\n    return {\"instructions\": [line for line in output.split(\"\\n\") if line != \"\"]}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair","title":"GenerateSentencePair","text":"

Bases: Task

Generate a positive and negative (optionally) sentences given an anchor sentence.

GenerateSentencePair is a pre-defined task that given an anchor sentence generates a positive sentence related to the anchor and optionally a negative sentence unrelated to the anchor or similar to it. Optionally, you can give a context to guide the LLM towards more specific behavior. This task is useful to generate training datasets for training embeddings models.

Attributes:

Name Type Description triplet bool

a flag to indicate if the task should generate a triplet of sentences (anchor, positive, negative). Defaults to False.

action GenerationAction

the action to perform to generate the positive sentence.

context str

the context to use for the generation. Can be helpful to guide the LLM towards more specific context. Not used by default.

hard_negative bool

A flag to indicate if the negative should be a hard-negative or not. Hard negatives make it hard for the model to distinguish against the positive, with a higher degree of semantic similarity.

Input columns
  • anchor (str): The anchor sentence to generate the positive and negative sentences.
Output columns
  • positive (str): The positive sentence related to the anchor.
  • negative (str): The negative sentence unrelated to the anchor if triplet=True, or more similar to the positive to make it more challenging for a model to distinguish in case hard_negative=True.
  • model_name (str): The name of the model that was used to generate the sentences.
Categories
  • embedding

Examples:

Paraphrasing:

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"paraphrase\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n

Generating semantically similar sentences:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import GenerateSentencePair\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"semantically-similar\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n

Generating queries:

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n

Generating answers:

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"answer\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n

Generating queries with context (applies to every action):

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    context=\"Argilla is an open-source data curation platform for LLMs.\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n

Generating Hard-negatives (applies to every action):

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    context=\"Argilla is an open-source data curation platform for LLMs.\",\n    hard_negative=True,\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n

Generating structured data with default schema (applies to every action):

from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    context=\"Argilla is an open-source data curation platform for LLMs.\",\n    hard_negative=True,\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n    use_default_structured_output=True\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n
Source code in src/distilabel/steps/tasks/sentence_transformers.py
class GenerateSentencePair(Task):\n    \"\"\"Generate a positive and negative (optionally) sentences given an anchor sentence.\n\n    `GenerateSentencePair` is a pre-defined task that given an anchor sentence generates\n    a positive sentence related to the anchor and optionally a negative sentence unrelated\n    to the anchor or similar to it. Optionally, you can give a context to guide the LLM\n    towards more specific behavior. This task is useful to generate training datasets for\n    training embeddings models.\n\n    Attributes:\n        triplet: a flag to indicate if the task should generate a triplet of sentences\n            (anchor, positive, negative). Defaults to `False`.\n        action: the action to perform to generate the positive sentence.\n        context: the context to use for the generation. Can be helpful to guide the LLM\n            towards more specific context. Not used by default.\n        hard_negative: A flag to indicate if the negative should be a hard-negative or not.\n            Hard negatives make it hard for the model to distinguish against the positive,\n            with a higher degree of semantic similarity.\n\n    Input columns:\n        - anchor (`str`): The anchor sentence to generate the positive and negative sentences.\n\n    Output columns:\n        - positive (`str`): The positive sentence related to the `anchor`.\n        - negative (`str`): The negative sentence unrelated to the `anchor` if `triplet=True`,\n            or more similar to the positive to make it more challenging for a model to distinguish\n            in case `hard_negative=True`.\n        - model_name (`str`): The name of the model that was used to generate the sentences.\n\n    Categories:\n        - embedding\n\n    Examples:\n        Paraphrasing:\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"paraphrase\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n        ```\n\n        Generating semantically similar sentences:\n\n        ```python\n        from distilabel.models import InferenceEndpointsLLM\n        from distilabel.steps.tasks import GenerateSentencePair\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"semantically-similar\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n        ```\n\n        Generating queries:\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"query\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n        ```\n\n        Generating answers:\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"answer\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n        ```\n\n        Generating queries with context (**applies to every action**):\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"query\",\n            context=\"Argilla is an open-source data curation platform for LLMs.\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n        ```\n\n        Generating Hard-negatives (**applies to every action**):\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"query\",\n            context=\"Argilla is an open-source data curation platform for LLMs.\",\n            hard_negative=True,\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n        ```\n\n        Generating structured data with default schema (**applies to every action**):\n\n        ```python\n        from distilabel.steps.tasks import GenerateSentencePair\n        from distilabel.models import InferenceEndpointsLLM\n\n        generate_sentence_pair = GenerateSentencePair(\n            triplet=True, # `False` to generate only positive\n            action=\"query\",\n            context=\"Argilla is an open-source data curation platform for LLMs.\",\n            hard_negative=True,\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            input_batch_size=10,\n            use_default_structured_output=True\n        )\n\n        generate_sentence_pair.load()\n\n        result = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n        ```\n    \"\"\"\n\n    triplet: bool = False\n    action: GenerationAction\n    hard_negative: bool = False\n    context: str = \"\"\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"generate-sentence-pair.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs for the task is the `anchor` sentence.\"\"\"\n        return [\"anchor\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The inputs are formatted as a `ChatType`, with a system prompt describing the\n        task of generating a positive and negative sentences for the anchor sentence. The\n        anchor is provided as the first user interaction in the conversation.\n\n        Args:\n            input: The input containing the `anchor` sentence.\n\n        Returns:\n            A list of dictionaries containing the system and user interactions.\n        \"\"\"\n        action_sentence = GENERATION_ACTION_SENTENCES[self.action]\n\n        format_system_prompt = {\n            \"action_sentence\": action_sentence,\n            \"context\": CONTEXT_INTRO if self.context else \"\",\n        }\n        if self.triplet:\n            format_system_prompt[\"negative_style\"] = NEGATIVE_STYLE[\n                \"hard-negative\" if self.hard_negative else \"negative\"\n            ]\n\n        system_prompt = (\n            POSITIVE_NEGATIVE_SYSTEM_PROMPT if self.triplet else POSITIVE_SYSTEM_PROMPT\n        ).format(**format_system_prompt)\n\n        return [\n            {\"role\": \"system\", \"content\": system_prompt},\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    anchor=input[\"anchor\"],\n                    context=self.context if self.context else None,\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The outputs for the task are the `positive` and `negative` sentences, as well\n        as the `model_name` used to generate the sentences.\"\"\"\n        columns = [\"positive\", \"negative\"] if self.triplet else [\"positive\"]\n        columns += [\"model_name\"]\n        return columns\n\n    def format_output(\n        self, output: Union[str, None], input: Optional[Dict[str, Any]] = None\n    ) -> Dict[str, Any]:\n        \"\"\"Formats the output of the LLM, to extract the `positive` and `negative` sentences\n        generated. If the output is `None` or the regex doesn't match, then the outputs\n        will be set to `None` as well.\n\n        Args:\n            output: The output of the LLM.\n            input: The input used to generate the output.\n\n        Returns:\n            The formatted output containing the `positive` and `negative` sentences.\n        \"\"\"\n        if output is None:\n            return {\"positive\": None, \"negative\": None}\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output)\n\n        match = POSITIVE_NEGATIVE_PAIR_REGEX.match(output)\n        if match is None:\n            formatted_output = {\"positive\": None}\n            if self.triplet:\n                formatted_output[\"negative\"] = None\n            return formatted_output\n\n        groups = match.groups()\n        if self.triplet:\n            return {\n                \"positive\": groups[0].strip(),\n                \"negative\": (\n                    groups[1].strip()\n                    if len(groups) > 1 and groups[1] is not None\n                    else None\n                ),\n            }\n\n        return {\"positive\": groups[0].strip()}\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        if self.triplet:\n            return {\n                \"properties\": {\n                    \"positive\": {\"title\": \"Positive\", \"type\": \"string\"},\n                    \"negative\": {\"title\": \"Negative\", \"type\": \"string\"},\n                },\n                \"required\": [\"positive\", \"negative\"],\n                \"title\": \"Schema\",\n                \"type\": \"object\",\n            }\n        return {\n            \"properties\": {\"positive\": {\"title\": \"Positive\", \"type\": \"string\"}},\n            \"required\": [\"positive\"],\n            \"title\": \"Schema\",\n            \"type\": \"object\",\n        }\n\n    def _format_structured_output(self, output: str) -> Dict[str, str]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with either `positive`, or `positive` and `negative` keys.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            if self.triplet:\n                return {\"positive\": None, \"negative\": None}\n            return {\"positive\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.inputs","title":"inputs: List[str] property","text":"

The inputs for the task is the anchor sentence.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.outputs","title":"outputs: List[str] property","text":"

The outputs for the task are the positive and negative sentences, as well as the model_name used to generate the sentences.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.load","title":"load()","text":"

Loads the Jinja2 template.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"generate-sentence-pair.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.format_input","title":"format_input(input)","text":"

The inputs are formatted as a ChatType, with a system prompt describing the task of generating a positive and negative sentences for the anchor sentence. The anchor is provided as the first user interaction in the conversation.

Parameters:

Name Type Description Default input Dict[str, Any]

The input containing the anchor sentence.

required

Returns:

Type Description ChatType

A list of dictionaries containing the system and user interactions.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The inputs are formatted as a `ChatType`, with a system prompt describing the\n    task of generating a positive and negative sentences for the anchor sentence. The\n    anchor is provided as the first user interaction in the conversation.\n\n    Args:\n        input: The input containing the `anchor` sentence.\n\n    Returns:\n        A list of dictionaries containing the system and user interactions.\n    \"\"\"\n    action_sentence = GENERATION_ACTION_SENTENCES[self.action]\n\n    format_system_prompt = {\n        \"action_sentence\": action_sentence,\n        \"context\": CONTEXT_INTRO if self.context else \"\",\n    }\n    if self.triplet:\n        format_system_prompt[\"negative_style\"] = NEGATIVE_STYLE[\n            \"hard-negative\" if self.hard_negative else \"negative\"\n        ]\n\n    system_prompt = (\n        POSITIVE_NEGATIVE_SYSTEM_PROMPT if self.triplet else POSITIVE_SYSTEM_PROMPT\n    ).format(**format_system_prompt)\n\n    return [\n        {\"role\": \"system\", \"content\": system_prompt},\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(\n                anchor=input[\"anchor\"],\n                context=self.context if self.context else None,\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.format_output","title":"format_output(output, input=None)","text":"

Formats the output of the LLM, to extract the positive and negative sentences generated. If the output is None or the regex doesn't match, then the outputs will be set to None as well.

Parameters:

Name Type Description Default output Union[str, None]

The output of the LLM.

required input Optional[Dict[str, Any]]

The input used to generate the output.

None

Returns:

Type Description Dict[str, Any]

The formatted output containing the positive and negative sentences.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
def format_output(\n    self, output: Union[str, None], input: Optional[Dict[str, Any]] = None\n) -> Dict[str, Any]:\n    \"\"\"Formats the output of the LLM, to extract the `positive` and `negative` sentences\n    generated. If the output is `None` or the regex doesn't match, then the outputs\n    will be set to `None` as well.\n\n    Args:\n        output: The output of the LLM.\n        input: The input used to generate the output.\n\n    Returns:\n        The formatted output containing the `positive` and `negative` sentences.\n    \"\"\"\n    if output is None:\n        return {\"positive\": None, \"negative\": None}\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output)\n\n    match = POSITIVE_NEGATIVE_PAIR_REGEX.match(output)\n    if match is None:\n        formatted_output = {\"positive\": None}\n        if self.triplet:\n            formatted_output[\"negative\"] = None\n        return formatted_output\n\n    groups = match.groups()\n    if self.triplet:\n        return {\n            \"positive\": groups[0].strip(),\n            \"negative\": (\n                groups[1].strip()\n                if len(groups) > 1 and groups[1] is not None\n                else None\n            ),\n        }\n\n    return {\"positive\": groups[0].strip()}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    if self.triplet:\n        return {\n            \"properties\": {\n                \"positive\": {\"title\": \"Positive\", \"type\": \"string\"},\n                \"negative\": {\"title\": \"Negative\", \"type\": \"string\"},\n            },\n            \"required\": [\"positive\", \"negative\"],\n            \"title\": \"Schema\",\n            \"type\": \"object\",\n        }\n    return {\n        \"properties\": {\"positive\": {\"title\": \"Positive\", \"type\": \"string\"}},\n        \"required\": [\"positive\"],\n        \"title\": \"Schema\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.GenerateSentencePair._format_structured_output","title":"_format_structured_output(output)","text":"

Parses the structured response, which should correspond to a dictionary with either positive, or positive and negative keys.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, str]

Formatted output.

Source code in src/distilabel/steps/tasks/sentence_transformers.py
def _format_structured_output(self, output: str) -> Dict[str, str]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with either `positive`, or `positive` and `negative` keys.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        if self.triplet:\n            return {\"positive\": None, \"negative\": None}\n        return {\"positive\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration","title":"StructuredGeneration","text":"

Bases: Task

Generate structured content for a given instruction using an LLM.

StructuredGeneration is a pre-defined task that defines the instruction and the structured_output as the inputs, and generation as the output. This task is used to generate structured content based on the input instruction and following the schema provided within the structured_output column per each instruction. The model_name also returned as part of the output in order to enhance it.

Attributes:

Name Type Description use_system_prompt bool

Whether to use the system prompt in the generation. Defaults to True, which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored.

Input columns
  • instruction (str): The instruction to generate structured content from.
  • structured_output (Dict[str, Any]): The structured_output to generate structured content from. It should be a Python dictionary with the keys format and schema, where format should be one of json or regex, and the schema should be either the JSON schema or the regex pattern, respectively.
Output columns
  • generation (str): The generated text matching the provided schema, if possible.
  • model_name (str): The name of the model used to generate the text.
Categories
  • outlines
  • structured-generation

Examples:

Generate structured output from a JSON schema:

from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\n\nstructured_gen.load()\n\nresult = next(\n    structured_gen.process(\n        [\n            {\n                \"instruction\": \"Create an RPG character\",\n                \"structured_output\": {\n                    \"format\": \"json\",\n                    \"schema\": {\n                        \"properties\": {\n                            \"name\": {\n                                \"title\": \"Name\",\n                                \"type\": \"string\"\n                            },\n                            \"description\": {\n                                \"title\": \"Description\",\n                                \"type\": \"string\"\n                            },\n                            \"role\": {\n                                \"title\": \"Role\",\n                                \"type\": \"string\"\n                            },\n                            \"weapon\": {\n                                \"title\": \"Weapon\",\n                                \"type\": \"string\"\n                            }\n                        },\n                        \"required\": [\n                            \"name\",\n                            \"description\",\n                            \"role\",\n                            \"weapon\"\n                        ],\n                        \"title\": \"Character\",\n                        \"type\": \"object\"\n                    }\n                },\n            }\n        ]\n    )\n)\n

Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines):

from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\n\nstructured_gen.load()\n\nresult = next(\n    structured_gen.process(\n        [\n            {\n                \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n                \"structured_output\": {\n                    \"format\": \"regex\",\n                    \"schema\": r\"(\\d{1,2})\u00b0C\"\n                },\n\n            }\n        ]\n    )\n)\n
Source code in src/distilabel/steps/tasks/structured_generation.py
class StructuredGeneration(Task):\n    \"\"\"Generate structured content for a given `instruction` using an `LLM`.\n\n    `StructuredGeneration` is a pre-defined task that defines the `instruction` and the `structured_output`\n    as the inputs, and `generation` as the output. This task is used to generate structured content based on\n    the input instruction and following the schema provided within the `structured_output` column per each\n    `instruction`. The `model_name` also returned as part of the output in order to enhance it.\n\n    Attributes:\n        use_system_prompt: Whether to use the system prompt in the generation. Defaults to `True`,\n            which means that if the column `system_prompt` is  defined within the input batch, then\n            the `system_prompt` will be used, otherwise, it will be ignored.\n\n    Input columns:\n        - instruction (`str`): The instruction to generate structured content from.\n        - structured_output (`Dict[str, Any]`): The structured_output to generate structured content from. It should be a\n            Python dictionary with the keys `format` and `schema`, where `format` should be one of `json` or\n            `regex`, and the `schema` should be either the JSON schema or the regex pattern, respectively.\n\n    Output columns:\n        - generation (`str`): The generated text matching the provided schema, if possible.\n        - model_name (`str`): The name of the model used to generate the text.\n\n    Categories:\n        - outlines\n        - structured-generation\n\n    Examples:\n        Generate structured output from a JSON schema:\n\n        ```python\n        from distilabel.steps.tasks import StructuredGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        structured_gen = StructuredGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            ),\n        )\n\n        structured_gen.load()\n\n        result = next(\n            structured_gen.process(\n                [\n                    {\n                        \"instruction\": \"Create an RPG character\",\n                        \"structured_output\": {\n                            \"format\": \"json\",\n                            \"schema\": {\n                                \"properties\": {\n                                    \"name\": {\n                                        \"title\": \"Name\",\n                                        \"type\": \"string\"\n                                    },\n                                    \"description\": {\n                                        \"title\": \"Description\",\n                                        \"type\": \"string\"\n                                    },\n                                    \"role\": {\n                                        \"title\": \"Role\",\n                                        \"type\": \"string\"\n                                    },\n                                    \"weapon\": {\n                                        \"title\": \"Weapon\",\n                                        \"type\": \"string\"\n                                    }\n                                },\n                                \"required\": [\n                                    \"name\",\n                                    \"description\",\n                                    \"role\",\n                                    \"weapon\"\n                                ],\n                                \"title\": \"Character\",\n                                \"type\": \"object\"\n                            }\n                        },\n                    }\n                ]\n            )\n        )\n        ```\n\n        Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines):\n\n        ```python\n        from distilabel.steps.tasks import StructuredGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        structured_gen = StructuredGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            ),\n        )\n\n        structured_gen.load()\n\n        result = next(\n            structured_gen.process(\n                [\n                    {\n                        \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n                        \"structured_output\": {\n                            \"format\": \"regex\",\n                            \"schema\": r\"(\\\\d{1,2})\u00b0C\"\n                        },\n\n                    }\n                ]\n            )\n        )\n        ```\n    \"\"\"\n\n    use_system_prompt: bool = False\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task are the `instruction` and the `structured_output`.\n        Optionally, if the `use_system_prompt` flag is set to True, then the\n        `system_prompt` will be used too.\"\"\"\n        columns = [\"instruction\", \"structured_output\"]\n        if self.use_system_prompt:\n            columns = [\"system_prompt\"] + columns\n        return columns\n\n    def format_input(self, input: Dict[str, Any]) -> StructuredInput:\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        if not isinstance(input[\"instruction\"], str):\n            raise DistilabelUserError(\n                f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n                page=\"components-gallery/tasks/structuredgeneration/\",\n            )\n\n        messages = [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n        if self.use_system_prompt:\n            if \"system_prompt\" in input:\n                messages.insert(\n                    0, {\"role\": \"system\", \"content\": input[\"system_prompt\"]}\n                )\n            else:\n                warnings.warn(\n                    \"`use_system_prompt` is set to `True`, but no `system_prompt` in input batch, so it will be ignored.\",\n                    UserWarning,\n                    stacklevel=2,\n                )\n\n        return (messages, input.get(\"structured_output\", None))  # type: ignore\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        return [\"generation\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n        will be automatically included within the `process` method of `Task`. Note that even\n        if the `structured_output` is defined to produce a JSON schema, this method will return the raw\n        output i.e. a string without any parsing.\"\"\"\n        return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.inputs","title":"inputs: List[str] property","text":"

The input for the task are the instruction and the structured_output. Optionally, if the use_system_prompt flag is set to True, then the system_prompt will be used too.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/structured_generation.py
def format_input(self, input: Dict[str, Any]) -> StructuredInput:\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    if not isinstance(input[\"instruction\"], str):\n        raise DistilabelUserError(\n            f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n            page=\"components-gallery/tasks/structuredgeneration/\",\n        )\n\n    messages = [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n    if self.use_system_prompt:\n        if \"system_prompt\" in input:\n            messages.insert(\n                0, {\"role\": \"system\", \"content\": input[\"system_prompt\"]}\n            )\n        else:\n            warnings.warn(\n                \"`use_system_prompt` is set to `True`, but no `system_prompt` in input batch, so it will be ignored.\",\n                UserWarning,\n                stacklevel=2,\n            )\n\n    return (messages, input.get(\"structured_output\", None))  # type: ignore\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.StructuredGeneration.format_output","title":"format_output(output, input)","text":"

The output is formatted as a dictionary with the generation. The model_name will be automatically included within the process method of Task. Note that even if the structured_output is defined to produce a JSON schema, this method will return the raw output i.e. a string without any parsing.

Source code in src/distilabel/steps/tasks/structured_generation.py
def format_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n    will be automatically included within the `process` method of `Task`. Note that even\n    if the `structured_output` is defined to produce a JSON schema, this method will return the raw\n    output i.e. a string without any parsing.\"\"\"\n    return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification","title":"TextClassification","text":"

Bases: Task

Classifies text into one or more categories or labels.

This task can be used for text classification problems, where the goal is to assign one or multiple labels to a given text. It uses structured generation as per the reference paper by default, it can help to generate more concise labels. See section 4.1 in the reference.

Input columns
  • text (str): The reference text we want to obtain labels for.
Output columns
  • labels (Union[str, List[str]]): The label or list of labels for the text.
  • model_name (str): The name of the model used to generate the label/s.
Categories
  • text-classification
References
  • Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models

Attributes:

Name Type Description system_prompt Optional[str]

A prompt to display to the user before the task starts. Contains a default message to make the model behave like a classifier specialist.

n PositiveInt

Number of labels to generate If only 1 is required, corresponds to a label classification problem, if >1 it will intend return the \"n\" labels most representative for the text. Defaults to 1.

context Optional[str]

Context to use when generating the labels. By default contains a generic message, but can be used to customize the context for the task.

examples Optional[List[str]]

List of examples to help the model understand the task, few shots.

available_labels Optional[Union[List[str], Dict[str, str]]]

List of available labels to choose from when classifying the text, or a dictionary with the labels and their descriptions.

default_label Optional[Union[str, List[str]]]

Default label to use when the text is ambiguous or lacks sufficient information for classification. Can be a list in case of multiple labels (n>1).

Examples:

Assigning a sentiment to a text:

from distilabel.steps.tasks import TextClassification\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\ntext_classification = TextClassification(\n    llm=llm,\n    context=\"You are an AI system specialized in assigning sentiment to movies.\",\n    available_labels=[\"positive\", \"negative\"],\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n    )\n)\n# result\n# [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n# 'labels': 'positive',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"positive\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"positive\",  # The text shows positive sentiment\\n    \"negative\",  # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n

Assigning predefined labels with specified descriptions:

from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n    llm=llm,\n    n=1,\n    context=\"Determine the intent of the text.\",\n    available_labels={\n        \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n        \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n        \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n        \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n    },\n    query_title=\"Customer Query\",\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"Can you tell me more about your return policy?\"}]\n    )\n)\n# result\n# [{'text': 'Can you tell me more about your return policy?',\n# 'labels': 'inquiry',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"inquiry\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"complaint\",  # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n    \"inquiry\",  # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n    \"feedback\",  # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n    \"praise\",  # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n

Free multi label classification without predefined labels:

from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n    llm=llm,\n    n=3,\n    context=(\n        \"Describe the main themes, topics, or categories that could describe the \"\n        \"following type of persona.\"\n    ),\n    query_title=\"Example of Persona\",\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n    )\n)\n# result\n# [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n# 'labels': ['Historical Researcher',\n# 'Cultural Specialist',\n# 'Ethnic Studies Expert'],\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
Source code in src/distilabel/steps/tasks/text_classification.py
class TextClassification(Task):\n    r\"\"\"Classifies text into one or more categories or labels.\n\n    This task can be used for text classification problems, where the goal is to assign\n    one or multiple labels to a given text.\n    It uses structured generation as per the reference paper by default,\n    it can help to generate more concise labels. See section 4.1 in the reference.\n\n    Input columns:\n        - text (`str`): The reference text we want to obtain labels for.\n\n    Output columns:\n        - labels (`Union[str, List[str]]`): The label or list of labels for the text.\n        - model_name (`str`): The name of the model used to generate the label/s.\n\n    Categories:\n        - text-classification\n\n    References:\n        - [`Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models`](https://arxiv.org/abs/2408.02442)\n\n    Attributes:\n        system_prompt: A prompt to display to the user before the task starts. Contains a default\n            message to make the model behave like a classifier specialist.\n        n: Number of labels to generate If only 1 is required, corresponds to a label\n            classification problem, if >1 it will intend return the \"n\" labels most representative\n            for the text. Defaults to 1.\n        context: Context to use when generating the labels. By default contains a generic message,\n            but can be used to customize the context for the task.\n        examples: List of examples to help the model understand the task, few shots.\n        available_labels: List of available labels to choose from when classifying the text, or\n            a dictionary with the labels and their descriptions.\n        default_label: Default label to use when the text is ambiguous or lacks sufficient information for\n            classification. Can be a list in case of multiple labels (n>1).\n\n    Examples:\n        Assigning a sentiment to a text:\n\n        ```python\n        from distilabel.steps.tasks import TextClassification\n        from distilabel.models import InferenceEndpointsLLM\n\n        llm = InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        )\n\n        text_classification = TextClassification(\n            llm=llm,\n            context=\"You are an AI system specialized in assigning sentiment to movies.\",\n            available_labels=[\"positive\", \"negative\"],\n        )\n\n        text_classification.load()\n\n        result = next(\n            text_classification.process(\n                [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n            )\n        )\n        # result\n        # [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n        # 'labels': 'positive',\n        # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"positive\"\\n}',\n        # 'raw_input_text_classification_0': [{'role': 'system',\n        #     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n        #     {'role': 'user',\n        #     'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"positive\",  # The text shows positive sentiment\\n    \"negative\",  # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Assigning predefined labels with specified descriptions:\n\n        ```python\n        from distilabel.steps.tasks import TextClassification\n\n        text_classification = TextClassification(\n            llm=llm,\n            n=1,\n            context=\"Determine the intent of the text.\",\n            available_labels={\n                \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n                \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n                \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n                \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n            },\n            query_title=\"Customer Query\",\n        )\n\n        text_classification.load()\n\n        result = next(\n            text_classification.process(\n                [{\"text\": \"Can you tell me more about your return policy?\"}]\n            )\n        )\n        # result\n        # [{'text': 'Can you tell me more about your return policy?',\n        # 'labels': 'inquiry',\n        # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"inquiry\"\\n}',\n        # 'raw_input_text_classification_0': [{'role': 'system',\n        #     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n        #     {'role': 'user',\n        #     'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"complaint\",  # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n    \"inquiry\",  # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n    \"feedback\",  # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n    \"praise\",  # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Free multi label classification without predefined labels:\n\n        ```python\n        from distilabel.steps.tasks import TextClassification\n\n        text_classification = TextClassification(\n            llm=llm,\n            n=3,\n            context=(\n                \"Describe the main themes, topics, or categories that could describe the \"\n                \"following type of persona.\"\n            ),\n            query_title=\"Example of Persona\",\n        )\n\n        text_classification.load()\n\n        result = next(\n            text_classification.process(\n                [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n            )\n        )\n        # result\n        # [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n        # 'labels': ['Historical Researcher',\n        # 'Cultural Specialist',\n        # 'Ethnic Studies Expert'],\n        # 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n        # 'raw_input_text_classification_0': [{'role': 'system',\n        #     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n        #     {'role': 'user',\n        #     'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n    \"\"\"\n\n    system_prompt: Optional[str] = (\n        \"You are an AI system specialized in generating labels to classify pieces of text. \"\n        \"Your sole purpose is to analyze the given text and provide appropriate classification labels.\"\n    )\n    n: PositiveInt = Field(\n        default=1,\n        description=\"Number of labels to generate. Defaults to 1.\",\n    )\n    context: Optional[str] = Field(\n        default=\"Generate concise, relevant labels that accurately represent the text's main themes, topics, or categories.\",\n        description=\"Context to use when generating the labels.\",\n    )\n    examples: Optional[List[str]] = Field(\n        default=None,\n        description=\"List of examples to help the model understand the task, few shots.\",\n    )\n    available_labels: Optional[Union[List[str], Dict[str, str]]] = Field(\n        default=None,\n        description=(\n            \"List of available labels to choose from when classifying the text, or \"\n            \"a dictionary with the labels and their descriptions.\"\n        ),\n    )\n    default_label: Optional[Union[str, List[str]]] = Field(\n        default=\"Unclassified\",\n        description=(\n            \"Default label to use when the text is ambiguous or lacks sufficient information for \"\n            \"classification. Can be a list in case of multiple labels (n>1).\"\n        ),\n    )\n    query_title: str = Field(\n        default=\"User Query\",\n        description=\"Title of the query used to show the example/s to classify.\",\n    )\n    use_default_structured_output: bool = True\n\n    _template: Optional[Template] = PrivateAttr(default=None)\n\n    def load(self) -> None:\n        super().load()\n        self._template = Template(TEXT_CLASSIFICATION_TEMPLATE)\n        self._labels_format: str = (\n            '\"label\"'\n            if self.n == 1\n            else \"[\" + \", \".join([f'\"label_{i}\"' for i in range(self.n)]) + \"]\"\n        )\n        self._labels_message: str = (\n            \"Provide the label that best describes the text.\"\n            if self.n == 1\n            else f\"Provide a list of {self.n} labels that best describe the text.\"\n        )\n        self._available_labels_message: str = self._get_available_labels_message()\n        self._examples: str = self._get_examples_message()\n\n    def _get_available_labels_message(self) -> str:\n        \"\"\"Prepares the message to display depending on the available labels (if any),\n        and whether the labels have a specific context.\n        \"\"\"\n        if self.available_labels is None:\n            return (\n                \"Use clear, widely understood terms for labels.\"\n                \"Avoid overly specific or obscure labels unless the text demands it.\"\n            )\n\n        msg = (\n            \"## Labeling the user input\\n\"\n            \"Use the available labels to classify the user query{label_context}:\\n\"\n            \"available_labels = {available_labels}\"\n        )\n        if isinstance(self.available_labels, list):\n            specific_msg = (\n                \"[\\n\"\n                + indent(\n                    \"\".join([f'\"{label}\",\\n' for label in self.available_labels]),\n                    prefix=\" \" * 4,\n                )\n                + \"]\"\n            )\n            return msg.format(label_context=\"\", available_labels=specific_msg)\n\n        elif isinstance(self.available_labels, dict):\n            specific_msg = \"\"\n            for label, description in self.available_labels.items():\n                specific_msg += indent(\n                    f'\"{label}\",  # {description}' + \"\\n\", prefix=\" \" * 4\n                )\n\n            specific_msg = \"[\\n\" + specific_msg + \"]\"\n            return msg.format(\n                label_context=\". Analyze the context of each label specifically\",\n                available_labels=specific_msg,\n            )\n\n    def _get_examples_message(self) -> str:\n        \"\"\"Prepares the message to display depending on the examples provided.\"\"\"\n        if self.examples is None:\n            return \"\"\n\n        examples_msg = \"\\n\".join([f\"- {ex}\" for ex in self.examples])\n\n        return (\n            \"\\n## Examples\\n\"\n            \"Here are some examples to help you understand the task:\\n\"\n            f\"{examples_msg}\"\n        )\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`.\"\"\"\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        return [\"labels\", \"model_name\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        messages = [\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    context=f\"\\n{self.context}\",\n                    labels_message=self._labels_message,\n                    available_labels=self._available_labels_message,\n                    examples=self._examples,\n                    default_label=self.default_label,\n                    labels_format=self._labels_format,\n                    query_title=self.query_title,\n                    text=input[\"text\"],\n                ),\n            },\n        ]\n        if self.system_prompt:\n            messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n        return messages\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n        will be automatically included within the `process` method of `Task`.\"\"\"\n        return self._format_structured_output(output)\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        if self.n > 1:\n\n            class MultiLabelSchema(BaseModel):\n                labels: List[str]\n\n            return MultiLabelSchema.model_json_schema()\n\n        class SingleLabelSchema(BaseModel):\n            labels: str\n\n        return SingleLabelSchema.model_json_schema()\n\n    def _format_structured_output(\n        self, output: str\n    ) -> Dict[str, Union[str, List[str]]]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with the `labels`, and either a string or a list of strings with the labels.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            if self.n > 1:\n                return {\"labels\": [None for _ in range(self.n)]}\n            return {\"labels\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.inputs","title":"inputs: List[str] property","text":"

The input for the task is the instruction.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._get_available_labels_message","title":"_get_available_labels_message()","text":"

Prepares the message to display depending on the available labels (if any), and whether the labels have a specific context.

Source code in src/distilabel/steps/tasks/text_classification.py
def _get_available_labels_message(self) -> str:\n    \"\"\"Prepares the message to display depending on the available labels (if any),\n    and whether the labels have a specific context.\n    \"\"\"\n    if self.available_labels is None:\n        return (\n            \"Use clear, widely understood terms for labels.\"\n            \"Avoid overly specific or obscure labels unless the text demands it.\"\n        )\n\n    msg = (\n        \"## Labeling the user input\\n\"\n        \"Use the available labels to classify the user query{label_context}:\\n\"\n        \"available_labels = {available_labels}\"\n    )\n    if isinstance(self.available_labels, list):\n        specific_msg = (\n            \"[\\n\"\n            + indent(\n                \"\".join([f'\"{label}\",\\n' for label in self.available_labels]),\n                prefix=\" \" * 4,\n            )\n            + \"]\"\n        )\n        return msg.format(label_context=\"\", available_labels=specific_msg)\n\n    elif isinstance(self.available_labels, dict):\n        specific_msg = \"\"\n        for label, description in self.available_labels.items():\n            specific_msg += indent(\n                f'\"{label}\",  # {description}' + \"\\n\", prefix=\" \" * 4\n            )\n\n        specific_msg = \"[\\n\" + specific_msg + \"]\"\n        return msg.format(\n            label_context=\". Analyze the context of each label specifically\",\n            available_labels=specific_msg,\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._get_examples_message","title":"_get_examples_message()","text":"

Prepares the message to display depending on the examples provided.

Source code in src/distilabel/steps/tasks/text_classification.py
def _get_examples_message(self) -> str:\n    \"\"\"Prepares the message to display depending on the examples provided.\"\"\"\n    if self.examples is None:\n        return \"\"\n\n    examples_msg = \"\\n\".join([f\"- {ex}\" for ex in self.examples])\n\n    return (\n        \"\\n## Examples\\n\"\n        \"Here are some examples to help you understand the task:\\n\"\n        f\"{examples_msg}\"\n    )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/text_classification.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    messages = [\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                context=f\"\\n{self.context}\",\n                labels_message=self._labels_message,\n                available_labels=self._available_labels_message,\n                examples=self._examples,\n                default_label=self.default_label,\n                labels_format=self._labels_format,\n                query_title=self.query_title,\n                text=input[\"text\"],\n            ),\n        },\n    ]\n    if self.system_prompt:\n        messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n    return messages\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a dictionary with the generation. The model_name will be automatically included within the process method of Task.

Source code in src/distilabel/steps/tasks/text_classification.py
def format_output(\n    self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n    will be automatically included within the `process` method of `Task`.\"\"\"\n    return self._format_structured_output(output)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/text_classification.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    if self.n > 1:\n\n        class MultiLabelSchema(BaseModel):\n            labels: List[str]\n\n        return MultiLabelSchema.model_json_schema()\n\n    class SingleLabelSchema(BaseModel):\n        labels: str\n\n    return SingleLabelSchema.model_json_schema()\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextClassification._format_structured_output","title":"_format_structured_output(output)","text":"

Parses the structured response, which should correspond to a dictionary with the labels, and either a string or a list of strings with the labels.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, Union[str, List[str]]]

Formatted output.

Source code in src/distilabel/steps/tasks/text_classification.py
def _format_structured_output(\n    self, output: str\n) -> Dict[str, Union[str, List[str]]]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with the `labels`, and either a string or a list of strings with the labels.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        if self.n > 1:\n            return {\"labels\": [None for _ in range(self.n)]}\n        return {\"labels\": None}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration","title":"ChatGeneration","text":"

Bases: Task

Generates text based on a conversation.

ChatGeneration is a pre-defined task that defines the messages as the input and generation as the output. This task is used to generate text based on a conversation. The model_name is also returned as part of the output in order to enhance it.

Input columns
  • messages (List[Dict[Literal[\"role\", \"content\"], str]]): The messages to generate the follow up completion from.
Output columns
  • generation (str): The generated text from the assistant.
  • model_name (str): The model name used to generate the text.
Categories
  • chat-generation
Icon

:material-chat:

Examples:

Generate text from a conversation in OpenAI chat format:

from distilabel.steps.tasks import ChatGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nchat = ChatGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nchat.load()\n\nresult = next(\n    chat.process(\n        [\n            {\n                \"messages\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                ]\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#         'generation': '4',\n#     }\n# ]\n
Source code in src/distilabel/steps/tasks/text_generation.py
class ChatGeneration(Task):\n    \"\"\"Generates text based on a conversation.\n\n    `ChatGeneration` is a pre-defined task that defines the `messages` as the input\n    and `generation` as the output. This task is used to generate text based on a conversation.\n    The `model_name` is also returned as part of the output in order to enhance it.\n\n    Input columns:\n        - messages (`List[Dict[Literal[\"role\", \"content\"], str]]`): The messages to generate the\n            follow up completion from.\n\n    Output columns:\n        - generation (`str`): The generated text from the assistant.\n        - model_name (`str`): The model name used to generate the text.\n\n    Categories:\n        - chat-generation\n\n    Icon:\n        `:material-chat:`\n\n    Examples:\n        Generate text from a conversation in OpenAI chat format:\n\n        ```python\n        from distilabel.steps.tasks import ChatGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        chat = ChatGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            )\n        )\n\n        chat.load()\n\n        result = next(\n            chat.process(\n                [\n                    {\n                        \"messages\": [\n                            {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                        ]\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n        #         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n        #         'generation': '4',\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task are the `messages`.\"\"\"\n        return [\"messages\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the messages provided\n        are already formatted that way i.e. following the OpenAI chat format.\"\"\"\n\n        if not is_openai_format(input[\"messages\"]):\n            raise DistilabelUserError(\n                \"Input `messages` must be an OpenAI chat-like format conversation. \"\n                f\"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n                page=\"components-gallery/tasks/chatgeneration/\",\n            )\n\n        if input[\"messages\"][-1][\"role\"] != \"user\":\n            raise DistilabelUserError(\n                \"The last message must be from the user. Please check: \"\n                \"'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n                page=\"components-gallery/tasks/chatgeneration/\",\n            )\n\n        return input[\"messages\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        return [\"generation\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n        will be automatically included within the `process` method of `Task`.\"\"\"\n        return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.inputs","title":"inputs: List[str] property","text":"

The input for the task are the messages.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the messages provided are already formatted that way i.e. following the OpenAI chat format.

Source code in src/distilabel/steps/tasks/text_generation.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the messages provided\n    are already formatted that way i.e. following the OpenAI chat format.\"\"\"\n\n    if not is_openai_format(input[\"messages\"]):\n        raise DistilabelUserError(\n            \"Input `messages` must be an OpenAI chat-like format conversation. \"\n            f\"Got: {input['messages']}. Please check: 'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n            page=\"components-gallery/tasks/chatgeneration/\",\n        )\n\n    if input[\"messages\"][-1][\"role\"] != \"user\":\n        raise DistilabelUserError(\n            \"The last message must be from the user. Please check: \"\n            \"'https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models'.\",\n            page=\"components-gallery/tasks/chatgeneration/\",\n        )\n\n    return input[\"messages\"]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.ChatGeneration.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a dictionary with the generation. The model_name will be automatically included within the process method of Task.

Source code in src/distilabel/steps/tasks/text_generation.py
def format_output(\n    self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n    will be automatically included within the `process` method of `Task`.\"\"\"\n    return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration","title":"TextGeneration","text":"

Bases: Task

Text generation with an LLM given a prompt.

TextGeneration is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. This task should be good enough for tasks that don't need post-processing of the responses generated by the LLM.

Attributes:

Name Type Description system_prompt Union[str, None]

The system prompt to use in the generation. If not provided, then it will check if the input row has a column named system_prompt and use it. If not, then no system prompt will be used. Defaults to None.

template str

The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template.

columns Union[str, List[str]]

A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction.

use_system_prompt bool

DEPRECATED. To be removed in 1.5.0. Whether to use the system prompt in the generation. Defaults to True, which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored.

Input columns
  • dynamic (determined by columns attribute): By default will be set to instruction. The columns can point both to a str or a List[str] to be used in the template.
Output columns
  • generation (str): The generated text.
  • model_name (str): The name of the model used to generate the text.
Categories
  • text-generation
References
  • Jinja2 Template Designer Documentation

Examples:

Generate text from an instruction:

from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    )\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [{\"instruction\": \"your instruction\"}]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'your instruction',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'generation',\n#     }\n# ]\n

Use a custom template to generate text:

from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Document:\n{{ document }}\n\nQuestion: {{ question }}\n\nPlease provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n    template=CUSTOM_TEMPLATE,\n    columns=[\"document\", \"question\"],\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [\n            {\n                \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n                \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n#         'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n#     }\n# ]\n

Few shot learning with different system prompts:

from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n{% for example in examples %}\nExample {{ loop.index }}:\nInstruction: {{ example }}\n\n{% endfor %}\nNow, generate a new instruction in a similar style:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    template=CUSTOM_TEMPLATE,\n    columns=\"examples\",\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [\n            {\n                \"examples\": [\"This is an example\", \"Another relevant example\"],\n                \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'examples': ['This is an example', 'Another relevant example'],\n#         'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'Disable the firewall on the router',\n#     }\n# ]\n
Source code in src/distilabel/steps/tasks/text_generation.py
class TextGeneration(Task):\n    \"\"\"Text generation with an `LLM` given a prompt.\n\n    `TextGeneration` is a pre-defined task that allows passing a custom prompt using the\n    Jinja2 syntax. By default, a `instruction` is expected in the inputs, but the using\n    `template` and `columns` attributes one can define a custom prompt and columns expected\n    from the text. This task should be good enough for tasks that don't need post-processing\n    of the responses generated by the LLM.\n\n    Attributes:\n        system_prompt: The system prompt to use in the generation. If not provided, then\n            it will check if the input row has a column named `system_prompt` and use it.\n            If not, then no system prompt will be used. Defaults to `None`.\n        template: The template to use for the generation. It must follow the Jinja2 template\n            syntax. If not provided, it will assume the text passed is an instruction and\n            construct the appropriate template.\n        columns: A string with the column, or a list with columns expected in the template.\n            Take a look at the examples for more information. Defaults to `instruction`.\n        use_system_prompt: DEPRECATED. To be removed in 1.5.0. Whether to use the system\n            prompt in the generation. Defaults to `True`, which means that if the column\n            `system_prompt` is defined within the input batch, then the `system_prompt`\n            will be used, otherwise, it will be ignored.\n\n    Input columns:\n        - dynamic (determined by `columns` attribute): By default will be set to `instruction`.\n            The columns can point both to a `str` or a `List[str]` to be used in the template.\n\n    Output columns:\n        - generation (`str`): The generated text.\n        - model_name (`str`): The name of the model used to generate the text.\n\n    Categories:\n        - text-generation\n\n    References:\n        - [Jinja2 Template Designer Documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/)\n\n    Examples:\n        Generate text from an instruction:\n\n        ```python\n        from distilabel.steps.tasks import TextGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        text_gen = TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            )\n        )\n\n        text_gen.load()\n\n        result = next(\n            text_gen.process(\n                [{\"instruction\": \"your instruction\"}]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'your instruction',\n        #         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n        #         'generation': 'generation',\n        #     }\n        # ]\n        ```\n\n        Use a custom template to generate text:\n\n        ```python\n        from distilabel.steps.tasks import TextGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        CUSTOM_TEMPLATE = '''Document:\n        {{ document }}\n\n        Question: {{ question }}\n\n        Please provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n        '''.rstrip()\n\n        text_gen = TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n            template=CUSTOM_TEMPLATE,\n            columns=[\"document\", \"question\"],\n        )\n\n        text_gen.load()\n\n        result = next(\n            text_gen.process(\n                [\n                    {\n                        \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n                        \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n        #         'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n        #         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n        #         'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n        #     }\n        # ]\n        ```\n\n        Few shot learning with different system prompts:\n\n        ```python\n        from distilabel.steps.tasks import TextGeneration\n        from distilabel.models import InferenceEndpointsLLM\n\n        CUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n        {% for example in examples %}\n        Example {{ loop.index }}:\n        Instruction: {{ example }}\n\n        {% endfor %}\n        Now, generate a new instruction in a similar style:\n        '''.rstrip()\n\n        text_gen = TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            template=CUSTOM_TEMPLATE,\n            columns=\"examples\",\n        )\n\n        text_gen.load()\n\n        result = next(\n            text_gen.process(\n                [\n                    {\n                        \"examples\": [\"This is an example\", \"Another relevant example\"],\n                        \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'examples': ['This is an example', 'Another relevant example'],\n        #         'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n        #         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n        #         'generation': 'Disable the firewall on the router',\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    system_prompt: Union[str, None] = None\n    use_system_prompt: bool = Field(default=True, deprecated=True)\n    template: str = Field(\n        default=\"{{ instruction }}\",\n        description=(\n            \"This is a template or prompt to use for the generation. \"\n            \"If not provided, it is assumed a `instruction` is placed in the inputs, \"\n            \"to be used as is.\"\n        ),\n    )\n    columns: Union[str, List[str]] = Field(\n        default=\"instruction\",\n        description=(\n            \"Custom column or list of columns to include in the input. \"\n            \"If a `template` is provided which needs custom column names, \"\n            \"then they should be provided here. By default it will use `instruction`.\"\n        ),\n    )\n\n    _can_be_used_with_offline_batch_generation = True\n    _template: Optional[\"Template\"] = PrivateAttr(default=...)\n\n    def model_post_init(self, __context: Any) -> None:\n        self.columns = [self.columns] if isinstance(self.columns, str) else self.columns\n        super().model_post_init(__context)\n\n    def load(self) -> None:\n        super().load()\n\n        def check_column_in_template(column, template):\n            pattern = (\n                r\"(?:{%.*?\\b\"\n                + re.escape(column)\n                + r\"\\b.*?%}|{{\\s*\"\n                + re.escape(column)\n                + r\"\\s*}})\"\n            )\n            if not re.search(pattern, template):\n                raise DistilabelUserError(\n                    (\n                        f\"You required column name '{column}', but is not present in the template, \"\n                        \"ensure the 'columns' match with the 'template' to avoid errors.\"\n                    ),\n                    page=\"components-gallery/tasks/textgeneration/\",\n                )\n\n        for column in self.columns:\n            check_column_in_template(column, self.template)\n\n        self._template = Template(self.template)\n\n    def unload(self) -> None:\n        super().unload()\n        self._template = None\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        \"\"\"The input for the task is the `instruction` by default, or the `columns` given as input.\"\"\"\n        columns = {column: True for column in self.columns}\n        columns[\"system_prompt\"] = False\n        return columns\n\n    def _prepare_message_content(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n        fields = {column: input[column] for column in self.columns}\n        return [{\"role\": \"user\", \"content\": self._template.render(**fields)}]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        # Handle the previous expected errors, in case of custom columns there's more freedom\n        # and we cannot check it so easily.\n        if self.columns == [\"instruction\"]:\n            if is_openai_format(input[\"instruction\"]):\n                raise DistilabelUserError(\n                    \"Providing `instruction` formatted as an OpenAI chat / conversation is\"\n                    \" deprecated, you should use `ChatGeneration` with `messages` as input instead.\",\n                    page=\"components-gallery/tasks/textgeneration/\",\n                )\n\n            if not isinstance(input[\"instruction\"], str):\n                raise DistilabelUserError(\n                    f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n                    page=\"components-gallery/tasks/textgeneration/\",\n                )\n\n        messages = self._prepare_message_content(input)\n\n        row_system_prompt = input.get(\"system_prompt\")\n        if row_system_prompt:\n            messages.insert(0, {\"role\": \"system\", \"content\": row_system_prompt})\n\n        if self.system_prompt and not row_system_prompt:\n            messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n        return messages  # type: ignore\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        return [\"generation\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n        will be automatically included within the `process` method of `Task`.\"\"\"\n        return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.inputs","title":"inputs: StepColumns property","text":"

The input for the task is the instruction by default, or the columns given as input.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration._prepare_message_content","title":"_prepare_message_content(input)","text":"

Prepares the content for the template and returns the formatted messages.

Source code in src/distilabel/steps/tasks/text_generation.py
def _prepare_message_content(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"Prepares the content for the template and returns the formatted messages.\"\"\"\n    fields = {column: input[column] for column in self.columns}\n    return [{\"role\": \"user\", \"content\": self._template.render(**fields)}]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/text_generation.py
def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    # Handle the previous expected errors, in case of custom columns there's more freedom\n    # and we cannot check it so easily.\n    if self.columns == [\"instruction\"]:\n        if is_openai_format(input[\"instruction\"]):\n            raise DistilabelUserError(\n                \"Providing `instruction` formatted as an OpenAI chat / conversation is\"\n                \" deprecated, you should use `ChatGeneration` with `messages` as input instead.\",\n                page=\"components-gallery/tasks/textgeneration/\",\n            )\n\n        if not isinstance(input[\"instruction\"], str):\n            raise DistilabelUserError(\n                f\"Input `instruction` must be a string. Got: {input['instruction']}.\",\n                page=\"components-gallery/tasks/textgeneration/\",\n            )\n\n    messages = self._prepare_message_content(input)\n\n    row_system_prompt = input.get(\"system_prompt\")\n    if row_system_prompt:\n        messages.insert(0, {\"role\": \"system\", \"content\": row_system_prompt})\n\n    if self.system_prompt and not row_system_prompt:\n        messages.insert(0, {\"role\": \"system\", \"content\": self.system_prompt})\n\n    return messages  # type: ignore\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.TextGeneration.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a dictionary with the generation. The model_name will be automatically included within the process method of Task.

Source code in src/distilabel/steps/tasks/text_generation.py
def format_output(\n    self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `generation`. The `model_name`\n    will be automatically included within the `process` method of `Task`.\"\"\"\n    return {\"generation\": output}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback","title":"UltraFeedback","text":"

Bases: Task

Rank generations focusing on different aspects using an LLM.

UltraFeedback: Boosting Language Models with High-quality Feedback.

Attributes:

Name Type Description aspect Literal['helpfulness', 'honesty', 'instruction-following', 'truthfulness', 'overall-rating']

The aspect to perform with the UltraFeedback model. The available aspects are: - helpfulness: Evaluate text outputs based on helpfulness. - honesty: Evaluate text outputs based on honesty. - instruction-following: Evaluate text outputs based on given instructions. - truthfulness: Evaluate text outputs based on truthfulness. Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall assessment of the text outputs within a single prompt. The custom aspect is: - overall-rating: Evaluate text outputs based on an overall assessment. Defaults to \"overall-rating\".

Input columns
  • instruction (str): The reference instruction to evaluate the text outputs.
  • generations (List[str]): The text outputs to evaluate for the given instruction.
Output columns
  • ratings (List[float]): The ratings for each of the provided text outputs.
  • rationales (List[str]): The rationales for each of the provided text outputs.
  • model_name (str): The name of the model used to generate the ratings and rationales.
Categories
  • preference
References
  • UltraFeedback: Boosting Language Models with High-quality Feedback
  • UltraFeedback - GitHub Repository

Examples:

Rate generations from different LLMs based on the selected aspect:

from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    use_default_structured_output=False\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'How much is 2+2?',\n#         'generations': ['4', 'and a car'],\n#         'ratings': [1, 2],\n#         'rationales': ['explanation for 4', 'explanation for and a car'],\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#     }\n# ]\n

Rate generations from different LLMs based on the honesty, using the default structured output:

from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    aspect=\"honesty\"\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [5, 1],\n# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n# \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\n    5,\\n    1\\n] \\n\\n,\"rationales\": [\\n    \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\n    \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\n] }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n

Rate generations from different LLMs based on the helpfulness, using the default structured output:

from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512},\n    ),\n    aspect=\"helpfulness\"\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n#   'generations': ['4', 'and a car'],\n#   'ratings': [1, 5],\n#   'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n#    'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n#   'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n#    'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n#   'types': [1, 3, 1],\n#   'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n  \"ratings\": [\\n    1,\\n    5\\n  ]\\n ,\\n  \"rationales\": [\\n    \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\n    \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\n  ]\\n ,\\n  \"rationales_for_rating\": [\\n    \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\n    \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\n  ]\\n ,\\n  \"types\": [\\n    1, 3,\\n    1\\n  ]\\n  }'},\n#   'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
Citations
@misc{cui2024ultrafeedbackboostinglanguagemodels,\n    title={UltraFeedback: Boosting Language Models with Scaled AI Feedback},\n    author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\n    year={2024},\n    eprint={2310.01377},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL},\n    url={https://arxiv.org/abs/2310.01377},\n}\n
Source code in src/distilabel/steps/tasks/ultrafeedback.py
class UltraFeedback(Task):\n    \"\"\"Rank generations focusing on different aspects using an `LLM`.\n\n    UltraFeedback: Boosting Language Models with High-quality Feedback.\n\n    Attributes:\n        aspect: The aspect to perform with the `UltraFeedback` model. The available aspects are:\n            - `helpfulness`: Evaluate text outputs based on helpfulness.\n            - `honesty`: Evaluate text outputs based on honesty.\n            - `instruction-following`: Evaluate text outputs based on given instructions.\n            - `truthfulness`: Evaluate text outputs based on truthfulness.\n            Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall\n            assessment of the text outputs within a single prompt. The custom aspect is:\n            - `overall-rating`: Evaluate text outputs based on an overall assessment.\n            Defaults to `\"overall-rating\"`.\n\n    Input columns:\n        - instruction (`str`): The reference instruction to evaluate the text outputs.\n        - generations (`List[str]`): The text outputs to evaluate for the given instruction.\n\n    Output columns:\n        - ratings (`List[float]`): The ratings for each of the provided text outputs.\n        - rationales (`List[str]`): The rationales for each of the provided text outputs.\n        - model_name (`str`): The name of the model used to generate the ratings and rationales.\n\n    Categories:\n        - preference\n\n    References:\n        - [`UltraFeedback: Boosting Language Models with High-quality Feedback`](https://arxiv.org/abs/2310.01377)\n        - [`UltraFeedback - GitHub Repository`](https://github.com/OpenBMB/UltraFeedback)\n\n    Examples:\n        Rate generations from different LLMs based on the selected aspect:\n\n        ```python\n        from distilabel.steps.tasks import UltraFeedback\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        ultrafeedback = UltraFeedback(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n            ),\n            use_default_structured_output=False\n        )\n\n        ultrafeedback.load()\n\n        result = next(\n            ultrafeedback.process(\n                [\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"generations\": [\"4\", \"and a car\"],\n                    }\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'instruction': 'How much is 2+2?',\n        #         'generations': ['4', 'and a car'],\n        #         'ratings': [1, 2],\n        #         'rationales': ['explanation for 4', 'explanation for and a car'],\n        #         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n        #     }\n        # ]\n        ```\n\n        Rate generations from different LLMs based on the honesty, using the default structured output:\n\n        ```python\n        from distilabel.steps.tasks import UltraFeedback\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        ultrafeedback = UltraFeedback(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            ),\n            aspect=\"honesty\"\n        )\n\n        ultrafeedback.load()\n\n        result = next(\n            ultrafeedback.process(\n                [\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"generations\": [\"4\", \"and a car\"],\n                    }\n                ]\n            )\n        )\n        # result\n        # [{'instruction': 'How much is 2+2?',\n        # 'generations': ['4', 'and a car'],\n        # 'ratings': [5, 1],\n        # 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n        # \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n        # 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\\\n    5,\\\\n    1\\\\n] \\\\n\\\\n,\"rationales\": [\\\\n    \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\\\n    \"The response is confidently incorrect, as it provides unrelated information (\\'a car\\') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\\\n] }'},\n        # 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n        Rate generations from different LLMs based on the helpfulness, using the default structured output:\n\n        ```python\n        from distilabel.steps.tasks import UltraFeedback\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        ultrafeedback = UltraFeedback(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n                generation_kwargs={\"max_new_tokens\": 512},\n            ),\n            aspect=\"helpfulness\"\n        )\n\n        ultrafeedback.load()\n\n        result = next(\n            ultrafeedback.process(\n                [\n                    {\n                        \"instruction\": \"How much is 2+2?\",\n                        \"generations\": [\"4\", \"and a car\"],\n                    }\n                ]\n            )\n        )\n        # result\n        # [{'instruction': 'How much is 2+2?',\n        #   'generations': ['4', 'and a car'],\n        #   'ratings': [1, 5],\n        #   'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n        #    'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n        #   'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n        #    'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n        #   'types': [1, 3, 1],\n        #   'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\\\n  \"ratings\": [\\\\n    1,\\\\n    5\\\\n  ]\\\\n ,\\\\n  \"rationales\": [\\\\n    \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\\\n    \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\\\n  ]\\\\n ,\\\\n  \"rationales_for_rating\": [\\\\n    \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\\\n    \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\\\n  ]\\\\n ,\\\\n  \"types\": [\\\\n    1, 3,\\\\n    1\\\\n  ]\\\\n  }'},\n        #   'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n        ```\n\n    Citations:\n        ```\n        @misc{cui2024ultrafeedbackboostinglanguagemodels,\n            title={UltraFeedback: Boosting Language Models with Scaled AI Feedback},\n            author={Ganqu Cui and Lifan Yuan and Ning Ding and Guanming Yao and Bingxiang He and Wei Zhu and Yuan Ni and Guotong Xie and Ruobing Xie and Yankai Lin and Zhiyuan Liu and Maosong Sun},\n            year={2024},\n            eprint={2310.01377},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2310.01377},\n        }\n        ```\n    \"\"\"\n\n    aspect: Literal[\n        \"helpfulness\",\n        \"honesty\",\n        \"instruction-following\",\n        \"truthfulness\",\n        # Custom aspects\n        \"overall-rating\",\n    ] = \"overall-rating\"\n\n    _system_prompt: str = PrivateAttr(\n        default=(\n            \"Your role is to evaluate text quality based on given criteria.\\n\"\n            'You\\'ll receive an instructional description (\"Instruction\") and {no_texts} text outputs (\"Text\").\\n'\n            \"Understand and interpret instructions to evaluate effectively.\\n\"\n            \"Provide annotations for each text with a rating and rationale.\\n\"\n            \"The {no_texts} texts given are independent, and should be evaluated separately.\\n\"\n        )\n    )\n    _template: Optional[\"Template\"] = PrivateAttr(default=...)\n    _can_be_used_with_offline_batch_generation = True\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"ultrafeedback\"\n            / f\"{self.aspect}.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`, and the `generations` for it.\"\"\"\n        return [\"instruction\", \"generations\"]\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation.\"\"\"\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self._system_prompt.format(\n                    no_texts=len(input[\"generations\"])\n                ),\n            },\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(  # type: ignore\n                    instruction=input[\"instruction\"], generations=input[\"generations\"]\n                ),\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The output for the task is the `generation` and the `model_name`.\"\"\"\n        columns = []\n        if self.aspect in [\"honesty\", \"instruction-following\", \"overall-rating\"]:\n            columns = [\"ratings\", \"rationales\"]\n        elif self.aspect in [\"helpfulness\", \"truthfulness\"]:\n            columns = [\"types\", \"rationales\", \"ratings\", \"rationales-for-ratings\"]\n        return columns + [\"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        \"\"\"The output is formatted as a dictionary with the `ratings` and `rationales` for\n        each of the provided `generations` for the given `instruction`. The `model_name`\n        will be automatically included within the `process` method of `Task`.\n\n        Args:\n            output: a string representing the output of the LLM via the `process` method.\n            input: the input to the task, as required by some tasks to format the output.\n\n        Returns:\n            A dictionary containing either the `ratings` and `rationales` for each of the provided\n            `generations` for the given `instruction` if the provided aspect is either `honesty`,\n            `instruction-following`, or `overall-rating`; or the `types`, `rationales`,\n            `ratings`, and `rationales-for-ratings` for each of the provided `generations` for the\n            given `instruction` if the provided aspect is either `helpfulness` or `truthfulness`.\n        \"\"\"\n        assert input is not None, \"Input is required to format the output.\"\n\n        if self.aspect in [\n            \"honesty\",\n            \"instruction-following\",\n            \"overall-rating\",\n        ]:\n            return self._format_ratings_rationales_output(output, input)\n\n        return self._format_types_ratings_rationales_output(output, input)\n\n    def _format_ratings_rationales_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, List[Any]]:\n        \"\"\"Formats the output when the aspect is either `honesty`, `instruction-following`, or `overall-rating`.\"\"\"\n        if output is None:\n            return {\n                \"ratings\": [None] * len(input[\"generations\"]),\n                \"rationales\": [None] * len(input[\"generations\"]),\n            }\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output, input)\n\n        pattern = r\"Rating: (.+?)\\nRationale: (.+)\"\n        sections = output.split(\"\\n\\n\")\n\n        formatted_outputs = []\n        for section in sections:\n            matches = None\n            if section is not None and section != \"\":\n                matches = re.search(pattern, section, re.DOTALL)\n            if not matches:\n                formatted_outputs.append({\"ratings\": None, \"rationales\": None})\n                continue\n\n            formatted_outputs.append(\n                {\n                    \"ratings\": (\n                        int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n                        if matches.group(1) not in [\"None\", \"N/A\"]\n                        else None\n                    ),\n                    \"rationales\": matches.group(2),\n                }\n            )\n        return group_dicts(*formatted_outputs)\n\n    def _format_types_ratings_rationales_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, List[Any]]:\n        \"\"\"Formats the output when the aspect is either `helpfulness` or `truthfulness`.\"\"\"\n        if output is None:\n            return {\n                \"types\": [None] * len(input[\"generations\"]),\n                \"rationales\": [None] * len(input[\"generations\"]),\n                \"ratings\": [None] * len(input[\"generations\"]),\n                \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n            }\n\n        if self.use_default_structured_output:\n            return self._format_structured_output(output, input)\n\n        pattern = r\"Type: (.+?)\\nRationale: (.+?)\\nRating: (.+?)\\nRationale: (.+)\"\n\n        sections = output.split(\"\\n\\n\")\n\n        formatted_outputs = []\n        for section in sections:\n            matches = None\n            if section is not None and section != \"\":\n                matches = re.search(pattern, section, re.DOTALL)\n            if not matches:\n                formatted_outputs.append(\n                    {\n                        \"types\": None,\n                        \"rationales\": None,\n                        \"ratings\": None,\n                        \"rationales-for-ratings\": None,\n                    }\n                )\n                continue\n\n            formatted_outputs.append(\n                {\n                    \"types\": (\n                        int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n                        if matches.group(1) not in [\"None\", \"N/A\"]\n                        else None\n                    ),\n                    \"rationales\": matches.group(2),\n                    \"ratings\": (\n                        int(re.findall(r\"\\b\\d+\\b\", matches.group(3))[0])\n                        if matches.group(3) not in [\"None\", \"N/A\"]\n                        else None\n                    ),\n                    \"rationales-for-ratings\": matches.group(4),\n                }\n            )\n        return group_dicts(*formatted_outputs)\n\n    @override\n    def get_structured_output(self) -> Dict[str, Any]:\n        \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n        a dictionary with the output which can be directly parsed as a python dictionary.\n\n        The schema corresponds to the following:\n\n        ```python\n        from pydantic import BaseModel\n        from typing import List\n\n        class SchemaUltraFeedback(BaseModel):\n            ratings: List[int]\n            rationales: List[str]\n\n        class SchemaUltraFeedbackWithType(BaseModel):\n            types: List[Optional[int]]\n            ratings: List[int]\n            rationales: List[str]\n            rationales_for_rating: List[str]\n        ```\n\n        Returns:\n            JSON Schema of the response to enforce.\n        \"\"\"\n        if self.aspect in [\n            \"honesty\",\n            \"instruction-following\",\n            \"overall-rating\",\n        ]:\n            return {\n                \"properties\": {\n                    \"ratings\": {\n                        \"items\": {\"type\": \"integer\"},\n                        \"title\": \"Ratings\",\n                        \"type\": \"array\",\n                    },\n                    \"rationales\": {\n                        \"items\": {\"type\": \"string\"},\n                        \"title\": \"Rationales\",\n                        \"type\": \"array\",\n                    },\n                },\n                \"required\": [\"ratings\", \"rationales\"],\n                \"title\": \"SchemaUltraFeedback\",\n                \"type\": \"object\",\n            }\n        return {\n            \"properties\": {\n                \"types\": {\n                    \"items\": {\"anyOf\": [{\"type\": \"integer\"}, {\"type\": \"null\"}]},\n                    \"title\": \"Types\",\n                    \"type\": \"array\",\n                },\n                \"ratings\": {\n                    \"items\": {\"type\": \"integer\"},\n                    \"title\": \"Ratings\",\n                    \"type\": \"array\",\n                },\n                \"rationales\": {\n                    \"items\": {\"type\": \"string\"},\n                    \"title\": \"Rationales\",\n                    \"type\": \"array\",\n                },\n                \"rationales_for_rating\": {\n                    \"items\": {\"type\": \"string\"},\n                    \"title\": \"Rationales For Rating\",\n                    \"type\": \"array\",\n                },\n            },\n            \"required\": [\"types\", \"ratings\", \"rationales\", \"rationales_for_rating\"],\n            \"title\": \"SchemaUltraFeedbackWithType\",\n            \"type\": \"object\",\n        }\n\n    def _format_structured_output(\n        self, output: str, input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        \"\"\"Parses the structured response, which should correspond to a dictionary\n        with either `positive`, or `positive` and `negative` keys.\n\n        Args:\n            output: The output from the `LLM`.\n\n        Returns:\n            Formatted output.\n        \"\"\"\n        try:\n            return orjson.loads(output)\n        except orjson.JSONDecodeError:\n            if self.aspect in [\n                \"honesty\",\n                \"instruction-following\",\n                \"overall-rating\",\n            ]:\n                return {\n                    \"ratings\": [None] * len(input[\"generations\"]),\n                    \"rationales\": [None] * len(input[\"generations\"]),\n                }\n            return {\n                \"ratings\": [None] * len(input[\"generations\"]),\n                \"rationales\": [None] * len(input[\"generations\"]),\n                \"types\": [None] * len(input[\"generations\"]),\n                \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n            }\n\n    @override\n    def _sample_input(self) -> ChatType:\n        return self.format_input(\n            {\n                \"instruction\": f\"<PLACEHOLDER_{'instruction'.upper()}>\",\n                \"generations\": [\n                    f\"<PLACEHOLDER_{f'GENERATION_{i}'.upper()}>\" for i in range(2)\n                ],\n            }\n        )\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.inputs","title":"inputs: List[str] property","text":"

The input for the task is the instruction, and the generations for it.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.outputs","title":"outputs: List[str] property","text":"

The output for the task is the generation and the model_name.

"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.load","title":"load()","text":"

Loads the Jinja2 template for the given aspect.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"ultrafeedback\"\n        / f\"{self.aspect}.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.format_input","title":"format_input(input)","text":"

The input is formatted as a ChatType assuming that the instruction is the first interaction from the user within a conversation.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def format_input(self, input: Dict[str, Any]) -> ChatType:\n    \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n    is the first interaction from the user within a conversation.\"\"\"\n    return [\n        {\n            \"role\": \"system\",\n            \"content\": self._system_prompt.format(\n                no_texts=len(input[\"generations\"])\n            ),\n        },\n        {\n            \"role\": \"user\",\n            \"content\": self._template.render(  # type: ignore\n                instruction=input[\"instruction\"], generations=input[\"generations\"]\n            ),\n        },\n    ]\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.format_output","title":"format_output(output, input=None)","text":"

The output is formatted as a dictionary with the ratings and rationales for each of the provided generations for the given instruction. The model_name will be automatically included within the process method of Task.

Parameters:

Name Type Description Default output Union[str, None]

a string representing the output of the LLM via the process method.

required input Union[Dict[str, Any], None]

the input to the task, as required by some tasks to format the output.

None

Returns:

Type Description Dict[str, Any]

A dictionary containing either the ratings and rationales for each of the provided

Dict[str, Any]

generations for the given instruction if the provided aspect is either honesty,

Dict[str, Any]

instruction-following, or overall-rating; or the types, rationales,

Dict[str, Any]

ratings, and rationales-for-ratings for each of the provided generations for the

Dict[str, Any]

given instruction if the provided aspect is either helpfulness or truthfulness.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def format_output(\n    self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n) -> Dict[str, Any]:\n    \"\"\"The output is formatted as a dictionary with the `ratings` and `rationales` for\n    each of the provided `generations` for the given `instruction`. The `model_name`\n    will be automatically included within the `process` method of `Task`.\n\n    Args:\n        output: a string representing the output of the LLM via the `process` method.\n        input: the input to the task, as required by some tasks to format the output.\n\n    Returns:\n        A dictionary containing either the `ratings` and `rationales` for each of the provided\n        `generations` for the given `instruction` if the provided aspect is either `honesty`,\n        `instruction-following`, or `overall-rating`; or the `types`, `rationales`,\n        `ratings`, and `rationales-for-ratings` for each of the provided `generations` for the\n        given `instruction` if the provided aspect is either `helpfulness` or `truthfulness`.\n    \"\"\"\n    assert input is not None, \"Input is required to format the output.\"\n\n    if self.aspect in [\n        \"honesty\",\n        \"instruction-following\",\n        \"overall-rating\",\n    ]:\n        return self._format_ratings_rationales_output(output, input)\n\n    return self._format_types_ratings_rationales_output(output, input)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_ratings_rationales_output","title":"_format_ratings_rationales_output(output, input)","text":"

Formats the output when the aspect is either honesty, instruction-following, or overall-rating.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def _format_ratings_rationales_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, List[Any]]:\n    \"\"\"Formats the output when the aspect is either `honesty`, `instruction-following`, or `overall-rating`.\"\"\"\n    if output is None:\n        return {\n            \"ratings\": [None] * len(input[\"generations\"]),\n            \"rationales\": [None] * len(input[\"generations\"]),\n        }\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output, input)\n\n    pattern = r\"Rating: (.+?)\\nRationale: (.+)\"\n    sections = output.split(\"\\n\\n\")\n\n    formatted_outputs = []\n    for section in sections:\n        matches = None\n        if section is not None and section != \"\":\n            matches = re.search(pattern, section, re.DOTALL)\n        if not matches:\n            formatted_outputs.append({\"ratings\": None, \"rationales\": None})\n            continue\n\n        formatted_outputs.append(\n            {\n                \"ratings\": (\n                    int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n                    if matches.group(1) not in [\"None\", \"N/A\"]\n                    else None\n                ),\n                \"rationales\": matches.group(2),\n            }\n        )\n    return group_dicts(*formatted_outputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_types_ratings_rationales_output","title":"_format_types_ratings_rationales_output(output, input)","text":"

Formats the output when the aspect is either helpfulness or truthfulness.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def _format_types_ratings_rationales_output(\n    self, output: Union[str, None], input: Dict[str, Any]\n) -> Dict[str, List[Any]]:\n    \"\"\"Formats the output when the aspect is either `helpfulness` or `truthfulness`.\"\"\"\n    if output is None:\n        return {\n            \"types\": [None] * len(input[\"generations\"]),\n            \"rationales\": [None] * len(input[\"generations\"]),\n            \"ratings\": [None] * len(input[\"generations\"]),\n            \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n        }\n\n    if self.use_default_structured_output:\n        return self._format_structured_output(output, input)\n\n    pattern = r\"Type: (.+?)\\nRationale: (.+?)\\nRating: (.+?)\\nRationale: (.+)\"\n\n    sections = output.split(\"\\n\\n\")\n\n    formatted_outputs = []\n    for section in sections:\n        matches = None\n        if section is not None and section != \"\":\n            matches = re.search(pattern, section, re.DOTALL)\n        if not matches:\n            formatted_outputs.append(\n                {\n                    \"types\": None,\n                    \"rationales\": None,\n                    \"ratings\": None,\n                    \"rationales-for-ratings\": None,\n                }\n            )\n            continue\n\n        formatted_outputs.append(\n            {\n                \"types\": (\n                    int(re.findall(r\"\\b\\d+\\b\", matches.group(1))[0])\n                    if matches.group(1) not in [\"None\", \"N/A\"]\n                    else None\n                ),\n                \"rationales\": matches.group(2),\n                \"ratings\": (\n                    int(re.findall(r\"\\b\\d+\\b\", matches.group(3))[0])\n                    if matches.group(3) not in [\"None\", \"N/A\"]\n                    else None\n                ),\n                \"rationales-for-ratings\": matches.group(4),\n            }\n        )\n    return group_dicts(*formatted_outputs)\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback.get_structured_output","title":"get_structured_output()","text":"

Creates the json schema to be passed to the LLM, to enforce generating a dictionary with the output which can be directly parsed as a python dictionary.

The schema corresponds to the following:

from pydantic import BaseModel\nfrom typing import List\n\nclass SchemaUltraFeedback(BaseModel):\n    ratings: List[int]\n    rationales: List[str]\n\nclass SchemaUltraFeedbackWithType(BaseModel):\n    types: List[Optional[int]]\n    ratings: List[int]\n    rationales: List[str]\n    rationales_for_rating: List[str]\n

Returns:

Type Description Dict[str, Any]

JSON Schema of the response to enforce.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
@override\ndef get_structured_output(self) -> Dict[str, Any]:\n    \"\"\"Creates the json schema to be passed to the LLM, to enforce generating\n    a dictionary with the output which can be directly parsed as a python dictionary.\n\n    The schema corresponds to the following:\n\n    ```python\n    from pydantic import BaseModel\n    from typing import List\n\n    class SchemaUltraFeedback(BaseModel):\n        ratings: List[int]\n        rationales: List[str]\n\n    class SchemaUltraFeedbackWithType(BaseModel):\n        types: List[Optional[int]]\n        ratings: List[int]\n        rationales: List[str]\n        rationales_for_rating: List[str]\n    ```\n\n    Returns:\n        JSON Schema of the response to enforce.\n    \"\"\"\n    if self.aspect in [\n        \"honesty\",\n        \"instruction-following\",\n        \"overall-rating\",\n    ]:\n        return {\n            \"properties\": {\n                \"ratings\": {\n                    \"items\": {\"type\": \"integer\"},\n                    \"title\": \"Ratings\",\n                    \"type\": \"array\",\n                },\n                \"rationales\": {\n                    \"items\": {\"type\": \"string\"},\n                    \"title\": \"Rationales\",\n                    \"type\": \"array\",\n                },\n            },\n            \"required\": [\"ratings\", \"rationales\"],\n            \"title\": \"SchemaUltraFeedback\",\n            \"type\": \"object\",\n        }\n    return {\n        \"properties\": {\n            \"types\": {\n                \"items\": {\"anyOf\": [{\"type\": \"integer\"}, {\"type\": \"null\"}]},\n                \"title\": \"Types\",\n                \"type\": \"array\",\n            },\n            \"ratings\": {\n                \"items\": {\"type\": \"integer\"},\n                \"title\": \"Ratings\",\n                \"type\": \"array\",\n            },\n            \"rationales\": {\n                \"items\": {\"type\": \"string\"},\n                \"title\": \"Rationales\",\n                \"type\": \"array\",\n            },\n            \"rationales_for_rating\": {\n                \"items\": {\"type\": \"string\"},\n                \"title\": \"Rationales For Rating\",\n                \"type\": \"array\",\n            },\n        },\n        \"required\": [\"types\", \"ratings\", \"rationales\", \"rationales_for_rating\"],\n        \"title\": \"SchemaUltraFeedbackWithType\",\n        \"type\": \"object\",\n    }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.UltraFeedback._format_structured_output","title":"_format_structured_output(output, input)","text":"

Parses the structured response, which should correspond to a dictionary with either positive, or positive and negative keys.

Parameters:

Name Type Description Default output str

The output from the LLM.

required

Returns:

Type Description Dict[str, Any]

Formatted output.

Source code in src/distilabel/steps/tasks/ultrafeedback.py
def _format_structured_output(\n    self, output: str, input: Dict[str, Any]\n) -> Dict[str, Any]:\n    \"\"\"Parses the structured response, which should correspond to a dictionary\n    with either `positive`, or `positive` and `negative` keys.\n\n    Args:\n        output: The output from the `LLM`.\n\n    Returns:\n        Formatted output.\n    \"\"\"\n    try:\n        return orjson.loads(output)\n    except orjson.JSONDecodeError:\n        if self.aspect in [\n            \"honesty\",\n            \"instruction-following\",\n            \"overall-rating\",\n        ]:\n            return {\n                \"ratings\": [None] * len(input[\"generations\"]),\n                \"rationales\": [None] * len(input[\"generations\"]),\n            }\n        return {\n            \"ratings\": [None] * len(input[\"generations\"]),\n            \"rationales\": [None] * len(input[\"generations\"]),\n            \"types\": [None] * len(input[\"generations\"]),\n            \"rationales-for-ratings\": [None] * len(input[\"generations\"]),\n        }\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.URIAL","title":"URIAL","text":"

Bases: Task

Generates a response using a non-instruct fine-tuned model.

URIAL is a pre-defined task that generates a response using a non-instruct fine-tuned model. This task is used to generate a response based on the conversation provided as input.

Input columns
  • instruction (str, optional): The instruction to generate a response from.
  • conversation (List[Dict[str, str]], optional): The conversation to generate a response from (the last message must be from the user).
Output columns
  • generation (str): The generated response.
  • model_name (str): The name of the model used to generate the response.
Categories
  • text-generation
References
  • The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning

Examples:

Generate text from an instruction:

from distilabel.models import vLLM\nfrom distilabel.steps.tasks import URIAL\n\nstep = URIAL(\n    llm=vLLM(\n        model=\"meta-llama/Meta-Llama-3.1-8B\",\n        generation_kwargs={\"temperature\": 0.7},\n    ),\n)\n\nstep.load()\n\nresults = next(\n    step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n)\n# [\n#     {\n#         'instruction': \"What's the most most common type of cloud?\",\n#         'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n#         'distilabel_metadata': {...},\n#         'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n#     }\n# ]\n
Source code in src/distilabel/steps/tasks/urial.py
class URIAL(Task):\n    \"\"\"Generates a response using a non-instruct fine-tuned model.\n\n    `URIAL` is a pre-defined task that generates a response using a non-instruct fine-tuned\n    model. This task is used to generate a response based on the conversation provided as\n    input.\n\n    Input columns:\n        - instruction (`str`, optional): The instruction to generate a response from.\n        - conversation (`List[Dict[str, str]]`, optional): The conversation to generate\n            a response from (the last message must be from the user).\n\n    Output columns:\n        - generation (`str`): The generated response.\n        - model_name (`str`): The name of the model used to generate the response.\n\n    Categories:\n        - text-generation\n\n    References:\n        - [The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning](https://arxiv.org/abs/2312.01552)\n\n    Examples:\n        Generate text from an instruction:\n\n        ```python\n        from distilabel.models import vLLM\n        from distilabel.steps.tasks import URIAL\n\n        step = URIAL(\n            llm=vLLM(\n                model=\"meta-llama/Meta-Llama-3.1-8B\",\n                generation_kwargs={\"temperature\": 0.7},\n            ),\n        )\n\n        step.load()\n\n        results = next(\n            step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n        )\n        # [\n        #     {\n        #         'instruction': \"What's the most most common type of cloud?\",\n        #         'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n        #         'distilabel_metadata': {...},\n        #         'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n        super().load()\n\n        _path = str(\n            importlib_resources.files(\"distilabel\")\n            / \"steps\"\n            / \"tasks\"\n            / \"templates\"\n            / \"urial.jinja2\"\n        )\n\n        self._template = Template(open(_path).read())\n\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return {\"instruction\": False, \"conversation\": False}\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        messages = (\n            [{\"role\": \"user\", \"content\": input[\"instruction\"]}]\n            if \"instruction\" in input\n            else input[\"conversation\"]\n        )\n\n        if messages[-1][\"role\"] != \"user\":\n            raise ValueError(\"The last message must be from the user.\")\n\n        return [{\"role\": \"user\", \"content\": self._template.render(messages=messages)}]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"generation\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n    ) -> Dict[str, Any]:\n        if output is None:\n            return {\"generation\": None}\n\n        response = output.split(\"\\n\\n# User\")[0]\n        if response.startswith(\"\\n\\n\"):\n            response = response[2:]\n        response = response.strip()\n\n        return {\"generation\": response}\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.URIAL.load","title":"load()","text":"

Loads the Jinja2 template for the given aspect.

Source code in src/distilabel/steps/tasks/urial.py
def load(self) -> None:\n    \"\"\"Loads the Jinja2 template for the given `aspect`.\"\"\"\n    super().load()\n\n    _path = str(\n        importlib_resources.files(\"distilabel\")\n        / \"steps\"\n        / \"tasks\"\n        / \"templates\"\n        / \"urial.jinja2\"\n    )\n\n    self._template = Template(open(_path).read())\n
"},{"location":"api/task/task_gallery/#distilabel.steps.tasks.task","title":"task(inputs=None, outputs=None)","text":"

Creates a Task from a formatting output function.

Parameters:

Name Type Description Default inputs Union[StepColumns, None]

a list containing the name of the inputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column is required or not, that are required by the step. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None.

None outputs Union[StepColumns, None]

a list containing the name of the outputs columns/keys or a dictionary where the keys are the columns and the values are booleans indicating whether the column will be generated or not. If not provided the default will be an empty list [] and it will be assumed that the step doesn't need any specific columns. Defaults to None.

None Source code in src/distilabel/steps/tasks/decorator.py
def task(\n    inputs: Union[\"StepColumns\", None] = None,\n    outputs: Union[\"StepColumns\", None] = None,\n) -> Callable[..., Type[\"Task\"]]:\n    \"\"\"Creates a `Task` from a formatting output function.\n\n    Args:\n        inputs: a list containing the name of the inputs columns/keys or a dictionary\n            where the keys are the columns and the values are booleans indicating whether\n            the column is required or not, that are required by the step. If not provided\n            the default will be an empty list `[]` and it will be assumed that the step\n            doesn't need any specific columns. Defaults to `None`.\n        outputs: a list containing the name of the outputs columns/keys or a dictionary\n            where the keys are the columns and the values are booleans indicating whether\n            the column will be generated or not. If not provided the default will be an\n            empty list `[]` and it will be assumed that the step doesn't need any specific\n            columns. Defaults to `None`.\n    \"\"\"\n\n    inputs = inputs or []\n    outputs = outputs or []\n\n    def decorator(func: TaskFormattingOutputFunc) -> Type[\"Task\"]:\n        doc = inspect.getdoc(func)\n        if doc is None:\n            raise DistilabelUserError(\n                \"When using the `task` decorator, including a docstring in the formatting\"\n                \" function is mandatory. The docstring must follow the format described\"\n                \" in the documentation.\",\n                page=\"\",\n            )\n\n        system_prompt, user_message_template = _parse_docstring(doc)\n        _validate_templates(inputs, system_prompt, user_message_template)\n\n        def inputs_property(self) -> \"StepColumns\":\n            return inputs\n\n        def outputs_property(self) -> \"StepColumns\":\n            return outputs\n\n        def format_input(self, input: Dict[str, Any]) -> \"FormattedInput\":\n            return [\n                {\"role\": \"system\", \"content\": system_prompt.format(**input)},\n                {\"role\": \"user\", \"content\": user_message_template.format(**input)},\n            ]\n\n        def format_output(\n            self, output: Union[str, None], input: Union[Dict[str, Any], None] = None\n        ) -> Dict[str, Any]:\n            return func(output, input)\n\n        return type(\n            func.__name__,\n            (Task,),\n            {\n                \"inputs\": property(inputs_property),\n                \"outputs\": property(outputs_property),\n                \"__module__\": func.__module__,\n                \"format_input\": format_input,\n                \"format_output\": format_output,\n            },\n        )\n\n    return decorator\n
"},{"location":"api/task/typing/","title":"Task Typing","text":""},{"location":"api/task/typing/#distilabel.steps.tasks.typing","title":"typing","text":""},{"location":"api/task/typing/#distilabel.steps.tasks.typing.ChatType","title":"ChatType = List[ChatItem] module-attribute","text":"

ChatType is a type alias for a list of dicts following the OpenAI conversational format.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StructuredOutputType","title":"StructuredOutputType = Union[OutlinesStructuredOutputType, InstructorStructuredOutputType] module-attribute","text":"

StructuredOutputType is an alias for the union of OutlinesStructuredOutputType and InstructorStructuredOutputType.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StandardInput","title":"StandardInput = ChatType module-attribute","text":"

StandardInput is an alias for ChatType that defines the default / standard input produced by format_input.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.StructuredInput","title":"StructuredInput = Tuple[StandardInput, Union[StructuredOutputType, None]] module-attribute","text":"

StructuredInput defines a type produced by format_input when using either StructuredGeneration or a subclass of it.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.FormattedInput","title":"FormattedInput = Union[StandardInput, StructuredInput] module-attribute","text":"

FormattedInput is an alias for the union of StandardInput and StructuredInput as generated by format_input and expected by the LLMs.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType","title":"OutlinesStructuredOutputType","text":"

Bases: TypedDict

TypedDict to represent the structured output configuration from outlines.

Source code in src/distilabel/steps/tasks/typing.py
class OutlinesStructuredOutputType(TypedDict, total=False):\n    \"\"\"TypedDict to represent the structured output configuration from `outlines`.\"\"\"\n\n    format: Literal[\"json\", \"regex\"]\n    \"\"\"One of \"json\" or \"regex\".\"\"\"\n    schema: Union[str, Type[BaseModel], Dict[str, Any]]\n    \"\"\"The schema to use for the structured output. If \"json\", it\n    can be a pydantic.BaseModel class, or the schema as a string,\n    as obtained from `model_to_schema(BaseModel)`, if \"regex\", it\n    should be a regex pattern as a string.\n    \"\"\"\n    whitespace_pattern: Optional[Union[str, List[str]]]\n    \"\"\"If \"json\" corresponds to a string or a list of\n    strings with a pattern (doesn't impact string literals).\n    For example, to allow only a single space or newline with\n    `whitespace_pattern=r\"[\\n ]?\"`\n    \"\"\"\n
"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.format","title":"format: Literal['json', 'regex'] instance-attribute","text":"

One of \"json\" or \"regex\".

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.schema","title":"schema: Union[str, Type[BaseModel], Dict[str, Any]] instance-attribute","text":"

The schema to use for the structured output. If \"json\", it can be a pydantic.BaseModel class, or the schema as a string, as obtained from model_to_schema(BaseModel), if \"regex\", it should be a regex pattern as a string.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.OutlinesStructuredOutputType.whitespace_pattern","title":"whitespace_pattern: Optional[Union[str, List[str]]] instance-attribute","text":"

If \"json\" corresponds to a string or a list of strings with a pattern (doesn't impact string literals). For example, to allow only a single space or newline with whitespace_pattern=r\"[ ]?\"

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType","title":"InstructorStructuredOutputType","text":"

Bases: TypedDict

TypedDict to represent the structured output configuration from instructor.

Source code in src/distilabel/steps/tasks/typing.py
class InstructorStructuredOutputType(TypedDict, total=False):\n    \"\"\"TypedDict to represent the structured output configuration from `instructor`.\"\"\"\n\n    format: Optional[Literal[\"json\"]]\n    \"\"\"One of \"json\".\"\"\"\n    schema: Union[Type[BaseModel], Dict[str, Any]]\n    \"\"\"The schema to use for the structured output, a `pydantic.BaseModel` class. \"\"\"\n    mode: Optional[str]\n    \"\"\"Generation mode. Take a look at `instructor.Mode` for more information, if not informed it will\n    be determined automatically. \"\"\"\n    max_retries: int\n    \"\"\"Number of times to reask the model in case of error, if not set will default to the model's default. \"\"\"\n
"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.format","title":"format: Optional[Literal['json']] instance-attribute","text":"

One of \"json\".

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.schema","title":"schema: Union[Type[BaseModel], Dict[str, Any]] instance-attribute","text":"

The schema to use for the structured output, a pydantic.BaseModel class.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.mode","title":"mode: Optional[str] instance-attribute","text":"

Generation mode. Take a look at instructor.Mode for more information, if not informed it will be determined automatically.

"},{"location":"api/task/typing/#distilabel.steps.tasks.typing.InstructorStructuredOutputType.max_retries","title":"max_retries: int instance-attribute","text":"

Number of times to reask the model in case of error, if not set will default to the model's default.

"},{"location":"sections/community/","title":"Community","text":"

We are an open-source community-driven project not only focused on building a great product but also on building a great community, where you can get support, share your experiences, and contribute to the project! We would love to hear from you and help you get started with distilabel.

  • Discord

    In our Discord channels (#argilla-general and #argilla-help), you can get direct support from the community.

    Discord \u2197

  • Community Meetup

    We host bi-weekly community meetups where you can listen in or present your work.

    Community Meetup \u2197

  • Changelog

    The changelog is where you can find the latest updates and changes to the distilabel project.

    Changelog \u2197

  • Roadmap

    We love to discuss our plans with the community. Feel encouraged to participate in our roadmap discussions.

    Roadmap \u2197

"},{"location":"sections/community/#badges","title":"Badges","text":"

If you build something cool with distilabel consider adding one of these badges to your dataset or model card.

[<img src=\"https://raw.githubusercontent.com/argilla-io/distilabel/main/docs/assets/distilabel-badge-light.png\" alt=\"Built with Distilabel\" width=\"200\" height=\"32\"/>](https://github.com/argilla-io/distilabel)\n

[<img src=\"https://raw.githubusercontent.com/argilla-io/distilabel/main/docs/assets/distilabel-badge-dark.png\" alt=\"Built with Distilabel\" width=\"200\" height=\"32\"/>](https://github.com/argilla-io/distilabel)\n

"},{"location":"sections/community/#contribute","title":"Contribute","text":"

To directly contribute with distilabel, check our good first issues or open a new one.

"},{"location":"sections/community/contributor/","title":"How to contribute?","text":"

Thank you for investing your time in contributing to the project! Any contribution you make will be reflected in the most recent version of distilabel \ud83e\udd29.

New to contributing in general?

If you're a new contributor, read the README to get an overview of the project. In addition, here are some resources to help you get started with open-source contributions:

  • Discord: You are welcome to join the distilabel Discord community, where you can keep in touch with other users, contributors and the distilabel team. In the following section, you can find more information on how to get started in Discord.
  • Git: This is a very useful tool to keep track of the changes in your files. Using the command-line interface (CLI), you can make your contributions easily. For that, you need to have it installed and updated on your computer.
  • GitHub: It is a platform and cloud-based service that uses git and allows developers to collaborate on projects. To contribute to distilabel, you'll need to create an account. Check the Contributor Workflow with Git and Github for more info.
  • Developer Documentation: To collaborate, you'll need to set up an efficient environment. Check the Installation guide to know how to do it.
"},{"location":"sections/community/contributor/#first-contact-in-discord","title":"First Contact in Discord","text":"

Discord is a handy tool for more casual conversations and to answer day-to-day questions. As part of Hugging Face, we have set up some distilabel channels on the server. Click here to join the Hugging Face Discord community effortlessly.

When part of the Hugging Face Discord, you can select \"Channels & roles\" and select \"Argilla\" along with any of the other groups that are interesting to you. \"Argilla\" will cover anything about argilla and distilabel. You can join the following channels:

  • #argilla-distilabel-announcements: \ud83d\udce3 Stay up-to-date.
  • #argilla-distilabel-general: \ud83d\udcac For general discussions.
  • #argilla-distilabel-help: \ud83d\ude4b\u200d\u2640\ufe0f Need assistance? We're always here to help. Select the appropriate label (argilla or distilabel) for your issue and post it.

So now there is only one thing left to do: introduce yourself and talk to the community. You'll always be welcome! \ud83e\udd17\ud83d\udc4b

"},{"location":"sections/community/contributor/#contributor-workflow-with-git-and-github","title":"Contributor Workflow with Git and GitHub","text":"

If you're working with distilabel and suddenly a new idea comes to your mind or you find an issue that can be improved, it's time to actively participate and contribute to the project!

"},{"location":"sections/community/contributor/#report-an-issue","title":"Report an issue","text":"

If you spot a problem, search if an issue already exists, you can use the Label filter. If that is the case, participate in the conversation. If it does not exist, create an issue by clicking on New Issue. This will show various templates; choose the one that best suits your issue. Once you choose one, you will need to fill it in following the guidelines. Try to be as clear as possible. In addition, you can assign yourself to the issue and add or choose the right labels. Finally, click on Submit new issue.

"},{"location":"sections/community/contributor/#work-with-a-fork","title":"Work with a fork","text":""},{"location":"sections/community/contributor/#fork-the-distilabel-repository","title":"Fork the distilabel repository","text":"

After having reported the issue, you can start working on it. For that, you will need to create a fork of the project. To do that, click on the Fork button. Now, fill in the information. Remember to uncheck the Copy develop branch only if you are going to work in or from another branch (for instance, to fix documentation, the main branch is used). Then, click on Create fork.

You will be redirected to your fork. You can see that you are in your fork because the name of the repository will be your username/distilabel, and it will indicate forked from argilla-io/distilabel.

"},{"location":"sections/community/contributor/#clone-your-forked-repository","title":"Clone your forked repository","text":"

In order to make the required adjustments, clone the forked repository to your local machine. Choose the destination folder and run the following command:

git clone https://github.com/[your-github-username]/distilabel.git\ncd distilabel\n

To keep your fork\u2019s main/develop branch up to date with our repo, add it as an upstream remote branch.

git remote add upstream https://github.com/argilla-io/distilabel.git\n
"},{"location":"sections/community/contributor/#create-a-new-branch","title":"Create a new branch","text":"

For each issue you're addressing, it's advisable to create a new branch. GitHub offers a straightforward method to streamline this process.

\u26a0\ufe0f Never work directly on the main or develop branch. Always create a new branch for your changes.

Navigate to your issue, and on the right column, select Create a branch.

After the new window pops up, the branch will be named after the issue and include a prefix such as feature/, bug/, or docs/ to facilitate quick recognition of the issue type. In the Repository destination, pick your fork ( [your-github-username]/distilabel), and then select Change branch source to specify the source branch for creating the new one. Complete the process by clicking Create branch.

\ud83e\udd14 Remember that the main branch is only used to work with the documentation. For any other changes, use the develop branch.

Now, locally, change to the new branch you just created.

git fetch origin\ngit checkout [branch-name]\n
"},{"location":"sections/community/contributor/#make-changes-and-push-them","title":"Make changes and push them","text":"

Make the changes you want in your local repository, and test that everything works and you are following the guidelines.

Once you have finished, you can check the status of your repository and synchronize with the upstreaming repo with the following command:

# Check the status of your repository\ngit status\n\n# Synchronize with the upstreaming repo\ngit checkout [branch-name]\ngit rebase [default-branch]\n

If everything is right, we need to commit and push the changes to your fork. For that, run the following commands:

# Add the changes to the staging area\ngit add filename\n\n# Commit the changes by writing a proper message\ngit commit -m \"commit-message\"\n\n# Push the changes to your fork\ngit push origin [branch-name]\n

When pushing, you will be asked to enter your GitHub login credentials. Once the push is complete, all local commits will be on your GitHub repository.

"},{"location":"sections/community/contributor/#create-a-pull-request","title":"Create a pull request","text":"

Come back to GitHub, navigate to the original repository where you created your fork, and click on Compare & pull request.

First, click on compare across forks and select the right repositories and branches.

In the base repository, keep in mind that you should select either main or develop based on the modifications made. In the head repository, indicate your forked repository and the branch corresponding to the issue.

Then, fill in the pull request template. You should add a prefix to the PR name, as we did with the branch above. If you are working on a new feature, you can name your PR as feat: TITLE. If your PR consists of a solution for a bug, you can name your PR as bug: TITLE. And, if your work is for improving the documentation, you can name your PR as docs: TITLE.

In addition, on the right side, you can select a reviewer (for instance, if you discussed the issue with a member of the team) and assign the pull request to yourself. It is highly advisable to add labels to PR as well. You can do this again by the labels section right on the screen. For instance, if you are addressing a bug, add the bug label, or if the PR is related to the documentation, add the documentation label. This way, PRs can be easily filtered.

Finally, fill in the template carefully and follow the guidelines. Remember to link the original issue and enable the checkbox to allow maintainer edits so the branch can be updated for a merge. Then, click on Create pull request.

For the PR body, ensure you give a description of what the PR contains, and add examples if possible (and if they apply to the contribution) to help with the review process. You can take a look at #PR 974 or #PR 983 for examples of typical PRs.

"},{"location":"sections/community/contributor/#review-your-pull-request","title":"Review your pull request","text":"

Once you submit your PR, a team member will review your proposal. We may ask questions, request additional information, or ask for changes to be made before a PR can be merged, either using suggested changes or pull request comments.

You can apply the changes directly through the UI (check the files changed and click on the right-corner three dots; see image below) or from your fork, and then commit them to your branch. The PR will be updated automatically, and the suggestions will appear as outdated.

If you run into any merge issues, check out this git tutorial to help you resolve merge conflicts and other issues.

"},{"location":"sections/community/contributor/#your-pr-is-merged","title":"Your PR is merged!","text":"

Congratulations \ud83c\udf89\ud83c\udf8a We thank you \ud83e\udd29

Once your PR is merged, your contributions will be publicly visible on the distilabel GitHub.

Additionally, we will include your changes in the next release based on our development branch.

"},{"location":"sections/community/contributor/#additional-resources","title":"Additional resources","text":"

Here are some helpful resources for your reference.

  • Configuring Discord, a guide to learning how to get started with Discord.
  • Pro Git, a book to learn Git.
  • Git in VSCode, a guide to learning how to easily use Git in VSCode.
  • GitHub Skills, an interactive course for learning GitHub.
"},{"location":"sections/community/developer_documentation/","title":"Developer Documentation","text":"

Thank you for investing your time in contributing to the project!

If you don't have the repository locally, and need any help, go to the contributor guide and read the contributor workflow with Git and GitHub first.

"},{"location":"sections/community/developer_documentation/#set-up-the-python-environment","title":"Set up the Python environment","text":"

To work on the distilabel, you must install the package on your system.

Tip

This guide will use uv, but pip and venv can be used as well, this guide can work quite similar with both options.

From the root of the cloned Distilabel repository, you should move to the distilabel folder in your terminal.

cd distilabel\n
"},{"location":"sections/community/developer_documentation/#create-a-virtual-environment","title":"Create a virtual environment","text":"

The first step will be creating a virtual environment to keep our dependencies isolated. Here we are choosing python 3.11 (uv venv documentation), and then activate it:

uv venv .venv --python 3.11\nsource .venv/bin/activate\n
"},{"location":"sections/community/developer_documentation/#install-the-project","title":"Install the project","text":"

Installing from local (we are using uv pip):

uv pip install -e .\n

We have extra dependencies with their name, depending on the part you are working on, you may want to install some dependency (take a look at pyproject.toml in the repo to see all the extra dependencies):

uv pip install -e \".[vllm,outlines]\"\n
"},{"location":"sections/community/developer_documentation/#linting-and-formatting","title":"Linting and formatting","text":"

To maintain a consistent code format, install the pre-commit hooks to run before each commit automatically (we rely heavily on ruff):

uv pip install -e \".[dev]\"\npre-commit install\n
"},{"location":"sections/community/developer_documentation/#running-tests","title":"Running tests","text":"

All the changes you add to the codebase should come with tests, either unit or integration tests, depending on the type of change, which are placed under tests/unit and tests/integration respectively.

Start by installing the tests dependencies:

uv pip install \".[tests]\"\n

Running the whole tests suite may take some time, and you will need all the dependencies installed, so just run your tests, and the whole tests suite will be run for you in the CI:

# Run specific tests\npytest tests/unit/steps/generators/test_data.py\n
"},{"location":"sections/community/developer_documentation/#set-up-the-documentation","title":"Set up the documentation","text":"

To contribute to the documentation and generate it locally, ensure you have installed the development dependencies:

uv pip install -e \".[docs]\"\n

And run the following command to create the development server with mkdocs:

mkdocs serve\n
"},{"location":"sections/community/developer_documentation/#documentation-guidelines","title":"Documentation guidelines","text":"

As mentioned, we use mkdocs to build the documentation. You can write the documentation in markdown format, and it will automatically be converted to HTML. In addition, you can include elements such as tables, tabs, images, and others, as shown in this guide. We recommend following these guidelines:

  • Use clear and concise language: Ensure the documentation is easy to understand for all users by using straightforward language and including meaningful examples. Images are not easy to maintain, so use them only when necessary and place them in the appropriate folder within the docs/assets/images directory.

  • Verify code snippets: Double-check that all code snippets are correct and runnable.

  • Review spelling and grammar: Check the spelling and grammar of the documentation.

  • Update the table of contents: If you add a new page, include it in the relevant index.md or the mkdocs.yml file.

"},{"location":"sections/community/developer_documentation/#components-gallery","title":"Components gallery","text":"

The components gallery section of the documentation is automatically generated thanks to a custom plugin, it will be run when mkdocs serve is called. This guide to the steps helps us visualize each step, as well as examples of use.

Note

Changes done to the docstrings of Steps/Tasks/LLMs won't appear in the components gallery automatically, you will have to stop the mkdocs server and run it again to see the changes, everything else is reloaded automatically.

"},{"location":"sections/community/popular_issues/","title":"Issue dashboard","text":"Most engaging open issuesLatest issues open by the communityPlanned issues for upcoming releases Rank Issue Reactions Comments 1 368 - [FEATURE] create a pipeline playground UI \ud83d\udc4d 3 \ud83d\udcac 1 2 1041 - [FEATURE] Add Offline batch generation for open models with EXXA API \ud83d\udc4d 2 \ud83d\udcac 1 3 995 - [FEATURE] mlx-lm integration \ud83d\udc4d 2 \ud83d\udcac 1 4 737 - [FEATURE] Allow FormatTextGenerationSFT to include tools/function calls in the formatted messages. \ud83d\udc4d 2 \ud83d\udcac 0 5 829 - [FEATURE] Add Callable and GlobalCallable that takes custom callable as argument \ud83d\udc4d 1 \ud83d\udcac 3 6 797 - [FEATURE] synthetic data generation for predictive NLP tasks \ud83d\udc4d 1 \ud83d\udcac 1 7 914 - [FEATURE] Use Step.resources to set tensor_parallel_size and pipeline_parallel_size in vLLM \ud83d\udc4d 1 \ud83d\udcac 0 8 839 - [REFACTOR] unify singular/plural semantic naming of columns \ud83d\udc4d 1 \ud83d\udcac 0 9 788 - [DOCS] add embedded Datasetviewer to places where data is loaded from the hub \ud83d\udc4d 1 \ud83d\udcac 0 10 588 - [FEATURE] Single request caching \ud83d\udc4d 1 \ud83d\udcac 0 Rank Issue Author 1 \ud83d\udfe2 1049 - [BUG] vLLM Task not utilizing multiple GPUs in parallel when replicas > 1 by adamlin120 2 \ud83d\udfe2 1048 - [BUG] OepnAI JSON format by tinyrolls 3 \ud83d\udfe2 1047 - Failed to load all the steps. Could not run pipeline. by yuqie 4 \ud83d\udfe2 1046 - [FEATURE] Compute the input/output tokens of a dataset by plaguss 5 \ud83d\udfe3 1044 - Receiving error: The number of required GPUs exceeds the total number of available GPUs in the placement group by saurabhbbjain 6 \ud83d\udfe3 1042 - CUDA_VISIBLE_DEVICES does not work with distilabel code by yuqie 7 \ud83d\udfe2 1041 - [FEATURE] Add Offline batch generation for open models with EXXA API by etiennebalit 8 \ud83d\udfe2 1030 - [FEATURE] Trim inputs by arthrod 9 \ud83d\udfe2 1025 - [FEATURE] Update outlines integration to the new version by plaguss 10 \ud83d\udfe3 1020 - [BUG] Error when wrapping the step by sdiazlor Rank Issue Milestone 1 \ud83d\udfe2 880 - [FEATURE] Add exclude_from_signature attribute 1.4.0 2 \ud83d\udfe2 771 - [FEATURE] Allow passing path to YAML file containing pipeline runtime parameters in distilabel run 1.4.0 3 \ud83d\udfe2 773 - [DOCS] Include section/guide describing pipeline patterns 1.4.0 4 \ud83d\udfe2 802 - [FEATURE] Add defaults to Steps and Tasks so they can be more easily connected 1.4.0 5 \ud83d\udfe2 662 - [FEATURE] Allow passing self to steps created with step decorator 1.4.0 6 \ud83d\udfe2 889 - [FEATURE] Replace extra_sampling_params for normal arguments in vLLM 1.4.0 7 \ud83d\udfe2 942 - [BUG] make_generator_step can fail when setting the _dataset_info internally 1.4.0 8 \ud83d\udfe2 579 - [FEATURE] Sequential execution for local pipeline 1.4.0 9 \ud83d\udfe2 944 - [FEATURE] Improve the Argilla steps 1.5.0 10 \ud83d\udfe2 738 - [FEATURE] Update LLM.generate interface to allow returning arbitrary/extra stuff related to the generation 1.5.0

Last update: 2024-11-07

"},{"location":"sections/getting_started/faq/","title":"Frequent Asked Questions (FAQ)","text":"How can I rename the columns in a batch?

Every Step has both input_mappings and output_mappings attributes that can be used to rename the columns in each batch.

But input_mappings will only map, meaning that if you have a batch with the column A and you want to rename it to B, you should use input_mappings={\"A\": \"B\"}, but that will only be applied to that specific Step meaning that the next step in the pipeline will still have the column A instead of B.

While output_mappings will indeed apply the rename, meaning that if the Step produces the column A and you want to rename to B, you should use output_mappings={\"A\": \"B\"}, and that will be applied to the next Step in the pipeline.

Will the API Keys be exposed when sharing the pipeline?

No, those will be masked out using pydantic.SecretStr, meaning that those won't be exposed when sharing the pipeline.

This also means that if you want to re-run your own pipeline and the API keys have not been provided via environment variable but either via an attribute or runtime parameter, you will need to provide them again.

Does it work for Windows?

Yes, but you may need to set the multiprocessing context in advance to ensure that the spawn method is used since the default method fork is not available on Windows.

import multiprocessing as mp\n\nmp.set_start_method(\"spawn\")\n
Will the custom Steps / Tasks / LLMs be serialized too?

No, at the moment, only the references to the classes within the distilabel library will be serialized, meaning that if you define a custom class used within the pipeline, the serialization won't break, but the deserialize will fail since the class won't be available unless used from the same file.

What happens if Pipeline.run fails? Do I lose all the data?

No, indeed, we're using a cache mechanism to store all the intermediate results in the disk so, if a Step fails; the pipeline can be re-run from that point without losing the data, only if nothing is changed in the Pipeline.

All the data will be stored in .cache/distilabel, but the only data that will persist at the end of the Pipeline.run execution is the one from the leaf step/s, so bear that in mind.

For more information on the caching mechanism in distilabel, you can check the Learn - Advanced - Caching section.

Also, note that when running a Step or a Task standalone, the cache mechanism won't be used, so if you want to use that, you should use the Pipeline context manager.

How can I use the same LLM across several tasks without having to load it several times?

You can serve the LLM using a solution like TGI or vLLM, and then connect to it using an AsyncLLM client like InferenceEndpointsLLM or OpenAILLM. Please refer to Serving LLMs guide for more information.

Can distilabel be used with OpenAI Batch API?

Yes, distilabel is integrated with OpenAI Batch API via OpenAILLM. Check LLMs - Offline Batch Generation for a small example on how to use it and Advanced - Offline Batch Generation for a more detailed guide.

Prevent overloads on Free Serverless Endpoints

When running a task using the InferenceEndpointsLLM with Free Serverless Endpoints, you may be facing some errors such as Model is overloaded if you let the batch size to the default (set at 50). To fix the issue, lower the value or even better set input_batch_size=1 in your task. It may take a longer time to finish, but please remember this is a free service.

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import TextGeneration\n\nTextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=1\n)\n
"},{"location":"sections/getting_started/installation/","title":"Installation","text":"

You will need to have at least Python 3.9 or higher, up to Python 3.12, since support for the latter is still a work in progress.

To install the latest release of the package from PyPI you can use the following command:

pip install distilabel --upgrade\n

Alternatively, you may also want to install it from source i.e. the latest unreleased version, you can use the following command:

pip install \"distilabel @ git+https://github.com/argilla-io/distilabel.git@develop\" --upgrade\n

Note

We are installing from develop since that's the branch we use to collect all the features, bug fixes, and improvements that will be part of the next release. If you want to install from a specific branch, you can replace develop with the branch name.

"},{"location":"sections/getting_started/installation/#extras","title":"Extras","text":"

Additionally, as part of distilabel some extra dependencies are available, mainly to add support for some of the LLM integrations we support. Here's a list of the available extras:

"},{"location":"sections/getting_started/installation/#llms","title":"LLMs","text":"
  • anthropic: for using models available in Anthropic API via the AnthropicLLM integration.

  • argilla: for exporting the generated datasets to Argilla.

  • cohere: for using models available in Cohere via the CohereLLM integration.

  • groq: for using models available in Groq using groq Python client via the GroqLLM integration.

  • hf-inference-endpoints: for using the Hugging Face Inference Endpoints via the InferenceEndpointsLLM integration.

  • hf-transformers: for using models available in transformers package via the TransformersLLM integration.

  • litellm: for using LiteLLM to call any LLM using OpenAI format via the LiteLLM integration.

  • llama-cpp: for using llama-cpp-python Python bindings for llama.cpp via the LlamaCppLLM integration.

  • mistralai: for using models available in Mistral AI API via the MistralAILLM integration.

  • ollama: for using Ollama and their available models via OllamaLLM integration.

  • openai: for using OpenAI API models via the OpenAILLM integration, or the rest of the integrations based on OpenAI and relying on its client as AnyscaleLLM, AzureOpenAILLM, and TogetherLLM.

  • vertexai: for using Google Vertex AI proprietary models via the VertexAILLM integration.

  • vllm: for using vllm serving engine via the vLLM integration.

  • sentence-transformers: for generating sentence embeddings using sentence-transformers.

"},{"location":"sections/getting_started/installation/#data-processing","title":"Data processing","text":"
  • ray: for scaling and distributing a pipeline with Ray.

  • faiss-cpu and faiss-gpu: for generating sentence embeddings using faiss.

  • minhash: for using minhash for duplicate detection with datasketch and nltk.

  • text-clustering: for using text clustering with UMAP and Scikit-learn.

"},{"location":"sections/getting_started/installation/#structured-generation","title":"Structured generation","text":"
  • outlines: for using structured generation of LLMs with outlines.

  • instructor: for using structured generation of LLMs with Instructor.

"},{"location":"sections/getting_started/installation/#recommendations-notes","title":"Recommendations / Notes","text":"

The mistralai dependency requires Python 3.9 or higher, so if you're willing to use the distilabel.models.llms.MistralLLM implementation, you will need to have Python 3.9 or higher.

In some cases like transformers and vllm, the installation of flash-attn is recommended if you are using a GPU accelerator since it will speed up the inference process, but the installation needs to be done separately, as it's not included in the distilabel dependencies.

pip install flash-attn --no-build-isolation\n

Also, if you are willing to use the llama-cpp-python integration for running local LLMs, note that the installation process may get a bit trickier depending on which OS are you using, so we recommend you to read through their Installation section in their docs.

"},{"location":"sections/getting_started/quickstart/","title":"Quickstart","text":""},{"location":"sections/getting_started/quickstart/#quickstart","title":"Quickstart","text":"

Distilabel provides all the tools you need to your scalable and reliable pipelines for synthetic data generation and AI-feedback. Pipelines are used to generate data, evaluate models, manipulate data, or any other general task. They are made up of different components: Steps, Tasks and LLMs, which are chained together in a directed acyclic graph (DAG).

  • Steps: These are the building blocks of your pipeline. Normal steps are used for basic executions like loading data, applying some transformations, or any other general task.
  • Tasks: These are steps that rely on LLMs and prompts to perform generative tasks. For example, they can be used to generate data, evaluate models or manipulate data.
  • LLMs: These are the models that will perform the task. They can be local or remote models, and open-source or commercial models.

Pipelines are designed to be scalable and reliable. They can be executed in a distributed manner, and they can be cached and recovered. This is useful when dealing with large datasets or when you want to ensure that your pipeline is reproducible.

Besides that, pipelines are designed to be modular and flexible. You can easily add new steps, tasks, or LLMs to your pipeline, and you can also easily modify or remove them. An example architecture of a pipeline to generate a dataset of preferences is the following:

"},{"location":"sections/getting_started/quickstart/#installation","title":"Installation","text":"

To install the latest release with hf-inference-endpoints extra of the package from PyPI you can use the following command:

pip install distilabel[hf-inference-endpoints] --upgrade\n
"},{"location":"sections/getting_started/quickstart/#define-a-pipeline","title":"Define a pipeline","text":"

In this guide we will walk you through the process of creating a simple pipeline that uses the InferenceEndpointsLLM class to generate text. The Pipeline will load a dataset that contains a column named prompt from the Hugging Face Hub via the step LoadDataFromHub and then use the InferenceEndpointsLLM class to generate text based on the dataset using the TextGeneration task.

You can check the available models in the Hugging Face Model Hub and filter by Inference status.

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(  # (1)\n    name=\"simple-text-generation-pipeline\",\n    description=\"A simple text generation pipeline\",\n) as pipeline:  # (2)\n    load_dataset = LoadDataFromHub(  # (3)\n        output_mappings={\"prompt\": \"instruction\"},\n    )\n\n    text_generation = TextGeneration(  # (4)\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n        ),  # (5)\n        system_prompt=\"You are a creative AI Assistant writer.\",\n        template=\"Follow the following instruction: {{ instruction }}\"  # (6)\n    )\n\n    load_dataset >> text_generation  # (7)\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(  # (8)\n        parameters={\n            load_dataset.name: {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n            text_generation.name: {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n        },\n    )\n    distiset.push_to_hub(repo_id=\"distilabel-example\")  # (9)\n
  1. We define a Pipeline with the name simple-text-generation-pipeline and a description A simple text generation pipeline. Note that the name is mandatory and will be used to calculate the cache signature path, so changing the name will change the cache path and will be identified as a different pipeline.

  2. We are using the Pipeline context manager, meaning that every Step subclass that is defined within the context manager will be added to the pipeline automatically.

  3. We define a LoadDataFromHub step named load_dataset that will load a dataset from the Hugging Face Hub, as provided via runtime parameters in the pipeline.run method below, but it can also be defined within the class instance via the arg repo_id=.... This step will produce output batches with the rows from the dataset, and the column prompt will be mapped to the instruction field.

  4. We define a TextGeneration task named text_generation that will generate text based on the instruction field from the dataset. This task will use the InferenceEndpointsLLM class with the model Meta-Llama-3.1-8B-Instruct.

  5. We define the InferenceEndpointsLLM class with the model Meta-Llama-3.1-8B-Instruct that will be used by the TextGeneration task. In this case, since the InferenceEndpointsLLM is used, we assume that the HF_TOKEN environment variable is set.

  6. Both system_prompt and template are optional fields. The template must be informed as a string following the Jinja2 template format, and the fields that appear there (\"instruction\" in this case, which corresponds to the default) must be informed in the columns attribute. The component gallery for TextGeneration has examples to get you started.

  7. We connect the load_dataset step to the text_generation task using the rshift operator, meaning that the output from the load_dataset step will be used as input for the text_generation task.

  8. We run the pipeline with the parameters for the load_dataset and text_generation steps. The load_dataset step will use the repository distilabel-internal-testing/instruction-dataset-mini and the test split, and the text_generation task will use the generation_kwargs with the temperature set to 0.7 and the max_new_tokens set to 512.

  9. Optionally, we can push the generated Distiset to the Hugging Face Hub repository distilabel-example. This will allow you to share the generated dataset with others and use it in other pipelines.

"},{"location":"sections/how_to_guides/","title":"How-to guides","text":"

Welcome to the how-to guides section! Here you will find a collection of guides that will help you get started with Distilabel. We have divided the guides into two categories: basic and advanced. The basic guides will help you get started with the core concepts of Distilabel, while the advanced guides will help you explore more advanced features.

"},{"location":"sections/how_to_guides/#basic","title":"Basic","text":"
  • Define Steps for your Pipeline

    Steps are the building blocks of your pipeline. They can be used to generate data, evaluate models, manipulate data, or any other general task.

    Define Steps

  • Define Tasks that rely on LLMs

    Tasks are a specific type of step that rely on Language Models (LLMs) to generate data.

    Define Tasks

  • Define LLMs as local or remote models

    LLMs are the core of your tasks. They are used to integrate with local models or remote APIs.

    Define LLMs

  • Execute Steps and Tasks in a Pipeline

    Pipeline is where you put all your steps and tasks together to create a workflow.

    Execute Pipeline

"},{"location":"sections/how_to_guides/#advanced","title":"Advanced","text":"
  • Using the Distiset dataset object

    Distiset is a dataset object based on the datasets library that can be used to store and manipulate data.

    Distiset

  • Export data to Argilla

    Argilla is a platform that can be used to store, search, and apply feedback to datasets. Argilla

  • Using a file system to pass data of batches between steps

    File system can be used to pass data between steps in a pipeline.

    File System

  • Using CLI to explore and re-run existing Pipelines

    CLI can be used to explore and re-run existing pipelines through the command line.

    CLI

  • Cache and recover pipeline executions

    Caching can be used to recover pipeline executions to avoid loosing data and precious LLM calls.

    Caching

  • Structured data generation

    Structured data generation can be used to generate data with a specific structure like JSON, function calls, etc.

    Structured Generation

  • Serving an LLM for sharing it between several tasks

    Serve an LLM via TGI or vLLM to make requests and connect using a client like InferenceEndpointsLLM or OpenAILLM to avoid wasting resources.

    Sharing an LLM across tasks

  • Impose requirements to your pipelines and steps

    Add requirements to steps in a pipeline to ensure they are installed and avoid errors.

    Pipeline requirements

"},{"location":"sections/how_to_guides/advanced/argilla/","title":"Export data to Argilla","text":"

Being able to export the generated synthetic datasets to Argilla, is a core feature within distilabel. We believe in the potential of synthetic data, but without removing the impact a human annotator or group of annotators can bring. So on, the Argilla integration makes it straightforward to push a dataset to Argilla while the Pipeline is running, to be able to follow along the generation process in Argilla's UI, as well as annotating the records on the fly. One can include a Step within the Pipeline to easily export the datasets to Argilla with a pre-defined configuration, suiting the annotation purposes.

Before using any of the steps about to be described below, you should first have an Argilla instance up and running, so that you can successfully upload the data to Argilla. In order to deploy Argilla, the easiest and most straightforward way is to deploy it via the Argilla Template in Hugging Face Spaces as simply as following the steps there, or just via the following button:

"},{"location":"sections/how_to_guides/advanced/argilla/#text-generation","title":"Text Generation","text":"

For text generation scenarios, i.e. when the Pipeline contains a single TextGeneration step, we have designed the task TextGenerationToArgilla, which will seamlessly push the generated data to Argilla, and allow the annotator to review the records.

The dataset will be pushed with the following configuration:

  • Fields: instruction and generation, both being fields of type argilla.TextField, plus the automatically generated id for the given instruction to be able to search for other records with the same instruction in the dataset. The field instruction must always be a string, while the field generation can either be a single string or a list of strings (useful when there are multiple parent nodes of type TextGeneration); even though each record will always contain at most one instruction-generation pair.

  • Questions: quality will be the only question for the annotators to answer, i.e., to annotate, and it will be an argilla.LabelQuestion referring to the quality of the provided generation for the given instruction. It can be annotated as either \ud83d\udc4e (bad) or \ud83d\udc4d (good).

Note

The TextGenerationToArgilla step will only work as is if the Pipeline contains one or multiple TextGeneration steps, or if the columns instruction and generation are available within the batch data. Otherwise, the variable input_mappings will need to be set so that either both or one of instruction and generation are mapped to one of the existing columns in the batch data.

from distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts, TextGenerationToArgilla\nfrom distilabel.steps.tasks import TextGeneration\n\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n    load_dataset = LoadDataFromDicts(\n        name=\"load_dataset\",\n        data=[\n            {\n                \"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\",\n            },\n        ],\n    )\n\n    text_generation = TextGeneration(\n        name=\"text_generation\",\n        llm=OpenAILLM(model=\"gpt-4\"),\n    )\n\n    to_argilla = TextGenerationToArgilla(\n        dataset_name=\"my-dataset\",\n        dataset_workspace=\"admin\",\n        api_url=\"<ARGILLA_API_URL>\",\n        api_key=\"<ARGILLA_API_KEY>\",\n    )\n\n    load_dataset >> text_generation >> to_argilla\n\npipeline.run()\n

"},{"location":"sections/how_to_guides/advanced/argilla/#preference","title":"Preference","text":"

For preference scenarios, i.e. when the Pipeline contains multiple TextGeneration steps, we have designed the task PreferenceToArgilla, which will seamlessly push the generated data to Argilla, and allow the annotator to review the records.

The dataset will be pushed with the following configuration:

  • Fields: instruction and generations, both being fields of type argilla.TextField, plus the automatically generated id for the given instruction to be able to search for other records with the same instruction in the dataset. The field instruction must always be a string, while the field generations must be a list of strings, containing the generated texts for the given instruction so that at least there are two generations to compare. Other than that, the number of generation fields within each record in Argilla will be defined by the value of the variable num_generations to be provided in the PreferenceToArgilla step.

  • Questions: rating and rationale will be the pairs of questions to be defined per each generation i.e. per each value within the range from 0 to num_generations, and those will be of types argilla.RatingQuestion and argilla.TextQuestion, respectively. Note that only the first pair of questions will be mandatory, since only one generation is ensured to be within the batch data. Additionally, note that the provided ratings will range from 1 to 5, and to mention that Argilla only supports values above 0.

Note

The PreferenceToArgilla step will only work if the Pipeline contains multiple TextGeneration steps, or if the columns instruction and generations are available within the batch data. Otherwise, the variable input_mappings will need to be set so that either both or one of instruction and generations are mapped to one of the existing columns in the batch data.

Note

Additionally, if the Pipeline contains an UltraFeedback step, the ratings and rationales will also be available and be automatically injected as suggestions to the existing dataset.

from distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts, PreferenceToArgilla\nfrom distilabel.steps.tasks import TextGeneration\n\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n    load_dataset = LoadDataFromDicts(\n        name=\"load_dataset\",\n        data=[\n            {\n                \"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\",\n            },\n        ],\n    )\n\n    text_generation = TextGeneration(\n        name=\"text_generation\",\n        llm=OpenAILLM(model=\"gpt-4\"),\n        num_generations=4,\n        group_generations=True,\n    )\n\n    to_argilla = PreferenceToArgilla(\n        dataset_name=\"my-dataset\",\n        dataset_workspace=\"admin\",\n        api_url=\"<ARGILLA_API_URL>\",\n        api_key=\"<ARGILLA_API_KEY>\",\n        num_generations=4,\n    )\n\n    load_dataset >> text_generation >> to_argilla\n\nif __name__ == \"__main__\":\n    pipeline.run()\n

"},{"location":"sections/how_to_guides/advanced/assigning_resources_to_step/","title":"Assigning resources to a Step","text":"

When dealing with complex pipelines that get executed in a distributed environment with abundant resources (CPUs and GPUs), sometimes it's necessary to allocate these resources judiciously among the Steps. This is why distilabel allows to specify the number of replicas, cpus and gpus for each Step. Let's see that with an example:

from distilabel.pipeline import Pipeline\nfrom distilabel.models import vLLM\nfrom distilabel.steps import StepResources\nfrom distilabel.steps.tasks import PrometheusEval\n\n\nwith Pipeline(name=\"resources\") as pipeline:\n    ...\n\n    prometheus = PrometheusEval(\n        llm=vLLM(\n            model=\"prometheus-eval/prometheus-7b-v2.0\",\n            chat_template=\"[INST] {{ messages[0]['content'] }}\\\\n{{ messages[1]['content'] }}[/INST]\",\n        ),\n        resources=StepResources(replicas=2, cpus=1, gpus=1)\n        mode=\"absolute\",\n        rubric=\"factual-validity\",\n        reference=False,\n        num_generations=1,\n        group_generations=False,\n    )\n

In the example above, we're creating a PrometheusEval task (remember that Tasks are Steps) that will use vLLM to serve prometheus-eval/prometheus-7b-v2.0 model. This task is resource intensive as it requires an LLM, which in turn requires a GPU to run fast. With that in mind, we have specified the resources required for the task using the StepResources class, and we have defined that we need 1 GPU and 1 CPU per replica of the task. In addition, we have defined that we need 2 replicas i.e. we will run two instances of the task so the computation for the whole dataset runs faster. In addition, StepResources uses the RuntimeParametersMixin, so we can also specify the resources for each step when running the pipeline:

...\n\nif __name__ == \"__main__\":\n    pipeline.run(\n        parameters={\n            prometheus.name: {\"resources\": {\"replicas\": 2, \"cpus\": 1, \"gpus\": 1}}\n        }\n    )\n

And that's it! When running the pipeline, distilabel will create the tasks in nodes that have available the specified resources.

"},{"location":"sections/how_to_guides/advanced/caching/","title":"Pipeline cache","text":"

distilabel will automatically save all the intermediate outputs generated by each Step of a Pipeline, so these outputs can be reused to recover the state of a pipeline execution that was stopped before finishing or to not have to re-execute steps from a pipeline after adding a new downstream step.

"},{"location":"sections/how_to_guides/advanced/caching/#how-to-enabledisable-the-cache","title":"How to enable/disable the cache","text":"

The use of the cache can be toggled using the use_cache parameter of the Pipeline.use_cache method. If True, then distilabel will use the reuse the outputs of previous executions for the new execution. If False, then distilabel will re-execute all the steps of the pipeline to generate new outputs for all the steps.

with Pipeline(name=\"my-pipeline\") as pipeline:\n    ...\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)  # (1)\n
  1. Pipeline cache is disabled

In addition, the cache can be enabled/disabled at Step level using its use_cache attribute. If True, then the outputs of the step will be reused in the new pipeline execution. If False, then the step will be re-executed to generate new outputs. If the cache of one step is disabled and the outputs have to be regenerated, then the outputs of the steps that depend on this step will also be regenerated.

with Pipeline(name=\"writting-assistant\") as pipeline:\n    load_data = LoadDataFromDicts(\n        data=[\n            {\n                \"instruction\": \"How much is 2+2?\"\n            }\n        ]\n    )\n\n    generation = TextGeneration(\n        llm=InferenceEndpointsLLM(\n            model_id=\"Qwen/Qwen2.5-72B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.8,\n                \"max_new_tokens\": 512,\n            },\n        ),\n        use_cache=False  # (1)\n    )\n\n    load_data >> generation\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run()\n
  1. Step cache is disabled and every time the pipeline is executed, this step will be re-executed
"},{"location":"sections/how_to_guides/advanced/caching/#how-a-cache-hit-is-triggered","title":"How a cache hit is triggered","text":"

distilabel groups information and data generated by a Pipeline using the name of the pipeline, so the first factor that triggers a cache hit is the name of the pipeline. The second factor, is the Pipeline.signature property. This property returns a hash that is generated using the names of the steps used in the pipeline and their connections. The third factor, is the Pipeline.aggregated_steps_signature property which is used to determine if the new pipeline execution is exactly the same as one of the previous i.e. the pipeline contains exactly the same steps, with exactly the same connections and the steps are using exactly the same parameters. If these three factors are met, then the cache hit is triggered and the pipeline won't get re-executed and instead the function create_distiset will be used to create the resulting Distiset using the outputs of the previous execution, as it can be seen in the following image:

If the new pipeline execution have a different Pipeline.aggregated_steps_signature i.e. at least one step has changed its parameters, distilabel will reuse the outputs of the steps that have not changed and re-execute the steps that have changed, as it can be seen in the following image:

The same pipeline from above gets executed a third time, but this time the last step text_generation_1 changed, so it's needed to re-execute it. The other steps, as they have not been changed, doesn't need to be re-executed and their outputs are reused.

"},{"location":"sections/how_to_guides/advanced/distiset/","title":"Using the Distiset dataset object","text":"

A Pipeline in distilabel returns a special type of Hugging Face datasets.DatasetDict which is called Distiset.

The Distiset is a dictionary-like object that contains the different configurations generated by the Pipeline, where each configuration corresponds to each leaf step in the DAG built by the Pipeline. Each configuration corresponds to a different subset of the dataset. This is a concept taken from \ud83e\udd17 datasets that lets you upload different configurations of the same dataset within the same repository and can contain different columns i.e. different configurations, which can be seamlessly pushed to the Hugging Face Hub.

Below you can find an example of how to create a Distiset object that resembles a datasets.DatasetDict:

from datasets import Dataset\nfrom distilabel.distiset import Distiset\n\ndistiset = Distiset(\n    {\n        \"leaf_step_1\": Dataset.from_dict({\"instruction\": [1, 2, 3]}),\n        \"leaf_step_2\": Dataset.from_dict(\n            {\"instruction\": [1, 2, 3, 4], \"generation\": [5, 6, 7, 8]}\n        ),\n    }\n)\n

Note

If there's only one leaf node, i.e., only one step at the end of the Pipeline, then the configuration name won't be the name of the last step, but it will be set to \"default\" instead, as that's more aligned with standard datasets within the Hugging Face Hub.

"},{"location":"sections/how_to_guides/advanced/distiset/#distiset-methods","title":"Distiset methods","text":"

We can interact with the different pieces generated by the Pipeline and treat them as different configurations. The Distiset contains just two methods:

"},{"location":"sections/how_to_guides/advanced/distiset/#traintest-split","title":"Train/Test split","text":"

Create a train/test split partition of the dataset for the different configurations or subsets.

>>> distiset.train_test_split(train_size=0.9)\nDistiset({\n    leaf_step_1: DatasetDict({\n        train: Dataset({\n            features: ['instruction'],\n            num_rows: 2\n        })\n        test: Dataset({\n            features: ['instruction'],\n            num_rows: 1\n        })\n    })\n    leaf_step_2: DatasetDict({\n        train: Dataset({\n            features: ['instruction', 'generation'],\n            num_rows: 3\n        })\n        test: Dataset({\n            features: ['instruction', 'generation'],\n            num_rows: 1\n        })\n    })\n})\n
"},{"location":"sections/how_to_guides/advanced/distiset/#push-to-hugging-face-hub","title":"Push to Hugging Face Hub","text":"

Push the Distiset to a Hugging Face repository, where each one of the subsets will correspond to a different configuration:

distiset.push_to_hub(\n    \"my-org/my-dataset\",\n    commit_message=\"Initial commit\",\n    private=False,\n    token=os.getenv(\"HF_TOKEN\"),\n    generate_card=True,\n    include_script=False\n)\n

New since version 1.3.0

Since version 1.3.0 you can automatically push the script that created your pipeline to the same repository. For example, assuming you have a file like the following:

sample_pipe.py
with Pipeline() as pipe:\n    ...\ndistiset = pipe.run()\ndistiset.push_to_hub(\n    \"my-org/my-dataset,\n    include_script=True\n)\n

After running the command, you could visit the repository and the file sample_pipe.py will be stored to simplify sharing your pipeline with the community.

"},{"location":"sections/how_to_guides/advanced/distiset/#custom-docstrings","title":"Custom Docstrings","text":"

distilabel contains a custom plugin to automatically generates a gallery for the different components. The information is extracted by parsing the Step's docstrings. You can take a look at the docstrings in the source code of the UltraFeedback, and take a look at the corresponding entry in the components gallery to see an example of how the docstrings are rendered.

If you create your own components and want the Citations automatically rendered in the README card (in case you are sharing your final distiset in the Hugging Face Hub), you may want to add the citation section. This is an example for the MagpieGenerator Task:

class MagpieGenerator(GeneratorTask, MagpieBase):\n    r\"\"\"Generator task the generates instructions or conversations using Magpie.\n    ...\n\n    Citations:\n\n        ```\n        @misc{xu2024magpiealignmentdatasynthesis,\n            title={Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing},\n            author={Zhangchen Xu and Fengqing Jiang and Luyao Niu and Yuntian Deng and Radha Poovendran and Yejin Choi and Bill Yuchen Lin},\n            year={2024},\n            eprint={2406.08464},\n            archivePrefix={arXiv},\n            primaryClass={cs.CL},\n            url={https://arxiv.org/abs/2406.08464},\n        }\n        ```\n    \"\"\"\n

The Citations section can include any number of bibtex references. To define them, you can add as much elements as needed just like in the example: each citation will be a block of the form: ```@misc{...}```. This information will be automatically used in the README of your Distiset if you decide to call distiset.push_to_hub. Alternatively, if the Citations is not found, but in the References there are found any urls pointing to https://arxiv.org/, we will try to obtain the Bibtex equivalent automatically. This way, Hugging Face can automatically track the paper for you and it's easier to find other datasets citing the same paper, or directly visiting the paper page.

"},{"location":"sections/how_to_guides/advanced/distiset/#save-and-load-from-disk","title":"Save and load from disk","text":"

Take into account that these methods work as datasets.load_from_disk and datasets.Dataset.save_to_disk so the arguments are directly passed to those methods. This means you can also make use of storage_options argument to save your Distiset in your cloud provider, including the distilabel artifacts (pipeline.yaml, pipeline.log and the README.md with the dataset card). You can read more in datasets documentation here.

Save to diskLoad from disk (local)Load from disk (cloud)

Save the Distiset to disk, and optionally (will be done by default) saves the dataset card, the pipeline config file and logs:

distiset.save_to_disk(\n    \"my-dataset\",\n    save_card=True,\n    save_pipeline_config=True,\n    save_pipeline_log=True\n)\n

Load a Distiset that was saved using Distiset.save_to_disk just the same way:

distiset = Distiset.load_from_disk(\"my-dataset\")\n

Load a Distiset from a remote location, like S3, GCS. You can pass the storage_options argument to authenticate with the cloud provider:

distiset = Distiset.load_from_disk(\n    \"s3://path/to/my_dataset\",  # gcs:// or any filesystem tolerated by fsspec\n    storage_options={\n        \"key\": os.environ[\"S3_ACCESS_KEY\"],\n        \"secret\": os.environ[\"S3_SECRET_KEY\"],\n        ...\n    }\n)\n

Take a look at the remaining arguments at Distiset.save_to_disk and Distiset.load_from_disk.

"},{"location":"sections/how_to_guides/advanced/distiset/#dataset-card","title":"Dataset card","text":"

Having this special type of dataset comes with an added advantage when calling Distiset.push_to_hub, which is the automatically generated dataset card in the Hugging Face Hub. Note that it is enabled by default, but can be disabled by setting generate_card=False:

distiset.push_to_hub(\"my-org/my-dataset\", generate_card=True)\n

We will have an automatic dataset card (an example can be seen here) with some handy information like reproducing the Pipeline with the CLI, or examples of the records from the different subsets.

"},{"location":"sections/how_to_guides/advanced/distiset/#create_distiset-helper","title":"create_distiset helper","text":"

Lastly, we presented in the caching section the create_distiset function, you can take a look at the section to see how to create a Distiset from the cache folder, using the helper function to automatically include all the relevant data.

"},{"location":"sections/how_to_guides/advanced/fs_to_pass_data/","title":"Using a file system to pass data of batches between steps","text":"

In some situations, it can happen that the batches contains so much data that is faster to write it to disk and read it back in the next step, instead of passing it using the queue. To solve this issue, distilabel uses fsspec to allow providing a file system configuration and whether if this file system should be used to pass data between steps in the run method of the distilabel pipelines:

Warning

In order to use a specific file system/cloud storage, you will need to install the specific package providing the fsspec implementation for that file system. For instance, to use Google Cloud Storage you will need to install gcsfs:

pip install gcsfs\n

Check the available implementations: fsspec - Other known implementations

from distilabel.pipeline import Pipeline\n\nwith Pipeline(name=\"my-pipeline\") as pipeline:\n  ...\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        ..., \n        storage_parameters={\"path\": \"gcs://my-bucket\"},\n        use_fs_to_pass_data=True\n    )\n

The code above setups a file system (in this case Google Cloud Storage) and sets the flag use_fs_to_pass_data to specify that the data of the batches should be passed to the steps using the file system. The storage_parameters argument is optional, and in the case it's not provided but use_fs_to_pass_data==True, distilabel will use the local file system.

Note

As GlobalSteps receives all the data from the previous steps in one single batch accumulating all the data, it's very likely that the data of the batch will be too big to be passed using the queue. In this case and even if use_fs_to_pass_data==False, distilabel will use the file system to pass the data to the GlobalStep.

"},{"location":"sections/how_to_guides/advanced/offline_batch_generation/","title":"Offline Batch Generation","text":"

The offline batch generation is a feature that some LLMs implemented in distilabel offers, allowing to send the inputs to a LLM-as-a-service platform and waiting for the outputs in a asynchronous manner. LLM-as-a-service platforms offer this feature as it allows them to gather many inputs and creating batches as big as the hardware allows, maximizing the hardware utilization and reducing the cost of the service. In exchange, the user has to wait certain time for the outputs to be ready but the cost per token is usually much lower.

distilabel pipelines are able to handle LLMs that offer this feature in the following way:

  • The first time the pipeline gets executed, the LLM will send the inputs to the platform. The platform will return jobs ids that can be used later to check the status of the jobs and retrieve the results. The LLM will save these jobs ids in its jobs_ids attribute and raise an special exception DistilabelOfflineBatchGenerationNotFinishedException that will be handled by the Pipeline. The jobs ids will be saved in the pipeline cache, so they can be used in subsequent calls.
  • The second time and subsequent calls will recover the pipeline execution and the LLM won't send the inputs again to the platform. This time as it has the jobs_ids it will check if the jobs have finished, and if they have then it will retrieve the results and return the outputs. If they haven't finished, then it will raise again DistilabelOfflineBatchGenerationNotFinishedException again.
  • In addition, LLMs with offline batch generation can be specified to do polling until the jobs have finished, blocking the pipeline until they are done. If for some reason the polling needs to be stopped, one can press Ctrl+C or Cmd+C depending on your OS (or send a SIGINT to the main process) which will stop the polling and raise DistilabelOfflineBatchGenerationNotFinishedException that will be handled by the pipeline as described above.

Warning

In order to recover the pipeline execution and retrieve the results, the pipeline cache must be enabled. If the pipeline cache is disabled, then it will send the inputs again and create different jobs incurring in extra costs.

"},{"location":"sections/how_to_guides/advanced/offline_batch_generation/#example-pipeline-using-openaillm-with-offline-batch-generation","title":"Example pipeline using OpenAILLM with offline batch generation","text":"
from distilabel.models import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline() as pipeline:\n    load_data = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n    text_generation = TextGeneration(\n        llm=OpenAILLM(\n            model=\"gpt-3.5-turbo\",\n            use_offline_batch_generation=True,  # (1)\n        )\n    )\n\n    load_data >> text_generation\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            load_data.name: {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset\",\n                \"split\": \"test\",\n                \"batch_size\": 500,\n            },\n        }\n    )\n
  1. Indicate that the OpenAILLM should use offline batch generation.
"},{"location":"sections/how_to_guides/advanced/pipeline_requirements/","title":"Add requirements to run a Pipeline","text":"

When sharing a Pipeline that contains custom Steps or Tasks, you may want to add the specific requirements that are needed to run them. distilabel will take this list of requirements and warn the user if any are missing.

Let's see how we can add additional requirements with an example. The first thing we're going to do is to add requirements for our CustomStep. To do so we use the requirements decorator to specify that the step has nltk>=3.8 as dependency (we can use version specifiers). In addition, we're going to specify at Pipeline level that we need distilabel>=1.3.0 to run it.

from typing import List\n\nfrom distilabel.steps import Step\nfrom distilabel.steps.base import StepInput\nfrom distilabel.steps.typing import StepOutput\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.utils.requirements import requirements\nfrom distilabel.pipeline import Pipeline\n\n\n@requirements([\"nltk\"])\nclass CustomStep(Step):\n    @property\n    def inputs(self) -> List[str]:\n        return [\"instruction\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"response\"]\n\n    def process(self, inputs: StepInput) -> StepOutput:  # type: ignore\n        for input in inputs:\n            input[\"response\"] = nltk.word_tokenize(input)\n        yield inputs\n\n\nwith Pipeline(\n    name=\"pipeline-with-requirements\", requirements=[\"distilabel>=1.3.0\"]\n) as pipeline:\n    loader = LoadDataFromDicts(data=[{\"instruction\": \"sample sentence\"}])\n    step1 = CustomStep()\n    loader >> step1\n\nif __name__ == \"__main__\":\n    pipeline.run()\n

Once we call pipeline.run(), if any of the requirements informed at the Step or Pipeline level is missing, a ValueError will be raised telling us that we should install the list of dependencies:

>>> pipeline.run()\n[06/27/24 11:07:33] ERROR    ['distilabel.pipeline'] Please install the following requirements to run the pipeline:                                                                                                                                     base.py:350\n                             distilabel>=1.3.0\n...\nValueError: Please install the following requirements to run the pipeline:\ndistilabel>=1.3.0\n
"},{"location":"sections/how_to_guides/advanced/saving_step_generated_artifacts/","title":"Saving step generated artifacts","text":"

Some Steps might need to produce an auxiliary artifact that is not a result of the computation, but is needed for the computation. For example, the FaissNearestNeighbour needs to create a Faiss index to compute the output of the step which are the top k nearest neighbours for each input. Generating the Faiss index takes time and it could potentially be reused outside of the distilabel pipeline, so it would be a shame not saving it.

For this reason, Steps have a method called save_artifact that allows saving artifacts that will be included along the outputs of the pipeline in the generated Distiset. The generated artifacts will be uploaded and saved when using Distiset.push_to_hub or Distiset.save_to_disk respectively. Let's see how to use it with a simple example.

from typing import List, TYPE_CHECKING\nfrom distilabel.steps import GlobalStep, StepInput, StepOutput\nimport matplotlib.pyplot as plt\n\nif TYPE_CHECKING:\n    from distilabel.steps import StepOutput\n\n\nclass CountTextCharacters(GlobalStep):\n    @property\n    def inputs(self) -> List[str]:\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"text_character_count\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        character_counts = []\n\n        for input in inputs:\n            text_character_count = len(input[\"text\"])\n            input[\"text_character_count\"] = text_character_count\n            character_counts.append(text_character_count)\n\n        # Generate plot with the distribution of text character counts\n        plt.figure(figsize=(10, 6))\n        plt.hist(character_counts, bins=30, edgecolor=\"black\")\n        plt.title(\"Distribution of Text Character Counts\")\n        plt.xlabel(\"Character Count\")\n        plt.ylabel(\"Frequency\")\n\n        # Save the plot as an artifact of the step\n        self.save_artifact(\n            name=\"text_character_count_distribution\",\n            write_function=lambda path: plt.savefig(path / \"figure.png\"),\n            metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n        )\n\n        plt.close()\n\n        yield inputs\n

As it can be seen in the example above, we have created a simple step that counts the number of characters in each input text and generates a histogram with the distribution of the character counts. We save the histogram as an artifact of the step using the save_artifact method. The method takes three arguments:

  • name: The name we want to give to the artifact.
  • write_function: A function that writes the artifact to the desired path. The function will receive a path argument which is a pathlib.Path object pointing to the directory where the artifact should be saved.
  • metadata: A dictionary with metadata about the artifact. This metadata will be saved along with the artifact.

Let's execute the step with a simple pipeline and push the resulting Distiset to the Hugging Face Hub:

Example full code
from typing import TYPE_CHECKING, List\n\nimport matplotlib.pyplot as plt\nfrom datasets import load_dataset\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GlobalStep, StepInput, StepOutput\n\nif TYPE_CHECKING:\n    from distilabel.steps import StepOutput\n\n\nclass CountTextCharacters(GlobalStep):\n    @property\n    def inputs(self) -> List[str]:\n        return [\"text\"]\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"text_character_count\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":  # type: ignore\n        character_counts = []\n\n        for input in inputs:\n            text_character_count = len(input[\"text\"])\n            input[\"text_character_count\"] = text_character_count\n            character_counts.append(text_character_count)\n\n        # Generate plot with the distribution of text character counts\n        plt.figure(figsize=(10, 6))\n        plt.hist(character_counts, bins=30, edgecolor=\"black\")\n        plt.title(\"Distribution of Text Character Counts\")\n        plt.xlabel(\"Character Count\")\n        plt.ylabel(\"Frequency\")\n\n        # Save the plot as an artifact of the step\n        self.save_artifact(\n            name=\"text_character_count_distribution\",\n            write_function=lambda path: plt.savefig(path / \"figure.png\"),\n            metadata={\"type\": \"image\", \"library\": \"matplotlib\"},\n        )\n\n        plt.close()\n\n        yield inputs\n\n\nwith Pipeline() as pipeline:\n    count_text_characters = CountTextCharacters()\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        dataset=load_dataset(\n            \"HuggingFaceH4/instruction-dataset\", split=\"test\"\n        ).rename_column(\"prompt\", \"text\"),\n    )\n\n    distiset.push_to_hub(\"distilabel-internal-testing/distilabel-artifacts-example\")\n

The generated distilabel-internal-testing/distilabel-artifacts-example dataset repository has a section in its card describing the artifacts generated by the pipeline and the generated plot can be seen here.

"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/","title":"Scaling and distributing a pipeline with Ray","text":"

Although the local Pipeline based on multiprocessing + serving LLMs with an external service is enough for executing most of the pipelines used to create SFT and preference datasets, there are scenarios where you might need to scale your pipeline across multiple machines. In such cases, distilabel leverages Ray to distribute the workload efficiently. This allows you to generate larger datasets, reduce execution time, and maximize resource utilization across a cluster of machines, without needing to change a single line of code.

"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#relation-between-distilabel-steps-and-ray-actors","title":"Relation between distilabel steps and Ray Actors","text":"

A distilabel pipeline consist of several Steps. An Step is a class that defines a basic life-cycle:

  1. It will load or create the resources (LLMs, clients, etc) required to run its logic.
  2. It will run a loop waiting for incoming batches received using a queue. Once it receives one batch, it will process it and put the processed batch into an output queue.
  3. When it finish a batch that is the final one or receives a special signal, the loop will finish and the unload logic will be executed.

So an Step needs to maintain a minimum state and the best way to do that with Ray is using actors.

graph TD\n    A[Step] -->|has| B[Multiple Replicas]\n    B -->|wrapped in| C[Ray Actor]\n    C -->|maintains| D[Step Replica State]\n    C -->|executes| E[Step Lifecycle]\n    E -->|1. Load/Create Resources| F[LLMs, Clients, etc.]\n    E -->|2. Process batches from| G[Input Queue]\n    E -->|3. Processed batches are put in| H[Output Queue]\n    E -->|4. Unload| I[Cleanup]\n
"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#executing-a-pipeline-with-ray","title":"Executing a pipeline with Ray","text":"

The recommended way to execute a distilabel pipeline using Ray is using the Ray Jobs API.

Before jumping on the explanation, let's first install the prerequisites:

pip install distilabel[ray]\n

Tip

It's recommended to create a virtual environment.

For the purpose of explaining how to execute a pipeline with Ray, we'll use the following pipeline throughout the examples:

from distilabel.models import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(name=\"text-generation-ray-pipeline\") as pipeline:\n    load_data_from_hub = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n    text_generation = TextGeneration(\n        llm=vLLM(\n            model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n            tokenizer=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        )\n    )\n\n    load_data_from_hub >> text_generation\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            load_data_from_hub.name: {\n                \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n                \"split\": \"test\",\n            },\n            text_generation.name: {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 4096,\n                    }\n                },\n                \"resources\": {\"replicas\": 2, \"gpus\": 1}, # (1)\n            },\n        }\n    )\n\n    distiset.push_to_hub(\n        \"<YOUR_HF_USERNAME_OR_ORGANIZATION>/text-generation-distilabel-ray\" # (2)\n    )\n
  1. We're setting resources for the text_generation step and defining that we want two replicas and one GPU per replica. distilabel will create two replicas of the step i.e. two actors in the Ray cluster, and each actor will request to be allocated in a node of the cluster that have at least one GPU. You can read more about how Ray manages the resources here.
  2. You should modify this and add your user or organization on the Hugging Face Hub.

It's a basic pipeline with just two steps: one to load a dataset from the Hub with an instruction column and one to generate a response for that instruction using Llama 3 8B Instruct with vLLM. Simple but enough to demonstrate how to distribute and scale the workload using a Ray cluster!

"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#using-ray-jobs-api","title":"Using Ray Jobs API","text":"

If you don't know the Ray Jobs API then it's recommended to read Ray Jobs Overview. Quick summary: Ray Jobs is the recommended way to execute a job in a Ray cluster as it will handle packaging, deploying and managing the Ray application.

To execute the pipeline above, we first need to create a directory (kind of a package) with the pipeline script (or scripts) that we will submit to the Ray cluster:

mkdir ray-pipeline\n

The content of the directory ray-pipeline should be:

ray-pipeline/\n\u251c\u2500\u2500 pipeline.py\n\u2514\u2500\u2500 runtime_env.yaml\n

The first file contains the code of the pipeline, while the second one (runtime_env.yaml) is a specific Ray file containing the environment dependencies required to run the job:

pip:\n  - distilabel[ray,vllm] >= 1.3.0\nenv_vars:\n  HF_TOKEN: <YOUR_HF_TOKEN>\n

With this file we're basically informing to the Ray cluster that it will have to install distilabel with the vllm and ray extra dependencies to be able to run the job. In addition, we're defining the HF_TOKEN environment variable that will be used (by the push_to_hub method) to upload the resulting dataset to the Hugging Face Hub.

After that, we can proceed to execute the ray command that will submit the job to the Ray cluster:

ray job submit \\\n    --address http://localhost:8265 \\\n    --working-dir ray-pipeline \\\n    --runtime-env ray-pipeline/runtime_env.yaml -- python pipeline.py\n

What this will do, it's to basically upload the --working-dir to the Ray cluster, install the dependencies and then execute the python pipeline.py command from the head node.

"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#file-system-requirements","title":"File system requirements","text":"

As described in Using a file system to pass data to steps, distilabel relies on the file system to pass the data to the GlobalSteps, so if the pipeline to be executed in the Ray cluster have any GlobalStep or do you want to set the use_fs_to_pass_data=True of the run method, then you will need to setup a file system to which all the nodes of the Ray cluster have access:

if __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={...},\n        storage_parameters={\"path\": \"file:///mnt/data\"}, # (1)\n        use_fs_to_pass_data=True,\n    )\n
  1. All the nodes of the Ray cluster should have access to /mnt/data.
"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#executing-a-raypipeline-in-a-cluster-with-slurm","title":"Executing a RayPipeline in a cluster with Slurm","text":"

If you have access to an HPC, then you're probably also a user of Slurm, a workload manager typically used on HPCs. We can create Slurm job that takes some nodes and deploy a Ray cluster to run a distributed distilabel pipeline:

#!/bin/bash\n#SBATCH --job-name=distilabel-ray-text-generation\n#SBATCH --partition=your-partition\n#SBATCH --qos=normal\n#SBATCH --nodes=2 # (1)\n#SBATCH --exclusive\n#SBATCH --ntasks-per-node=1 # (2)\n#SBATCH --gpus-per-node=1 # (3)\n#SBATCH --time=0:30:00\n\nset -ex\n\necho \"SLURM_JOB_ID: $SLURM_JOB_ID\"\necho \"SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST\"\n\n# Activate virtual environment\nsource /path/to/virtualenv/.venv/bin/activate\n\n# Getting the node names\nnodes=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\")\nnodes_array=($nodes)\n\n# Get the IP address of the head node\nhead_node=${nodes_array[0]}\nhead_node_ip=$(srun --nodes=1 --ntasks=1 -w \"$head_node\" hostname --ip-address)\n\n# Start Ray head node\nport=6379\nip_head=$head_node_ip:$port\nexport ip_head\necho \"IP Head: $ip_head\"\n\n# Generate a unique Ray tmp dir for the head node (just in case the default one is not writable)\nhead_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_head\"\n\necho \"Starting HEAD at $head_node\"\nOUTLINES_CACHE_DIR=\"/tmp/.outlines\" srun --nodes=1 --ntasks=1 -w \"$head_node\" \\ # (4)\n    ray start --head --node-ip-address=\"$head_node_ip\" --port=$port \\\n    --dashboard-host=0.0.0.0 \\\n    --dashboard-port=8265 \\\n    --temp-dir=\"$head_tmp_dir\" \\\n    --block &\n\n# Give some time to head node to start...\necho \"Waiting a bit before starting worker nodes...\"\nsleep 10\n\n# Start Ray worker nodes\nworker_num=$((SLURM_JOB_NUM_NODES - 1))\n\n# Start from 1 (0 is head node)\nfor ((i = 1; i <= worker_num; i++)); do\n    node_i=${nodes_array[$i]}\n    worker_tmp_dir=\"/tmp/ray_tmp_${SLURM_JOB_ID}_worker_$i\"\n    echo \"Starting WORKER $i at $node_i\"\n    OUTLINES_CACHE_DIR=\"/tmp/.outlines\" srun --nodes=1 --ntasks=1 -w \"$node_i\" \\\n        ray start --address \"$ip_head\" \\\n        --temp-dir=\"$worker_tmp_dir\" \\\n        --block &\n    sleep 5\ndone\n\n# Give some time to the Ray cluster to gather info\necho \"Waiting a bit before submitting the job...\"\nsleep 60\n\n# Finally submit the job to the cluster\nray job submit --address http://localhost:8265 --working-dir ray-pipeline -- python -u pipeline.py\n
  1. In this case, we just want two nodes: one to run the Ray head node and one to run a worker.
  2. We just want to run a task per node i.e. the Ray command that starts the head/worker node.
  3. We have selected 1 GPU per node, but we could have selected more depending on the pipeline.
  4. We need to set the environment variable OUTLINES_CACHE_DIR to /tmp/.outlines to avoid issues with the nodes trying to read/write the same outlines cache files, which is not possible.
"},{"location":"sections/how_to_guides/advanced/scaling_with_ray/#vllm-and-tensor_parallel_size","title":"vLLM and tensor_parallel_size","text":"

In order to use vLLM multi-GPU and multi-node capabilities with ray, we need to do a few changes in the example pipeline from above. The first change needed is to specify a value for tensor_parallel_size aka \"In how many GPUs do I want you to load the model\", and the second one is to define ray as the distributed_executor_backend as the default one in vLLM is to use multiprocessing:

with Pipeline(name=\"text-generation-ray-pipeline\") as pipeline:\n    load_data_from_hub = LoadDataFromHub(output_mappings={\"prompt\": \"instruction\"})\n\n    text_generation = TextGeneration(\n        llm=vLLM(\n            model=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            extra_kwargs={\n                \"tensor_parallel_size\": 8,\n                \"distributed_executor_backend\": \"ray\",\n            }\n        )\n    )\n\n    load_data_from_hub >> text_generation\n

More information about distributed inference with vLLM can be found here: vLLM - Distributed Serving

"},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/","title":"Serving an LLM for sharing it between several Tasks","text":"

It's very common to want to use the same LLM for several Tasks in a pipeline. To avoid loading the LLM as many times as the number of Tasks and avoid wasting resources, it's recommended to serve the model using solutions like text-generation-inference or vLLM, and then use an AsyncLLM compatible client like InferenceEndpointsLLM or OpenAILLM to communicate with the server respectively.

"},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/#serving-llms-using-text-generation-inference","title":"Serving LLMs using text-generation-inference","text":"
model=meta-llama/Meta-Llama-3-8B-Instruct\nvolume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run\n\ndocker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \\\n    -e HUGGING_FACE_HUB_TOKEN=<secret> \\\n    ghcr.io/huggingface/text-generation-inference:2.0.4 \\\n    --model-id $model\n

Note

The bash command above has been copy-pasted from the official docs text-generation-inference. Please refer to the official docs for more information.

And then we can use InferenceEndpointsLLM with base_url=http://localhost:8080 (pointing to our TGI local deployment):

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nwith Pipeline(name=\"serving-llm\") as pipeline:\n    load_data = LoadDataFromDicts(\n        data=[{\"instruction\": \"Write a poem about the sun and moon.\"}]\n    )\n\n    # `base_url` points to the address of the `TGI` serving the LLM\n    llm = InferenceEndpointsLLM(base_url=\"http://192.168.1.138:8080\")\n\n    text_generation = TextGeneration(\n        llm=llm,\n        num_generations=3,\n        group_generations=True,\n        output_mappings={\"generation\": \"generations\"},\n    )\n\n    ultrafeedback = UltraFeedback(aspect=\"overall-rating\", llm=llm)\n\n    load_data >> text_generation >> ultrafeedback\n
"},{"location":"sections/how_to_guides/advanced/serving_an_llm_for_reuse/#serving-llms-using-vllm","title":"Serving LLMs using vLLM","text":"
docker run --gpus all \\\n    -v ~/.cache/huggingface:/root/.cache/huggingface \\\n    --env \"HUGGING_FACE_HUB_TOKEN=<secret>\" \\\n    -p 8000:8000 \\\n    --ipc=host \\\n    vllm/vllm-openai:latest \\\n    --model meta-llama/Meta-Llama-3-8B-Instruct\n

Note

The bash command above has been copy-pasted from the official docs vLLM. Please refer to the official docs for more information.

And then we can use OpenAILLM with base_url=http://localhost:8000 (pointing to our vLLM local deployment):

from distilabel.models import OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nwith Pipeline(name=\"serving-llm\") as pipeline:\n    load_data = LoadDataFromDicts(\n        data=[{\"instruction\": \"Write a poem about the sun and moon.\"}]\n    )\n\n    # `base_url` points to the address of the `vLLM` serving the LLM\n    llm = OpenAILLM(base_url=\"http://192.168.1.138:8000\", model=\"\")\n\n    text_generation = TextGeneration(\n        llm=llm,\n        num_generations=3,\n        group_generations=True,\n        output_mappings={\"generation\": \"generations\"},\n    )\n\n    ultrafeedback = UltraFeedback(aspect=\"overall-rating\", llm=llm)\n\n    load_data >> text_generation >> ultrafeedback\n
"},{"location":"sections/how_to_guides/advanced/structured_generation/","title":"Structured data generation","text":"

Distilabel has integrations with relevant libraries to generate structured text i.e. to guide the LLM towards the generation of structured outputs following a JSON schema, a regex, etc.

"},{"location":"sections/how_to_guides/advanced/structured_generation/#outlines","title":"Outlines","text":"

Distilabel integrates outlines within some LLM subclasses. At the moment, the following LLMs integrated with outlines are supported in distilabel: TransformersLLM, vLLM or LlamaCppLLM, so that anyone can generate structured outputs in the form of JSON or a parseable regex.

The LLM has an argument named structured_output1 that determines how we can generate structured outputs with it, let's see an example using LlamaCppLLM.

Note

For outlines integration to work you may need to install the corresponding dependencies:

pip install distilabel[outlines]\n
"},{"location":"sections/how_to_guides/advanced/structured_generation/#json","title":"JSON","text":"

We will start with a JSON example, where we initially define a pydantic.BaseModel schema to guide the generation of the structured output.

Note

Take a look at StructuredOutputType to see the expected format of the structured_output dict variable.

from pydantic import BaseModel\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n

And then we provide that schema to the structured_output argument of the LLM.

from distilabel.models import LlamaCppLLM\n\nllm = LlamaCppLLM(\n    model_path=\"./openhermes-2.5-mistral-7b.Q4_K_M.gguf\"  # (1)\n    n_gpu_layers=-1,\n    n_ctx=1024,\n    structured_output={\"format\": \"json\", \"schema\": User},\n)\nllm.load()\n
  1. We have previously downloaded a GGUF model i.e. llama.cpp compatible, from the Hugging Face Hub using curl2, but any model can be used as replacement, as long as the model_path argument is updated.

And we are ready to pass our instruction as usual:

import json\n\nresult = llm.generate(\n    [[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]],\n    max_new_tokens=50\n)\n\ndata = json.loads(result[0][0])\ndata\n# {'name': 'Kathy', 'last_name': 'Smith', 'id': 4539210}\nUser(**data)\n# User(name='Kathy', last_name='Smith', id=4539210)\n

We get back a Python dictionary (formatted as a string) that we can parse using json.loads, or validate it directly using the User, which si a pydantic.BaseModel instance.

"},{"location":"sections/how_to_guides/advanced/structured_generation/#regex","title":"Regex","text":"

The following example shows an example of text generation whose output adhere to a regular expression:

pattern = r\"<name>(.*?)</name>.*?<grade>(.*?)</grade>\"  #\u00a0the same pattern for re.compile\n\nllm=LlamaCppLLM(\n    model_path=model_path,\n    n_gpu_layers=-1,\n    n_ctx=1024,\n    structured_output={\"format\": \"regex\", \"schema\": pattern},\n)\nllm.load()\n\nresult = llm.generate(\n    [\n        [\n            {\"role\": \"system\", \"content\": \"You are Simpsons' fans who loves assigning grades from A to E, where A is the best and E is the worst.\"},\n            {\"role\": \"user\", \"content\": \"What's up with Homer Simpson?\"}\n        ]\n    ],\n    max_new_tokens=200\n)\n

We can check the output by parsing the content using the same pattern we required from the LLM.

import re\nmatch = re.search(pattern, result[0][0])\n\nif match:\n    name = match.group(1)\n    grade = match.group(2)\n    print(f\"Name: {name}, Grade: {grade}\")\n# Name: Homer Simpson, Grade: C+\n

These were some simple examples, but one can see the options this opens.

Tip

A full pipeline example can be seen in the following script: examples/structured_generation_with_outlines.py

"},{"location":"sections/how_to_guides/advanced/structured_generation/#instructor","title":"Instructor","text":"

For other LLM providers behind APIs, there's no direct way of accessing the internal logit processor like outlines does, but thanks to instructor we can generate structured output from LLM providers based on pydantic.BaseModel objects. We have integrated instructor to deal with the AsyncLLM.

Note

For instructor integration to work you may need to install the corresponding dependencies:

pip install distilabel[instructor]\n

Note

Take a look at InstructorStructuredOutputType to see the expected format of the structured_output dict variable.

The following is the same example you can see with outlines's JSON section for comparison purposes.

from pydantic import BaseModel\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n

And then we provide that schema to the structured_output argument of the LLM:

Note

In this example we are using Meta Llama 3.1 8B Instruct, keep in mind not all the models support structured outputs.

from distilabel.models import MistralLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n    structured_output={\"schema\": User}\n)\nllm.load()\n

And we are ready to pass our instructions as usual:

import json\n\nresult = llm.generate(\n    [[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]],\n    max_new_tokens=256\n)\n\ndata = json.loads(result[0][0])\ndata\n# {'name': 'John', 'last_name': 'Doe', 'id': 12345}\nUser(**data)\n# User(name='John', last_name='Doe', id=12345)\n

We get back a Python dictionary (formatted as a string) that we can parse using json.loads, or validate it directly using the User, which is a pydantic.BaseModel instance.

Tip

A full pipeline example can be seen in the following script: examples/structured_generation_with_instructor.py

"},{"location":"sections/how_to_guides/advanced/structured_generation/#openai-json","title":"OpenAI JSON","text":"

OpenAI offers a JSON Mode to deal with structured output via their API, let's see how to make use of them. The JSON mode instructs the model to always return a JSON object following the instruction required.

Warning

Bear in mind, for this to work, you must instruct the model in some way to generate JSON, either in the system message or in the instruction, as can be seen in the API reference.

Contrary to what we have via outlines, JSON mode will not guarantee the output matches any specific schema, only that it is valid and parses without errors. More information can be found in the OpenAI documentation.

Other than the reference to generating JSON, to ensure the model generates parseable JSON we can pass the argument response_format=\"json\"3:

from distilabel.models import OpenAILLM\nllm = OpenAILLM(model=\"gpt4-turbo\", api_key=\"api.key\")\nllm.generate(..., response_format=\"json\")\n
  1. You can check the variable type by importing it from:

    from distilabel.steps.tasks.structured_outputs.outlines import StructuredOutputType\n
    \u21a9

  2. Download the model with curl:

    curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n
    \u21a9

  3. Keep in mind that to interact with this response_format argument in a pipeline, you will have to pass it via the generation_kwargs:

    # Assuming a pipeline is already defined, and we have a task using OpenAILLM called `task_with_openai`:\npipeline.run(\n    parameters={\n        \"task_with_openai\": {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"response_format\": \"json\"\n                }\n            }\n        }\n    }\n)\n
    \u21a9

"},{"location":"sections/how_to_guides/advanced/cli/","title":"Command Line Interface (CLI)","text":"

Distilabel offers a CLI to explore and re-run existing Pipeline dumps, meaning that an existing dump can be explored to see the steps, how those are connected, the runtime parameters used, and also re-run it with the same or different runtime parameters, respectively.

"},{"location":"sections/how_to_guides/advanced/cli/#available-commands","title":"Available commands","text":"

The only available command as of the current version of distilabel is distilabel pipeline.

$ distilabel pipeline --help\n\n Usage: distilabel pipeline [OPTIONS] COMMAND [ARGS]...\n\n Commands to run and inspect Distilabel pipelines.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 --help          Show this message and exit.                                             \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n\u256d\u2500 Commands \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 info      Get information about a Distilabel pipeline.                                  \u2502\n\u2502 run       Run a Distilabel pipeline.                                                    \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n

So on, distilabel pipeline has two subcommands: info and run, as described below. Note that for testing purposes we will be using the following dataset.

"},{"location":"sections/how_to_guides/advanced/cli/#distilabel-pipeline-info","title":"distilabel pipeline info","text":"
$ distilabel pipeline info --help\n\n Usage: distilabel pipeline info [OPTIONS]\n\n Get information about a Distilabel pipeline.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 *  --config        TEXT  Path or URL to the Distilabel pipeline configuration file. \u2502\n\u2502                          [default: None]                                            \u2502\n\u2502                          [required]                                                 \u2502\n\u2502    --help                Show this message and exit.                                \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n

As we can see from the help message, we need to pass either a Path or a URL. This second option comes handy for datasets stored in Hugging Face Hub, for example:

distilabel pipeline info --config \"https://huggingface.co/datasets/distilabel-internal-testing/instruction-dataset-mini-with-generations/raw/main/pipeline.yaml\"\n

If we take a look:

The pipeline information includes the steps used in the Pipeline along with the Runtime Parameter that was used, as well as a description of each of them, and also the connections between these steps. These can be helpful to explore the Pipeline locally.

"},{"location":"sections/how_to_guides/advanced/cli/#distilabel-pipeline-run","title":"distilabel pipeline run","text":"

We can also run a Pipeline from the CLI just pointing to the same pipeline.yaml file or an URL pointing to it and calling distilabel pipeline run. Alternatively, an URL pointing to a Python script containing a distilabel pipeline can be used:

$ distilabel pipeline run --help\n\n Usage: distilabel pipeline run [OPTIONS]\n\n Run a Distilabel pipeline.\n\n\u256d\u2500 Options \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256e\n\u2502 --param                                          PARSE_RUNTIME_PARAM  [default: (dynamic)]                                         \u2502\n\u2502 --config                                         TEXT                 Path or URL to the Distilabel pipeline configuration file.   \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --script                                         TEXT                 URL pointing to a python script containing a distilabel      \u2502\n\u2502                                                                       pipeline.                                                    \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --pipeline-variable-name                         TEXT                 Name of the pipeline in a script. I.e. the 'pipeline'        \u2502\n\u2502                                                                       variable in `with Pipeline(...) as pipeline:...`.            \u2502\n\u2502                                                                       [default: pipeline]                                          \u2502\n\u2502 --ignore-cache              --no-ignore-cache                         Whether to ignore the cache and re-run the pipeline from     \u2502\n\u2502                                                                       scratch.                                                     \u2502\n\u2502                                                                       [default: no-ignore-cache]                                   \u2502\n\u2502 --repo-id                                        TEXT                 The Hugging Face Hub repository ID to push the resulting     \u2502\n\u2502                                                                       dataset to.                                                  \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --commit-message                                 TEXT                 The commit message to use when pushing the dataset.          \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --private                   --no-private                              Whether to make the resulting dataset private on the Hub.    \u2502\n\u2502                                                                       [default: no-private]                                        \u2502\n\u2502 --token                                          TEXT                 The Hugging Face Hub API token to use when pushing the       \u2502\n\u2502                                                                       dataset.                                                     \u2502\n\u2502                                                                       [default: None]                                              \u2502\n\u2502 --help                                                                Show this message and exit.                                  \u2502\n\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256f\n

Using --config option, we must pass a path with a pipeline.yaml file. To specify the runtime parameters of the steps we will need to use the --param option and the value of the parameter in the following format:

distilabel pipeline run --config \"https://huggingface.co/datasets/distilabel-internal-testing/instruction-dataset-mini-with-generations/raw/main/pipeline.yaml\" \\\n    --param load_dataset.repo_id=distilabel-internal-testing/instruction-dataset-mini \\\n    --param load_dataset.split=test \\\n    --param generate_with_gpt35.llm.generation_kwargs.max_new_tokens=512 \\\n    --param generate_with_gpt35.llm.generation_kwargs.temperature=0.7 \\\n    --param to_argilla.dataset_name=text_generation_with_gpt35 \\\n    --param to_argilla.dataset_workspace=admin\n

Or using --script we can pass directly a remote python script (keep in mind --config and --script are exclusive):

distilabel pipeline run --script \"https://huggingface.co/datasets/distilabel-internal-testing/pipe_nothing_test/raw/main/pipe_nothing.py\"\n

You can also pass runtime parameters to the python script as we saw with --config option.

Again, this helps with the reproducibility of the results, and simplifies sharing not only the final dataset but also the process to generate it.

"},{"location":"sections/how_to_guides/basic/llm/","title":"Executing Tasks with LLMs","text":""},{"location":"sections/how_to_guides/basic/llm/#working-with-llms","title":"Working with LLMs","text":"

LLM subclasses are designed to be used within a Task, but they can also be used standalone.

from distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(model=\"meta-llama/Meta-Llama-3.1-70B-Instruct\")\nllm.load()\n\nllm.generate_outputs(\n    inputs=[\n        [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n    ],\n)\n# \"The capital of Spain is Madrid.\"\n

Note

Always call the LLM.load or Task.load method when using LLMs standalone or as part of a Task. If using a Pipeline, this is done automatically in Pipeline.run().

"},{"location":"sections/how_to_guides/basic/llm/#offline-batch-generation","title":"Offline Batch Generation","text":"

By default, all LLMs will generate text in a synchronous manner i.e. send inputs using generate_outputs method that will get blocked until outputs are generated. There are some LLMs (such as OpenAILLM) that implements what we denote as offline batch generation, which allows to send the inputs to the LLM-as-a-service which will generate the outputs asynchronously and give us a job id that we can use later to check the status and retrieve the generated outputs when they are ready. LLM-as-a-service platforms offers this feature as a way to save costs in exchange of waiting for the outputs to be generated.

To use this feature in distilabel the only thing we need to do is to set the use_offline_batch_generation attribute to True when creating the LLM instance:

from distilabel.models import OpenAILLM\n\nllm = OpenAILLM(\n    model=\"gpt-4o\",\n    use_offline_batch_generation=True,\n)\n\nllm.load()\n\nllm.jobs_ids  # (1)\n# None\n\nllm.generate_outputs(  # (2)\n    inputs=[\n        [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n    ],\n)\n# DistilabelOfflineBatchGenerationNotFinishedException: Batch generation with jobs_ids=('batch_OGB4VjKpu2ay9nz3iiFJxt5H',) is not finished\n\nllm.jobs_ids  # (3)\n# ('batch_OGB4VjKpu2ay9nz3iiFJxt5H',)\n\n\nllm.generate_outputs(  # (4)\n    inputs=[\n        [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n    ],\n)\n# \"The capital of Spain is Madrid.\"\n
  1. At first the jobs_ids attribute is None.
  2. The first call to generate_outputs will send the inputs to the LLM-as-a-service and return a DistilabelOfflineBatchGenerationNotFinishedException since the outputs are not ready yet.
  3. After the first call to generate_outputs the jobs_ids attribute will contain the job ids created for generating the outputs.
  4. The second call or subsequent calls to generate_outputs will return the outputs if they are ready or raise a DistilabelOfflineBatchGenerationNotFinishedException if they are not ready yet.

The offline_batch_generation_block_until_done attribute can be used to block the generate_outputs method until the outputs are ready polling the platform the specified amount of seconds.

from distilabel.models import OpenAILLM\n\nllm = OpenAILLM(\n    model=\"gpt-4o\",\n    use_offline_batch_generation=True,\n    offline_batch_generation_block_until_done=5,  # poll for results every 5 seconds\n)\nllm.load()\n\nllm.generate_outputs(\n    inputs=[\n        [{\"role\": \"user\", \"content\": \"What's the capital of Spain?\"}],\n    ],\n)\n# \"The capital of Spain is Madrid.\"\n
"},{"location":"sections/how_to_guides/basic/llm/#within-a-task","title":"Within a Task","text":"

Pass the LLM as an argument to the Task, and the task will handle the rest.

from distilabel.models import OpenAILLM\nfrom distilabel.steps.tasks import TextGeneration\n\nllm = OpenAILLM(model=\"gpt-4\")\ntask = TextGeneration(name=\"text_generation\", llm=llm)\n\ntask.load()\n\nnext(task.process(inputs=[{\"instruction\": \"What's the capital of Spain?\"}]))\n# [{'instruction': \"What's the capital of Spain?\", \"generation\": \"The capital of Spain is Madrid.\"}]\n
"},{"location":"sections/how_to_guides/basic/llm/#runtime-parameters","title":"Runtime Parameters","text":"

LLMs can have runtime parameters, such as generation_kwargs, provided via the Pipeline.run() method using the params argument.

Note

Runtime parameters can differ between LLM subclasses, caused by the different functionalities offered by the LLM providers.

from distilabel.pipeline import Pipeline\nfrom distilabel.models import OpenAILLM\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(name=\"text-generation-pipeline\") as pipeline:\n    load_dataset = LoadDataFromDicts(\n        name=\"load_dataset\",\n        data=[{\"instruction\": \"Write a short story about a dragon that saves a princess from a tower.\"}],\n    )\n\n    text_generation = TextGeneration(\n        name=\"text_generation\",\n        llm=OpenAILLM(model=\"gpt-4\"),\n    )\n\n    load_dataset >> text_generation\n\nif __name__ == \"__main__\":\n    pipeline.run(\n        parameters={\n            text_generation.name: {\"llm\": {\"generation_kwargs\": {\"temperature\": 0.3}}},\n        },\n    )\n
"},{"location":"sections/how_to_guides/basic/llm/#creating-custom-llms","title":"Creating custom LLMs","text":"

To create custom LLMs, subclass either LLM for synchronous or AsyncLLM for asynchronous LLMs. Implement the following methods:

  • model_name: A property containing the model's name.

  • generate: A method that takes a list of prompts and returns generated texts.

  • agenerate: A method that takes a single prompt and returns generated texts. This method is used within the generate method of the AsyncLLM class.

  • (optional) get_last_hidden_state: is a method that will take a list of prompts and return a list of hidden states. This method is optional and will be used by some tasks such as the GenerateEmbeddings task.

Custom LLMCustom AsyncLLM
from typing import Any\n\nfrom pydantic import validate_call\n\nfrom distilabel.models import LLM\nfrom distilabel.typing import GenerateOutput, HiddenState\nfrom distilabel.typing import ChatType\n\nclass CustomLLM(LLM):\n    @property\n    def model_name(self) -> str:\n        return \"my-model\"\n\n    @validate_call\n    def generate(self, inputs: List[ChatType], num_generations: int = 1, **kwargs: Any) -> List[GenerateOutput]:\n        for _ in range(num_generations):\n            ...\n\n    def get_last_hidden_state(self, inputs: List[ChatType]) -> List[HiddenState]:\n        ...\n
from typing import Any\n\nfrom pydantic import validate_call\n\nfrom distilabel.models import AsyncLLM\nfrom distilabel.typing import GenerateOutput, HiddenState\nfrom distilabel.typing import ChatType\n\nclass CustomAsyncLLM(AsyncLLM):\n    @property\n    def model_name(self) -> str:\n        return \"my-model\"\n\n    @validate_call\n    async def agenerate(self, input: ChatType, num_generations: int = 1, **kwargs: Any) -> GenerateOutput:\n        for _ in range(num_generations):\n            ...\n\n    def get_last_hidden_state(self, inputs: List[ChatType]) -> List[HiddenState]:\n        ...\n

generate and agenerate keyword arguments (but input and num_generations) are considered as RuntimeParameters, so a value can be passed to them via the parameters argument of the Pipeline.run method.

Note

To have the arguments of the generate and agenerate coerced to the expected types, the validate_call decorator is used, which will automatically coerce the arguments to the expected types, and raise an error if the types are not correct. This is specially useful when providing a value for an argument of generate or agenerate from the CLI, since the CLI will always provide the arguments as strings.

"},{"location":"sections/how_to_guides/basic/llm/#available-llms","title":"Available LLMs","text":"

Our LLM gallery shows a list of the available LLMs that can be used within the distilabel library.

"},{"location":"sections/how_to_guides/basic/pipeline/","title":"Execute Steps and Tasks in a Pipeline","text":""},{"location":"sections/how_to_guides/basic/pipeline/#how-to-create-a-pipeline","title":"How to create a pipeline","text":"

Pipeline organise the Steps and Tasks in a sequence, where the output of one step is the input of the next one. A Pipeline should be created by making use of the context manager along with passing a name, and optionally a description.

from distilabel.pipeline import Pipeline\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    ...\n
"},{"location":"sections/how_to_guides/basic/pipeline/#connecting-steps-with-the-stepconnect-method","title":"Connecting steps with the Step.connect method","text":"

Now, we can define the steps of our Pipeline.

Note

Steps without predecessors (i.e. root steps), need to be GeneratorSteps such as LoadDataFromDicts or LoadDataFromHub. After this, other steps can be defined.

from distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n    ...\n

Easily load your datasets

If you are already used to work with Hugging Face's Dataset via load_dataset or pd.DataFrame, you can create the GeneratorStep directly from the dataset (or dataframe), and create the step with the help of make_generator_step:

From a list of dictsFrom datasets.DatasetFrom pd.DataFrame
from distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = [{\"instruction\": \"Tell me a joke.\"}]\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n    ...\n
from datasets import load_dataset\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = load_dataset(\n    \"DIBT/10k_prompts_ranked\",\n    split=\"train\"\n).filter(\n    lambda r: r[\"avg_rating\"]>=4 and r[\"num_responses\"]>=2\n).select(range(500))\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n    ...\n
import pandas as pd\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import make_generator_step\n\ndataset = pd.read_csv(\"path/to/dataset.csv\")\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    loader = make_generator_step(dataset, output_mappings={\"prompt\": \"instruction\"})\n    ...\n

Next, we will use prompt column from the dataset obtained through LoadDataFromHub and use several LLMs to execute a TextGeneration task. We will also use the Task.connect() method to connect the steps, so the output of one step is the input of the next one.

Note

The order of the execution of the steps will be determined by the connections of the steps. In this case, the TextGeneration tasks will be executed after the LoadDataFromHub step.

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        task.connect(load_dataset)\n\n    ...\n

For each row of the dataset, the TextGeneration task will generate a text based on the instruction column and the LLM model, and store the result (a single string) in a new column called generation. Because we need to have the responses in the same column, we will add GroupColumns to combine them all in the same column as a list of strings.

Note

In this case, the GroupColumns tasks will be executed after all TextGeneration steps.

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        load_dataset.connect(task)\n        task.connect(combine_generations)\n
"},{"location":"sections/how_to_guides/basic/pipeline/#connecting-steps-with-the-operator","title":"Connecting steps with the >> operator","text":"

Besides the Step.connect method: step1.connect(step2), there's an alternative way by making use of the >> operator. We can connect steps in a more readable way, and it's also possible to connect multiple steps at once.

Step per stepMultiple steps at once

Each call to step1.connect(step2) has been exchanged by step1 >> step2 within the loop.

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        load_dataset >> task >> combine_generations\n

Each task is first appended to a list, and then all the calls to connections are done in a single call.

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(name=\"load_dataset\")\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    tasks = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        tasks.append(\n            TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        )\n\n    load_dataset >> tasks >> combine_generations\n
"},{"location":"sections/how_to_guides/basic/pipeline/#routing-batches-to-specific-downstream-steps","title":"Routing batches to specific downstream steps","text":"

In some pipelines, you may want to send batches from a single upstream step to specific downstream steps based on certain conditions. To achieve this, you can use a routing_batch_function. This function takes a list of downstream steps and returns a list of step names to which each batch should be routed.

Let's update the example above to route the batches loaded by the LoadDataFromHub step to just 2 of the TextGeneration tasks. First, we will create our custom routing_batch_function, and then we will update the pipeline to use it:

import random\nfrom distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\n@routing_batch_function\ndef sample_two_steps(steps: list[str]) -> list[str]:\n    return random.sample(steps, 2)\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n    )\n\n    tasks = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.0-pro\"),\n    ):\n        tasks.append(\n            TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        )\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    load_dataset >> sample_two_steps >> tasks >> combine_generations\n

The routing_batch_function that we just built is a common one, so distilabel comes with a builtin function that can be used to achieve the same behavior:

from distilable.pipeline import sample_n_steps\n\nsample_two_steps = sample_n_steps(2)\n
"},{"location":"sections/how_to_guides/basic/pipeline/#running-the-pipeline","title":"Running the pipeline","text":""},{"location":"sections/how_to_guides/basic/pipeline/#pipelinedry_run","title":"Pipeline.dry_run","text":"

Before running the Pipeline we can check if the pipeline is valid using the Pipeline.dry_run() method. It takes the same parameters as the run method which we will discuss in the following section, plus the batch_size we want the dry run to use (by default set to 1).

with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    ...\n\nif __name__ == \"__main__\":\n    distiset = pipeline.dry_run(parameters=..., batch_size=1)\n
"},{"location":"sections/how_to_guides/basic/pipeline/#pipelinerun","title":"Pipeline.run","text":"

After testing, we can now execute the full Pipeline using the Pipeline.run() method.

with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    ...\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            \"load_dataset\": {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n            \"text_generation_with_gpt-4-0125-preview\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n            \"text_generation_with_mistral-large-2402\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n            \"text_generation_with_gemini-1.0-pro\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n        },\n    )\n

But if we run the pipeline above, we will see that the run method will fail:

ValueError: Step 'text_generation_with_gpt-4-0125-preview' requires inputs ['instruction'], but only the inputs=['prompt', 'completion', 'meta'] are available, which means that the inputs=['instruction'] are missing or not available\nwhen the step gets to be executed in the pipeline. Please make sure previous steps to 'text_generation_with_gpt-4-0125-preview' are generating the required inputs.\n

This is because, before actually running the pipeline, we must ensure each step has the necessary input columns to be executed. In this case, the TextGeneration task requires the instruction column, but the LoadDataFromHub step generates the prompt column. To solve this, we can use the output_mappings or input_mapping arguments of individual Steps, to map columns from one step to another.

with Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"}\n    )\n\n    ...\n

If we execute the pipeline again, it will run successfully and we will have a Distiset with the outputs of all the leaf steps of the pipeline which we can push to the Hugging Face Hub.

if __name__ == \"__main__\":\n    distiset = pipeline.run(...)\n    distiset.push_to_hub(\"distilabel-internal-testing/instruction-dataset-mini-with-generations\")\n
"},{"location":"sections/how_to_guides/basic/pipeline/#pipelinerun-with-a-dataset","title":"Pipeline.run with a dataset","text":"

Note that in most cases if you don't need the extra flexibility the GeneratorSteps bring you, you can create a dataset as you would normally do and pass it to the Pipeline.run method directly. Look at the highlighted lines to see the updated lines:

import random\nfrom distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline, routing_batch_function\nfrom distilabel.steps import GroupColumns\nfrom distilabel.steps.tasks import TextGeneration\n\n@routing_batch_function\ndef sample_two_steps(steps: list[str]) -> list[str]:\n    return random.sample(steps, 2)\n\ndataset = load_dataset(\n    \"distilabel-internal-testing/instruction-dataset-mini\",\n    split=\"test\"\n)\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    tasks = []\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.0-pro\"),\n    ):\n        tasks.append(\n            TextGeneration(name=f\"text_generation_with_{llm.model_name}\", llm=llm)\n        )\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    sample_two_steps >> tasks >> combine_generations\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        dataset=dataset,\n        parameters=...\n    )\n
"},{"location":"sections/how_to_guides/basic/pipeline/#stopping-the-pipeline","title":"Stopping the pipeline","text":"

In case you want to stop the pipeline while it's running, you can press Ctrl+C or Cmd+C depending on your OS (or send a SIGINT to the main process), and the outputs will be stored in the cache. Pressing an additional time will force the pipeline to stop its execution, but this can lead to losing the generated outputs for certain batches.

"},{"location":"sections/how_to_guides/basic/pipeline/#cache","title":"Cache","text":"

If for some reason, the pipeline execution stops (for example by pressing Ctrl+C), the state of the pipeline and the outputs will be stored in the cache, so we can resume the pipeline execution from the point where it was stopped.

If we want to force the pipeline to run again without can, then we can use the use_cache argument of the Pipeline.run() method:

if __name__ == \"__main__\":\n    distiset = pipeline.run(parameters={...}, use_cache=False)\n

Note

For more information on caching, we refer the reader to the caching section.

"},{"location":"sections/how_to_guides/basic/pipeline/#adjusting-the-batch-size-for-each-step","title":"Adjusting the batch size for each step","text":"

Memory issues can arise when processing large datasets or when using large models. To avoid this, we can use the input_batch_size argument of individual tasks. TextGeneration task will receive 5 dictionaries, while the LoadDataFromHub step will send 10 dictionaries per batch:

from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n        batch_size=10\n    )\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.5-pro\"),\n    ):\n        task = TextGeneration(\n            name=f\"text_generation_with_{llm.model_name.replace('.', '-')}\",\n            llm=llm,\n            input_batch_size=5,\n        )\n\n    ...\n
"},{"location":"sections/how_to_guides/basic/pipeline/#serializing-the-pipeline","title":"Serializing the pipeline","text":"

Sharing a pipeline with others is very easy, as we can serialize the pipeline object using the save method. We can save the pipeline in different formats, such as yaml or json:

yamljson
if __name__ == \"__main__\":\n    pipeline.save(\"pipeline.yaml\", format=\"yaml\")\n
if __name__ == \"__main__\":\n    pipeline.save(\"pipeline.json\", format=\"json\")\n

To load the pipeline, we can use the from_yaml or from_json methods:

yamljson
pipeline = Pipeline.from_yaml(\"pipeline.yaml\")\n
pipeline = Pipeline.from_json(\"pipeline.json\")\n

Serializing the pipeline is very useful when we want to share the pipeline with others, or when we want to store the pipeline for future use. It can even be hosted online, so the pipeline can be executed directly using the CLI.

"},{"location":"sections/how_to_guides/basic/pipeline/#visualizing-the-pipeline","title":"Visualizing the pipeline","text":"

We can visualize the pipeline using the Pipeline.draw() method. This will create a mermaid graph, and return the path to the image.

path_to_image = pipeline.draw(\n    top_to_bottom=True,\n    show_edge_labels=True,\n)\n

Within notebooks, we can simply call pipeline and the graph will be displayed. Alternatively, we can use the Pipeline.draw() method to have more control over the graph visualization and use IPython to display it.

from IPython.display import Image, display\n\ndisplay(Image(path_to_image))\n

Let's now see how the pipeline of the fully working example looks like.

"},{"location":"sections/how_to_guides/basic/pipeline/#fully-working-example","title":"Fully working example","text":"

To sum up, here is the full code of the pipeline we have created in this section. Note that you will need to change the name of the Hugging Face repository where the resulting will be pushed, set OPENAI_API_KEY environment variable, set MISTRAL_API_KEY and have gcloud installed and configured:

Code
from distilabel.models import MistralLLM, OpenAILLM, VertexAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import GroupColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import TextGeneration\n\nwith Pipeline(\"pipe-name\", description=\"My first pipe\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n    )\n\n    combine_generations = GroupColumns(\n        name=\"combine_generations\",\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    for llm in (\n        OpenAILLM(model=\"gpt-4-0125-preview\"),\n        MistralLLM(model=\"mistral-large-2402\"),\n        VertexAILLM(model=\"gemini-1.0-pro\"),\n    ):\n        task = TextGeneration(\n            name=f\"text_generation_with_{llm.model_name.replace('.', '-')}\", llm=llm\n        )\n        load_dataset.connect(task)\n        task.connect(combine_generations)\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            \"load_dataset\": {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n            \"text_generation_with_gpt-4-0125-preview\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n            \"text_generation_with_mistral-large-2402\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n            \"text_generation_with_gemini-1.0-pro\": {\n                \"llm\": {\n                    \"generation_kwargs\": {\n                        \"temperature\": 0.7,\n                        \"max_new_tokens\": 512,\n                    }\n                }\n            },\n        },\n    )\n    distiset.push_to_hub(\n        \"distilabel-internal-testing/instruction-dataset-mini-with-generations\"\n    )\n
"},{"location":"sections/how_to_guides/basic/step/","title":"Steps for processing data","text":""},{"location":"sections/how_to_guides/basic/step/#working-with-steps","title":"Working with Steps","text":"

The Step is intended to be used within the scope of a Pipeline, which will orchestrate the different steps defined but can also be used standalone.

Assuming that we have a Step already defined as it follows:

from typing import TYPE_CHECKING\nfrom distilabel.steps import Step, StepInput\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, StepOutput\n\nclass MyStep(Step):\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return [\"input_field\"]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"output_field\"]\n\n    def process(self, inputs: StepInput) -> \"StepOutput\":\n        for input in inputs:\n            input[\"output_field\"] = input[\"input_field\"]\n        yield inputs\n

Then we can use it as follows:

step = MyStep(name=\"my-step\")\nstep.load()\n\nnext(step.process([{\"input_field\": \"value\"}]))\n# [{'input_field': 'value', 'output_field': 'value'}]\n

Note

The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution.

"},{"location":"sections/how_to_guides/basic/step/#arguments","title":"Arguments","text":"
  • input_mappings, is a dictionary that maps keys from the input dictionaries to the keys expected by the step. For example, if input_mappings={\"instruction\": \"prompt\"}, means that the input key prompt will be used as the key instruction for current step.

  • output_mappings, is a dictionary that can be used to map the outputs of the step to other names. For example, if output_mappings={\"conversation\": \"prompt\"}, means that output key conversation will be renamed to prompt for the next step.

  • input_batch_size (by default set to 50), is independent for every step and will determine how many input dictionaries will process at once.

"},{"location":"sections/how_to_guides/basic/step/#runtime-parameters","title":"Runtime parameters","text":"

Steps can also have RuntimeParameter, which are parameters that can only be used after the pipeline initialisation when calling the Pipeline.run.

from distilabel.mixins.runtime_parameters import RuntimeParameter\n\nclass Step(...):\n    input_batch_size: RuntimeParameter[PositiveInt] = Field(\n        default=DEFAULT_INPUT_BATCH_SIZE,\n        description=\"The number of rows that will contain the batches processed by the\"\n        \" step.\",\n    )\n
"},{"location":"sections/how_to_guides/basic/step/#types-of-steps","title":"Types of Steps","text":"

There are two special types of Step in distilabel:

  • GeneratorStep: is a step that only generates data, and it doesn't need any input data from previous steps and normally is the first node in a Pipeline. More information: Components -> Step - GeneratorStep.

  • GlobalStep: is a step with the standard interface i.e. receives inputs and generates outputs, but it processes all the data at once, and often is the final step in the Pipeline. The fact that a GlobalStep requires the previous steps to finish before being able to start. More information: Components - Step - GlobalStep.

  • Task, is essentially the same as a default Step, but it relies on an LLM as an attribute, and the process method will be in charge of calling that LLM. More information: Components - Task.

"},{"location":"sections/how_to_guides/basic/step/#defining-custom-steps","title":"Defining custom Steps","text":"

We can define a custom step by creating a new subclass of the Step and defining the following:

  • inputs: is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • outputs: is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • process: is a method that receives the input data and returns the output data, and it should be a generator, meaning that it should yield the output data.

Note

The default signature for the process method is process(self, *inputs: StepInput) -> StepOutput. The argument inputs should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one.

Warning

For the custom Step subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline, the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING, otherwise, the validation and/or serialization will fail.

Inherit from StepUsing the @step decorator

We can inherit from the Step class and define the inputs, outputs, and process methods as follows:

from typing import TYPE_CHECKING\nfrom distilabel.steps import Step, StepInput\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, StepOutput\n\nclass CustomStep(Step):\n    @property\n    def inputs(self) -> \"StepColumns\":\n        ...\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        ...\n\n    def process(self, *inputs: StepInput) -> \"StepOutput\":\n        for upstream_step_inputs in inputs:\n            ...\n            yield item\n\n    # When overridden (ideally under the `typing_extensions.override` decorator)\n    # @typing_extensions.override\n    # def process(self, inputs: StepInput) -> StepOutput:\n    #     for input in inputs:\n    #         ...\n    #     yield inputs\n

The @step decorator will take care of the boilerplate code, and will allow to define the inputs, outputs, and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom Step subclass.

from typing import TYPE_CHECKING\nfrom distilabel.steps import StepInput, step\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepOutput\n\n@step(inputs=[...], outputs=[...])\ndef CustomStep(inputs: StepInput) -> \"StepOutput\":\n    for input in inputs:\n        ...\n    yield inputs\n\nstep = CustomStep(name=\"my-step\")\n
"},{"location":"sections/how_to_guides/basic/step/generator_step/","title":"GeneratorStep","text":"

The GeneratorStep is a subclass of Step that is intended to be used as the first step within a Pipeline, because it doesn't require input and generates data that can be used by other steps. Alternatively, it can also be used as a standalone.

from typing import List, TYPE_CHECKING\nfrom typing_extensions import override\n\nfrom distilabel.steps import GeneratorStep\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, GeneratorStepOutput\n\nclass MyGeneratorStep(GeneratorStep):\n    instructions: List[str]\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        if offset:\n            self.instructions = self.instructions[offset:]\n\n        while self.instructions:\n            batch = [\n                {\n                    \"instruction\": instruction\n                } for instruction in self.instructions[: self.batch_size]\n            ]\n            self.instructions = self.instructions[self.batch_size :]\n            yield (\n                batch,\n                True if len(self.instructions) == 0 else False,\n            )\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"instruction\"]\n

Then we can use it as follows:

step = MyGeneratorStep(\n    name=\"my-generator-step\",\n    instructions=[\"Tell me a joke.\", \"Tell me a story.\"],\n    batch_size=1,\n)\nstep.load()\n\nnext(step.process(offset=0))\n# ([{'instruction': 'Tell me a joke.'}], False)\nnext(step.process(offset=1))\n# ([{'instruction': 'Tell me a story.'}], True)\n

Note

The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution.

"},{"location":"sections/how_to_guides/basic/step/generator_step/#defining-custom-generatorsteps","title":"Defining custom GeneratorSteps","text":"

We can define a custom generator step by creating a new subclass of the GeneratorStep and defining the following:

  • outputs: is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • process: is a method that yields output data and a boolean flag indicating whether that's the last batch to be generated.

Note

The default signature for the process method is process(self, offset: int = 0) -> GeneratorStepOutput. The argument offset should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one.

Warning

For the custom Step subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline, the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING, otherwise, the validation and/or serialization will fail.

Inherit from GeneratorStepUsing the @step decorator

We can inherit from the GeneratorStep class and define the outputs, and process methods as follows:

from typing import List, TYPE_CHECKING\nfrom typing_extensions import override\n\nfrom distilabel.steps import GeneratorStep\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, GeneratorStepOutput\n\nclass MyGeneratorStep(GeneratorStep):\n    instructions: List[str]\n\n    @override\n    def process(self, offset: int = 0) -> \"GeneratorStepOutput\":\n        ...\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        ...\n

The @step decorator will take care of the boilerplate code, and will allow to define the outputs, and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom GeneratorStep subclass.

from typing import TYPE_CHECKING\nfrom distilabel.steps import step\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import GeneratorStepOutput\n\n@step(outputs=[...], step_type=\"generator\")\ndef CustomGeneratorStep(offset: int = 0) -> \"GeneratorStepOutput\":\n    yield (\n        ...,\n        True if offset == 10 else False,\n    )\n\nstep = CustomGeneratorStep(name=\"my-step\")\n
"},{"location":"sections/how_to_guides/basic/step/global_step/","title":"GlobalStep","text":"

The GlobalStep is a subclass of Step that is used to define a step that requires the previous steps to be completed to run, since it will wait until all the input batches are received before running. This step is useful when you need to run a step that requires all the input data to be processed before running. Alternatively, it can also be used as a standalone.

"},{"location":"sections/how_to_guides/basic/step/global_step/#defining-custom-globalsteps","title":"Defining custom GlobalSteps","text":"

We can define a custom step by creating a new subclass of the GlobalStep and defining the following:

  • inputs: is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • outputs: is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • process: is a method that receives the input data and returns the output data, and it should be a generator, meaning that it should yield the output data.

Note

The default signature for the process method is process(self, *inputs: StepInput) -> StepOutput. The argument inputs should be respected, no more arguments can be provided, and the type-hints and return type-hints should be respected too because it should be able to receive any number of inputs by default i.e. more than one Step at a time could be connected to the current one.

Warning

For the custom GlobalStep subclasses to work properly with distilabel and with the validation and serialization performed by default over each Step in the Pipeline, the type-hint for both StepInput and StepOutput should be used and not surrounded with double-quotes or imported under typing.TYPE_CHECKING, otherwise, the validation and/or serialization will fail.

Inherit from GlobalStepUsing the @step decorator

We can inherit from the GlobalStep class and define the inputs, outputs, and process methods as follows:

from typing import TYPE_CHECKING\nfrom distilabel.steps import GlobalStep, StepInput\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns, StepOutput\n\nclass CustomStep(Step):\n    @property\n    def inputs(self) -> \"StepColumns\":\n        ...\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        ...\n\n    def process(self, *inputs: StepInput) -> StepOutput:\n        for upstream_step_inputs in inputs:\n            for item in input:\n                ...\n            yield item\n\n    # When overridden (ideally under the `typing_extensions.override` decorator)\n    # @typing_extensions.override\n    # def process(self, inputs: StepInput) -> StepOutput:\n    #     for input in inputs:\n    #         ...\n    #     yield inputs\n

The @step decorator will take care of the boilerplate code, and will allow to define the inputs, outputs, and process methods in a more straightforward way. One downside is that it won't let you access the self attributes if any, neither set those, so if you need to access or set any attribute, you should go with the first approach of defining the custom GlobalStep subclass.

from typing import TYPE_CHECKING\nfrom distilabel.steps import StepInput, step\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepOutput\n\n@step(inputs=[...], outputs=[...], step_type=\"global\")\ndef CustomStep(inputs: StepInput) -> \"StepOutput\":\n    for input in inputs:\n        ...\n    yield inputs\n\nstep = CustomStep(name=\"my-step\")\n
"},{"location":"sections/how_to_guides/basic/task/","title":"Tasks for generating and judging with LLMs","text":""},{"location":"sections/how_to_guides/basic/task/#working-with-tasks","title":"Working with Tasks","text":"

The Task is a special kind of Step that includes the LLM as a mandatory argument. As with a Step, it is normally used within a Pipeline but can also be used standalone.

For example, the most basic task is the TextGeneration task, which generates text based on a given instruction.

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n    name=\"text-generation\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': 'The capital of Spain is Madrid.',\n#         'distilabel_metadata': {\n#               'raw_output_text-generation': 'The capital of Spain is Madrid.',\n#               'raw_input_text-generation': [\n#                   {'role': 'user', 'content': \"What's the capital of Spain?\"}\n#               ]\n#         },\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     }\n# ]\n

Note

The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution.

As shown above, the TextGeneration task adds a generation based on the instruction.

Tip

Since version 1.2.0, we provide some metadata about the LLM call through distilabel_metadata. This can be disabled by setting the add_raw_output attribute to False when creating the task.

Additionally, since version 1.4.0, the formatted input can also be included, which can be helpful when testing custom templates (testing the pipeline using the dry_run method).

disable raw input and output
task = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    add_raw_output=False,\n    add_raw_input=False\n)\n
"},{"location":"sections/how_to_guides/basic/task/#taskprint","title":"Task.print","text":"

Info

New since version 1.4.0, Task.print Task.print method.

The Tasks include a handy method to show what the prompt formatted for an LLM would look like, let's see an example with UltraFeedback, but it applies to any other Task.

from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\nuf = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n)\nuf.load()\nuf.print()\n

The result will be a rendered prompt, with the System prompt (if contained for the task) and the User prompt, rendered with rich (it will show exactly the same in a jupyter notebook).

In case you want to test with a custom input, you can pass an example to the tasksformat_input` method (or generate it on your own depending on the task), and pass it to the print method so that it shows your example:

uf.print(\n    uf.format_input({\"instruction\": \"test\", \"generations\": [\"1\", \"2\"]})\n)\n
Using a DummyLLM to avoid loading one

In case you don't want to load an LLM to render the template, you can create a dummy one like the ones we could use for testing.

from distilabel.models import LLM\nfrom distilabel.models.mixins import MagpieChatTemplateMixin\n\nclass DummyLLM(AsyncLLM, MagpieChatTemplateMixin):\n    structured_output: Any = None\n    magpie_pre_query_template: str = \"llama3\"\n\n    def load(self) -> None:\n        pass\n\n    @property\n    def model_name(self) -> str:\n        return \"test\"\n\n    def generate(\n        self, input: \"FormattedInput\", num_generations: int = 1\n    ) -> \"GenerateOutput\":\n        return [\"output\" for _ in range(num_generations)]\n

You can use this LLM just as any of the other ones to load your task and call print:

uf = UltraFeedback(llm=DummyLLM())\nuf.load()\nuf.print()\n

Note

When creating a custom task, the print method will be available by default, but it is limited to the most common scenarios for the inputs. If you test your new task and find it's not working as expected (for example, if your task contains one input consisting of a list of texts instead of a single one), you should override the _sample_input method. You can inspect the UltraFeedback source code for this.

"},{"location":"sections/how_to_guides/basic/task/#specifying-the-number-of-generations-and-grouping-generations","title":"Specifying the number of generations and grouping generations","text":"

All the Tasks have a num_generations attribute that allows defining the number of generations that we want to have per input. We can update the example above to generate 3 completions per input:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n    name=\"text-generation\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n    num_generations=3,\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': 'The capital of Spain is Madrid.',\n#         'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     },\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': 'The capital of Spain is Madrid.',\n#         'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     },\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': 'The capital of Spain is Madrid.',\n#         'distilabel_metadata': {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     }\n# ]\n

In addition, we might want to group the generations in a single output row as maybe one downstream step expects a single row with multiple generations. We can achieve this by setting the group_generations attribute to True:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import TextGeneration\n\ntask = TextGeneration(\n    name=\"text-generation\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n    num_generations=3,\n    group_generations=True\n)\ntask.load()\n\nnext(task.process([{\"instruction\": \"What's the capital of Spain?\"}]))\n# [\n#     {\n#         'instruction': \"What's the capital of Spain?\",\n#         'generation': ['The capital of Spain is Madrid.', 'The capital of Spain is Madrid.', 'The capital of Spain is Madrid.'],\n#         'distilabel_metadata': [\n#             {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#             {'raw_output_text-generation': 'The capital of Spain is Madrid.'},\n#             {'raw_output_text-generation': 'The capital of Spain is Madrid.'}\n#         ],\n#         'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'\n#     }\n# ]\n
"},{"location":"sections/how_to_guides/basic/task/#defining-custom-tasks","title":"Defining custom Tasks","text":"

We can define a custom step by creating a new subclass of the Task and defining the following:

  • inputs: is a property that returns a list of strings with the names of the required input fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not.

  • format_input: is a method that receives a dictionary with the input data and returns a ChatType following the chat-completion OpenAI message formatting.

  • outputs: is a property that returns a list of strings with the names of the output fields or a dictionary in which the keys are the names of the columns and the values are boolean indicating whether the column is required or not. This property should always include model_name as one of the outputs since that's automatically injected from the LLM.

  • format_output: is a method that receives the output from the LLM and optionally also the input data (which may be useful to build the output in some scenarios), and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs. Note that there's no need to include the model_name in the output.

Inherit from TaskUsing the @task decorator

When using the Task class inheritance method for creating a custom task, we can also optionally override the Task.process method to define a more complex processing logic involving an LLM, as the default one just calls the LLM.generate method once previously formatting the input and subsequently formatting the output. For example, EvolInstruct task overrides this method to call the LLM.generate multiple times (one for each evolution).

from typing import Any, Dict, List, Union, TYPE_CHECKING\n\nfrom distilabel.steps.tasks import Task\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepColumns\n    from distilabel.steps.tasks.typing import ChatType\n\n\nclass MyCustomTask(Task):\n    @property\n    def inputs(self) -> \"StepColumns\":\n        return [\"input_field\"]\n\n    def format_input(self, input: Dict[str, Any]) -> \"ChatType\":\n        return [\n            {\n                \"role\": \"user\",\n                \"content\": input[\"input_field\"],\n            },\n        ]\n\n    @property\n    def outputs(self) -> \"StepColumns\":\n        return [\"output_field\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return {\"output_field\": output}\n

If your task just needs a system prompt, a user message template and a way to format the output given by the LLM, then you can use the @task decorator to avoid writing too much boilerplate code.

from typing import Any, Dict, Union\nfrom distilabel.steps.tasks import task\n\n\n@task(inputs=[\"input_field\"], outputs=[\"output_field\"])\ndef MyCustomTask(output: Union[str, None], input: Union[Dict[str, Any], None] = None) -> Dict[str, Any]:\n    \"\"\"\n    ---\n    system_prompt: |\n        My custom system prompt\n\n    user_message_template: |\n        My custom user message template: {input_field}\n    ---\n    \"\"\"\n    # Format the `LLM` output here\n    return {\"output_field\": output}\n
"},{"location":"sections/how_to_guides/basic/task/generator_task/","title":"GeneratorTask that produces output","text":""},{"location":"sections/how_to_guides/basic/task/generator_task/#working-with-generatortasks","title":"Working with GeneratorTasks","text":"

The GeneratorTask is a custom implementation of a Task based on the GeneratorStep. As with a Task, it is normally used within a Pipeline but can also be used standalone.

Warning

This task is still experimental and may be subject to changes in the future.

from typing import Any, Dict, List, Union\nfrom typing_extensions import override\n\nfrom distilabel.steps.tasks.base import GeneratorTask\nfrom distilabel.steps.tasks.typing import ChatType\nfrom distilabel.steps.typing import GeneratorOutput\n\n\nclass MyCustomTask(GeneratorTask):\n    instruction: str\n\n    @override\n    def process(self, offset: int = 0) -> GeneratorOutput:\n        output = self.llm.generate(\n            inputs=[\n                [\n                    {\"role\": \"user\", \"content\": self.instruction},\n                ],\n            ],\n        )\n        output = {\"model_name\": self.llm.model_name}\n        output.update(\n            self.format_output(output=output, input=None)\n        )\n        yield output\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"output_field\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return {\"output_field\": output}\n

We can then use it as follows:

task = MyCustomTask(\n    name=\"custom-generation\",\n    instruction=\"Tell me a joke.\",\n    llm=OpenAILLM(model=\"gpt-4\"),\n)\ntask.load()\n\nnext(task.process())\n# [{'output_field\": \"Why did the scarecrow win an award? Because he was outstanding!\", \"model_name\": \"gpt-4\"}]\n

Note

Most of the times you would need to override the default process method, as it's suited for the standard Task and not for the GeneratorTask. But within the context of the process function you can freely use the llm to generate data in any way.

Note

The Step.load() always needs to be executed when being used as a standalone. Within a pipeline, this will be done automatically during pipeline execution.

"},{"location":"sections/how_to_guides/basic/task/generator_task/#defining-custom-generatortasks","title":"Defining custom GeneratorTasks","text":"

We can define a custom generator task by creating a new subclass of the GeneratorTask and defining the following:

  • process: is a method that generates the data based on the LLM and the instruction provided within the class instance, and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs. Note that the inputs argument is not allowed in this function since this is a GeneratorTask. The signature only expects the offset argument, which is used to keep track of the current iteration in the generator.

  • outputs: is a property that returns a list of strings with the names of the output fields, this property should always include model_name as one of the outputs since that's automatically injected from the LLM.

  • format_output: is a method that receives the output from the LLM and optionally also the input data (which may be useful to build the output in some scenarios), and returns a dictionary with the output data formatted as needed i.e. with the values for the columns in outputs. Note that there's no need to include the model_name in the output.

from typing import Any, Dict, List, Union\n\nfrom distilabel.steps.tasks.base import GeneratorTask\nfrom distilabel.steps.tasks.typing import ChatType\n\n\nclass MyCustomTask(GeneratorTask):\n    @override\n    def process(self, offset: int = 0) -> GeneratorOutput:\n        output = self.llm.generate(\n            inputs=[\n                [{\"role\": \"user\", \"content\": \"Tell me a joke.\"}],\n            ],\n        )\n        output = {\"model_name\": self.llm.model_name}\n        output.update(\n            self.format_output(output=output, input=None)\n        )\n        yield output\n\n    @property\n    def outputs(self) -> List[str]:\n        return [\"output_field\", \"model_name\"]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any]\n    ) -> Dict[str, Any]:\n        return {\"output_field\": output}\n
"},{"location":"sections/pipeline_samples/","title":"Tutorials","text":"
  • End-to-end tutorials provide detailed step-by-step explanations and the code used for end-to-end workflows.
  • Paper implementations provide reproductions of fundamental papers in the synthetic data domain.
  • Examples don't provide explenations but simply show code for different tasks.
"},{"location":"sections/pipeline_samples/#end-to-end-tutorials","title":"End-to-end tutorials","text":"
  • Generate a preference dataset

    Learn about synthetic data generation for ORPO and DPO.

    Tutorial

  • Clean an existing preference dataset

    Learn about how to provide AI feedback to clean an existing dataset.

    Tutorial

  • Retrieval and reranking models

    Learn about synthetic data generation for fine-tuning custom retrieval and reranking models.

    Tutorial

  • Generate text classification data

    Learn about how synthetic data generation for text classification can help address data imbalance or scarcity.

    Tutorial

"},{"location":"sections/pipeline_samples/#paper-implementations","title":"Paper Implementations","text":"
  • Deepseek Prover

    Learn about an approach to generate mathematical proofs for theorems generated from informal math problems.

    Example

  • DEITA

    Learn about prompt, response tuning for complexity and quality and LLMs as judges for automatic data selection.

    Paper

  • Instruction Backtranslation

    Learn about automatically labeling human-written text with corresponding instructions.

    Paper

  • Prometheus 2

    Learn about using open-source models as judges for direct assessment and pair-wise ranking.

    Paper

  • UltraFeedback

    Learn about a large-scale, fine-grained, diverse preference dataset, used for training powerful reward and critic models.

    Paper

  • APIGen

    Learn how to create verifiable high-quality datases for function-calling applications.

    Paper

  • CLAIR

    Learn Contrastive Learning from AI Revisions (CLAIR), a data-creation method which leads to more contrastive preference pairs.

    Paper

"},{"location":"sections/pipeline_samples/#examples","title":"Examples","text":"
  • Benchmarking with distilabel

    Learn about reproducing the Arena Hard benchmark with disitlabel.

    Example

  • Structured generation with outlines

    Learn about generating RPG characters following a pydantic.BaseModel with outlines in distilabel.

    Example

  • Structured generation with instructor

    Learn about answering instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel.

    Example

  • Create a social network with FinePersonas

    Learn how to leverage FinePersonas to create a synthetic social network and fine-tune adapters for Multi-LoRA.

    Example

"},{"location":"sections/pipeline_samples/examples/benchmarking_with_distilabel/","title":"Benchmarking with distilabel","text":"

Benchmark LLMs with distilabel: reproducing the Arena Hard benchmark.

The script below first defines both the ArenaHard and the ArenaHardResults tasks, so as to generate responses for a given collection of prompts/questions with up to two LLMs, and then calculate the results as per the original implementation, respectively. Additionally, the second part of the example builds a Pipeline to run the generation on top of the prompts with InferenceEndpointsLLM while streaming the rest of the generations from a pre-computed set of GPT-4 generations, and then evaluate one against the other with OpenAILLM generating an alternate response, a comparison between the responses, and a result as A>>B, A>B, B>A, B>>A, or tie.

To run this example you will first need to install the Arena Hard optional dependencies, being pandas, scikit-learn, and numpy.

Run
python examples/arena_hard.py\n
arena_hard.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom typing_extensions import override\n\nfrom distilabel.steps import GlobalStep, StepInput\nfrom distilabel.steps.tasks.base import Task\nfrom distilabel.steps.tasks.typing import ChatType\nfrom distilabel.steps.typing import StepOutput\n\n\nclass ArenaHard(Task):\n    \"\"\"Evaluates two assistant responses using an LLM as judge.\n\n    This `Task` is based on the \"From Live Data to High-Quality Benchmarks: The\n    Arena-Hard Pipeline\" paper that presents Arena Hard, which is a benchmark for\n    instruction-tuned LLMs that contains 500 challenging user queries. GPT-4 is used\n    as the judge to compare the model responses against a baseline model, which defaults\n    to `gpt-4-0314`.\n\n    Note:\n        Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena\n        among popular open-ended LLM benchmarks.\n\n    Input columns:\n        - instruction (`str`): The instruction to evaluate the responses.\n        - generations (`List[str]`): The responses generated by two, and only two, LLMs.\n\n    Output columns:\n        - evaluation (`str`): The evaluation of the responses generated by the LLMs.\n        - score (`str`): The score extracted from the evaluation.\n        - model_name (`str`): The model name used to generate the evaluation.\n\n    Categories:\n        - benchmark\n\n    References:\n        - [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)\n        - [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)\n\n    Examples:\n\n        Evaluate two assistant responses for a given instruction using Arean Hard prompts:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import GroupColumns, LoadDataFromDicts\n        from distilabel.steps.tasks import ArenaHard, TextGeneration\n\n        with Pipeline() as pipeline:\n            load_data = LoadDataFromDicts(\n                data=[{\"instruction\": \"What is the capital of France?\"}],\n            )\n\n            text_generation_a = TextGeneration(\n                llm=...,  # LLM instance\n                output_mappings={\"model_name\": \"generation_model\"},\n            )\n\n            text_generation_b = TextGeneration(\n                llm=...,  # LLM instance\n                output_mappings={\"model_name\": \"generation_model\"},\n            )\n\n            combine = GroupColumns(\n                columns=[\"generation\", \"generation_model\"],\n                output_columns=[\"generations\", \"generation_models\"],\n            )\n\n            arena_hard = ArenaHard(\n                llm=...,  # LLM instance\n            )\n\n            load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard\n        ```\n    \"\"\"\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs required by this task are the `instruction` and the `generations`,\n        which are the responses generated by two, and only two, LLMs.\"\"\"\n        return [\"instruction\", \"generations\"]\n\n    def format_input(self, input: Dict[str, Any]) -> ChatType:\n        \"\"\"This method formats the input data as a `ChatType` using the prompt defined\n        by the Arena Hard benchmark, which consists on a `system_prompt` plus a template\n        for the user first message that contains the `instruction` and both `generations`.\n        \"\"\"\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": \"Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\\n\\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\\n\\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\\n\\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\\n\\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\\n\\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\\n\\n1. Assistant A is significantly better: [[A>>B]]\\n2. Assistant A is slightly better: [[A>B]]\\n3. Tie, relatively the same: [[A=B]]\\n4. Assistant B is slightly better: [[B>A]]\\n5. Assistant B is significantly better: [[B>>A]]\\n\\nExample output: \\\"My final verdict is tie: [[A=B]]\\\".\",\n            },\n            {\n                \"role\": \"user\",\n                \"content\": f\"<|User Prompt|>\\n{input['instruction']}\\n\\n<|The Start of Assistant A's Answer|>\\n{input['generations'][0]}\\n<|The End of Assistant A's Answer|>\\n\\n<|The Start of Assistant B's Answer|>\\n{input['generations'][1]}\\n<|The End of Assistant B's Answer|>\",\n            },\n        ]\n\n    @property\n    def outputs(self) -> List[str]:\n        \"\"\"The outputs generated by this task are the `evaluation`, the `score` and\n        the `model_name` (which is automatically injected within the `process` method\n        of the parent task).\"\"\"\n        return [\"evaluation\", \"score\", \"model_name\"]\n\n    def format_output(\n        self,\n        output: Union[str, None],\n        input: Union[Dict[str, Any], None] = None,\n    ) -> Dict[str, Any]:\n        \"\"\"This method formats the output generated by the LLM as a Python dictionary\n        containing the `evaluation` which is the raw output generated by the LLM (consisting\n        of the judge LLM alternate generation for the given instruction, plus an explanation\n        on the evaluation of the given responses; plus the `score` extracted from the output.\n\n        Args:\n            output: the raw output of the LLM.\n            input: the input to the task. Is provided in case it needs to be used to enrich\n                the output if needed.\n\n        Returns:\n            A dict with the keys `evaluation` with the raw output which contains the LLM\n            evaluation and the extracted `score` if possible.\n        \"\"\"\n        if output is None:\n            return {\"evaluation\": None, \"score\": None}\n        pattern = re.compile(r\"\\[\\[([AB<>=]+)\\]\\]\")\n        match = pattern.search(output)\n        if match is None:\n            return {\"evaluation\": output, \"score\": None}\n        return {\"evaluation\": output, \"score\": match.group(1)}\n\n\nclass ArenaHardResults(GlobalStep):\n    \"\"\"Process Arena Hard results to calculate the ELO scores.\n\n    This `Step` is based on the \"From Live Data to High-Quality Benchmarks: The\n    Arena-Hard Pipeline\" paper that presents Arena Hard, which is a benchmark for\n    instruction-tuned LLMs that contains 500 challenging user queries. This step is\n    a `GlobalStep` that should run right after the `ArenaHard` task to calculate the\n    ELO scores for the evaluated models.\n\n    Note:\n        Arena-Hard-Auto has the highest correlation and separability to Chatbot Arena\n        among popular open-ended LLM benchmarks.\n\n    Input columns:\n        - evaluation (`str`): The evaluation of the responses generated by the LLMs.\n        - score (`str`): The score extracted from the evaluation.\n\n    References:\n        - [From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline](https://lmsys.org/blog/2024-04-19-arena-hard/)\n        - [`arena-hard-auto`](https://github.com/lm-sys/arena-hard-auto/tree/main)\n\n    Examples:\n\n        Rate the ELO scores for two assistant responses for a given an evaluation / comparison between both using Arean Hard prompts:\n\n        ```python\n        from distilabel.pipeline import Pipeline\n        from distilabel.steps import GroupColumns, LoadDataFromDicts\n        from distilabel.steps.tasks import ArenaHard, TextGeneration\n\n        with Pipeline() as pipeline:\n            load_data = LoadDataFromDicts(\n                data=[{\"instruction\": \"What is the capital of France?\"}],\n            )\n\n            text_generation_a = TextGeneration(\n                llm=...,  # LLM instance\n                output_mappings={\"model_name\": \"generation_model\"},\n            )\n\n            text_generation_b = TextGeneration(\n                llm=...,  # LLM instance\n                output_mappings={\"model_name\": \"generation_model\"},\n            )\n\n            combine = GroupColumns(\n                columns=[\"generation\", \"generation_model\"],\n                output_columns=[\"generations\", \"generation_models\"],\n            )\n\n            arena_hard = ArenaHard(\n                llm=...,  # LLM instance\n            )\n\n            arena_hard_results = ArenaHardResults(\n                custom_model_column=\"generation_models\",\n                custom_weights={\"A>B\": 1, \"A>>B\": 3, \"B>A\": 1, \"B>>A\": 3},\n            )\n\n            load_data >> [text_generation_a, text_generation_b] >> combine >> arena_hard >> arena_hard_results\n        ```\n\n    \"\"\"\n\n    custom_model_column: Optional[str] = None\n    custom_weights: Dict[str, int] = {\"A>B\": 1, \"A>>B\": 3, \"B>A\": 1, \"B>>A\": 3}\n\n    def load(self) -> None:\n        \"\"\"Ensures that the required dependencies are installed.\"\"\"\n        super().load()\n\n        try:\n            import numpy as np  # noqa: F401\n            import pandas as pd  # noqa: F401\n            from sklearn.linear_model import LogisticRegression  # noqa: F401\n        except ImportError as e:\n            raise ImportError(\n                \"In order to run `ArenaHardResults`, the `arena-hard` extra dependencies\"\n                \" must be installed i.e. `numpy`, `pandas`, and `scikit-learn`.\\n\"\n                \"Please install the dependencies by running `pip install distilabel[arena-hard]`.\"\n            ) from e\n\n    # TODO: the `evaluation` is not really required as an input, so it could be removed, since\n    # only `score` is used / required\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The inputs required by this step are the `evaluation` and the `score` generated\n        by the `ArenaHard` task. Since this step does use the identifiers `model_a` and `model_b`,\n        optionally one can set `custom_model_column` to use the model names if existing within\n        the input data, ideally this value should be `model_name` if connected from the `ArenaHard`\n        step.\"\"\"\n        columns = [\"evaluation\", \"score\"]\n        if self.custom_model_column:\n            columns.append(self.custom_model_column)\n        return columns\n\n    @override\n    def process(self, inputs: StepInput) -> StepOutput:  # type: ignore\n        \"\"\"This method processes the inputs generated by the `ArenaHard` task to calculate the\n        win rates for each of the models to evaluate. Since this step inherits from the `GlobalStep`,\n        it will wait for all the input batches to be processed, and then the output will be yielded in\n        case there's a follow up step, since this step won't modify the received inputs.\n\n        Args:\n            inputs: A list of Python dictionaries with the inputs of the task.\n\n        Yields:\n            A list of Python dictionaries with the outputs of the task.\n\n        References:\n            - https://github.com/lm-sys/arena-hard-auto/blob/main/show_result.py\n        \"\"\"\n        import numpy as np\n        import pandas as pd\n        from sklearn.linear_model import LogisticRegression\n\n        models = [\"A\", \"B\"]\n        if self.custom_model_column:\n            models = inputs[0][self.custom_model_column]\n\n        # TODO: the battles are only calculated for the first game, even though the official\n        # implementation also covers the possibility of a second game (not within the released\n        # dataset yet)\n        battles = pd.DataFrame()\n        for input in inputs:\n            output = {\n                # TODO: \"question_id\": input[\"question_id\"],\n                \"model_a\": models[0],\n                \"model_b\": models[1],\n            }\n            if input[\"score\"] in [\"A>B\", \"A>>B\"]:\n                output[\"winner\"] = models[0]\n                rows = [output] * self.custom_weights[input[\"score\"]]\n            elif input[\"score\"] in [\"B>A\", \"B>>A\"]:\n                output[\"winner\"] = models[1]\n                rows = [output] * self.custom_weights[input[\"score\"]]\n            elif input[\"score\"] == \"A=B\":\n                output[\"winner\"] = \"tie\"\n                rows = [output]\n            else:\n                continue\n\n            battles = pd.concat([battles, pd.DataFrame(rows)])\n\n        models = pd.concat([battles[\"model_a\"], battles[\"model_b\"]]).unique()\n        models = pd.Series(np.arange(len(models)), index=models)\n\n        battles = pd.concat([battles, battles], ignore_index=True)\n        p = len(models.index)\n        n = battles.shape[0]\n\n        X = np.zeros([n, p])\n        X[np.arange(n), models[battles[\"model_a\"]]] = +np.log(10)\n        X[np.arange(n), models[battles[\"model_b\"]]] = -np.log(10)\n\n        Y = np.zeros(n)\n        Y[battles[\"winner\"] == \"model_a\"] = 1.0\n\n        tie_idx = battles[\"winner\"] == \"tie\"\n        tie_idx[len(tie_idx) // 2 :] = False\n        Y[tie_idx] = 1.0\n\n        lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)  # type: ignore\n        lr.fit(X, Y)\n\n        # The ELO scores are calculated assuming that the reference is `gpt-4-0314`\n        # with an starting ELO of 1000, so that the evaluated models are compared with\n        # `gtp-4-0314` only if it's available within the models\n        elo_scores = 400 * lr.coef_[0] + 1000\n        # TODO: we could parametrize the reference / anchor model, but left as is to be faithful to the\n        # original implementation\n        if \"gpt-4-0314\" in models.index:\n            elo_scores += 1000 - elo_scores[models[\"gpt-4-0314\"]]\n\n        output = pd.Series(elo_scores, index=models.index).sort_values(ascending=False)\n        self._logger.info(f\"Arena Hard ELO: {output}\")\n\n        # Here only so that if follow up steps are connected the inputs are preserved,\n        # since this step doesn't modify nor generate new inputs\n        yield inputs\n\n\nif __name__ == \"__main__\":\n    import json\n\n    from distilabel.models import InferenceEndpointsLLM, OpenAILLM\n    from distilabel.pipeline import Pipeline\n    from distilabel.steps import (\n        GroupColumns,\n        KeepColumns,\n        LoadDataFromHub,\n        StepInput,\n        step,\n    )\n    from distilabel.steps.tasks import TextGeneration\n    from distilabel.steps.typing import StepOutput\n\n    @step(inputs=[\"turns\"], outputs=[\"system_prompt\", \"instruction\"])\n    def PrepareForTextGeneration(*inputs: StepInput) -> StepOutput:\n        for input in inputs:\n            for item in input:\n                item[\"system_prompt\"] = \"You are a helpful assistant.\"\n                item[\"instruction\"] = item[\"turns\"][0][\"content\"]\n            yield input\n\n    @step(\n        inputs=[\"question_id\"],\n        outputs=[\"generation\", \"generation_model\"],\n        step_type=\"global\",\n    )\n    def LoadReference(*inputs: StepInput) -> StepOutput:\n        # File downloaded from https://raw.githubusercontent.com/lm-sys/arena-hard-auto/e0a8ea1df42c1df76451a6cd04b14e31ff992b87/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl\n        lines = open(\"gpt-4-0314.jsonl\", mode=\"r\").readlines()\n        for input in inputs:\n            for item in input:\n                for line in lines:\n                    data = json.loads(line)\n                    if data[\"question_id\"] == item[\"question_id\"]:\n                        item[\"generation\"] = data[\"choices\"][0][\"turns\"][0][\"content\"]\n                        item[\"generation_model\"] = data[\"model_id\"]\n                        break\n            yield input\n\n    with Pipeline(name=\"arena-hard-v0.1\") as pipeline:\n        load_dataset = LoadDataFromHub(\n            name=\"load_dataset\",\n            repo_id=\"alvarobartt/lmsys-arena-hard-v0.1\",\n            split=\"test\",\n            num_examples=5,\n        )\n\n        load_reference = LoadReference(name=\"load_reference\")\n\n        prepare = PrepareForTextGeneration(name=\"prepare\")\n\n        text_generation_cohere = TextGeneration(\n            name=\"text_generation_cohere\",\n            llm=InferenceEndpointsLLM(\n                model_id=\"CohereForAI/c4ai-command-r-plus\",\n                tokenizer_id=\"CohereForAI/c4ai-command-r-plus\",\n            ),\n            use_system_prompt=True,\n            input_batch_size=10,\n            output_mappings={\"model_name\": \"generation_model\"},\n        )\n\n        combine_columns = GroupColumns(\n            name=\"combine_columns\",\n            columns=[\"generation\", \"generation_model\"],\n            output_columns=[\"generations\", \"generation_models\"],\n        )\n\n        arena_hard = ArenaHard(\n            name=\"arena_hard\",\n            llm=OpenAILLM(model=\"gpt-4-1106-preview\"),\n            output_mappings={\"model_name\": \"evaluation_model\"},\n        )\n\n        keep_columns = KeepColumns(\n            name=\"keep_columns\",\n            columns=[\n                \"question_id\",\n                \"category\",\n                \"cluster\",\n                \"system_prompt\",\n                \"instruction\",\n                \"generations\",\n                \"generation_models\",\n                \"evaluation\",\n                \"score\",\n                \"evaluation_model\",\n            ],\n        )\n\n        win_rates = ArenaHardResults(\n            name=\"win_rates\", custom_model_column=\"generation_models\"\n        )\n\n        load_dataset >> load_reference  # type: ignore\n        load_dataset >> prepare >> text_generation_cohere  # type: ignore\n        (  # type: ignore\n            [load_reference, text_generation_cohere]\n            >> combine_columns\n            >> arena_hard\n            >> keep_columns\n            >> win_rates\n        )\n\n        distiset = pipeline.run(\n            parameters={  # type: ignore\n                text_generation_cohere.name: {\n                    \"llm\": {\n                        \"generation_kwargs\": {\n                            \"temperature\": 0.7,\n                            \"max_new_tokens\": 4096,\n                            \"stop_sequences\": [\"<EOS_TOKEN>\", \"<|END_OF_TURN_TOKEN|>\"],\n                        }\n                    }\n                },\n                arena_hard.name: {\n                    \"llm\": {\n                        \"generation_kwargs\": {\n                            \"temperature\": 0.0,\n                            \"max_new_tokens\": 4096,\n                        }\n                    }\n                },\n            },\n        )\n        if distiset is not None:\n            distiset.push_to_hub(\"arena-hard-results\")\n
"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/","title":"Create a social network with FinePersonas","text":"

In this example, we'll explore the creation of specialized user personas for social network interactions using the FinePersonas-v0.1 dataset from Hugging Face. The final dataset will be ready to fine-tune a chat model with specific traits and characteristics.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#introduction","title":"Introduction","text":"

We'll delve into the process of fine-tuning different LoRA (Low-Rank Adaptation) models to imbue these personas with specific traits and characteristics.

This approach draws inspiration from Michael Sayman's work on SocialAI (visit the profile to see some examples), to leverage FinePersonas-v0.1 for building models that can emulate bots with specific behaviour.

By fine-tuning these adapters, we can potentially create AI personas with distinct characteristics, communication styles, and areas of expertise. The result? AI interactions that feel more natural and tailored to specific contexts or user needs. For those interested in the technical aspects of this approach, we recommend the insightful blog post on Multi-LoRA serving. It provides a clear and comprehensive explanation of the technology behind this innovative method.

Let's jump to the demo.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#creating-our-socialai-task","title":"Creating our SocialAI Task","text":"

Building on the new TextGeneration, creating custom tasks is easier than ever before. This powerful tool opens up a world of possibilities for creating tailored text-based content with ease and precision. We will create a SocialAI task that will be in charge of generating responses to user interactions, taking into account a given follower_type, and use the perspective from a given persona:

from distilabel.steps.tasks import TextGeneration\n\nclass SocialAI(TextGeneration):\n    follower_type: Literal[\"supporter\", \"troll\", \"alarmist\"] = \"supporter\"\n    system_prompt: str = (\n        \"You are an AI assistant expert at simulating user interactions. \"\n        \"You must answer as if you were a '{follower_type}', be concise answer with no more than 200 characters, nothing else.\"\n        \"Here are some traits to use for your personality:\\n\\n\"\n        \"{traits}\"\n    )  #\u00a0(1)\n    template: str = \"You are the folowing persona:\\n\\n{{ persona }}\\n\\nWhat would you say to the following?\\n\\n {{ post }}\"  # (2)\n    columns: str | list[str] = [\"persona\", \"post\"]  # (3)\n\n    _follower_traits: dict[str, str] = {\n        \"supporter\": (\n            \"- Encouraging and positive\\n\"\n            \"- Tends to prioritize enjoyment and relaxation\\n\"\n            \"- Focuses on the present moment and short-term pleasure\\n\"\n            \"- Often uses humor and playful language\\n\"\n            \"- Wants to help others feel good and have fun\\n\"\n        ),\n        \"troll\": (\n            \"- Provocative and confrontational\\n\"\n            \"- Enjoys stirring up controversy and conflict\\n\"\n            \"- Often uses sarcasm, irony, and mocking language\\n\"\n            \"- Tends to belittle or dismiss others' opinions and feelings\\n\"\n            \"- Seeks to get a rise out of others and create drama\\n\"\n        ),\n        \"alarmist\": (\n            \"- Anxious and warning-oriented\\n\"\n            \"- Focuses on potential risks and negative consequences\\n\"\n            \"- Often uses dramatic or sensational language\\n\"\n            \"- Tends to be serious and stern in tone\\n\"\n            \"- Seeks to alert others to potential dangers and protect them from harm (even if it's excessive or unwarranted)\\n\"\n        ),\n    }\n\n    def load(self) -> None:\n        super().load()\n        self.system_prompt = self.system_prompt.format(\n            follower_type=self.follower_type,\n            traits=self._follower_traits[self.follower_type]\n        )  # (4)\n
  1. We have a custom system prompt that will depend on the follower_type we decide for our model.

  2. The base template or prompt will answert to the post we have, from the point of view of a persona.

  3. We will need our dataset to have both persona and post columns to populate the prompt.

  4. In the load method we place the specific traits for our follower type in the system prompt.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#data-preparation","title":"Data preparation","text":"

This is an example, so let's keep it short. We will use 3 posts, and 3 different types of personas. While there's potential to enhance this process (perhaps by implementing random persona selection or leveraging semantic similarity) we'll opt for a straightforward method in this demonstration.

Our goal is to create a set of nine examples, each pairing a post with a persona. To achieve this, we'll employ an LLM to respond to each post from the perspective of a specific persona, effectively simulating how different characters might engage with the content.

posts = [\n    {\n        \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\"\n    },\n    {\n        \"post\": \"I need to develop a training course for my company on communication skills. Need to decide how deliver it remotely.\"\n    },\n    {\n        \"post\": \"I'm always 10 minutes late to meetups but no one's complained. Could this be annoying to them?\"\n    },\n]\n\npersonas = (\n    load_dataset(\"argilla/FinePersonas-v0.1-clustering-100k\", split=\"train\")\n    .shuffle()\n    .select(range(3))\n    .select_columns(\"persona\")\n    .to_list()\n)\n\ndata = []\nfor post in posts:\n    for persona in personas:\n        data.append({\"post\": post[\"post\"], \"persona\": persona[\"persona\"]})\n

Each row in will have the following format:

import json\nprint(json.dumps(data[0], indent=4))\n{\n    \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\",\n    \"persona\": \"A high school or college environmental science teacher or an ecology student specializing in biogeography and ecosystem dynamics.\"\n}\n

This will be our dataset, that we can ingest using the LoadDataFromDicts:

loader = LoadDataFromDicts(data=data)\n
"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#simulating-from-different-types-of-followers","title":"Simulating from different types of followers","text":"

With our data in hand, we're ready to explore the capabilities of our SocialAI task. For this demonstration, we'll make use of of meta-llama/Meta-Llama-3.1-70B-Instruct While this model has become something of a go-to choice recently, it's worth noting that experimenting with a variety of models could yield even more interesting results:

from distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 256,\n    },\n)\nfollower_type = \"supporter\"\n\nfollower = SocialAI(\n    llm=llm,\n    follower_type=follower_type,\n    name=f\"{follower_type}_user\",\n)\n

This setup simplifies the process, we only need to input the follower type, and the system handles the rest. We could update this too to have a random type of follower by default, and simulate from a bunch of different personalities.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#building-our-pipeline","title":"Building our Pipeline","text":"

The foundation of our pipeline is now in place. At its core is a single, powerful LLM. This versatile model will be repurposed to drive three distinct SocialAI Tasks, each tailored to a specific TextGeneration task, and each one of them will be prepared for Supervised Fine Tuning using FormatTextGenerationSFT:

with Pipeline(name=\"Social AI Personas\") as pipeline:\n    loader = LoadDataFromDicts(data=data, batch_size=1)\n\n    llm = InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 256,\n        },\n    )\n\n    for follower_type in [\"supporter\", \"troll\", \"alarmist\"]:\n        follower = SocialAI(\n            llm=llm,\n            follower_type=follower_type,\n            name=f\"{follower_type}_user\",  # (1)\n            output_mappings={\n                \"generation\": f\"interaction_{follower_type}\"  # (2)\n            }\n        )\n        format_sft = FormatTextGenerationSFT(\n            name=f\"format_sft_{follower_type}\",\n            input_mappings={\n                \"instruction\": \"post\",\n                \"generation\": f\"interaction_{follower_type}\"  # (3)\n            },\n        )\n        loader >> follower >> format_sft  # (4)\n
  1. We update the name of the step to keep track in the pipeline.

  2. The generation column from each LLM will be mapped to avoid them being overriden, as we are reusing the same task.

  3. As we have modified the output column from SocialAI, we redirect each one of the \"follower_type\" responses.

  4. Connect the loader to each one of the follower tasks and format_sft to obtain 3 different subsets.

The outcome of this pipeline will be three specialized models, each fine-tuned to a unique follower type crafted by the SocialAI task. These models will generate SFT-formatted datasets, where each post is paired with its corresponding interaction data for a specific follower type. This setup enables seamless fine-tuning using your preferred framework, such as TRL, or any other training framework of your choice.

"},{"location":"sections/pipeline_samples/examples/fine_personas_social_network/#script-and-final-dataset","title":"Script and final dataset","text":"

All the pieces are in place for our script, the full pipeline can be seen here:

Run
python examples/finepersonas_social_ai.py\n
finepersonas_social_ai.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import Literal\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import FormatTextGenerationSFT, LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass SocialAI(TextGeneration):\n    follower_type: Literal[\"supporter\", \"troll\", \"alarmist\"] = \"supporter\"\n    system_prompt: str = (\n        \"You are an AI assistant expert at simulating user interactions. \"\n        \"You must answer as if you were a '{follower_type}', be concise answer with no more than 200 characters, nothing else.\"\n        \"Here are some traits to use for your personality:\\n\\n\"\n        \"{traits}\"\n    )\n    template: str = \"You are the folowing persona:\\n\\n{{ persona }}\\n\\nWhat would you say to the following?\\n\\n {{ post }}\"\n    columns: str | list[str] = [\"persona\", \"post\"]\n\n    _follower_traits: dict[str, str] = {\n        \"supporter\": (\n            \"- Encouraging and positive\\n\"\n            \"- Tends to prioritize enjoyment and relaxation\\n\"\n            \"- Focuses on the present moment and short-term pleasure\\n\"\n            \"- Often uses humor and playful language\\n\"\n            \"- Wants to help others feel good and have fun\\n\"\n        ),\n        \"troll\": (\n            \"- Provocative and confrontational\\n\"\n            \"- Enjoys stirring up controversy and conflict\\n\"\n            \"- Often uses sarcasm, irony, and mocking language\\n\"\n            \"- Tends to belittle or dismiss others' opinions and feelings\\n\"\n            \"- Seeks to get a rise out of others and create drama\\n\"\n        ),\n        \"alarmist\": (\n            \"- Anxious and warning-oriented\\n\"\n            \"- Focuses on potential risks and negative consequences\\n\"\n            \"- Often uses dramatic or sensational language\\n\"\n            \"- Tends to be serious and stern in tone\\n\"\n            \"- Seeks to alert others to potential dangers and protect them from harm (even if it's excessive or unwarranted)\\n\"\n        ),\n    }\n\n    def load(self) -> None:\n        super().load()\n        self.system_prompt = self.system_prompt.format(\n            follower_type=self.follower_type,\n            traits=self._follower_traits[self.follower_type],\n        )\n\n\nposts = [\n    {\n        \"post\": \"Hmm, ok now I'm torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\"\n    },\n    {\n        \"post\": \"I need to develop a training course for my company on communication skills. Need to decide how deliver it remotely.\"\n    },\n    {\n        \"post\": \"I'm always 10 minutes late to meetups but no one's complained. Could this be annoying to them?\"\n    },\n]\n\npersonas = (\n    load_dataset(\"argilla/FinePersonas-v0.1-clustering-100k\", split=\"train\")\n    .shuffle()\n    .select(range(3))\n    .select_columns(\"persona\")\n    .to_list()\n)\n\ndata = []\nfor post in posts:\n    for persona in personas:\n        data.append({\"post\": post[\"post\"], \"persona\": persona[\"persona\"]})\n\n\nwith Pipeline(name=\"Social AI Personas\") as pipeline:\n    loader = LoadDataFromDicts(data=data, batch_size=1)\n\n    llm = InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 256,\n        },\n    )\n\n    for follower_type in [\"supporter\", \"troll\", \"alarmist\"]:\n        follower = SocialAI(\n            llm=llm,\n            follower_type=follower_type,\n            name=f\"{follower_type}_user\",\n            output_mappings={\"generation\": f\"interaction_{follower_type}\"},\n        )\n        format_sft = FormatTextGenerationSFT(\n            name=f\"format_sft_{follower_type}\",\n            input_mappings={\n                \"instruction\": \"post\",\n                \"generation\": f\"interaction_{follower_type}\",\n            },\n        )\n        loader >> follower >> format_sft\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    distiset.push_to_hub(\"plaguss/FinePersonas-SocialAI-test\", include_script=True)\n

This is the final toy dataset we obtain: FinePersonas-SocialAI-test

You can see examples of how to load each subset of them to fine-tune a model:

from datasets import load_dataset\n\nds = load_dataset(\"plaguss/FinePersonas-SocialAI-test\", \"format_sft_troll\")\n

And a sample of the generated field with the corresponding post and persona:

{\n    \"post\": \"Hmm, ok now I\\u0027m torn: should I go for healthy chicken tacos or unhealthy beef tacos for late night cravings?\",\n    \"persona\": \"A high school or undergraduate physics or chemistry teacher, likely with a focus on experimental instruction.\",\n    \"interaction_troll\": \"\\\"Late night cravings? More like late night brain drain. Either way, it\\u0027s just a collision of molecules in your stomach. Choose the one with more calories, at least that\\u0027s some decent kinetic energy.\\\"\",\n}\n

There's a lot of room for improvement, but quite a promising start.

"},{"location":"sections/pipeline_samples/examples/llama_cpp_with_outlines/","title":"Structured generation with outlines","text":"

Generate RPG characters following a pydantic.BaseModel with outlines in distilabel.

This script makes use of LlamaCppLLM and the structured output capabilities thanks to outlines to generate RPG characters that adhere to a JSON schema.

It makes use of a local model which can be downloaded using curl (explained in the script itself), and can be exchanged with other LLMs like vLLM.

Run
python examples/structured_generation_with_outlines.py\n
structured_generation_with_outlines.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom enum import Enum\nfrom pathlib import Path\n\nfrom pydantic import BaseModel, StringConstraints, conint\nfrom typing_extensions import Annotated\n\nfrom distilabel.models import LlamaCppLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass Weapon(str, Enum):\n    sword = \"sword\"\n    axe = \"axe\"\n    mace = \"mace\"\n    spear = \"spear\"\n    bow = \"bow\"\n    crossbow = \"crossbow\"\n\n\nclass Armor(str, Enum):\n    leather = \"leather\"\n    chainmail = \"chainmail\"\n    plate = \"plate\"\n    mithril = \"mithril\"\n\n\nclass Character(BaseModel):\n    name: Annotated[str, StringConstraints(max_length=30)]\n    age: conint(gt=1, lt=3000)\n    armor: Armor\n    weapon: Weapon\n\n\n# Download the model with\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nwith Pipeline(\"RPG-characters\") as pipeline:\n    system_prompt = (\n        \"You are a leading role play gamer. You have seen thousands of different characters and their attributes.\"\n        \" Please return a JSON object with common attributes of an RPG character.\"\n    )\n\n    load_dataset = LoadDataFromDicts(\n        name=\"load_instructions\",\n        data=[\n            {\n                \"system_prompt\": system_prompt,\n                \"instruction\": f\"Give me a character description for a {char}\",\n            }\n            for char in [\"dwarf\", \"elf\", \"human\", \"ork\"]\n        ],\n    )\n    llm = LlamaCppLLM(\n        model_path=str(Path.home() / model_path),  # type: ignore\n        n_gpu_layers=-1,\n        n_ctx=1024,\n        structured_output={\"format\": \"json\", \"schema\": Character},\n    )\n    # Change to vLLM as such:\n    # llm = vLLM(\n    #     model=\"teknium/OpenHermes-2.5-Mistral-7B\",\n    #     extra_kwargs={\"tensor_parallel_size\": 1},\n    #     structured_output={\"format\": \"json\", \"schema\": Character},\n    # )\n\n    text_generation = TextGeneration(\n        name=\"text_generation_rpg\",\n        llm=llm,\n        input_batch_size=8,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            text_generation.name: {\n                \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 256}}\n            }\n        },\n        use_cache=False,\n    )\n    for num, character in enumerate(distiset[\"default\"][\"train\"][\"generation\"]):\n        print(f\"Character: {num}\")\n        print(character)\n\n# Character: 0\n# {\n# \"name\": \"Gimli\",\n# \"age\": 42,\n# \"armor\": \"plate\",\n# \"weapon\": \"axe\" }\n# Character: 1\n# {\"name\":\"Gaelen\",\"age\":600,\"armor\":\"leather\",\"weapon\":\"bow\"}\n# Character: 2\n# {\"name\": \"John Smith\",\"age\": 35,\"armor\": \"leather\",\"weapon\": \"sword\"}\n# Character: 3\n# { \"name\": \"Grug\", \"age\": 35, \"armor\": \"leather\", \"weapon\": \"axe\"}\n
"},{"location":"sections/pipeline_samples/examples/mistralai_with_instructor/","title":"Structured generation with instructor","text":"

Answer instructions with knowledge graphs defined as pydantic.BaseModel objects using instructor in distilabel.

This script makes use of MistralLLM and the structured output capabilities thanks to instructor to generate knowledge graphs from complex topics.

This example is translated from this awesome example from instructor cookbook.

Run
python examples/structured_generation_with_instructor.py\n
structured_generation_with_instructor.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom typing import List\n\nfrom pydantic import BaseModel, Field\n\nfrom distilabel.models import MistralLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import TextGeneration\n\n\nclass Node(BaseModel):\n    id: int\n    label: str\n    color: str\n\n\nclass Edge(BaseModel):\n    source: int\n    target: int\n    label: str\n    color: str = \"black\"\n\n\nclass KnowledgeGraph(BaseModel):\n    nodes: List[Node] = Field(..., default_factory=list)\n    edges: List[Edge] = Field(..., default_factory=list)\n\n\nwith Pipeline(\n    name=\"Knowledge-Graphs\",\n    description=(\n        \"Generate knowledge graphs to answer questions, this type of dataset can be used to \"\n        \"steer a model to answer questions with a knowledge graph.\"\n    ),\n) as pipeline:\n    sample_questions = [\n        \"Teach me about quantum mechanics\",\n        \"Who is who in The Simpsons family?\",\n        \"Tell me about the evolution of programming languages\",\n    ]\n\n    load_dataset = LoadDataFromDicts(\n        name=\"load_instructions\",\n        data=[\n            {\n                \"system_prompt\": \"You are a knowledge graph expert generator. Help me understand by describing everything as a detailed knowledge graph.\",\n                \"instruction\": f\"{question}\",\n            }\n            for question in sample_questions\n        ],\n    )\n\n    text_generation = TextGeneration(\n        name=\"knowledge_graph_generation\",\n        llm=MistralLLM(\n            model=\"open-mixtral-8x22b\", structured_output={\"schema\": KnowledgeGraph}\n        ),\n        input_batch_size=8,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    load_dataset >> text_generation\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            text_generation.name: {\n                \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 2048}}\n            }\n        },\n        use_cache=False,\n    )\n\n    distiset.push_to_hub(\"distilabel-internal-testing/knowledge_graphs\")\n
Visualizing the graphs

Want to see how to visualize the graphs? You can test it using the following script. Generate some samples on your own and take a look:

Note

This example uses graphviz to render the graph, you can install with pip in the following way:

pip install graphviz\n
python examples/draw_kg.py 2  # You can pass 0,1,2 to visualize each of the samples.\n

"},{"location":"sections/pipeline_samples/papers/apigen/","title":"Create Function-Calling datasets with APIGen","text":"

This example will introduce APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets, a data generation pipeline designed to synthesize verifiable high-quality datasets for function-calling applications.

"},{"location":"sections/pipeline_samples/papers/apigen/#replication","title":"Replication","text":"

The following figure showcases the APIGen framework:

Now, let's walk through the key steps illustrated in the figure:

  • DataSampler: With the help of this step and the original Salesforce/xlam-function-calling-60k we are getting the Seed QA Data Sampler for the prompt template.

  • APIGenGenerator: This step does the job of the Query-Answer Generator, including the format checker from Stage 1: Format Checker thanks to the structured output generation.

  • APIGenExecutionChecker: This step is in charge of the Stage 2: Execution Checker.

  • APIGenSemanticChecker: Step in charge of running Stage 3: Semantic Checker, can use the same or a different LLM, we are using the same as in APIGenGenerator step.

The current implementation hasn't utilized the Diverse Prompt Library. To incorporate it, one could either adjust the prompt template within the APIGenGenerator or develop a new sampler specifically for this purpose. As for the API Sampler, while no specific data is shared here, we've created illustrative examples to demonstrate the pipeline's functionality. These examples represent a mix of data that could be used to replicate the sampler's output.

"},{"location":"sections/pipeline_samples/papers/apigen/#data-preparation","title":"Data preparation","text":"

The original paper tells about the data they used and give some hints, but nothing was shared. In this example, we will write a bunch of examples by hand to showcase how this pipeline can be built.

Assume we have the following function names, and corresponding descriptions of their behaviour:

data = [\n    {\n        \"func_name\": \"final_velocity\",\n        \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n    },\n    {\n        \"func_name\": \"permutation_count\",\n        \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n    },\n    {\n        \"func_name\": \"getdivision\",\n        \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n    },\n    {\n        \"func_name\": \"binary_addition\",\n        \"func_desc\": \"Adds two binary numbers and returns the result as a binary string.\",\n    },\n    {\n        \"func_name\": \"swapi_planet_resource\",\n        \"func_desc\": \"get a specific planets resource\",\n    },\n    {\n        \"func_name\": \"disney_character\",\n        \"func_desc\": \"Find a specific character using this endpoint\",\n    }\n]\n

The original paper refers to both python functions and APIs, but we will make use of python functions exclusively for simplicity. In order to execute and check this functions/APIs, we need access to the code, which we have moved to a Python file: lib_apigen.py. All this functions are executable, but we also need access to their tool representation. For this, we will make use of transformers' get_json_schema function1.

We have all the machinery prepared in our libpath, except from the tool definition. With the help of our helper function load_module_from_path we will load this python module, collect all the tools, and add them to each row in our data variable.

from distilabel.steps.tasks.apigen.utils import load_module_from_path\n\nlibpath_module = load_module_from_path(libpath)\ntools = getattr(libpath_module, \"get_tools\")()  # call get_tools()\n\nfor row in data:\n    #\u00a0The tools should have a mix where both the correct and irrelevant tools are present.\n    row.update({\"tools\": [tools[row[\"func_name\"]]]})\n

Now we have all the necessary data for our prompt. Additionally, we will make use of the original dataset as few-shot examples to enhance the model:

ds_og = (\n    load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n    .shuffle(seed=42)\n    .select(range(500))\n    .to_list()\n)\n

We have just loaded a subset and transformed it to a list of dictionaries, as we will use it in the DataSampler GeneratorStep, grabbing random examples from the original dataset.

"},{"location":"sections/pipeline_samples/papers/apigen/#building-the-pipeline","title":"Building the Pipeline","text":"

Now that we've walked through each component, it's time to see how it all comes together, here's the Pipeline code:

with Pipeline(name=\"apigen-example\") as pipeline:\n    loader_seeds = LoadDataFromDicts(data=data)  # (1)\n\n    sampler = DataSampler(  # (2)\n        data=ds_og,\n        size=2,\n        samples=len(data),\n        batch_size=8,\n    )\n\n    prep_examples = PrepareExamples()  # This step will add the 'examples' column\n\n    combine_steps = CombineOutputs()  # (3)\n\n    model_id = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n    llm=InferenceEndpointsLLM(  # (4)\n        model_id=model_id,\n        tokenizer_id=model_id,\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 2048,\n        },\n    )\n    apigen = APIGenGenerator(  # (5)\n        llm=llm,\n        use_default_structured_output=True,\n    )\n\n    execution_checker = APIGenExecutionChecker(libpath=str(libpath))  # (6)\n    semantic_checker = APIGenSemanticChecker(llm=llm)  # (7)\n\n    sampler >> prep_examples\n    (\n        [loader_seeds, prep_examples] \n        >> combine_steps \n        >> apigen\n        >> execution_checker\n        >> semantic_checker\n    )\n
  1. Load the data seeds we are going to use to generate our function calling dataset.

  2. The DataSampler together with PrepareExamples will be used to help us create the few-shot examples from the original dataset to be fed in our prompt.

  3. Combine both columns to obtain a single stream of data

  4. Will reuse the same LLM for the generation and the semantic checks.

  5. Creates the query and answers that will be used together with the tools to fine-tune a new model. Will generate the structured outputs to ensure we have valid JSON formatted answers.

  6. Adds columns keep_row_after_execution_check and execution_result.

  7. Adds columns keep_row_after_semantic_check and thought.

"},{"location":"sections/pipeline_samples/papers/apigen/#script-and-final-dataset","title":"Script and final dataset","text":"

To see all the pieces in place, take a look at the full pipeline, as well as an example row that would be generated from this pipeline.

Run
python examples/pipeline_apigen.py\n
pipeline_apigen.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nfrom pathlib import Path\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs, DataSampler, LoadDataFromDicts\nfrom distilabel.steps.tasks import (\n    APIGenExecutionChecker,\n    APIGenGenerator,\n    APIGenSemanticChecker,\n)\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples, load_module_from_path\n\nlibpath = Path(__file__).parent / \"lib_apigen.py\"\n\ndata = [\n    {\n        \"func_name\": \"final_velocity\",\n        \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n    },\n    {\n        \"func_name\": \"permutation_count\",\n        \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n    },\n    {\n        \"func_name\": \"getdivision\",\n        \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n    },\n    {\n        \"func_name\": \"binary_addition\",\n        \"func_desc\": \"Adds two binary numbers and returns the result as a binary string.\",\n    },\n    {\n        \"func_name\": \"swapi_planet_resource\",\n        \"func_desc\": \"get a specific planets resource\",\n    },\n    {\n        \"func_name\": \"disney_character\",\n        \"func_desc\": \"Find a specific character using this endpoint\",\n    },\n]\n\nlibpath_module = load_module_from_path(libpath)\ntools = libpath_module.get_tools()  # call get_tools()\n\n# TODO: Add in the tools between 0 and 2 extra tools to make the task more challenging.\nfor row in data:\n    # The tools should have a mix where both the correct and irrelevant tools are present.\n    row.update({\"tools\": [tools[row[\"func_name\"]]]})\n\n\nds_og = (\n    load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n    .shuffle(seed=42)\n    .select(range(500))\n    .to_list()\n)\n\n\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n    loader_seeds = LoadDataFromDicts(data=data)\n    sampler = DataSampler(\n        data=ds_og,\n        size=2,\n        samples=len(data),\n        batch_size=8,\n    )\n\n    prep_examples = PrepareExamples()\n\n    model_id = \"meta-llama/Meta-Llama-3.1-70B-Instruct\"\n    llm = InferenceEndpointsLLM(\n        model_id=model_id,\n        tokenizer_id=model_id,\n        generation_kwargs={\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 2048,\n        },\n    )\n    apigen = APIGenGenerator(\n        llm=llm,\n        use_default_structured_output=True,\n    )\n    combine_steps = CombineOutputs()\n\n    execution_checker = APIGenExecutionChecker(libpath=str(libpath))\n    semantic_checker = APIGenSemanticChecker(llm=llm)\n\n    sampler >> prep_examples\n    (\n        [loader_seeds, prep_examples]\n        >> combine_steps\n        >> apigen\n        >> execution_checker\n        >> semantic_checker\n    )\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run()\n    print(distiset[\"default\"][\"train\"][0])\n

Example row:

{\n  \"func_name\": \"final_velocity\",\n  \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n  \"tools\": [\n    {\n      \"function\": {\n        \"description\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n        \"name\": \"final_velocity\",\n        \"parameters\": {\n          \"properties\": {\n            \"acceleration\": {\n              \"description\": \"The acceleration of the object.\",\n              \"type\": \"number\"\n            },\n            \"initial_velocity\": {\n              \"description\": \"The initial velocity of the object.\",\n              \"type\": \"number\"\n            },\n            \"time\": {\n              \"description\": \"The time elapsed.\",\n              \"type\": \"number\"\n            }\n          },\n          \"required\": [\n            \"initial_velocity\",\n            \"acceleration\",\n            \"time\"\n          ],\n          \"type\": \"object\"\n        }\n      },\n      \"type\": \"function\"\n    }\n  ],\n  \"examples\": \"## Query:\\nRetrieve the first 15 comments for post ID '12345' from the Tokapi mobile API.\\n## Answers:\\n[{\\\"name\\\": \\\"v1_post_post_id_comments\\\", \\\"arguments\\\": {\\\"post_id\\\": \\\"12345\\\", \\\"count\\\": 15}}]\\n\\n## Query:\\nRetrieve the detailed recipe for the cake with ID 'cake101'.\\n## Answers:\\n[{\\\"name\\\": \\\"detailed_cake_recipe_by_id\\\", \\\"arguments\\\": {\\\"is_id\\\": \\\"cake101\\\"}}]\\n\\n## Query:\\nWhat are the frequently asked questions and their answers for Coca-Cola Company? Also, what are the suggested tickers based on Coca-Cola Company?\\n## Answers:\\n[{\\\"name\\\": \\\"symbols_faq\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}, {\\\"name\\\": \\\"symbols_suggested\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}]\",\n  \"query\": \"What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\",\n  \"answers\": \"[{\\\"arguments\\\": {\\\"acceleration\\\": \\\"9.8\\\", \\\"initial_velocity\\\": \\\"0\\\", \\\"time\\\": \\\"10\\\"}, \\\"name\\\": \\\"final_velocity\\\"}]\",\n  \"distilabel_metadata\": {\n    \"raw_input_a_p_i_gen_generator_0\": [\n      {\n        \"content\": \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\\n\\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\\n\\nEnsure the query:\\n- Is clear and concise\\n- Demonstrates typical use cases\\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\\n- The corresponding result's parameter types and ranges match with the function's descriptions\\n\\nEnsure the answer:\\n- Is a list of function calls in JSON format\\n- The length of the answer list should be equal to the number of requests in the query\\n- Can solve all the requests in the query effectively\",\n        \"role\": \"system\"\n      },\n      {\n        \"content\": \"Here are examples of queries and the corresponding answers for similar functions:\\n## Query:\\nRetrieve the first 15 comments for post ID '12345' from the Tokapi mobile API.\\n## Answers:\\n[{\\\"name\\\": \\\"v1_post_post_id_comments\\\", \\\"arguments\\\": {\\\"post_id\\\": \\\"12345\\\", \\\"count\\\": 15}}]\\n\\n## Query:\\nRetrieve the detailed recipe for the cake with ID 'cake101'.\\n## Answers:\\n[{\\\"name\\\": \\\"detailed_cake_recipe_by_id\\\", \\\"arguments\\\": {\\\"is_id\\\": \\\"cake101\\\"}}]\\n\\n## Query:\\nWhat are the frequently asked questions and their answers for Coca-Cola Company? Also, what are the suggested tickers based on Coca-Cola Company?\\n## Answers:\\n[{\\\"name\\\": \\\"symbols_faq\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}, {\\\"name\\\": \\\"symbols_suggested\\\", \\\"arguments\\\": {\\\"ticker_slug\\\": \\\"KO\\\"}}]\\n\\nNote that the query could be interpreted as a combination of several independent requests.\\n\\nBased on these examples, generate 1 diverse query and answer pairs for the function `final_velocity`.\\nThe detailed function description is the following:\\nCalculates the final velocity of an object given its initial velocity, acceleration, and time.\\n\\nThese are the available tools to help you:\\n[{'type': 'function', 'function': {'name': 'final_velocity', 'description': 'Calculates the final velocity of an object given its initial velocity, acceleration, and time.', 'parameters': {'type': 'object', 'properties': {'initial_velocity': {'type': 'number', 'description': 'The initial velocity of the object.'}, 'acceleration': {'type': 'number', 'description': 'The acceleration of the object.'}, 'time': {'type': 'number', 'description': 'The time elapsed.'}}, 'required': ['initial_velocity', 'acceleration', 'time']}}}]\\n\\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\\n```json\\n[\\n   {\\n       \\\"query\\\": \\\"The generated query.\\\",\\n       \\\"answers\\\": [\\n           {\\n               \\\"name\\\": \\\"api_name\\\",\\n               \\\"arguments\\\": {\\n                   \\\"arg_name\\\": \\\"value\\\"\\n                   ... (more arguments as required)\\n               }\\n           },\\n           ... (more API calls as required)\\n       ]\\n   }\\n]\\n```\\n\\nNow please generate 1 diverse query and answer pairs following the above format.\",\n        \"role\": \"user\"\n      }\n    ],\n    \"raw_input_a_p_i_gen_semantic_checker_0\": [\n      {\n        \"content\": \"As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\",\n        \"role\": \"system\"\n      },\n      {\n        \"content\": \"Given Information:\\n- All Available Functions:\\nCalculates the final velocity of an object given its initial velocity, acceleration, and time.\\n- User Query: What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\\n- Generated Function Calls: [{\\\"arguments\\\": {\\\"acceleration\\\": \\\"9.8\\\", \\\"initial_velocity\\\": \\\"0\\\", \\\"time\\\": \\\"10\\\"}, \\\"name\\\": \\\"final_velocity\\\"}]\\n- Execution Results: ['9.8']\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query's intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n   \\\"thought\\\": \\\"Concisely describe your reasoning here\\\",\\n   \\\"passes\\\": \\\"yes\\\" or \\\"no\\\"\\n}\\n```\\n\",\n        \"role\": \"user\"\n      }\n    ],\n    \"raw_output_a_p_i_gen_generator_0\": \"{\\\"pairs\\\": [\\n   {\\n       \\\"answers\\\": [\\n           {\\n               \\\"arguments\\\": {\\n                   \\\"acceleration\\\": \\\"9.8\\\",\\n                   \\\"initial_velocity\\\": \\\"0\\\",\\n                   \\\"time\\\": \\\"10\\\"\\n               },\\n               \\\"name\\\": \\\"final_velocity\\\"\\n           }\\n       ],\\n       \\\"query\\\": \\\"What would be the final velocity of an object that starts at rest and accelerates at 9.8 m/s^2 for 10 seconds.\\\"\\n   }\\n]}\",\n    \"raw_output_a_p_i_gen_semantic_checker_0\": \"{\\n   \\\"thought\\\": \\\"\\\",\\n   \\\"passes\\\": \\\"yes\\\"\\n}\"\n  },\n  \"model_name\": \"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n  \"keep_row_after_execution_check\": true,\n  \"execution_result\": [\n    \"9.8\"\n  ],\n  \"thought\": \"\",\n  \"keep_row_after_semantic_check\": true\n}\n
  1. Read this nice blog post for more information on tools and the reasoning behind get_json_schema: Tool Use, Unified.\u00a0\u21a9

"},{"location":"sections/pipeline_samples/papers/clair/","title":"Contrastive Learning From AI Revisions (CLAIR)","text":"

\"Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment\" introduces both Contrastive Learning from AI Revisions (CLAIR), a data-creation method which leads to more contrastive preference pairs, and Anchored Preference Optimization (APO), a controllable and more stable alignment objective. While APO can be found in TRL, we have implemented a task for CLAIR in distilabel.

CLAIR is a method for creating preference pairs which minimally revises one output to express a preference, resulting in a more precise learning signal as opposed to conventional methods which use a judge to select a preferred response.

The athors from the original paper shared a collection of datasets from CLAIR and APO, where ContextualAI/ultrafeedback_clair_32k corresponds to the CLAIR implementation.

"},{"location":"sections/pipeline_samples/papers/clair/#replication","title":"Replication","text":"

Note

The section is named Replication but in this case we are showing how to use the CLAIR task create revisions for your generations using distilabel.

To showcase CLAIR we will be using the CLAIR task implemented in distilabel and we are reusing a small sample of the already generated dataset by ContextualAI ContextualAI/ultrafeedback_clair_32k for testing.

"},{"location":"sections/pipeline_samples/papers/clair/#installation","title":"Installation","text":"

To reproduce the code below, one will need to install distilabel as follows:

pip install \"distilabel>=1.4.0\"\n

Depending on the LLM provider you want to use, the requirements may vary, take a look at the dependencies in that case, we are using for the example the free inference endpoints from Hugging Face, but that won't apply for a bigger dataset.

"},{"location":"sections/pipeline_samples/papers/clair/#building-blocks","title":"Building blocks","text":"

In this case where we already have instructions and their generations, we will just need to load the data and the corresponding CLAIR task for the revisions:

  • CLAIR to generate the revisions.
"},{"location":"sections/pipeline_samples/papers/clair/#code","title":"Code","text":"

Let's see the full pipeline applied to ContextualAI/ultrafeedback_clair_32k in distilabel:

from typing import Any, Dict\n\nfrom datasets import load_dataset\n\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\n\ndef transform_ultrafeedback(example: Dict[str, Any]) -> Dict[str, Any]:\n    return {\n        \"task\": example[\"prompt\"],\n        \"student_solution\": example[\"rejected\"][1][\"content\"],\n    }\n\ndataset = (\n    load_dataset(\"ContextualAI/ultrafeedback_clair_32k\", split=\"train\")\n    .select(range(10))             #\u00a0We collect just 10 examples\n    .map(transform_ultrafeedback)  # Apply the transformation to get just the text\n)\n\nwith Pipeline(name=\"CLAIR UltraFeedback sample\") as pipeline:\n    clair = CLAIR(  # (1)\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\n                \"temperature\": 0.7,\n                \"max_new_tokens\": 4096\n            }\n        )\n    )\n\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(dataset=dataset)  # (2)\n    distiset.push_to_hub(repo_id=\"username/clair-test\", include_script=True)  # (3)\n
  1. This Pipeline uses just CLAIR because we already have the generations, but one can just include a first task to create generations from instructions, and then the revisions with CLAIR.

  2. Include the dataset directly in the run method for simplicity.

  3. Push the distiset to the hub with the script for reproducibility.

An example dataset can be found at: distilabel-internal-testing/clair-test.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/","title":"DeepSeek Prover","text":"

\"DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data\" presents an approach to generate mathematical proofs for theorems generated from informal math problems. This approach shows promising results to advance the capabilities of models towards theorem proving using synthetic data. Until this moment the dataset and the model trained on top of it haven't been opened, let's see how the approach works to reproduce the pipeline using distilabel. The following figure depicts the approach taken to generate the dataset:

The authors propose a method for generating Lean 4 proof data from informal mathematical problems. Their approach translates high-school and undergraduate-level mathematical competition problems into formal statements.

Here we show how to deal with steps 1 and 2, but the authors ensure the theorems are checked using the lean4 program on the generated proofs, and iterate for a series of steps, fine-tuning a model on the synthetic data (DeepSeek prover 7B), regenerating the dataset, and continue the process until no further improvement is found.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#replication","title":"Replication","text":"

Note

The section is named Replication but we will show how we can use distilabel to create the different steps outlined in the DeepSeek-Prover approach. We intentionally let some steps out of the pipeline, but this can easily be extended.

We will define the components needed to generate a dataset like the one depicted in the previous figure (we won't call lean4 or do the fine-tuning, this last step can be done outside of distilabel). The different blocks will have all the docstrings as we would have in the internal steps to showcase how they are done, but they can be omitted for brevity.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#installation","title":"Installation","text":"

To reproduce the code below, we need to install distilabel as it follows:

pip install \"distilabel[hf-inference-endpoints]\"\n

We have decided to use InferenceEndpointsLLM, but any other provider with a strong model could work.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#building-blocks","title":"Building blocks","text":"

There are three components we needed to define for this pipeline, for the different components in the paper: A task to formalize the original statements, another one to assess the relevance of the theorems, and a final one to generate proofs for the theorems.

Note

We will use the same LLM for all the tasks, so we will define once and reuse it for the different tasks:

llm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n)\n
"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproverautoformalization","title":"DeepSeekProverAutoFormalization","text":"

This Task corresponds to the first step in the figure. Given an informal statement, it will formalize it for us in Lean 4 language, meaning it will translate from an informal statement that could be gathered from the internet, to the lean4 structured language.

DeepSeekProverAutoFormalization
_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX = r\"```lean4(.*?)```\"\n\ntemplate_deepseek_prover_auto_formalization = \"\"\"\\\nMathematical Problem in Natural Language:\n{{ informal_statement }}\n{%- if few_shot %}\n\nPlease use the following examples to guide you with the answer:\n{%- for example in examples %}\n- {{ example }}\n{%- endfor %}\n{% endif -%}\"\"\"\n\n\nclass DeepSeekProverAutoFormalization(Task):\n    examples: Optional[List[str]] = None\n    system_prompt: str = \"Translate the problem to Lean 4 (only the core declaration):\\n```lean4\\nformal statement goes here\\n```\"\n    _template: Union[Template, None] = PrivateAttr(...)\n    _few_shot: bool = PrivateAttr(default=False)\n\n    def load(self) -> None:\n        super().load()\n        self._template = Template(template_deepseek_prover_auto_formalization)\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"informal_statement\"]\n\n    @property\n    def outputs(self):\n        return [\"formal_statement\", \"model_name\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self.system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    informal_statement=input[self.inputs[0]],\n                    few_shot=bool(self.examples),\n                    examples=self.examples,\n                ),\n            },\n        ]\n\n    @override\n    def format_output(  # type: ignore\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n        if match:\n            match = match.group(1).strip()\n        return {\"formal_statement\": match}\n

Following the paper, they found that the model yields better results if it uses examples in a few shot setting, so this class allows to take some examples to help in generating the formulation. Let's see an example of how we can instantiate it:

from textwrap import dedent\n\nexamples = [\n    dedent(\"\"\"\n    ## Statement in natural language:\n    For real numbers k and x:\n    If x is equal to (13 - \u221a131) / 4, and\n    If the equation 2x\u00b2 - 13x + k = 0 is satisfied,\n    Then k must be equal to 19/4.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n    dedent(\"\"\"\n    ## Statement in natural language:\n    The greatest common divisor (GCD) of 20 factorial (20!) and 200,000 is equal to 40,000.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n    dedent(\"\"\"\n    ## Statement in natural language:\n    Given two integers x and y:\n    If y is positive (greater than 0),\n    And y is less than x,\n    And the equation x + y + xy = 80 is true,\n    Then x must be equal to 26.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n]\n\nauto_formalization = DeepSeekProverAutoFormalization(\n    name=\"auto_formalization\",\n    input_batch_size=8,\n    llm=llm,\n    examples=examples\n)\n
"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproverscorer","title":"DeepSeekProverScorer","text":"

The next Task corresponds to the second step, the model scoring and assessment. It uses an LLM as judge to evaluate the relevance of the theorem, and assigns a score so it can be filtered afterwards.

DeepSeekProverScorer
template_deepseek_prover_scorer = \"\"\"\\\nTo evaluate whether a formal Lean4 statement will be of interest to the community, consider the following criteria:\n\n1. Relevance to Current Research: Does the statement address a problem or concept that is actively being researched in mathematics or related fields? Higher relevance scores indicate greater potential interest.\n2. Complexity and Depth: Is the statement complex enough to challenge existing theories and methodologies, yet deep enough to provide significant insights or advancements? Complexity and depth showcase Lean4's capabilities and attract interest.\n3. Interdisciplinary Potential: Does the statement offer opportunities for interdisciplinary research, connecting mathematics with other fields such as computer science, physics, or biology? Interdisciplinary projects often garner wide interest.\n4. Community Needs and Gaps: Does the statement fill an identified need or gap within the Lean4 community or the broader mathematical community? Addressing these needs directly correlates with interest.\n5. Innovativeness: How innovative is the statement? Does it propose new methods, concepts, or applications? Innovation drives interest and engagement.\n\nCustomize your evaluation for each problem accordingly, assessing it as 'excellent', 'good', 'above average', 'fair' or 'poor'.\n\nYou should respond in the following format for each statement:\n\n'''\nNatural language: (Detailed explanation of the informal statement, including any relevant background information, assumptions, and definitions.)\nAnalysis: (Provide a brief justification for each score, highlighting why the statement scored as it did across the criteria.)\nAssessment: (Based on the criteria, rate the statement as 'excellent', 'good', 'above average', 'fair' or 'poor'. JUST the Assessment.)\n'''\"\"\"\n\nclass DeepSeekProverScorer(Task):\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    def load(self) -> None:\n        super().load()\n        self._template = Template(template_deepseek_prover_scorer)\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"informal_statement\", \"formal_statement\"]\n\n    @property\n    def outputs(self):\n        return [\"natural_language\", \"analysis\", \"assessment\", \"model_name\"]\n\n    def format_input(self, input: str) -> ChatType:\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self._template.render(),\n            },\n            {\n                \"role\": \"user\",\n                \"content\": f\"## Informal statement:\\n{input[self.inputs[0]]}\\n\\n ## Formal statement:\\n{input[self.inputs[1]]}\",\n            },\n        ]\n\n    @override\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:\n        try:\n            result = output.split(\"Natural language:\")[1].strip()\n            natural_language, analysis = result.split(\"Analysis:\")\n            analysis, assessment = analysis.split(\"Assessment:\")\n            natural_language = natural_language.strip()\n            analysis = analysis.strip()\n            assessment = assessment.strip()\n        except Exception:\n            natural_language = analysis = assessment = None\n\n        return {\n            \"natural_language\": natural_language,\n            \"analysis\": analysis,\n            \"assessment\": assessment\n        }\n
"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#deepseekproversolver","title":"DeepSeekProverSolver","text":"

The last task is in charge of generating a proof for the theorems generated in the previous steps.

DeepSeekProverSolver
class DeepSeekProverSolver(Task):\n    system_prompt: str = (\n        \"You are an expert in proving mathematical theorems formalized in lean4 language. \"\n        \"Your answers consist just in the proof to the theorem given, and nothing else.\"\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        return [\"formal_statement\"]\n\n    @property\n    def outputs(self):\n        return [\"proof\"]\n\n    def format_input(self, input: str) -> ChatType:\n        prompt = dedent(\"\"\"\n            Give me a proof for the following theorem:\n            ```lean4\n            {theorem}\n            ```\"\"\"\n        )\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self.system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": prompt.format(theorem=input[\"formal_statement\"]),\n            },\n        ]\n\n    def format_output(\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:\n        import re\n        match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n        if match:\n            match = match.group(1).strip()\n        return {\"proof\": match}\n

Additionally, the original pipeline defined in the paper includes a step to check the final proofs using the lean 4 language that we have omitted for simplicity. The fine tuning can be done completely offline, and come back to the pipeline after each iteration/training run.

All the docstrings have been removed from the code blocks, but can be seen in the full pipeline.

"},{"location":"sections/pipeline_samples/papers/deepseek_prover/#code","title":"Code","text":"

Lets's put the building blocks together to create the final pipeline with distilabel. For this example we have generated a sample dataset plaguss/informal-mathematical-statements-tiny of informal mathematical statements starting from casey-martin/multilingual-mathematical-autoformalization, but as the paper mentions, we can create formal statements and it's corresponding proofs starting from informal ones:

Click to see the full pipeline deepseek_prover.py
# Copyright 2023-present, Argilla, Inc.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport re\nfrom pathlib import Path\nfrom textwrap import dedent\nfrom typing import Any, Dict, List, Optional, Union\n\nfrom jinja2 import Template\nfrom pydantic import PrivateAttr\nfrom typing_extensions import override\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub\nfrom distilabel.steps.tasks.base import Task\nfrom distilabel.steps.tasks.typing import ChatType\n\n_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX = r\"```lean4(.*?)```\"\n\n\ntemplate_deepseek_prover_auto_formalization = \"\"\"\\\nMathematical Problem in Natural Language:\n{{ informal_statement }}\n{%- if few_shot %}\n\nPlease use the following examples to guide you with the answer:\n{%- for example in examples %}\n- {{ example }}\n{%- endfor %}\n{% endif -%}\"\"\"\n\n\nclass DeepSeekProverAutoFormalization(Task):\n    \"\"\"Task to translate a mathematical problem from natural language to Lean 4.\n\n    Note:\n        A related dataset (MMA from the paper) can be found in Hugging Face:\n        [casey-martin/multilingual-mathematical-autoformalization](https://huggingface.co/datasets/casey-martin/multilingual-mathematical-autoformalization).\n\n    Input columns:\n        - informal_statement (`str`): The statement to be formalized using Lean 4.\n\n    Output columns:\n        - formal_statement (`str`): The formalized statement using Lean 4, to be analysed.\n\n    Categories:\n        - generation\n\n    References:\n        - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n        - [`Lean 4`](https://github.com/leanprover/lean4).\n\n    Examples:\n\n        Formalize a mathematical problem from natural language to Lean 4:\n\n        ```python\n        from distilabel.steps.tasks import DeepSeekProverAutoFormalization\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prover_autoformal = DeepSeekProverAutoFormalization(\n            llm=InferenceEndpointsLLM(\n                model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n                tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n            ),\n        )\n\n        prover_autoformal.load()\n\n        result = next(\n            prover_autoformal.process(\n                [\n                    {\"informal_statement\": \"If a polynomial g is monic, then the root of g is integral over the ring R.\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'informal_statement': 'If a polynomial g is monic, then the root of g is integral over the ring R.',\n        #         'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n        #         'distilabel_metadata': {\n        #             'raw_output_deep_seek_prover_auto_formalization_0': '```lean4\\ntheorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\\n```'\n        #         },\n        #         'model_name': 'deepseek-prover'\n        #     }\n        # ]\n        ```\n\n        Use a few-shot setting to formalize a mathematical problem from natural language to Lean 4:\n\n        ```python\n        from distilabel.steps.tasks import DeepSeekProverAutoFormalization\n        from distilabel.models import InferenceEndpointsLLM\n\n        # You can gain inspiration from the following examples to create your own few-shot examples:\n        # https://github.com/yangky11/miniF2F-lean4/blob/main/MiniF2F/Valid.lean\n        # Consider this as a placeholder for your actual LLM.\n        prover_autoformal = DeepSeekProverAutoFormalization(\n            llm=InferenceEndpointsLLM(\n                model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n                tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n            ),\n            examples=[\n                \"theorem amc12a_2019_p21 (z : \u2102) (h\u2080 : z = (1 + Complex.I) / Real.sqrt 2) :\\n\\n((\u2211 k : \u2124 in Finset.Icc 1 12, z ^ k ^ 2) * (\u2211 k : \u2124 in Finset.Icc 1 12, 1 / z ^ k ^ 2)) = 36 := by\\n\\nsorry\",\n                \"theorem amc12a_2015_p10 (x y : \u2124) (h\u2080 : 0 < y) (h\u2081 : y < x) (h\u2082 : x + y + x * y = 80) : x = 26 := by\\n\\nsorry\"\n            ]\n        )\n\n        prover_autoformal.load()\n\n        result = next(\n            prover_autoformal.process(\n                [\n                    {\"informal_statement\": \"If a polynomial g is monic, then the root of g is integral over the ring R.\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'informal_statement': 'If a polynomial g is monic, then the root of g is integral over the ring R.',\n        #         'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n        #         'distilabel_metadata': {\n        #             'raw_output_deep_seek_prover_auto_formalization_0': '```lean4\\ntheorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\\n```'\n        #         },\n        #         'model_name': 'deepseek-prover'\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    examples: Optional[List[str]] = None\n    system_prompt: str = \"Translate the problem to Lean 4 (only the core declaration):\\n```lean4\\nformal statement goes here\\n```\"\n    _template: Union[Template, None] = PrivateAttr(...)\n    _few_shot: bool = PrivateAttr(default=False)\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        self._template = Template(template_deepseek_prover_auto_formalization)\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`.\"\"\"\n        return [\"informal_statement\"]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n        return [\"formal_statement\", \"model_name\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation. And the\n        `system_prompt` is added as the first message if it exists.\"\"\"\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self.system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": self._template.render(\n                    informal_statement=input[self.inputs[0]],\n                    few_shot=bool(self.examples),\n                    examples=self.examples,\n                ),\n            },\n        ]\n\n    @override\n    def format_output(  # type: ignore\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        \"\"\"Extracts the formal statement from the Lean 4 output.\"\"\"\n        match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n        if match:\n            match = match.group(1).strip()\n        return {\"formal_statement\": match}\n\n\ntemplate_deepseek_prover_scorer = \"\"\"\\\nTo evaluate whether a formal Lean4 statement will be of interest to the community, consider the following criteria:\n\n1. Relevance to Current Research: Does the statement address a problem or concept that is actively being researched in mathematics or related fields? Higher relevance scores indicate greater potential interest.\n2. Complexity and Depth: Is the statement complex enough to challenge existing theories and methodologies, yet deep enough to provide significant insights or advancements? Complexity and depth showcase Lean4's capabilities and attract interest.\n3. Interdisciplinary Potential: Does the statement offer opportunities for interdisciplinary research, connecting mathematics with other fields such as computer science, physics, or biology? Interdisciplinary projects often garner wide interest.\n4. Community Needs and Gaps: Does the statement fill an identified need or gap within the Lean4 community or the broader mathematical community? Addressing these needs directly correlates with interest.\n5. Innovativeness: How innovative is the statement? Does it propose new methods, concepts, or applications? Innovation drives interest and engagement.\n\nCustomize your evaluation for each problem accordingly, assessing it as 'excellent', 'good', 'above average', 'fair' or 'poor'.\n\nYou should respond in the following format for each statement:\n\n'''\nNatural language: (Detailed explanation of the informal statement, including any relevant background information, assumptions, and definitions.)\nAnalysis: (Provide a brief justification for each score, highlighting why the statement scored as it did across the criteria.)\nAssessment: (Based on the criteria, rate the statement as 'excellent', 'good', 'above average', 'fair' or 'poor'. JUST the Assessment.)\n'''\"\"\"\n\n\nclass DeepSeekProverScorer(Task):\n    \"\"\"Task to evaluate the quality of a formalized mathematical problem in Lean 4,\n    inspired by the DeepSeek-Prover task for scoring.\n\n    Note:\n        A related dataset (MMA from the paper) can be found in Hugging Face:\n        [casey-martin/multilingual-mathematical-autoformalization](https://huggingface.co/datasets/casey-martin/multilingual-mathematical-autoformalization).\n\n    Input columns:\n        - informal_statement (`str`): The statement to be formalized using Lean 4.\n        - formal_statement (`str`): The formalized statement using Lean 4, to be analysed.\n\n    Output columns:\n        - natural_language (`str`): Explanation for the problem.\n        - analysis (`str`): Analysis of the different points defined in the prompt.\n        - assessment (`str`): Result of the assessment.\n\n    Categories:\n        - scorer\n        - quality\n        - response\n\n    References:\n        - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n        - [`Lean 4`](https://github.com/leanprover/lean4).\n\n    Examples:\n\n        Analyse a formal statement in Lean 4:\n\n        ```python\n        from distilabel.steps.tasks import DeepSeekProverScorer\n        from distilabel.models import InferenceEndpointsLLM\n\n        # Consider this as a placeholder for your actual LLM.\n        prover_scorer = DeepSeekProverAutoFormalization(\n            llm=InferenceEndpointsLLM(\n                model_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n                tokenizer_id=\"deepseek-ai/deepseek-math-7b-instruct\",\n            ),\n        )\n\n        prover_scorer.load()\n\n        result = next(\n            prover_scorer.process(\n                [\n                    {\"formal_statement\": \"theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=\"},\n                ]\n            )\n        )\n        # result\n        # [\n        #     {\n        #         'formal_statement': 'theorem isIntegral_root (hg : g.Monic) : IsIntegral R (root g):=',\n        #         'informal_statement': 'INFORMAL',\n        #         'analysis': 'ANALYSIS',\n        #         'assessment': 'ASSESSMENT',\n        #         'distilabel_metadata': {\n        #             'raw_output_deep_seek_prover_scorer_0': 'Natural language:\\nINFORMAL\\nAnalysis:\\nANALYSIS\\nAssessment:\\nASSESSMENT'\n        #         },\n        #         'model_name': 'deepseek-prover-scorer'\n        #     }\n        # ]\n        ```\n    \"\"\"\n\n    _template: Union[Template, None] = PrivateAttr(...)\n\n    def load(self) -> None:\n        \"\"\"Loads the Jinja2 template.\"\"\"\n        super().load()\n\n        self._template = Template(template_deepseek_prover_scorer)\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `instruction`.\"\"\"\n        return [\"informal_statement\", \"formal_statement\"]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is a list of `instructions` containing the generated instructions.\"\"\"\n        return [\"natural_language\", \"analysis\", \"assessment\", \"model_name\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType` assuming that the instruction\n        is the first interaction from the user within a conversation. And the\n        `system_prompt` is added as the first message if it exists.\"\"\"\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self._template.render(),\n            },\n            {\n                \"role\": \"user\",\n                \"content\": f\"## Informal statement:\\n{input[self.inputs[0]]}\\n\\n ## Formal statement:\\n{input[self.inputs[1]]}\",\n            },\n        ]\n\n    @override\n    def format_output(  # type: ignore\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        \"\"\"Analyses the formal statement with Lean 4 output and generates an assessment\n        and the corresponding informal assessment.\"\"\"\n\n        try:\n            result = output.split(\"Natural language:\")[1].strip()\n            natural_language, analysis = result.split(\"Analysis:\")\n            analysis, assessment = analysis.split(\"Assessment:\")\n            natural_language = natural_language.strip()\n            analysis = analysis.strip()\n            assessment = assessment.strip()\n        except Exception:\n            natural_language = analysis = assessment = None\n\n        return {\n            \"natural_language\": natural_language,\n            \"analysis\": analysis,\n            \"assessment\": assessment,\n        }\n\n\nclass DeepSeekProverSolver(Task):\n    \"\"\"Task to generate a proof for a formal statement (theorem) in lean4.\n\n    Input columns:\n        - formal_statement (`str`): The formalized statement using Lean 4.\n\n    Output columns:\n        - proof (`str`): The proof for the formal statement theorem.\n\n    Categories:\n        - scorer\n        - quality\n        - response\n\n    References:\n        - [`DeepSeek-Prover: Advancing Theorem Proving in LLMs through Large-Scale Synthetic Data`](https://arxiv.org/abs/2405.14333).\n    \"\"\"\n\n    system_prompt: str = (\n        \"You are an expert in proving mathematical theorems formalized in lean4 language. \"\n        \"Your answers consist just in the proof to the theorem given, and nothing else.\"\n    )\n\n    @property\n    def inputs(self) -> List[str]:\n        \"\"\"The input for the task is the `formal_statement`.\"\"\"\n        return [\"formal_statement\"]\n\n    @property\n    def outputs(self):\n        \"\"\"The output for the task is the proof for the formal statement theorem.\"\"\"\n        return [\"proof\"]\n\n    def format_input(self, input: str) -> ChatType:  # type: ignore\n        \"\"\"The input is formatted as a `ChatType`, with a system prompt to guide our model.\"\"\"\n        prompt = dedent(\"\"\"\n            Give me a proof for the following theorem:\n            ```lean4\n            {theorem}\n            ```\"\"\")\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": self.system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": prompt.format(theorem=input[\"formal_statement\"]),\n            },\n        ]\n\n    def format_output(  # type: ignore\n        self, output: Union[str, None], input: Dict[str, Any] = None\n    ) -> Dict[str, Any]:  # type: ignore\n        import re\n\n        match = re.search(_PARSE_DEEPSEEK_PROVER_AUTOFORMAL_REGEX, output, re.DOTALL)\n        if match:\n            match = match.group(1).strip()\n        return {\"proof\": match}\n\n\nexamples = [\n    dedent(\"\"\"\n    ## Statement in natural language:\n    For real numbers k and x:\n    If x is equal to (13 - \u221a131) / 4, and\n    If the equation 2x\u00b2 - 13x + k = 0 is satisfied,\n    Then k must be equal to 19/4.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n    dedent(\"\"\"\n    ## Statement in natural language:\n    The greatest common divisor (GCD) of 20 factorial (20!) and 200,000 is equal to 40,000.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n    dedent(\"\"\"\n    ## Statement in natural language:\n    Given two integers x and y:\n    If y is positive (greater than 0),\n    And y is less than x,\n    And the equation x + y + xy = 80 is true,\n    Then x must be equal to 26.\n    ## Formalized:\n    theorem mathd_algebra_116 (k x : \u211d) (h\u2080 : x = (13 - Real.sqrt 131) / 4)\n        (h\u2081 : 2 * x ^ 2 - 13 * x + k = 0) : k = 19 / 4 :=\"\"\"),\n]\n\n\nwith Pipeline(name=\"test_deepseek_prover\") as pipeline:\n    data_loader = LoadDataFromHub(\n        repo_id=\"plaguss/informal-mathematical-statements-tiny\",\n        split=\"val\",\n        batch_size=8,\n    )\n\n    llm = InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    )\n    auto_formalization = DeepSeekProverAutoFormalization(\n        name=\"auto_formalization\", input_batch_size=8, llm=llm, examples=examples\n    )\n    prover_scorer = DeepSeekProverScorer(\n        name=\"prover_scorer\",\n        input_batch_size=8,\n        llm=llm,\n    )\n    proof_generator = DeepSeekProverSolver(\n        name=\"proof_generator\", input_batch_size=8, llm=llm\n    )\n\n    (data_loader >> auto_formalization >> prover_scorer >> proof_generator)\n\n\nif __name__ == \"__main__\":\n    import argparse\n\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"-d\",\n        \"--dry-run\",\n        action=argparse.BooleanOptionalAction,\n        help=\"Do a dry run for testing purposes.\",\n    )\n    args = parser.parse_args()\n\n    pipeline_parameters = {\n        data_loader.name: {\"split\": \"val\"},\n        auto_formalization.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"temperature\": 0.6,\n                    \"top_p\": 0.9,\n                    \"max_new_tokens\": 512,\n                }\n            }\n        },\n        prover_scorer.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"temperature\": 0.6,\n                    \"top_p\": 0.9,\n                    \"max_new_tokens\": 512,\n                }\n            }\n        },\n    }\n\n    ds_name = \"test_deepseek_prover\"\n\n    if args.dry_run:\n        distiset = pipeline.dry_run(batch_size=1, parameters=pipeline_parameters)\n        distiset.save_to_disk(Path.home() / f\"Downloads/{ds_name}\")\n\n        import pprint\n\n        pprint.pprint(distiset[\"default\"][\"train\"][0])\n\n    else:\n        distiset = pipeline.run(parameters=pipeline_parameters)\n        distiset.push_to_hub(ds_name, include_script=True)\n

The script can be run run for a dry run or not, depending on the argument (the pipeline will run without dry run by default), and will be pushed to the hub with the name your_username/test_deepseek_prover:

python deepseek_prover.py [-d | --dry-run | --no-dry-run]\n

Final dataset: plaguss/test_deepseek_prover.

"},{"location":"sections/pipeline_samples/papers/deita/","title":"DEITA","text":"

DEITA (Data-Efficient Instruction Tuning for Alignment) studies an automatic data selection process by first quantifying the data quality based on complexity, quality and diversity. Second, select the best potential combination from an open-source dataset that would fit into the budget you allocate to tune your own LLM.

In most setting we cannot allocate unlimited resources for instruction-tuning LLMs. Therefore, the DEITA authors investigated how to select qualitative data for instruction tuning based on the principle of fewer high-quality samples. Liu et al. tackle the issue of first defining good data and second identifying it to respect an initial budget to instruct-tune your LLM.

The strategy utilizes LLMs to replace human effort in time-intensive data quality tasks on instruction-tuning datasets**. DEITA introduces a way to measure data quality across three critical dimensions: complexity, quality and diversity.

You can see that we see again the dataset of instructions/responses and we kind of reproducing the second step when we learn how to optimize the responses according to an instruction by comparing several possibilities.

"},{"location":"sections/pipeline_samples/papers/deita/#datasets-and-budget","title":"Datasets and budget","text":"

We will dive deeper into the whole process. We will investigate each stage to efficiently select the final dataset used for supervised fine-tuning with a budget constraint. We will tackle technical challenges by explaining exactly how you would assess good data as presented in the paper.

As a reminder, we're looking for a strategy to automatically select good data for the instruction-tuning step when you want to fine-tune an LLM to your own use case taking into account a resource constraint. This means that you cannot blindly train a model on any data you encounter on the internet.

The DEITA authors assume that you have access to open-source datasets that fit your use case. This may not be the case entirely. But with open-source communities tackling many use cases, with projects such as BLOOM or AYA, it's likely that your use case will be tackled at some point. Furthermore, you could generate your own instruction/response pairs with methods such as self-generated instructions using distilabel. This tutorial assumes that we have a data pool with excessive samples for the project's cost constraint. In short, we aim to achieve adequate performance from fewer samples.

The authors claim that the subsample size \"correlates proportionally with the computation consumed in instruction tuning\". Hence on a first approximation, reducing the sample size means reducing computation consumption and so the total development cost. Reproducing the paper notations, we will associate the budget m to a number of instruction/response pairs that you can set depending on your real budget.

To match the experimental set-up, dataset X_sota is a meta-dataset combining major open-source datasets available to instruct-tune LLMs. This dataset is composed of ShareGPT (58k instruction/response pairs), UltraChat (105k instruction/response pairs) and WizardLM (143k instruction/response pairs). It sums to more than 300k instruction/response pairs. We aim to reduce the final subsample to 6k instruction/response pairs.

"},{"location":"sections/pipeline_samples/papers/deita/#setup-the-notebook-and-packages","title":"Setup the notebook and packages","text":"

Let's prepare our dependencies:

pip install \"distilabel[openai,hf-transformers]>=1.0.0\"\npip install pynvml huggingface_hub argilla\n

Import distilabel:

from distilabel.models import TransformersLLM, OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import ConversationTemplate, DeitaFiltering, ExpandColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import ComplexityScorer, EvolInstruct, EvolQuality, GenerateEmbeddings, QualityScorer\n

Define the distilabel Pipeline and load the dataset from the Hugging Face Hub.

pipeline = Pipeline(name=\"DEITA\")\n\nload_data = LoadDataFromHub(\n    name=\"load_data\", batch_size=100, output_mappings={\"prompt\": \"instruction\"}, pipeline=pipeline\n)\n
"},{"location":"sections/pipeline_samples/papers/deita/#evol-instruct-generate-instructions-with-an-llm","title":"EVOL-INSTRUCT: Generate Instructions with an LLM","text":"

Evol-Instruct automates the creation of complex instruction data for training large language models (LLMs) by progressively rewriting an initial set of instructions into more complex forms. This generated data is then used to fine-tune a model named WizardLM.

Evaluations show that instructions from Evol-Instruct are superior to human-created ones, and WizardLM achieves performance close to or exceeding GPT3.5-turbo in many skills. In distilabel, we initialise each step of the data generation pipeline. Later, we'll connect them together.

evol_instruction_complexity = EvolInstruct(\n    name=\"evol_instruction_complexity\",\n    llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n    num_evolutions=5,\n    store_evolutions=True,\n    generate_answers=True,\n    include_original_instruction=True,\n    pipeline=pipeline,\n)\n\nevol_instruction_complexity.load()\n\n_evolved_instructions = next(evol_instruction_complexity.process(\n    ([{\"instruction\": \"How many fish are there in a dozen fish?\"}]))\n)\n\nprint(*_evolved_instructions, sep=\"\\n\")\n

Output:

( 1, 'How many fish are there in a dozen fish?')\n( 2, 'How many rainbow trout are there in a dozen rainbow trout?')\n( 3, 'What is the average weight in pounds of a dozen rainbow trout caught in a specific river in Alaska during the month of May?')\n
"},{"location":"sections/pipeline_samples/papers/deita/#evol-complexity-evaluate-complexity-of-generated-instructions","title":"EVOL COMPLEXITY: Evaluate complexity of generated instructions","text":"

The second step is the evaluation of complexity for an instruction in a given instruction-response pair. Like EVOL-INSTRUCT, this method uses LLMs instead of humans to automatically improve instructions, specifically through their complexity. From any instruction-response pair, \\((I, R)\\), we first generate new instructions following the In-Depth Evolving Response. We generate more complex instructions through prompting, as explained by authors, by adding some constraints or reasoning steps. Let\\'s take an example from GPT-4-LLM which aims to generate observations by GPT-4 to instruct-tune LLMs with supervised fine-tuning. And, we have the instruction \\(instruction_0\\):

instruction_0 = \"Give three tips for staying healthy.\"\n

To make it more complex, you can use, as the authors did, some prompt templates to add constraints or deepen the instruction. They provided some prompts in the paper appendix. For instance, this one was used to add constraints:

PROMPT = \"\"\"I want you act as a Prompt Rewriter.\nYour objective is to rewrite a given prompt into a more complex version to\nmake those famous AI systems (e.g., ChatGPT and GPT4) a bit harder to handle.\nBut the rewritten prompt must be reasonable and must be understood and\nresponded by humans.\nYour rewriting cannot omit the non-text parts such as the table and code in\n#Given Prompt#:. Also, please do not omit the input in #Given Prompt#.\nYou SHOULD complicate the given prompt using the following method:\nPlease add one more constraints/requirements into #Given Prompt#\nYou should try your best not to make the #Rewritten Prompt# become verbose,\n#Rewritten Prompt# can only add 10 to 20 words into #Given Prompt#.\n\u2018#Given Prompt#\u2019, \u2018#Rewritten Prompt#\u2019, \u2018given prompt\u2019 and \u2018rewritten prompt\u2019\nare not allowed to appear in #Rewritten Prompt#\n#Given Prompt#:\n<Here is instruction>\n#Rewritten Prompt#:\n\"\"\"\n

Prompting this to an LLM, you automatically get a more complex instruction, called \\(instruction_1\\), from an initial instruction \\(instruction_0\\):

instruction_1 = \"Provide three recommendations for maintaining well-being, ensuring one focuses on mental health.\"\n

With sequences of evolved instructions, we use a further LLM to automatically rank and score them. We provide the 6 instructions at the same time. By providing all instructions together, we force the scoring model to look at minor complexity differences between evolved instructions. Encouraging the model to discriminate between instructions. Taking the example below, \\(instruction_0\\) and \\(instruction_1\\) could deserve the same score independently, but when compared together we would notice the slight difference that makes \\(instruction_1\\) more complex.

In distilabel, we implement this like so:

instruction_complexity_scorer = ComplexityScorer(\n    name=\"instruction_complexity_scorer\",\n    llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n    input_mappings={\"instructions\": \"evolved_instructions\"},\n    pipeline=pipeline,\n)\n\nexpand_evolved_instructions = ExpandColumns(\n    name=\"expand_evolved_instructions\",\n    columns=[\"evolved_instructions\", \"answers\", \"scores\"],\n    output_mappings={\n        \"evolved_instructions\": \"evolved_instruction\",\n        \"answers\": \"answer\",\n        \"scores\": \"evol_instruction_score\",\n    },\n    pipeline=pipeline,\n)\n\ninstruction_complexity_scorer.load()\n\n_evolved_instructions = next(instruction_complexity_scorer.process(([{\"evolved_instructions\": [PROMPT + instruction_1]}])))\n\nprint(\"Original Instruction:\")\nprint(instruction_1)\nprint(\"\\nEvolved Instruction:\")\nprint(_evolved_instructions[0][\"evolved_instructions\"][0].split(\"#Rewritten Prompt#:\\n\")[1])\n

Output:

Original Instruction:\nProvide three recommendations for maintaining well-being, ensuring one focuses on mental health.\n\nEvolved Instruction:\nSuggest three strategies for nurturing overall well-being, with the stipulation that at least one explicitly addresses the enhancement of mental health, incorporating evidence-based practices.\n
"},{"location":"sections/pipeline_samples/papers/deita/#evol-quality-quality-evaluation","title":"EVOL-QUALITY: Quality Evaluation","text":"

Now that we have scored the complexity of the instructions, we will focus on the quality of the responses. Similar to EVOL COMPLEXITY, the authors introduced EVOL QUALITY, a method based on LLMs, instead of humans, to automatically score the quality of the response.

From an instruction-response pair, \\((I, R)\\), the goal is to make the response evolve into a more helpful and relevant response. The key difference is that we need to also provide the first instruction to guide evolution. Let's take back our example from GPT-4-LLM.

Here we have the response \\(response_0\\) and its initial instruction \\(instruction_0\\):

instruction_0 = \"Give three tips for staying healthy.\"\nreponse_0 = \"1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\"\n

Again the authors provided several prompts you could use to make your response evolve according to some guidelines. For example, this one was used to enrich the answer:

PROMPT = \"\"\"I want you to act as a Response Rewriter\nYour goal is to enhance the quality of the response given by an AI assistant\nto the #Given Prompt# through rewriting.\nBut the rewritten response must be reasonable and must be understood by humans.\nYour rewriting cannot omit the non-text parts such as the table and code in\n#Given Prompt# and #Given Response#. Also, please do not omit the input\nin #Given Prompt#.\nYou Should enhance the quality of the response using the following method:\nPlease make the Response more in-depth\nYou should try your best not to make the #Rewritten Response# become verbose,\n#Rewritten Response# can only add 10 to 20 words into #Given Response#.\n\u2018#Given Response#\u2019, \u2018#Rewritten Response#\u2019, \u2018given response\u2019 and \u2018rewritten response\u2019\nare not allowed to appear in #Rewritten Response#\n#Given Prompt#:\n<instruction_0>\n#Given Response#:\n<response_0>\n#Rewritten Response#:\n\"\"\"\n

Prompting this to an LLM, you will automatically get a more enriched response, called \\(response_1\\), from an initial response \\(response_0\\) and initial instruction \\(instruction_0\\):

evol_response_quality = EvolQuality(\n    name=\"evol_response_quality\",\n    llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n    num_evolutions=5,\n    store_evolutions=True,\n    include_original_response=True,\n    input_mappings={\n        \"instruction\": \"evolved_instruction\",\n        \"response\": \"answer\",\n    },\n    pipeline=pipeline,\n)\n\nevol_response_quality.load()\n\n_evolved_responses = next(evol_response_quality.process([{\"instruction\": PROMPT + instruction_0, \"response\": reponse_0}]))\n\nprint(\"Original Response:\")\nprint(reponse_0)\nprint(\"\\nEvolved Response:\")\nprint(*_evolved_responses[0]['evolved_responses'], sep=\"\\n\")\n

And now, as in EVOL COMPLEXITY you iterate through this path and use different prompts to make your responses more relevant, helpful or creative. In the paper, they make 4 more iterations to get 5 evolved responses \\((R0, R1, R2, R3, R4)\\) which makes 5 different responses for one initial instruction at the end of this step.

response_quality_scorer = QualityScorer(\n    name=\"response_quality_scorer\",\n    llm=OpenAILLM(model=\"gpt-3.5-turbo\"),\n    input_mappings={\n        \"instruction\": \"evolved_instruction\",\n        \"responses\": \"evolved_responses\",\n    },\n    pipeline=pipeline,\n)\n\nexpand_evolved_responses = ExpandColumns(\n    name=\"expand_evolved_responses\",\n    columns=[\"evolved_responses\", \"scores\"],\n    output_mappings={\n        \"evolved_responses\": \"evolved_response\",\n        \"scores\": \"evol_response_score\",\n    },\n    pipeline=pipeline,\n)\n\nresponse_quality_scorer.load()\n\n_scored_responses = next(response_quality_scorer.process([{\"instruction\": PROMPT + instruction_0, \"responses\": _evolved_responses[0]['evolved_responses']}]))\n\nprint(\"Original Response:\")\nprint(reponse_0)\n\nprint(\"\\nScore, Evolved Response:\")\nprint(*zip(_scored_responses[0][\"scores\"], _evolved_responses[0]['evolved_responses']), sep=\"\\n\")\n

Output:

Original Response:\n1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases. 2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week. 3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.\n\nScore, Evolved Response:\n(4.0, 'Here are three essential tips for maintaining good health: \\n1. Prioritize regular exercise \\n2. Eat a balanced diet with plenty of fruits and vegetables \\n3. Get an adequate amount of sleep each night.')\n(2.0, 'Here are three effective strategies to maintain a healthy lifestyle.')\n(5.0, 'Here are three practical tips to maintain good health: Ensure a balanced diet, engage in regular exercise, and prioritize sufficient sleep. These practices support overall well-being.')\n
"},{"location":"sections/pipeline_samples/papers/deita/#improving-data-diversity","title":"Improving Data Diversity","text":"

One main component of good data to instruct-tune LLMs is diversity. Real world data can often contain redundancy due repetitive and homogeneous data.

The authors of the DEITA paper tackle the challenge of ensuring data diversity in the instruction tuning LLMs to avoid the pitfalls of data redundancy that can lead to over-fitting or poor generalization. They propose an embedding-based method to filter data for diversity. This method, called Repr Filter, uses embeddings generated by the Llama 1 13B model to represent instruction-response pairs in a vector space. The diversity of a new data sample is assessed based on the cosine distance between its embedding and that of its nearest neighbor in the already selected dataset. If this distance is greater than a specified threshold, the sample is considered diverse and is added to the selection. This process prioritizes diversity by assessing each sample's contribution to the variety of the dataset until the data selection budget is met. This approach effectively maintains the diversity of the data used for instruction tuning, as demonstrated by the DEITA models outperforming or matching state-of-the-art models with significantly less training data. In this implementation of DEITA we use the hidden state of the last layer of the Llama 2 model to generate embeddings, instead of a sentence transformer model, because we found that it improved the diversity of the data selection.

generate_conversation = ConversationTemplate(\n    name=\"generate_conversation\",\n    input_mappings={\n        \"instruction\": \"evolved_instruction\",\n        \"response\": \"evolved_response\",\n    },\n    pipeline=pipeline,\n)\n\ngenerate_embeddings = GenerateEmbeddings(\n    name=\"generate_embeddings\",\n    llm=TransformersLLM(\n        model=\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n        device=\"cuda\",\n        torch_dtype=\"float16\",\n    ),\n    input_mappings={\"text\": \"conversation\"},\n    input_batch_size=5,\n    pipeline=pipeline,\n)\n\ndeita_filtering = DeitaFiltering(name=\"deita_filtering\", pipeline=pipeline)\n
"},{"location":"sections/pipeline_samples/papers/deita/#build-the-distilabel-pipeline","title":"Build the \u2697 distilabel Pipeline","text":"

Now we're ready to build a distilabel pipeline using the DEITA method:

load_data.connect(evol_instruction_complexity)\nevol_instruction_complexity.connect(instruction_complexity_scorer)\ninstruction_complexity_scorer.connect(expand_evolved_instructions)\nexpand_evolved_instructions.connect(evol_response_quality)\nevol_response_quality.connect(response_quality_scorer)\nresponse_quality_scorer.connect(expand_evolved_responses)\nexpand_evolved_responses.connect(generate_conversation)\ngenerate_conversation.connect(generate_embeddings)\ngenerate_embeddings.connect(deita_filtering)\n

Now we can run the pipeline. We use the step names to reference them in the pipeline configuration:

distiset = pipeline.run(\n    parameters={\n        \"load_data\": {\n            \"repo_id\": \"distilabel-internal-testing/instruction-dataset-50\",\n            \"split\": \"train\",\n        },\n        \"evol_instruction_complexity\": {\n            \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 512, \"temperature\": 0.7}}\n        },\n        \"instruction_complexity_scorer\": {\n            \"llm\": {\"generation_kwargs\": {\"temperature\": 0.0}}\n        },\n        \"evol_response_quality\": {\n            \"llm\": {\"generation_kwargs\": {\"max_new_tokens\": 512, \"temperature\": 0.7}}\n        },\n        \"response_quality_scorer\": {\"llm\": {\"generation_kwargs\": {\"temperature\": 0.0}}},\n        \"deita_filtering\": {\"data_budget\": 500, \"diversity_threshold\": 0.04},\n    },\n    use_cache=False,\n)\n

We can push the results to the Hugging Face Hub:

distiset.push_to_hub(\"distilabel-internal-testing/deita-colab\")\n
"},{"location":"sections/pipeline_samples/papers/deita/#results","title":"Results","text":"

Again, to show the relevance of EVOL QUALITY method, the authors evaluated on the MT-bench models fine-tuned with different data selections according to how we defined quality responses according to an instruction. Each time they selected 6k data according to the quality score:

Credit: Liu et al. (2023)

The score is much better when selecting data with the EVOL QUALITY method than when we select randomly or according to the length, making a more qualitative response if longer. Nevertheless, we see that the margin we may have seen in the complexity score is thinner. And we'll discuss the strategy in a later part. Nevertheless, this strategy looks to improve the fine-tuning compared to the baselines and now we're interested in mixing quality and complexity assessment with a diversity evaluation to find the right trade-off in our selection process.

"},{"location":"sections/pipeline_samples/papers/deita/#conclusion","title":"Conclusion","text":"

In conclusion, if you are looking for some efficient method to align an open-source LLM to your business case with a constrained budget, the solutions provided by DEITA are really worth the shot. This data-centric approach enables one to focus on the content of the dataset to have the best results instead of \"just\" scaling the instruction-tuning with more, and surely less qualitative, data. In a nutshell, the strategy developed, through automatically scoring instructions-responses, aims to substitute the human preference step proprietary models such as GPT-4 have been trained with. There are a few improvements we could think about when it comes to how to select the good data, but it opens a really great way in instruct-tuning LLM with lower computational needs making the whole process intellectually relevant and more sustainable than most of the other methods. We'd be happy to help you out with aligning an LLM with your business case drawing inspiration from such a methodology.

"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/","title":"Instruction Backtranslation","text":"

\"Self Alignment with Instruction Backtranslation\" presents a scalable method to build high-quality instruction following a language model by automatically labeling human-written text with corresponding instructions. Their approach, named instruction backtranslation, starts with a language model finetuned on a small amount of seed data, and a given web corpus. The seed model is used to construct training examples by generating instruction prompts for web documents (self-augmentation), and then selecting high-quality examples from among these candidates (self-curation). This data is then used to finetune a stronger model.

Their self-training approach assumes access to a base language model, a small amount of seed data, and a collection of unlabelled examples, e.g. a web corpus. The unlabelled data is a large, diverse set of human-written documents that includes writing about all manner of topics humans are interested in \u2013 but crucially is not paired with instructions.

A first key assumption is that there exists some subset of this very large human-written text that would be suitable as gold generations for some user instructions. A second key assumption is that they can predict instructions for these candidate gold answers that can be used as high-quality example pairs to train an instruction-following model.

Their overall process, called instruction back translation performs two core steps:

  1. Self-augment: Generate instructions for unlabelled data, i.e. the web corpus, to produce candidate training data of (instruction, output) pairs for instruction tuning.

  2. Self-curate: Self-select high-quality demonstration examples as training data to finetune the base model to follow instructions. This approach is done iteratively where a better intermediate instruction-following model can improve on selecting data for finetuning in the next iteration.

This replication covers the self-curation step i.e. the second/latter step as mentioned above, so as to be able to use the proposed prompting approach to rate the quality of the generated text, which can either be synthetically generated or real human-written text.

"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#replication","title":"Replication","text":"

To replicate the paper we will be using distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes.

"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#installation","title":"Installation","text":"

To replicate Self Alignment with Instruction Backtranslation one will need to install distilabel as it follows:

pip install \"distilabel[hf-inference-endpoints,openai]>=1.0.0\"\n

And since we will be using InferenceEndpointsLLM (installed via the extra hf-inference-endpoints) we will need deploy those in advance either locally or in the Hugging Face Hub (alternatively also the serverless endpoints can be used, but most of the times the inference times are slower, and there's a limited quota to use those as those are free) and set both the HF_TOKEN (to use the InferenceEndpointsLLM) and the OPENAI_API_KEY environment variable value (to use the OpenAILLM).

"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#building-blocks","title":"Building blocks","text":"
  • LoadDataFromHub: Generator Step to load a dataset from the Hugging Face Hub.
  • TextGeneration: Task to generate responses for a given instruction using an LLM.
    • InferenceEndpointsLLM: LLM that runs a model from an Inference Endpoint in the Hugging Face Hub.
  • InstructionBacktranslation: Task that generates a score and a reason for a response for a given instruction using the Self Alignment with Instruction Backtranslation prompt.
    • OpenAILLM: LLM that loads a model from OpenAI.
"},{"location":"sections/pipeline_samples/papers/instruction_backtranslation/#code","title":"Code","text":"

As mentioned before, we will put the previously mentioned building blocks together to replicate Self Alignment with Instruction Backtranslation.

from distilabel.models import InferenceEndpointsLLM, OpenAILLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromHub, KeepColumns\nfrom distilabel.steps.tasks import InstructionBacktranslation, TextGeneration\n\n\nwith Pipeline(name=\"self-alignment-with-instruction-backtranslation\") as pipeline:\n    load_hub_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n    )\n\n    text_generation = TextGeneration(\n        name=\"text_generation\",\n        llm=InferenceEndpointsLLM(\n            base_url=\"<INFERENCE_ENDPOINT_URL>\",\n            tokenizer_id=\"argilla/notus-7b-v1\",\n            model_display_name=\"argilla/notus-7b-v1\",\n        ),\n        input_batch_size=10,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n\n    instruction_backtranslation = InstructionBacktranslation(\n        name=\"instruction_backtranslation\",\n        llm=OpenAILLM(model=\"gpt-4\"),\n        input_batch_size=10,\n        output_mappings={\"model_name\": \"scoring_model\"},\n    )\n\n    keep_columns = KeepColumns(\n        name=\"keep_columns\",\n        columns=[\n            \"instruction\",\n            \"generation\",\n            \"generation_model\",\n            \"score\",\n            \"reason\",\n            \"scoring_model\",\n        ],\n    )\n\n    load_hub_dataset >> text_generation >> instruction_backtranslation >> keep_columns\n

Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched.

distiset = pipeline.run(\n    parameters={\n        load_hub_dataset.name: {\n            \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n            \"split\": \"test\",\n        },\n        text_generation.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 1024,\n                    \"temperature\": 0.7,\n                },\n            },\n        },\n        instruction_backtranslation.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 1024,\n                    \"temperature\": 0.7,\n                },\n            },\n        },\n    },\n)\n

Finally, we can optionally push the generated dataset, named Distiset, to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub.

distiset.push_to_hub(\n    \"instruction-backtranslation-instruction-dataset\",\n    private=True,\n)\n
"},{"location":"sections/pipeline_samples/papers/prometheus/","title":"Prometheus 2","text":"

\"Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models\" presents Prometheus 2, a new and more powerful evaluator LLM compared to Prometheus (its predecessor) presented in \"Prometheus: Inducing Fine-grained Evaluation Capability in Language Models\"; since GPT-4, as well as other proprietary LLMs, are commonly used to assess the quality of the responses for various LLMs, but there are concerns about transparency, controllability, and affordability, that motivate the need of open-source LLMs specialized in evaluations.

Existing open evaluator LMs exhibit critical shortcomings:

  1. They issue scores that significantly diverge from those assigned by humans.
  2. They lack the flexibility to perform both direct assessment and pairwise ranking, the two most prevalent forms of assessment.

Additionally, they do not possess the ability to evaluate based on custom evaluation criteria, focusing instead on general attributes like helpfulness and harmlessness. Prometheus 2 is capable of processing both direct assessment and pair-wise ranking formats grouped with user-defined evaluation criteria.

Prometheus 2 released two variants:

  • prometheus-eval/prometheus-7b-v2.0: fine-tuned on top of mistralai/Mistral-7B-Instruct-v0.2
  • prometheus-eval/prometheus-8x7b-v2.0: fine-tuned on top of mistralai/Mixtral-8x7B-Instruct-v0.1

Both models have been fine-tuned for both direct assessment and pairwise ranking tasks i.e. assessing the quality of a single isolated response for a given instruction with or without a reference answer and assessing the quality of one response against another one for a given instruction with or without a reference answer, respectively.

On four direct assessment benchmarks and four pairwise ranking benchmarks, Prometheus 2 scores the highest correlation and agreement with humans and proprietary LM judges among all tested open evaluator LMs. Their models, code, and data are all publicly available at prometheus-eval/prometheus-eval.

"},{"location":"sections/pipeline_samples/papers/prometheus/#replication","title":"Replication","text":"

Note

The section is named Replication but in this case we're not replicating the Prometheus 2 paper per se, but rather showing how to use the PrometheusEval task implemented within distilabel to evaluate the quality of the responses from a given instruction using the Prometheus 2 model.

To showcase Prometheus 2 we will be using the PrometheusEval task implemented in distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes.

"},{"location":"sections/pipeline_samples/papers/prometheus/#installation","title":"Installation","text":"

To reproduce the code below, one will need to install distilabel as it follows:

pip install \"distilabel[vllm]>=1.1.0\"\n

Alternatively, it's recommended to install Dao-AILab/flash-attention to benefit from Flash Attention 2 speed ups during inference via vllm.

pip install flash-attn --no-build-isolation\n

Note

The installation notes above assume that you are using a VM with one GPU accelerator with at least the required VRAM to fit prometheus-eval/prometheus-7b-v2.0 in bfloat16 (28GB); but if you have enough VRAM to fit their 8x7B model in bfloat16 (~90GB) you can use prometheus-eval/prometheus-8x7b-v2.0 instead.

"},{"location":"sections/pipeline_samples/papers/prometheus/#building-blocks","title":"Building blocks","text":"
  • LoadDataFromHub: GeneratorStep to load a dataset from the Hugging Face Hub.

  • PrometheusEval: Task that assesses the quality of a response for a given instruction using any of the Prometheus 2 models.

    • vLLM: LLM that loads a model from the Hugging Face Hub via vllm-project/vllm.

    Note

    Since the Prometheus 2 models use a slightly different chat template than mistralai/Mistral-7B-Instruct-v0.2, we need to set the chat_template parameter to [INST] {{ messages[0]['content'] }}\\n{{ messages[1]['content'] }}[/INST] so as to properly format the input for Prometheus 2.

  • (Optional) KeepColumns: Task that keeps only the specified columns in the dataset, used to remove the undesired columns.

"},{"location":"sections/pipeline_samples/papers/prometheus/#code","title":"Code","text":"

As mentioned before, we will put the previously mentioned building blocks together to see how Prometheus 2 can be used via distilabel.

from distilabel.models import vLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import KeepColumns, LoadDataFromHub\nfrom distilabel.steps.tasks import PrometheusEval\n\nif __name__ == \"__main__\":\n    with Pipeline(name=\"prometheus\") as pipeline:\n        load_dataset = LoadDataFromHub(\n            name=\"load_dataset\",\n            repo_id=\"HuggingFaceH4/instruction-dataset\",\n            split=\"test\",\n            output_mappings={\"prompt\": \"instruction\", \"completion\": \"generation\"},\n        )\n\n        task = PrometheusEval(\n            name=\"task\",\n            llm=vLLM(\n                model=\"prometheus-eval/prometheus-7b-v2.0\",\n                chat_template=\"[INST] {{ messages[0]['content'] }}\\n{{ messages[1]['content'] }}[/INST]\",\n            ),\n            mode=\"absolute\",\n            rubric=\"factual-validity\",\n            reference=False,\n            num_generations=1,\n            group_generations=False,\n        )\n\n        keep_columns = KeepColumns(\n            name=\"keep_columns\",\n            columns=[\"instruction\", \"generation\", \"feedback\", \"result\", \"model_name\"],\n        )\n\n        load_dataset >> task >> keep_columns\n

Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched.

distiset = pipeline.run(\n    parameters={\n        task.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 1024,\n                    \"temperature\": 0.7,\n                },\n            },\n        },\n    },\n)\n

Finally, we can optionally push the generated dataset, named Distiset, to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub.

distiset.push_to_hub(\n    \"instruction-dataset-prometheus\",\n    private=True,\n)\n
"},{"location":"sections/pipeline_samples/papers/ultrafeedback/","title":"UltraFeedback","text":"

UltraFeedback: Boosting Language Models with High-quality Feedback is a paper published by OpenBMB which proposes UltraFeedback, a large-scale, fine-grained, diverse preference dataset, used for training powerful reward models and critic models.

UltraFeedback collects about 64k prompts from diverse resources (including UltraChat, ShareGPT, Evol-Instruct, TruthfulQA, FalseQA, and FLAN), then they use these prompts to query multiple LLMs (commercial models, Llama models ranging 7B to 70B, and non-Llama models) and generate four different responses for each prompt, resulting in a total of 256k samples i.e. the UltraFeedback will rate four responses on every OpenAI request.

To collect high-quality preference and textual feedback, they design a fine-grained annotation instruction, which contains four different aspects, namely instruction-following, truthfulness, honesty and helpfulness (even though within the paper they also mention a fifth one named verbalized calibration). Finally, GPT-4 is used to generate the ratings for the generated responses to the given prompt using the previously mentioned aspects.

"},{"location":"sections/pipeline_samples/papers/ultrafeedback/#replication","title":"Replication","text":"

To replicate the paper we will be using distilabel and a smaller dataset created by the Hugging Face H4 team named HuggingFaceH4/instruction-dataset for testing purposes.

Also for testing purposes we will just show how to evaluate the generated responses for a given prompt using a new global aspect named overall-rating defined by Argilla, that computes the average of the four aspects, so as to reduce number of requests to be sent to OpenAI, but note that all the aspects are implemented within distilabel and can be used instead for a more faithful reproduction. Besides that we will generate three responses for each instruction using three LLMs selected from a pool of six: HuggingFaceH4/zephyr-7b-beta, argilla/notus-7b-v1, google/gemma-1.1-7b-it, meta-llama/Meta-Llama-3-8B-Instruct, HuggingFaceH4/zephyr-7b-gemma-v0.1 and mlabonne/UltraMerge-7B.

"},{"location":"sections/pipeline_samples/papers/ultrafeedback/#installation","title":"Installation","text":"

To replicate UltraFeedback one will need to install distilabel as it follows:

pip install \"distilabel[argilla,openai,vllm]>=1.0.0\"\n

And since we will be using vllm we will need to use a VM with at least 6 NVIDIA GPUs with at least 16GB of memory each to run the text generation, and set the OPENAI_API_KEY environment variable value.

"},{"location":"sections/pipeline_samples/papers/ultrafeedback/#building-blocks","title":"Building blocks","text":"
  • LoadDataFromHub: Generator Step to load a dataset from the Hugging Face Hub.
  • sample_n_steps: Function to create a routing_batch_function that samples n downstream steps for each batch generated by the upstream step. This is the key to replicate the LLM pooling mechanism described in the paper.
  • TextGeneration: Task to generate responses for a given instruction using an LLM.
    • vLLM: LLM that loads a model from the Hugging Face Hub using vllm.
  • GroupColumns: Task that combines multiple columns into a single one i.e. from string to list of strings. Useful when there are multiple parallel steps that are connected to the same node.
  • UltraFeedback: Task that generates ratings for the responses of a given instruction using the UltraFeedback prompt.
    • OpenAILLM: LLM that loads a model from OpenAI.
  • KeepColumns: Task to keep the desired columns while removing the not needed ones, as well as defining the order for those.
  • (optional) PreferenceToArgilla: Task to optionally push the generated dataset to Argilla to do some further analysis and human annotation.
"},{"location":"sections/pipeline_samples/papers/ultrafeedback/#code","title":"Code","text":"

As mentioned before, we will put the previously mentioned building blocks together to replicate UltraFeedback.

from distilabel.models import OpenAILLM, vLLM\nfrom distilabel.pipeline import Pipeline, sample_n_steps\nfrom distilabel.steps import (\n    GroupColumns,\n    KeepColumns,\n    LoadDataFromHub,\n    PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n\nsample_three_llms = sample_n_steps(n=3)\n\n\nwith Pipeline(name=\"ultrafeedback-pipeline\") as pipeline:\n    load_hub_dataset = LoadDataFromHub(\n        name=\"load_dataset\",\n        output_mappings={\"prompt\": \"instruction\"},\n        batch_size=2,\n    )\n\n    text_generation_with_notus = TextGeneration(\n        name=\"text_generation_with_notus\",\n        llm=vLLM(model=\"argilla/notus-7b-v1\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_zephyr = TextGeneration(\n        name=\"text_generation_with_zephyr\",\n        llm=vLLM(model=\"HuggingFaceH4/zephyr-7b-gemma-v0.1\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_gemma = TextGeneration(\n        name=\"text_generation_with_gemma\",\n        llm=vLLM(model=\"google/gemma-1.1-7b-it\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_zephyr_gemma = TextGeneration(\n        name=\"text_generation_with_zephyr_gemma\",\n        llm=vLLM(model=\"HuggingFaceH4/zephyr-7b-gemma-v0.1\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_llama = TextGeneration(\n        name=\"text_generation_with_llama\",\n        llm=vLLM(model=\"meta-llama/Meta-Llama-3-8B-Instruct\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n    text_generation_with_ultramerge = TextGeneration(\n        name=\"text_generation_with_ultramerge\",\n        llm=vLLM(model=\"mlabonne/UltraMerge-7B\"),\n        input_batch_size=2,\n        output_mappings={\"model_name\": \"generation_model\"},\n    )\n\n    combine_columns = GroupColumns(\n        name=\"combine_columns\",\n        columns=[\"generation\", \"generation_model\"],\n        output_columns=[\"generations\", \"generation_models\"],\n        input_batch_size=2\n    )\n\n    ultrafeedback = UltraFeedback(\n        name=\"ultrafeedback_openai\",\n        llm=OpenAILLM(model=\"gpt-4-turbo-2024-04-09\"),\n        aspect=\"overall-rating\",\n        output_mappings={\"model_name\": \"ultrafeedback_model\"},\n    )\n\n    keep_columns = KeepColumns(\n        name=\"keep_columns\",\n        columns=[\n            \"instruction\",\n            \"generations\",\n            \"generation_models\",\n            \"ratings\",\n            \"rationales\",\n            \"ultrafeedback_model\",\n        ],\n    )\n\n    (\n        load_hub_dataset\n        >> sample_three_llms\n        >> [\n            text_generation_with_notus,\n            text_generation_with_zephyr,\n            text_generation_with_gemma,\n            text_generation_with_llama,\n            text_generation_with_zephyr_gemma,\n            text_generation_with_ultramerge\n        ]\n        >> combine_columns\n        >> ultrafeedback\n        >> keep_columns\n    )\n\n    # Optional: Push the generated dataset to Argilla, but will need to `pip install argilla` first\n    # push_to_argilla = PreferenceToArgilla(\n    #     name=\"push_to_argilla\",\n    #     api_url=\"<ARGILLA_API_URL>\",\n    #     api_key=\"<ARGILLA_API_KEY>\",  # type: ignore\n    #     dataset_name=\"ultrafeedback\",\n    #     dataset_workspace=\"admin\",\n    #     num_generations=2,\n    # )\n    # keep_columns >> push_to_argilla\n

Note

As we're using a relative small dataset, we're setting a low batch_size and input_batch_size so we have more batches for the routing_batch_function i.e. we will have more variety on the LLMs used to generate the responses. When using a large dataset, it's recommended to use a larger batch_size and input_batch_size to benefit from the vLLM optimizations for larger batch sizes, which makes the pipeline execution faster.

Then we need to call pipeline.run with the runtime parameters so that the pipeline can be launched.

distiset = pipeline.run(\n    parameters={\n        load_hub_dataset.name: {\n            \"repo_id\": \"HuggingFaceH4/instruction-dataset\",\n            \"split\": \"test\",\n        },\n        text_generation_with_notus.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_zephyr.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_gemma.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_llama.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_zephyr_gemma.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        text_generation_with_ultramerge.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 512,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n        ultrafeedback.name: {\n            \"llm\": {\n                \"generation_kwargs\": {\n                    \"max_new_tokens\": 2048,\n                    \"temperature\": 0.7,\n                }\n            },\n        },\n    }\n)\n

Finally, we can optionally push the generated dataset, named Distiset, to the Hugging Face Hub via the push_to_hub method, so that each subset generated in the leaf steps is pushed to the Hub.

distiset.push_to_hub(\n    \"ultrafeedback-instruction-dataset\",\n    private=True,\n)\n
"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/","title":"Synthetic data generation for fine-tuning custom retrieval and reranking models","text":"
!pip install \"distilabel[hf-inference-endpoints]\"\n
!pip install \"sentence-transformers~=3.0\"\n

Let's make the needed imports:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.steps import LoadDataFromHub\n\nfrom sentence_transformers import SentenceTransformer, CrossEncoder\nimport torch\n

You'll need an HF_TOKEN to use the HF Inference Endpoints. Login to use it directly within this notebook.

import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n
!pip install \"distilabel[argilla, hf-inference-endpoints]\"\n

Let's make the extra needed imports:

import argilla as rg\n
context = (\n\"\"\"\nThe text is a chunk from technical Python SDK documentation of Argilla.\nArgilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets.\nAlong with prose explanations, the text chunk may include code snippets and Python references.\n\"\"\"\n)\n
llm = InferenceEndpointsLLM(\n    model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    tokenizer_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n)\n\nwith Pipeline(name=\"generate\") as pipeline:\n    load_dataset = LoadDataFromHub(\n        num_examples=15,\n        output_mappings={\"chunks\": \"anchor\"},\n    )\n    generate_retrieval_pairs = GenerateSentencePair(\n        name=\"generate_retrieval_pairs\",\n        triplet=True,\n        hard_negative=True,\n        action=\"query\",\n        llm=llm,\n        input_batch_size=10,\n        context=context,\n    )\n    generate_reranking_pairs = GenerateSentencePair(\n        name=\"generate_reranking_pairs\",\n        triplet=True,\n        hard_negative=False,  # to potentially generate non-relevant pairs\n        action=\"semantically-similar\",\n        llm=llm,\n        input_batch_size=10,\n        context=context,\n    )\n\n    load_dataset.connect(generate_retrieval_pairs, generate_reranking_pairs)\n

Next, we can execute this using pipeline.run. We will provide some parameters to specific components within our pipeline.

generation_kwargs = {\n    \"llm\": {\n        \"generation_kwargs\": {\n            \"temperature\": 0.7,\n            \"max_new_tokens\": 512,\n        }\n    }\n}\n\ndistiset = pipeline.run(  \n    parameters={\n        load_dataset.name: {\n            \"repo_id\": \"plaguss/argilla_sdk_docs_raw_unstructured\",\n            \"split\": \"train\",\n        },\n        generate_retrieval_pairs.name: generation_kwargs,\n        generate_reranking_pairs.name: generation_kwargs,\n    },\n    use_cache=False,  # False for demo\n)\n

Data generation can be a expensive, so it is recommended to store the data somewhere. For now, we will store it on the Hugging Face Hub, using our push_to_hub method.

distiset.push_to_hub(\"[your-owner-name]/example-retrieval-reranking-dataset\")\n

We have got 2 different leaf/end nodes, therefore we've got a distil configurations we can access, one for the retrieval data, and one for the reranking data.

Looking at these initial examples, we can see they nicely capture the essence of the chunks column but we will need to evaluate the quality of the data a bit more before we can use it for fine-tuning.

model_id = \"Snowflake/snowflake-arctic-embed-m\"  # Hugging Face model ID\n\nmodel_retrieval = SentenceTransformer(\n    model_id, device=\"cuda\" if torch.cuda.is_available() else \"cpu\"\n)\n

Next, we will encode the generated text pairs and compute the similarities.

from sklearn.metrics.pairwise import cosine_similarity\n\ndef get_embeddings(texts):\n    vectors = model_retrieval.encode(texts)\n    return [vector.tolist() for vector in vectors]\n\n\ndef get_similarities(vector_batch_a, vector_batch_b):\n    similarities = []\n    for vector_a, vector_b in zip(vector_batch_a, vector_batch_b):\n        similarity = cosine_similarity([vector_a], [vector_b])[0][0]\n        similarities.append(similarity)\n    return similarities\n\ndef format_data_retriever(batch):# -&gt; Any:\n    batch[\"anchor-vector\"] = get_embeddings(batch[\"anchor\"])\n    batch[\"positive-vector\"] = get_embeddings(batch[\"positive\"])\n    batch[\"negative-vector\"] = get_embeddings(batch[\"negative\"])    \n    batch[\"similarity-positive-negative\"] = get_similarities(batch[\"positive-vector\"], batch[\"negative-vector\"])\n    batch[\"similarity-anchor-positive\"] = get_similarities(batch[\"anchor-vector\"], batch[\"positive-vector\"])\n    batch[\"similarity-anchor-negative\"] = get_similarities(batch[\"anchor-vector\"], batch[\"negative-vector\"])\n    return batch\n\ndataset_generate_retrieval_pairs = distiset[\"generate_retrieval_pairs\"][\"train\"].map(format_data_retriever, batched=True, batch_size=250)\n
model_id = \"sentence-transformers/all-MiniLM-L12-v2\"\n\nmodel = CrossEncoder(model_id)\n

Next, we will compute the similarity for the generated text pairs using the reranker. On top of that, we will compute an anchor-vector to allow for doing semantic search.

def format_data_retriever(batch):# -&gt; Any:\n    batch[\"anchor-vector\"] = get_embeddings(batch[\"anchor\"])\n    batch[\"similarity-positive-negative\"] = model.predict(zip(batch[\"positive-vector\"], batch[\"negative-vector\"]))\n    batch[\"similarity-anchor-positive\"] = model.predict(zip(batch[\"anchor-vector\"], batch[\"positive-vector\"]))\n    batch[\"similarity-anchor-negative\"] = model.predict(zip(batch[\"anchor-vector\"], batch[\"negative-vector\"]))\n    return batch\n\ndataset_generate_reranking_pairs = distiset[\"generate_reranking_pairs\"][\"train\"].map(format_data_retriever, batched=True, batch_size=250)\n

And voila, we have our proxies for quality evaluation which we can use to filter out the best and worst examples.

First, we need to define the setting for our Argilla dataset. We will create two different datasets, one for the retrieval data and one for the reranking data to ensure our annotators can focus on the task at hand.

import argilla as rg\nfrom argilla._exceptions import ConflictError\n\napi_key = \"ohh so secret\"\napi_url = \"https://[your-owner-name]-[your-space-name].hf.space\"\n\nclient = rg.Argilla(api_url=api_url, api_key=api_key)\n\nsettings = rg.Settings(\n    fields=[\n        rg.TextField(\"anchor\")\n    ],\n    questions=[\n        rg.TextQuestion(\"positive\"),\n        rg.TextQuestion(\"negative\"),\n        rg.LabelQuestion(\n            name=\"is_positive_relevant\",\n            title=\"Is the positive query relevant?\",\n            labels=[\"yes\", \"no\"],\n        ),\n        rg.LabelQuestion(\n            name=\"is_negative_irrelevant\",\n            title=\"Is the negative query irrelevant?\",\n            labels=[\"yes\", \"no\"],\n        )\n    ],\n    metadata=[\n        rg.TermsMetadataProperty(\"filename\"),\n        rg.FloatMetadataProperty(\"similarity-positive-negative\"),\n        rg.FloatMetadataProperty(\"similarity-anchor-positive\"),\n        rg.FloatMetadataProperty(\"similarity-anchor-negative\"),\n    ],\n    vectors=[\n        rg.VectorField(\"anchor-vector\", dimensions=model.get_sentence_embedding_dimension())\n    ]\n)\nrg_datasets = []\nfor dataset_name in [\"generate_retrieval_pairs\", \"generate_reranking_pairs\"]:\n    ds = rg.Dataset(\n        name=dataset_name,\n        settings=settings\n    )\n    try:\n        ds.create()\n    except ConflictError:\n        ds = client.datasets(dataset_name)\n    rg_datasets.append(ds)\n

Now, we've got our dataset definitions setup in Argilla, we can upload our data to Argilla.

ds_datasets = [dataset_generate_retrieval_pairs, dataset_generate_reranking_pairs]\n\nrecords = []\n\nfor rg_dataset, ds_dataset in zip(rg_datasets, ds_datasets):\n    for idx, entry in enumerate(ds_dataset):\n        records.append(\n            rg.Record(\n                id=idx,\n                fields={\"anchor\": entry[\"anchor\"]},\n                suggestions=[\n                    rg.Suggestion(\"positive\", value=entry[\"positive\"], agent=\"gpt-4o\", type=\"model\"),\n                    rg.Suggestion(\"negative\", value=entry[\"negative\"], agent=\"gpt-4o\", type=\"model\"),\n                ],\n                metadata={\n                    \"filename\": entry[\"filename\"],\n                    \"similarity-positive-negative\": entry[\"similarity-positive-negative\"],\n                    \"similarity-anchor-positive\": entry[\"similarity-anchor-positive\"],\n                    \"similarity-anchor-negative\": entry[\"similarity-anchor-negative\"]\n                },\n                vectors={\"anchor-vector\": entry[\"anchor-vector\"]}\n            )\n        )\n    rg_dataset.records.log(records)\n

Now, we can explore the UI and add a final human touch to get he most out of our dataset.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#synthetic-data-generation-for-fine-tuning-custom-retrieval-and-reranking-models","title":"Synthetic data generation for fine-tuning custom retrieval and reranking models","text":"
  • Goal: Bootstrap, optimize and maintain your embedding models and rerankers through synthetic data generation and human feedback.
  • Libraries: argilla, hf-inference-endpoints, sentence-transformers
  • Components: LoadDataFromHub, GenerateSentencePair, InferenceEndpointsLLM

Note

For a comprehensive overview on optimizing the retrieval performance in a RAG pipeline, check this guide in collaboration with ZenML, an open-source MLOps framework designed for building portable and production-ready machine learning pipelines.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#install-the-dependencies","title":"Install the dependencies","text":"

To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command:

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"

You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide.

Along with that, you will need to install Argilla as a distilabel extra.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#the-dataset","title":"The dataset","text":"

Before starting any project, it is always important to look at your data. Our data is publicly available on the Hugging Face Hub so we can have a quick look through their dataset viewer within an embedded iFrame.

As we can see, our dataset contains a column called chunks, which was obtained from the Argilla docs. Normally, you would need to download and chunk the data but we will not cover that in this tutorial. To read a full explanation for how this dataset was generated, please refer to How we leveraged distilabel to create an Argilla 2.0 Chatbot.

Alternatively, we can load the entire dataset to disk with datasets.load_dataset.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#synthetic-data-generation","title":"Synthetic data generation","text":"

The GenerateSentencePair component from distilabel can be used to generate training datasets for embeddings models.

It is a pre-defined Task that given an anchor sentence generate data for a specific action. Supported actions are: \"paraphrase\", \"semantically-similar\", \"query\", \"answer\". In our case the chunks column corresponds to the anchor. This means we will use query to generate potential queries for a fine-tuning a retrieval model and that we will use semantically-similar to generate texts that are similar to the intial anchor for fine-tuning a reranking model.

We will triplet=True in order to generate both positive and negative examples, which should help the model generalize better during fine-tuning and we will set hard_negative=True to generate more challenging examples that are closer to the anchor and discussed topics.

Lastly, we can seed the LLM with context to generate more relevant examples.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval","title":"Retrieval","text":"

For retrieval, we will thus generate queries that are similar to the chunks column. We will use the query action to generate potential queries for a fine-tuning a retrieval model.

generate_sentence_pair = GenerateSentencePair(\n    triplet=True,  \n    hard_negative=True,\n    action=\"query\",\n    llm=llm,\n    input_batch_size=10,\n    context=context,\n)\n
"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking","title":"Reranking","text":"

For reranking, we will generate texts that are similar to the intial anchor. We will use the semantically-similar action to generate texts that are similar to the intial anchor for fine-tuning a reranking model. In this case, we set hard_negative=False to generate more diverse and potentially wrong examples, which can be used as negative examples for similarity fine-tuning because rerankers cannot be fine-tuned using triplets.

generate_sentence_pair = GenerateSentencePair(\n    triplet=True,\n    hard_negative=False,\n    action=\"semantically-similar\",\n    llm=llm,\n    input_batch_size=10,\n    context=context,\n)\n
"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#combined-pipeline","title":"Combined pipeline","text":"

We will now use the GenerateSentencePair task to generate synthetic data for both retrieval and reranking models in a single pipeline. Note that, we map the chunks column to the anchor argument.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#data-quality-evaluation","title":"Data quality evaluation","text":"

Data is never as clean as it can be and this also holds for synthetically generated data too, therefore, it is always good to spent some time and look at your data.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#feature-engineering","title":"Feature engineering","text":"

In order to evaluate the quality of our data we will use features of the models that we intent to fine-tune as proxy for data quality. We can then use these features to filter out the best examples.

In order to choose a good default model, we will use the Massive Text Embedding Benchmark (MTEB) Leaderboard. We want to optimize for size and speed, so we will set model size <100M and then filter for Retrieval and Reranking based on the highest average score, resulting in Snowflake/snowflake-arctic-embed-s and sentence-transformers/all-MiniLM-L12-v2 respectively.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval_1","title":"Retrieval","text":"

For retrieval, we will compute similarities for the current embeddings of anchor-positive, positive-negative and anchor-negative pairs. We assume that an overlap of these similarities will cause the model to have difficulties generalizing and therefore we can use these features to evaluate the quality of our data.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking_1","title":"Reranking","text":"

For reranking, we will compute the compute the relevance scores from an existing reranker model for anchor-positive, positive-negative and anchor-negative pais and make a similar assumption as for the retrieval model.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#optional-argilla","title":"(Optional) Argilla","text":"

To get the most out of you data and actually look at our data, we will use Argilla. If you are not familiar with Argilla, we recommend taking a look at the Argilla quickstart docs. Alternatively, you can use your Hugging Face account to login to the Argilla demo Space.

To start exploring data, we first need to define an argilla.Dataset. We will create a basic datset with some input TextFields for the anchor and output TextQuestions for the positive and negative pairs. Additionally, we will use the file_name as MetaDataProperty. Lastly, we will be re-using the vectors obtained from our previous step to allow for semantic search and we will add te similarity scores for some basic filtering and sorting.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#fine-tuning","title":"Fine-tuning","text":"

At last, we can fine-tune our models. We will use the sentence-transformers library to fine-tune our models.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#retrieval_2","title":"Retrieval","text":"

For retrieval, we have created a script that fine-tunes a model on our generated data the generated data based https://github.com/argilla-io/argilla-sdk-chatbot/blob/main/train_embedding.ipynb.You can also open it in Google Colab directly.

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#reranking_2","title":"Reranking","text":"

For reranking, sentence-transformers provides a script that shows how to fine-tune a CrossEncoder models. Ad of now, there is some uncertainty over fine-tuning CrossEncoder models with triplets but you can still use the positive and anchor

"},{"location":"sections/pipeline_samples/tutorials/GenerateSentencePair/#conclusions","title":"Conclusions","text":"

In this tutorial, we present an end-to-end example of fine-tuning retrievers and rerankers for RAG. This serves as a good starting point for optimizing and maintaining your data and model but need to be adapted to your specific use case.

We started with some seed data from the Argilla docs, generated synthetic data for retrieval and reranking models, evaluated the quality of the data, and showed how to fine-tune the models. We also used Argilla to get a human touch on the data.

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/","title":"Clean an existing preference dataset","text":"
  • Goal: Clean an existing preference dataset by providing AI feedback on the quality of the data.
  • Libraries: argilla, hf-inference-endpoints
  • Components: LoadDataFromDicts, UltraFeedback, KeepColumns, PreferenceToArgilla, InferenceEndpointsLLM, GlobalStep
!pip install \"distilabel[hf-inference-endpoints]\"\n
!pip install \"transformers~=4.0\" \"torch~=2.0\"\n

Let's make the required imports:

import random\n\nfrom datasets import load_dataset\n\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import (\n    KeepColumns,\n    LoadDataFromDicts,\n    PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import UltraFeedback\n

You'll need an HF_TOKEN to use the HF Inference Endpoints. Login to use it directly within this notebook.

import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n
!pip install \"distilabel[argilla, hf-inference-endpoints]\"\n

In this case, we will clean a preference dataset, so we will use the Intel/orca_dpo_pairs dataset from the Hugging Face Hub.

dataset = load_dataset(\"Intel/orca_dpo_pairs\", split=\"train[:20]\")\n

Next, we will shuffle the chosen and rejected columns to avoid any bias in the dataset.

def shuffle_and_track(chosen, rejected):\n    pair = [chosen, rejected]\n    random.shuffle(pair)\n    order = [\"chosen\" if x == chosen else \"rejected\" for x in pair]\n    return {\"generations\": pair, \"order\": order}\n\ndataset = dataset.map(lambda x: shuffle_and_track(x[\"chosen\"], x[\"rejected\"]))\n
dataset = dataset.to_list()\n
As a custom step

You can also create a custom step in a separate module, import it and add it to the pipeline after loading the orca_dpo_pairs dataset using the LoadDataFromHub step.

shuffle_step.py
from typing import TYPE_CHECKING, List\nfrom distilabel.steps import GlobalStep, StepInput\n\nif TYPE_CHECKING:\n    from distilabel.steps.typing import StepOutput\n\nimport random\n\nclass ShuffleStep(GlobalStep):\n    @property\n    def inputs(self):\n        \"\"\"Returns List[str]: The inputs of the step.\"\"\"\n        return [\"instruction\", \"chosen\", \"rejected\"]\n\n    @property\n    def outputs(self):\n        \"\"\"Returns List[str]: The outputs of the step.\"\"\"\n        return [\"instruction\", \"generations\", \"order\"]\n\n    def process(self, inputs: StepInput):\n        \"\"\"Returns StepOutput: The outputs of the step.\"\"\"\n        outputs = []\n\n        for input in inputs:\n            chosen = input[\"chosen\"]\n            rejected = input[\"rejected\"]\n            pair = [chosen, rejected]\n            random.shuffle(pair)\n            order = [\"chosen\" if x == chosen else \"rejected\" for x in pair]\n\n            outputs.append({\"instruction\": input[\"instruction\"], \"generations\": pair, \"order\": order})\n\n        yield outputs\n
from shuffle_step import ShuffleStep\n

To clean an existing preference dataset, we will need to define a Pipeline with all the necessary steps. However, a similar workflow can be used to clean a SFT dataset. Below, we will go over each step in detail.

load_dataset = LoadDataFromDicts(\n    data=dataset[:1],\n    output_mappings={\"question\": \"instruction\"},\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nload_dataset.load()\nnext(load_dataset.process())\n
\n([{'system': '',\n   'question': \"You will be given a definition of a task first, then some input of the task.\\nThis task is about using the specified sentence and converting the sentence to Resource Description Framework (RDF) triplets of the form (subject, predicate object). The RDF triplets generated must be such that the triplets accurately capture the structure and semantics of the input sentence. The input is a sentence and the output is a list of triplets of the form [subject, predicate, object] that capture the relationships present in the sentence. When a sentence has more than 1 RDF triplet possible, the output must contain all of them.\\n\\nAFC Ajax (amateurs)'s ground is Sportpark De Toekomst where Ajax Youth Academy also play.\\nOutput:\",\n   'chosen': '[\\n  [\"AFC Ajax (amateurs)\", \"has ground\", \"Sportpark De Toekomst\"],\\n  [\"Ajax Youth Academy\", \"plays at\", \"Sportpark De Toekomst\"]\\n]',\n   'rejected': \" Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\\n\\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\\n\\nExplanation:\\n\\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\\n\\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.\",\n   'generations': [\" Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\\n\\n[AFC Ajax (amateurs), hasGround, Sportpark De Toekomst]\\n[Ajax Youth Academy, playsAt, Sportpark De Toekomst]\\n\\nExplanation:\\n\\n* AFC Ajax (amateurs) is the subject of the first triplet, and hasGround is the predicate that describes the relationship between AFC Ajax (amateurs) and Sportpark De Toekomst.\\n* Ajax Youth Academy is the subject of the second triplet, and playsAt is the predicate that describes the relationship between Ajax Youth Academy and Sportpark De Toekomst.\\n\\nNote that there may be other possible RDF triplets that could be derived from the input sentence, but the above triplets capture the main relationships present in the sentence.\",\n    '[\\n  [\"AFC Ajax (amateurs)\", \"has ground\", \"Sportpark De Toekomst\"],\\n  [\"Ajax Youth Academy\", \"plays at\", \"Sportpark De Toekomst\"]\\n]'],\n   'order': ['rejected', 'chosen']}],\n True)\n
evaluate_responses = UltraFeedback(\n    aspect=\"overall-rating\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n    ),\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nevaluate_responses.load()\nnext(\n    evaluate_responses.process(\n        [\n            {\n                \"instruction\": \"What's the capital of Spain?\",\n                \"generations\": [\"Madrid\", \"Barcelona\"],\n            }\n        ]\n    )\n)\n
\n[{'instruction': \"What's the capital of Spain?\",\n  'generations': ['Madrid', 'Barcelona'],\n  'ratings': [5, 1],\n  'rationales': [\"The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\",\n   \"The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"],\n  'distilabel_metadata': {'raw_output_ultra_feedback_0': \"#### Output for Text 1\\nRating: 5 (Excellent)\\nRationale: The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\\n\\n#### Output for Text 2\\nRating: 1 (Low Quality)\\nRationale: The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"},\n  'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
keep_columns = KeepColumns(\n    columns=[\n        \"instruction\",\n        \"generations\",\n        \"order\",\n        \"ratings\",\n        \"rationales\",\n        \"model_name\",\n    ],\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nkeep_columns.load()\nnext(\n    keep_columns.process(\n        [\n            {\n                \"system\": \"\",\n                \"instruction\": \"What's the capital of Spain?\",\n                \"chosen\": \"Madrid\",\n                \"rejected\": \"Barcelona\",\n                \"generations\": [\"Madrid\", \"Barcelona\"],\n                \"order\": [\"chosen\", \"rejected\"],\n                \"ratings\": [5, 1],\n                \"rationales\": [\"\", \"\"],\n                \"model_name\": \"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            }\n        ]\n    )\n)\n
\n[{'instruction': \"What's the capital of Spain?\",\n  'generations': ['Madrid', 'Barcelona'],\n  'order': ['chosen', 'rejected'],\n  'ratings': [5, 1],\n  'rationales': ['', ''],\n  'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
to_argilla = PreferenceToArgilla(\n    dataset_name=\"cleaned-dataset\",\n    dataset_workspace=\"argilla\",\n    api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n    api_key=\"[your-api-key]\",\n    num_generations=2\n)\n

Below, you can see the full pipeline definition:

with Pipeline(name=\"clean-dataset\") as pipeline:\n\n    load_dataset = LoadDataFromDicts(\n        data=dataset, output_mappings={\"question\": \"instruction\"}\n    )\n\n    evaluate_responses = UltraFeedback(\n        aspect=\"overall-rating\",\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n        ),\n    )\n\n    keep_columns = KeepColumns(\n        columns=[\n            \"instruction\",\n            \"generations\",\n            \"order\",\n            \"ratings\",\n            \"rationales\",\n            \"model_name\",\n        ]\n    )\n\n    to_argilla = PreferenceToArgilla(\n        dataset_name=\"cleaned-dataset\",\n        dataset_workspace=\"argilla\",\n        api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n        api_key=\"[your-api-key]\",\n        num_generations=2,\n    )\n\n    load_dataset.connect(evaluate_responses)\n    evaluate_responses.connect(keep_columns)\n    keep_columns.connect(to_argilla)\n

Let's now run the pipeline and clean our preference dataset.

distiset = pipeline.run()\n

Let's check it! If you have loaded the data to Argilla, you can start annotating in the Argilla UI.

You can push the dataset to the Hub for sharing with the community and embed it to explore the data.

distiset.push_to_hub(\"[your-owner-name]/example-cleaned-preference-dataset\")\n

In this tutorial, we showcased the detailed steps to build a pipeline for cleaning a preference dataset using distilabel. However, you can customize this pipeline for your own use cases, such as cleaning an SFT dataset or adding custom steps.

We used a preference dataset as our starting point and shuffled the data to avoid any bias. Next, we evaluated the responses using a model through the serverless Hugging Face Inference API, following the UltraFeedback standards. Finally, we kept the needed columns and used Argilla for further curation.

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#clean-an-existing-preference-dataset","title":"Clean an existing preference dataset","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#getting-started","title":"Getting Started","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#install-the-dependencies","title":"Install the dependencies","text":"

To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command:

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"

You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide.

Along with that, you will need to install Argilla as a distilabel extra.

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#the-dataset","title":"The dataset","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#define-the-pipeline","title":"Define the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#load-the-dataset","title":"Load the dataset","text":"

We will use the dataset we just shuffled as source data.

  • Component: LoadDataFromDicts
  • Input columns: system, question, chosen, rejected, generations and order, the same keys as in the loaded list of dictionaries.
  • Output columns: system, instruction, chosen, rejected, generations and order. We will use output_mappings to rename the columns.
"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#evaluate-the-responses","title":"Evaluate the responses","text":"

To evaluate the quality of the responses, we will use meta-llama/Meta-Llama-3.1-70B-Instruct, applying the UltraFeedback task that judges the responses according to different dimensions (helpfulness, honesty, instruction-following, truthfulness). For an SFT dataset, you can use PrometheusEval instead.

  • Component: UltraFeedback task with LLMs using InferenceEndpointsLLM
  • Input columns: instruction, generations
  • Output columns: ratings, rationales, distilabel_metadata, model_name

For your use case and to improve the results, you can use any other LLM of your choice.

"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#keep-only-the-required-columns","title":"Keep only the required columns","text":"

We will get rid of the unneeded columns.

  • Component: KeepColumns
  • Input columns: system, instruction, chosen, rejected, generations, ratings, rationales, distilabel_metadata and model_name
  • Output columns: instruction, chosen, rejected, generations and order
"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#optional-further-data-curation","title":"(Optional) Further data curation","text":"

You can use Argilla to further curate your data.

  • Component: PreferenceToArgilla step
  • Input columns: instruction, generations, generation_models, ratings
  • Output columns: instruction, generations, generation_models, ratings
"},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/clean_existing_dataset/#conclusions","title":"Conclusions","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/","title":"Generate a preference dataset","text":"
  • Goal: Generate a synthetic preference dataset for DPO/ORPO.
  • Libraries: argilla, hf-inference-endpoints
  • Components: LoadDataFromHub, TextGeneration, UltraFeedback, GroupColumns, FormatTextGenerationDPO, PreferenceToArgilla, InferenceEndpointsLLM
!pip install \"distilabel[hf-inference-endpoints]\"\n
!pip install \"transformers~=4.0\" \"torch~=2.0\"\n

Let's make the required imports:

from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import (\n    LoadDataFromHub,\n    GroupColumns,\n    FormatTextGenerationDPO,\n    PreferenceToArgilla,\n)\nfrom distilabel.steps.tasks import TextGeneration, UltraFeedback\n

You'll need an HF_TOKEN to use the HF Inference Endpoints. Log in to use it directly within this notebook.

import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n
!pip install \"distilabel[argilla, hf-inference-endpoints]\"\n

To generate our preference dataset, we will need to define a Pipeline with all the necessary steps. Below, we will go over each step in detail.

load_dataset = LoadDataFromHub(\n        repo_id= \"argilla/10Kprompts-mini\",\n        num_examples=1,\n        pipeline=Pipeline(name=\"showcase-pipeline\"),\n    )\nload_dataset.load()\nnext(load_dataset.process())\n
\n([{'instruction': 'How can I create an efficient and robust workflow that utilizes advanced automation techniques to extract targeted data, including customer information, from diverse PDF documents and effortlessly integrate it into a designated Google Sheet? Furthermore, I am interested in establishing a comprehensive and seamless system that promptly activates an SMS notification on my mobile device whenever a new PDF document is uploaded to the Google Sheet, ensuring real-time updates and enhanced accessibility.',\n   'topic': 'Software Development'}],\n True)\n
generate_responses = [\n    TextGeneration(\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n            generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n        ),\n        pipeline=Pipeline(name=\"showcase-pipeline\"),\n    ),\n    TextGeneration(\n        llm=InferenceEndpointsLLM(\n            model_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n            tokenizer_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n            generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n        ),\n        pipeline=Pipeline(name=\"showcase-pipeline\"),\n    ),\n]\nfor task in generate_responses:\n    task.load()\n    print(next(task.process([{\"instruction\": \"Which are the top cities in Spain?\"}])))\n
\n[{'instruction': 'Which are the top cities in Spain?', 'generation': 'Spain is a country with a rich culture, history, and architecture, and it has many great cities to visit. Here are some of the top cities in Spain:\\n\\n1. **Madrid**: The capital city of Spain, known for its vibrant nightlife, museums, and historic landmarks like the Royal Palace and Prado Museum.\\n2. **Barcelona**: The second-largest city in Spain, famous for its modernist architecture, beaches, and iconic landmarks like La Sagrada Fam\u00edlia and Park G\u00fcell, designed by Antoni Gaud\u00ed.\\n3. **Valencia**: Located on the Mediterranean coast, Valencia is known for its beautiful beaches, City of Arts and Sciences, and delicious local cuisine, such as paella.\\n4. **Seville**: The capital of Andalusia, Seville is famous for its stunning cathedral, Royal Alc\u00e1zar Palace, and lively flamenco music scene.\\n5. **M\u00e1laga**: A coastal city in southern Spain, M\u00e1laga is known for its rich history, beautiful beaches, and being the birthplace of Pablo Picasso.\\n6. **Zaragoza**: Located in the northeastern region of Aragon, Zaragoza is a city with a rich history, known for its Roman ruins, Gothic cathedral, and beautiful parks.\\n7. **Granada**: A city in the Andalusian region, Granada is famous for its stunning Alhambra palace and generalife gardens, a UNESCO World Heritage Site.\\n8. **Bilbao**: A city in the Basque Country, Bilbao is known for its modern architecture, including the Guggenheim Museum, and its rich cultural heritage.\\n9. **Alicante**: A coastal city in the Valencia region, Alicante is famous for its beautiful beaches, historic castle, and lively nightlife.\\n10. **San Sebasti\u00e1n**: A city in the Basque Country, San Sebasti\u00e1n is known for its stunning beaches, gastronomic scene, and cultural events like the San Sebasti\u00e1n International Film Festival.\\n\\nThese are just a few of the many great cities in Spain, each with its own unique character and attractions.', 'distilabel_metadata': {'raw_output_text_generation_0': 'Spain is a country with a rich culture, history, and architecture, and it has many great cities to visit. Here are some of the top cities in Spain:\\n\\n1. **Madrid**: The capital city of Spain, known for its vibrant nightlife, museums, and historic landmarks like the Royal Palace and Prado Museum.\\n2. **Barcelona**: The second-largest city in Spain, famous for its modernist architecture, beaches, and iconic landmarks like La Sagrada Fam\u00edlia and Park G\u00fcell, designed by Antoni Gaud\u00ed.\\n3. **Valencia**: Located on the Mediterranean coast, Valencia is known for its beautiful beaches, City of Arts and Sciences, and delicious local cuisine, such as paella.\\n4. **Seville**: The capital of Andalusia, Seville is famous for its stunning cathedral, Royal Alc\u00e1zar Palace, and lively flamenco music scene.\\n5. **M\u00e1laga**: A coastal city in southern Spain, M\u00e1laga is known for its rich history, beautiful beaches, and being the birthplace of Pablo Picasso.\\n6. **Zaragoza**: Located in the northeastern region of Aragon, Zaragoza is a city with a rich history, known for its Roman ruins, Gothic cathedral, and beautiful parks.\\n7. **Granada**: A city in the Andalusian region, Granada is famous for its stunning Alhambra palace and generalife gardens, a UNESCO World Heritage Site.\\n8. **Bilbao**: A city in the Basque Country, Bilbao is known for its modern architecture, including the Guggenheim Museum, and its rich cultural heritage.\\n9. **Alicante**: A coastal city in the Valencia region, Alicante is famous for its beautiful beaches, historic castle, and lively nightlife.\\n10. **San Sebasti\u00e1n**: A city in the Basque Country, San Sebasti\u00e1n is known for its stunning beaches, gastronomic scene, and cultural events like the San Sebasti\u00e1n International Film Festival.\\n\\nThese are just a few of the many great cities in Spain, each with its own unique character and attractions.'}, 'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct'}]\n[{'instruction': 'Which are the top cities in Spain?', 'generation': ' Here are some of the top cities in Spain based on various factors such as tourism, culture, history, and quality of life:\\n\\n1. Madrid: The capital and largest city in Spain, Madrid is known for its vibrant nightlife, world-class museums (such as the Prado Museum and Reina Sofia Museum), stunning parks (such as the Retiro Park), and delicious food.\\n\\n2. Barcelona: Famous for its unique architecture, Barcelona is home to several UNESCO World Heritage sites designed by Antoni Gaud\u00ed, including the Sagrada Familia and Park G\u00fcell. The city also boasts beautiful beaches, a lively arts scene, and delicious Catalan cuisine.\\n\\n3. Valencia: A coastal city located in the east of Spain, Valencia is known for its City of Arts and Sciences, a modern architectural complex that includes a planetarium, opera house, and museum of interactive science. The city is also famous for its paella, a traditional Spanish dish made with rice, vegetables, and seafood.\\n\\n4. Seville: The capital of Andalusia, Seville is famous for its flamenco dancing, stunning cathedral (the largest Gothic cathedral in the world), and the Alc\u00e1zar, a beautiful palace made up of a series of rooms and courtyards.\\n\\n5. Granada: Located in the foothills of the Sierra Nevada mountains, Granada is known for its stunning Alhambra palace, a Moorish fortress that dates back to the 9th century. The city is also famous for its tapas, a traditional Spanish dish that is often served for free with drinks.\\n\\n6. Bilbao: A city in the Basque Country, Bilbao is famous for its modern architecture, including the Guggenheim Museum, a contemporary art museum designed by Frank Gehry. The city is also known for its pintxos, a type of Basque tapas that are served in bars and restaurants.\\n\\n7. M\u00e1laga: A coastal city in Andalusia, M\u00e1laga is known for its beautiful beaches, historic sites (including the Alcazaba and Gibralfaro castles), and the Picasso Museum, which is dedicated to the famous Spanish artist who was born in the city.\\n\\nThese are just a few of the many wonderful cities in Spain.', 'distilabel_metadata': {'raw_output_text_generation_0': ' Here are some of the top cities in Spain based on various factors such as tourism, culture, history, and quality of life:\\n\\n1. Madrid: The capital and largest city in Spain, Madrid is known for its vibrant nightlife, world-class museums (such as the Prado Museum and Reina Sofia Museum), stunning parks (such as the Retiro Park), and delicious food.\\n\\n2. Barcelona: Famous for its unique architecture, Barcelona is home to several UNESCO World Heritage sites designed by Antoni Gaud\u00ed, including the Sagrada Familia and Park G\u00fcell. The city also boasts beautiful beaches, a lively arts scene, and delicious Catalan cuisine.\\n\\n3. Valencia: A coastal city located in the east of Spain, Valencia is known for its City of Arts and Sciences, a modern architectural complex that includes a planetarium, opera house, and museum of interactive science. The city is also famous for its paella, a traditional Spanish dish made with rice, vegetables, and seafood.\\n\\n4. Seville: The capital of Andalusia, Seville is famous for its flamenco dancing, stunning cathedral (the largest Gothic cathedral in the world), and the Alc\u00e1zar, a beautiful palace made up of a series of rooms and courtyards.\\n\\n5. Granada: Located in the foothills of the Sierra Nevada mountains, Granada is known for its stunning Alhambra palace, a Moorish fortress that dates back to the 9th century. The city is also famous for its tapas, a traditional Spanish dish that is often served for free with drinks.\\n\\n6. Bilbao: A city in the Basque Country, Bilbao is famous for its modern architecture, including the Guggenheim Museum, a contemporary art museum designed by Frank Gehry. The city is also known for its pintxos, a type of Basque tapas that are served in bars and restaurants.\\n\\n7. M\u00e1laga: A coastal city in Andalusia, M\u00e1laga is known for its beautiful beaches, historic sites (including the Alcazaba and Gibralfaro castles), and the Picasso Museum, which is dedicated to the famous Spanish artist who was born in the city.\\n\\nThese are just a few of the many wonderful cities in Spain.'}, 'model_name': 'mistralai/Mixtral-8x7B-Instruct-v0.1'}]\n\n
group_responses = GroupColumns(\n    columns=[\"generation\", \"model_name\"],\n    output_columns=[\"generations\", \"model_names\"],\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nnext(\n    group_responses.process(\n        [\n            {\n                \"generation\": \"Madrid\",\n                \"model_name\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n            },\n        ],\n        [\n            {\n                \"generation\": \"Barcelona\",\n                \"model_name\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n            }\n        ],\n    )\n)\n
\n[{'generations': ['Madrid', 'Barcelona'],\n  'model_names': ['meta-llama/Meta-Llama-3-8B-Instruct',\n   'mistralai/Mixtral-8x7B-Instruct-v0.1']}]\n
evaluate_responses = UltraFeedback(\n    aspect=\"overall-rating\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n    ),\n    pipeline=Pipeline(name=\"showcase-pipeline\"),\n)\nevaluate_responses.load()\nnext(\n    evaluate_responses.process(\n        [\n            {\n                \"instruction\": \"What's the capital of Spain?\",\n                \"generations\": [\"Madrid\", \"Barcelona\"],\n            }\n        ]\n    )\n)\n
\n[{'instruction': \"What's the capital of Spain?\",\n  'generations': ['Madrid', 'Barcelona'],\n  'ratings': [5, 1],\n  'rationales': [\"The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\",\n   \"The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"],\n  'distilabel_metadata': {'raw_output_ultra_feedback_0': \"#### Output for Text 1\\nRating: 5 (Excellent)\\nRationale: The answer is correct, directly addressing the question, and is free of hallucinations or unnecessary details. It confidently provides the accurate information, aligning perfectly with the user's intent.\\n\\n#### Output for Text 2\\nRating: 1 (Low Quality)\\nRationale: The answer is incorrect as Barcelona is not the capital of Spain. This introduces a significant inaccuracy, failing to provide helpful information and deviating entirely from the user's intent.\"},\n  'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'}]\n
format_dpo = FormatTextGenerationDPO(pipeline=Pipeline(name=\"showcase-pipeline\"))\nformat_dpo.load()\nnext(\n    format_dpo.process(\n        [\n            {\n                \"instruction\": \"What's the capital of Spain?\",\n                \"generations\": [\"Madrid\", \"Barcelona\"],\n                \"generation_models\": [\n                    \"Meta-Llama-3-8B-Instruct\",\n                    \"Mixtral-8x7B-Instruct-v0.1\",\n                ],\n                \"ratings\": [5, 1],\n            }\n        ]\n    )\n)\n
\n[{'instruction': \"What's the capital of Spain?\",\n  'generations': ['Madrid', 'Barcelona'],\n  'generation_models': ['Meta-Llama-3-8B-Instruct',\n   'Mixtral-8x7B-Instruct-v0.1'],\n  'ratings': [5, 1],\n  'prompt': \"What's the capital of Spain?\",\n  'prompt_id': '26174c953df26b3049484e4721102dca6b25d2de9e3aa22aa84f25ed1c798512',\n  'chosen': [{'role': 'user', 'content': \"What's the capital of Spain?\"},\n   {'role': 'assistant', 'content': 'Madrid'}],\n  'chosen_model': 'Meta-Llama-3-8B-Instruct',\n  'chosen_rating': 5,\n  'rejected': [{'role': 'user', 'content': \"What's the capital of Spain?\"},\n   {'role': 'assistant', 'content': 'Barcelona'}],\n  'rejected_model': 'Mixtral-8x7B-Instruct-v0.1',\n  'rejected_rating': 1}]\n
  • Or you can use Argilla to manually label the data and convert it to a preference dataset.
    • Component: PreferenceToArgilla step
    • Input columns: instruction, generations, generation_models, ratings
    • Output columns: instruction, generations, generation_models, ratings
to_argilla = PreferenceToArgilla(\n    dataset_name=\"preference-dataset\",\n    dataset_workspace=\"argilla\",\n    api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n    api_key=\"[your-api-key]\",\n    num_generations=2\n)\n

Below, you can see the full pipeline definition:

with Pipeline(name=\"generate-dataset\") as pipeline:\n\n    load_dataset = LoadDataFromHub(repo_id=\"argilla/10Kprompts-mini\")\n\n    generate_responses = [\n        TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n                generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n            )\n        ),\n        TextGeneration(\n            llm=InferenceEndpointsLLM(\n                model_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n                tokenizer_id=\"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n                generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n            )\n        ),\n    ]\n\n    group_responses = GroupColumns(\n        columns=[\"generation\", \"model_name\"],\n        output_columns=[\"generations\", \"model_names\"],\n    )\n\n    evaluate_responses = UltraFeedback(\n        aspect=\"overall-rating\",\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n        )\n    )\n\n    format_dpo = FormatTextGenerationDPO()\n\n    to_argilla = PreferenceToArgilla(\n        dataset_name=\"preference-dataset\",\n        dataset_workspace=\"argilla\",\n        api_url=\"https://[your-owner-name]-[your-space-name].hf.space\",\n        api_key=\"[your-api-key]\",\n        num_generations=2\n    )\n\n    for task in generate_responses:\n        load_dataset.connect(task)\n        task.connect(group_responses)\n    group_responses.connect(evaluate_responses)\n    evaluate_responses.connect(format_dpo, to_argilla)\n

Let's now run the pipeline and generate the preference dataset.

distiset = pipeline.run()\n

Let's check the preference dataset! If you have loaded the data to Argilla, you can start annotating in the Argilla UI.

You can push the dataset to the Hub for sharing with the community and embed it to explore the data.

distiset.push_to_hub(\"[your-owner-name]/example-preference-dataset\")\n

In this tutorial, we showcased the detailed steps to build a pipeline for generating a preference dataset using distilabel. You can customize this pipeline for your own use cases and share your datasets with the community through the Hugging Face Hub, or use them to train a model for DPO or ORPO.

We used a dataset containing prompts to generate responses using two different models through the serverless Hugging Face Inference API. Next, we evaluated the responses using a third model, following the UltraFeedback standards. Finally, we converted the data to a preference dataset and used Argilla for further curation.

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#generate-a-preference-dataset","title":"Generate a preference dataset","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#install-the-dependencies","title":"Install the dependencies","text":"

To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command:

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"

You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide.

Along with that, you will need to install Argilla as a distilabel extra.

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#define-the-pipeline","title":"Define the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#load-the-dataset","title":"Load the dataset","text":"

We will use as source data the argilla/10Kprompts-mini dataset from the Hugging Face Hub.

  • Component: LoadDataFromHub
  • Input columns: instruction and topic, the same as in the loaded dataset
  • Output columns: instruction and topic
"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#generate-responses","title":"Generate responses","text":"

We need to generate the responses for the given instructions. We will use two different models available on the Hugging Face Hub through the Serverless Inference API: meta-llama/Meta-Llama-3-8B-Instruct and mistralai/Mixtral-8x7B-Instruct-v0.1. We will also indicate the generation parameters for each model.

  • Component: TextGeneration task with LLMs using InferenceEndpointsLLM
  • Input columns: instruction
  • Output columns: generation, distilabel_metadata, model_name for each model

For your use case and to improve the results, you can use any other LLM of your choice.

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#group-the-responses","title":"Group the responses","text":"

The task to evaluate the responses needs as input a list of generations. However, each model response was saved in the generation column of the subsets text_generation_0 and text_generation_1. We will combine these two columns into a single column and the default subset.

  • Component: GroupColumns
  • Input columns: generation and model_namefrom text_generation_0 and text_generation_1
  • Output columns: generations and model_names
"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#evaluate-the-responses","title":"Evaluate the responses","text":"

To build our preference dataset, we need to evaluate the responses generated by the models. We will use meta-llama/Meta-Llama-3-70B-Instruct for this, applying the UltraFeedback task that judges the responses according to different dimensions (helpfulness, honesty, instruction-following, truthfulness).

  • Component: UltraFeedback task with LLMs using InferenceEndpointsLLM
  • Input columns: instruction, generations
  • Output columns: ratings, rationales, distilabel_metadata, model_name

For your use case and to improve the results, you can use any other LLM of your choice.

"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#convert-to-a-preference-dataset","title":"Convert to a preference dataset","text":"
  • You can automatically convert it to a preference dataset with the chosen and rejected columns.
    • Component: FormatTextGenerationDPO step
    • Input columns: instruction, generations, generation_models, ratings
    • Output columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating
"},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_preference_dataset/#conclusions","title":"Conclusions","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/","title":"Generate synthetic text classification data","text":"
  • Goal: Generate synthetic text classification data to augment an imbalanced and limited dataset for training a topic classifier. In addition, generate new data for training a fact-based versus opinion-based classifier to add a new label.
  • Libraries: argilla, hf-inference-endpoints, SetFit
  • Components: LoadDataFromDicts, EmbeddingTaskGenerator, GenerateTextClassificationData
!pip install \"distilabel[hf-inference-endpoints]\"\n
!pip install \"transformers~=4.40\" \"torch~=2.0\" \"setfit~=1.0\"\n

Let's make the required imports:

import random\nfrom collections import Counter\n\nfrom datasets import load_dataset, Dataset\nfrom distilabel.models import InferenceEndpointsLLM\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import LoadDataFromDicts\nfrom distilabel.steps.tasks import (\n    GenerateTextClassificationData,\n)\nfrom setfit import SetFitModel, Trainer, sample_dataset\n

You'll need an HF_TOKEN to use the HF Inference Endpoints. Log in to use it directly within this notebook.

import os\nfrom huggingface_hub import login\n\nlogin(token=os.getenv(\"HF_TOKEN\"), add_to_git_credential=True)\n
!pip install \"distilabel[argilla, hf-inference-endpoints]\"\n

We will use the fancyzhx/ag_news dataset from the Hugging Face Hub as our original data source. To simulate a real-world scenario with imbalanced and limited data, we will load only 20 samples from this dataset.

hf_dataset = load_dataset(\"fancyzhx/ag_news\", split=\"train[-20:]\")\n

Now, we can retrieve the available labels in the dataset and examine the current data distribution.

labels_topic = hf_dataset.features[\"label\"].names\nid2str = {i: labels_topic[i] for i in range(len(labels_topic))}\nprint(id2str)\nprint(Counter(hf_dataset[\"label\"]))\n
\n{0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}\nCounter({0: 12, 1: 6, 2: 2})\n\n

As observed, the dataset is imbalanced, with most samples falling under the World category, while the Sci/Tech category is entirely missing. Moreover, there are insufficient samples to effectively train a topic classification model.

We will also define the labels for the new classification task.

labels_fact_opinion = [\"Fact-based\", \"Opinion-based\"]\n

To generate the data we will use the GenerateTextClassificationData task. This task will use as input classification tasks and we can define the language, difficulty and clarity required for the generated data.

task = GenerateTextClassificationData(\n    language=\"English\",\n    difficulty=\"college\",\n    clarity=\"clear\",\n    num_generations=1,\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.4},\n    ),\n    input_batch_size=5,\n)\ntask.load()\nresult = next(\n    task.process([{\"task\": \"Classify the news article as fact-based or opinion-based\"}])\n)\nprint(result[0][\"distilabel_metadata\"][\"raw_input_generate_text_classification_data_0\"])\n
\n[{'role': 'user', 'content': 'You have been assigned a text classification task: Classify the news article as fact-based or opinion-based\\n\\nYour mission is to write one text classification example for this task in JSON format. The JSON object must contain the following keys:\\n - \"input_text\": a string, the input text specified by the classification task.\\n - \"label\": a string, the correct label of the input text.\\n - \"misleading_label\": a string, an incorrect label that is related to the task.\\n\\nPlease adhere to the following guidelines:\\n - The \"input_text\" should be diverse in expression.\\n - The \"misleading_label\" must be a valid label for the given task, but not as appropriate as the \"label\" for the \"input_text\".\\n - The values for all fields should be in English.\\n - Avoid including the values of the \"label\" and \"misleading_label\" fields in the \"input_text\", that would make the task too easy.\\n - The \"input_text\" is clear and requires college level education to comprehend.\\n\\nYour output must always be a JSON object only, do not explain yourself or output anything else. Be creative!'}]\n\n

For our use case, we only need to generate data for two tasks: a topic classification task and a fact versus opinion classification task. Therefore, we will define the tasks accordingly. As we will be using an smaller model for generation, we will select 2 random labels for each topic classification task and change the order for the fact versus opinion classification task ensuring more diversity in the generated data.

task_templates = [\n    \"Determine the news article as {}\",\n    \"Classify news article as {}\",\n    \"Identify the news article as {}\",\n    \"Categorize the news article as {}\",\n    \"Label the news article using {}\",\n    \"Annotate the news article based on {}\",\n    \"Determine the theme of a news article from {}\",\n    \"Recognize the topic of the news article as {}\",\n]\n\nclassification_tasks = [\n    {\"task\": action.format(\" or \".join(random.sample(labels_topic, 2)))}\n    for action in task_templates for _ in range(4)\n] + [\n    {\"task\": action.format(\" or \".join(random.sample(labels_fact_opinion, 2)))}\n    for action in task_templates\n]\n

Now, it's time to define and run the pipeline. As mentioned, we will load the written tasks and feed them into the GenerateTextClassificationData task. For our use case, we will be using Meta-Llama-3.1-8B-Instruct via the InferenceEndpointsLLM, with different degrees of difficulty and clarity.

difficulties = [\"college\", \"high school\", \"PhD\"]\nclarity = [\"clear\", \"understandable with some effort\", \"ambiguous\"]\n\nwith Pipeline(\"texcat-generation-pipeline\") as pipeline:\n\n    tasks_generator = LoadDataFromDicts(data=classification_tasks)\n\n    generate_data = []\n    for difficulty in difficulties:\n        for clarity_level in clarity:\n            task = GenerateTextClassificationData(\n                language=\"English\",\n                difficulty=difficulty,\n                clarity=clarity_level,\n                num_generations=2,\n                llm=InferenceEndpointsLLM(\n                    model_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n                    tokenizer_id=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n                    generation_kwargs={\"max_new_tokens\": 512, \"temperature\": 0.7},\n                ),\n                input_batch_size=5,\n            )\n            generate_data.append(task)\n\n    for task in generate_data:\n        tasks_generator.connect(task)\n

Let's now run the pipeline and generate the synthetic data.

distiset = pipeline.run()\n
distiset[\"generate_text_classification_data_0\"][\"train\"][0]\n
\n{'task': 'Determine the news article as Business or World',\n 'input_text': \"The recent decision by the European Central Bank to raise interest rates will likely have a significant impact on the eurozone's economic growth, with some analysts predicting a 0.5% contraction in GDP due to the increased borrowing costs. The move is seen as a measure to combat inflation, which has been rising steadily over the past year.\",\n 'label': 'Business',\n 'misleading_label': 'World',\n 'distilabel_metadata': {'raw_output_generate_text_classification_data_0': '{\\n  \"input_text\": \"The recent decision by the European Central Bank to raise interest rates will likely have a significant impact on the eurozone\\'s economic growth, with some analysts predicting a 0.5% contraction in GDP due to the increased borrowing costs. The move is seen as a measure to combat inflation, which has been rising steadily over the past year.\",\\n  \"label\": \"Business\",\\n  \"misleading_label\": \"World\"\\n}'},\n 'model_name': 'meta-llama/Meta-Llama-3.1-8B-Instruct'}\n

You can push the dataset to the Hub for sharing with the community and embed it to explore the data.

distiset.push_to_hub(\"[your-owner-name]/example-texcat-generation-dataset\")\n

By examining the distiset distribution, we can confirm that it includes at least the 8 required samples for each label to train our classification models with SetFit.

all_labels = [\n    entry[\"label\"]\n    for dataset_name in distiset\n    for entry in distiset[dataset_name][\"train\"]\n]\n\nCounter(all_labels)\n
\nCounter({'Sci/Tech': 275,\n         'Business': 130,\n         'World': 86,\n         'Fact-based': 86,\n         'Sports': 64,\n         'Opinion-based': 54,\n         None: 20,\n         'Opinion Based': 1,\n         'News/Opinion': 1,\n         'Science': 1,\n         'Environment': 1,\n         'Opinion': 1})\n

We will create two datasets with the required labels and data for our use cases.

def extract_rows(distiset, labels):\n    return [\n        {\n            \"text\": entry[\"input_text\"],\n            \"label\": entry[\"label\"],\n            \"id\": i\n        }\n        for dataset_name in distiset\n        for i, entry in enumerate(distiset[dataset_name][\"train\"])\n        if entry[\"label\"] in labels\n    ]\n\ndata_topic = extract_rows(distiset, labels_topic)\ndata_fact_opinion = extract_rows(distiset, labels_fact_opinion)\n

Get started in Argilla

If you are not familiar with Argilla, we recommend taking a look at the Argilla quickstart docs. Alternatively, you can use your Hugging Face account to login to the Argilla demo Space.

To get the most out of our data, we will use Argilla. First, we need to connect to the Argilla instance.

import argilla as rg\n\n# Replace api_url with your url if using Docker\n# Replace api_key with your API key under \"My Settings\" in the UI\n# Uncomment the last line and set your HF_TOKEN if your space is private\nclient = rg.Argilla(\n    api_url=\"https://[your-owner-name]-[your_space_name].hf.space\",\n    api_key=\"[your-api-key]\",\n    # headers={\"Authorization\": f\"Bearer {HF_TOKEN}\"}\n)\n

We will create a Dataset for each task, with an input TextField for the text classification text and a LabelQuestion to ensure the generated labels are correct.

def create_texcat_dataset(dataset_name, labels):\n    settings = rg.Settings(\n        fields=[rg.TextField(\"text\")],\n        questions=[\n            rg.LabelQuestion(\n                name=\"label\",\n                title=\"Classify the texts according to the following labels\",\n                labels=labels,\n            ),\n        ],\n    )\n    return rg.Dataset(name=dataset_name, settings=settings).create()\n\n\nrg_dataset_topic = create_texcat_dataset(\"topic-classification\", labels_topic)\nrg_dataset_fact_opinion = create_texcat_dataset(\n    \"fact-opinion-classification\", labels_fact_opinion\n)\n

Now, we can upload the generated data to Argilla and evaluate it. We will use the generated labels as suggestions.

rg_dataset_topic.records.log(data_topic)\nrg_dataset_fact_opinion.records.log(data_fact_opinion)\n

Now, we can start the annotation process. Just open the dataset in the Argilla UI and start annotating the records. If the suggestions are correct, you can just click on Submit. Otherwise, you can select the correct label.

Note

Check this how-to guide to know more about annotating in the UI.

Once, you get the annotations, let's continue by retrieving the data from Argilla and format it as a dataset with the required data.

rg_dataset_topic = client.datasets(\"topic-classification\")\nrg_dataset_fact_opinion = client.datasets(\"fact-opinion-classification\")\n
status_filter = rg.Query(filter=rg.Filter((\"response.status\", \"==\", \"submitted\")))\n\nsubmitted_topic = rg_dataset_topic.records(status_filter).to_list(flatten=True)\nsubmitted_fact_opinion = rg_dataset_fact_opinion.records(status_filter).to_list(\n    flatten=True\n)\n
def format_submitted(submitted):\n    return [\n        {\n            \"text\": r[\"text\"],\n            \"label\": r[\"label.responses\"][0],\n            \"id\": i,\n        }\n        for i, r in enumerate(submitted)\n    ]\n\ndata_topic = format_submitted(submitted_topic)\ndata_fact_opinion = format_submitted(submitted_fact_opinion)\n

In our case, we will fine-tune using SetFit. However, you can select the one that best fits your requirements.

The next step will be to format the data to be compatible with SetFit. In the case of the topic classification, we will need to combine the synthetic data with the original data.

hf_topic = hf_dataset.to_list()\nnum = len(data_topic)\n\ndata_topic.extend(\n    [\n        {\n            \"text\": r[\"text\"],\n            \"label\": id2str[r[\"label\"]],\n            \"id\": num + i,\n        }\n        for i, r in enumerate(hf_topic)\n    ]\n)\n

If we check the data distribution now, we can see that we have enough samples for each label to train our models.

labels = [record[\"label\"] for record in data_topic]\nCounter(labels)\n
\nCounter({'Sci/Tech': 275, 'Business': 132, 'World': 98, 'Sports': 70})\n
labels = [record[\"label\"] for record in data_fact_opinion]\nCounter(labels)\n
\nCounter({'Fact-based': 86, 'Opinion-based': 54})\n

Now, let's create our training and validation datasets. The training dataset will gather 8 samples by label. In this case, the validation datasets will contain the remaining samples not included in the training datasets.

def sample_and_split(dataset, label_column, num_samples):\n    train_dataset = sample_dataset(\n        dataset, label_column=label_column, num_samples=num_samples\n    )\n    eval_dataset = dataset.filter(lambda x: x[\"id\"] not in set(train_dataset[\"id\"]))\n    return train_dataset, eval_dataset\n\n\ndataset_topic_full = Dataset.from_list(data_topic)\ndataset_fact_opinion_full = Dataset.from_list(data_fact_opinion)\n\ntrain_dataset_topic, eval_dataset_topic = sample_and_split(\n    dataset_topic_full, \"label\", 8\n)\ntrain_dataset_fact_opinion, eval_dataset_fact_opinion = sample_and_split(\n    dataset_fact_opinion_full, \"label\", 8\n)\n

Let's train our models for each task! We will use TaylorAI/bge-micro-v2, available in the Hugging Face Hub. You can check the MTEB leaderboard to select the best model for your use case.

def train_model(model_name, dataset, eval_dataset):\n    model = SetFitModel.from_pretrained(model_name)\n\n    trainer = Trainer(\n        model=model,\n        train_dataset=dataset,\n    )\n    trainer.train()\n    metrics = trainer.evaluate(eval_dataset)\n    print(metrics)\n\n    return model\n
model_topic = train_model(\n    model_name=\"TaylorAI/bge-micro-v2\",\n    dataset=train_dataset_topic,\n    eval_dataset=eval_dataset_topic,\n)\nmodel_topic.save_pretrained(\"topic_classification_model\")\nmodel_topic = SetFitModel.from_pretrained(\"topic_classification_model\")\n
\n***** Running training *****\n  Num unique pairs = 768\n  Batch size = 16\n  Num epochs = 1\n  Total optimization steps = 48\n\n
\n{'embedding_loss': 0.1873, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}\n\n
\n***** Running evaluation *****\n\n
\n{'train_runtime': 4.9767, 'train_samples_per_second': 154.318, 'train_steps_per_second': 9.645, 'epoch': 1.0}\n{'accuracy': 0.8333333333333334}\n\n
model_fact_opinion = train_model(\n    model_name=\"TaylorAI/bge-micro-v2\",\n    dataset=train_dataset_fact_opinion,\n    eval_dataset=eval_dataset_fact_opinion,\n)\nmodel_fact_opinion.save_pretrained(\"fact_opinion_classification_model\")\nmodel_fact_opinion = SetFitModel.from_pretrained(\"fact_opinion_classification_model\")\n
\n***** Running training *****\n  Num unique pairs = 144\n  Batch size = 16\n  Num epochs = 1\n  Total optimization steps = 9\n\n
\n{'embedding_loss': 0.2985, 'learning_rate': 2e-05, 'epoch': 0.11}\n\n
\n***** Running evaluation *****\n\n
\n{'train_runtime': 0.8327, 'train_samples_per_second': 172.931, 'train_steps_per_second': 10.808, 'epoch': 1.0}\n{'accuracy': 0.9090909090909091}\n\n

Voil\u00e0! The models are now trained and ready to be used. You can start making predictions to check the model's performance and add the new label. Optionally, you can continue using distilabel to generate additional data or Argilla to verify the quality of the predictions.

def predict(model, input, labels):\n    model.labels = labels\n    prediction = model.predict([input])\n    return prediction[0]\n
predict(\n    model_topic, \"The new iPhone is expected to be released next month.\", labels_topic\n)\n
\n'Sci/Tech'\n
predict(\n    model_fact_opinion,\n    \"The new iPhone is expected to be released next month.\",\n    labels_fact_opinion,\n)\n
\n'Opinion-based'\n

In this tutorial, we showcased the detailed steps to build a pipeline for generating text classification data using distilabel. You can customize this pipeline for your own use cases and share your datasets with the community through the Hugging Face Hub.

We defined two text classification tasks\u2014a topic classification task and a fact versus opinion classification task\u2014and generated new data using various models via the serverless Hugging Face Inference API. Then, we curated the generated data with Argilla. Finally, we trained the models with SetFit using both the original and synthetic data.

"},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#generate-synthetic-text-classification-data","title":"Generate synthetic text classification data","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#getting-started","title":"Getting started","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#install-the-dependencies","title":"Install the dependencies","text":"

To complete this tutorial, you need to install the distilabel SDK and a few third-party libraries via pip. We will be using the free but rate-limited Hugging Face serverless Inference API for this tutorial, so we need to install this as an extra distilabel dependency. You can install them by running the following command:

"},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#optional-deploy-argilla","title":"(optional) Deploy Argilla","text":"

You can skip this step or replace it with any other data evaluation tool, but the quality of your model will suffer from a lack of data quality, so we do recommend looking at your data. If you already deployed Argilla, you can skip this step. Otherwise, you can quickly deploy Argilla following this guide.

Along with that, you will need to install Argilla as a distilabel extra.

"},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#the-dataset","title":"The dataset","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#define-the-text-classification-task","title":"Define the text classification task","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#run-the-pipeline","title":"Run the pipeline","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#optional-evaluate-with-argilla","title":"(Optional) Evaluate with Argilla","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#train-your-models","title":"Train your models","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#formatting-the-data","title":"Formatting the data","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#the-actual-training","title":"The actual training","text":""},{"location":"sections/pipeline_samples/tutorials/generate_textcat_dataset/#conclusions","title":"Conclusions","text":""},{"location":"components-gallery/","title":"Components Gallery","text":"
  • Steps

    Explore all the available Steps that can be used for data manipulation.

    Steps

  • Tasks

    Explore all the available Tasks that can be used with an LLM to perform data generation, annotation, and more.

    Tasks

  • LLMs

    Explore all the available LLMs integrated with distilabel.

    LLMs

  • Embeddings

    Explore all the available Embeddings models integrated with distilabel.

    Embeddings

"},{"location":"components-gallery/steps/","title":"Steps Gallery","text":"Category Overview

The gallery page showcases the different types of components within distilabel.

Icon Category Description text-generation Text generation steps are used to generate text based on a given prompt. chat-generation Chat generation steps are used to generate text based on a conversation. text-classification Text classification steps are used to classify text into a category. text-manipulation Text manipulation steps are used to manipulate or rewrite an input text. evol Evol steps are used to rewrite input text and evolve it to a higher quality. critique Critique steps are used to provide feedback on the quality of the data with a written explanation. scorer Scorer steps are used to evaluate and score the data with a numerical value. preference Preference steps are used to collect preferences on the data with numerical values or ranks. embedding Embedding steps are used to generate embeddings for the data. clustering Clustering steps are used to group similar data points together. columns Columns steps are used to manipulate columns in the data. filtering Filtering steps are used to filter the data based on some criteria. format Format steps are used to format the data. load Load steps are used to load the data. execution Executes python functions. save Save steps are used to save the data.
  • PreferenceToArgilla

    Creates a preference dataset in Argilla.

    PreferenceToArgilla

  • TextGenerationToArgilla

    Creates a text generation dataset in Argilla.

    TextGenerationToArgilla

  • CombineColumns

    CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead.

    CombineColumns

  • PushToHub

    Push data to a Hugging Face Hub dataset.

    PushToHub

  • LoadDataFromDicts

    Loads a dataset from a list of dictionaries.

    LoadDataFromDicts

  • DataSampler

    Step to sample from a dataset.

    DataSampler

  • LoadDataFromHub

    Loads a dataset from the Hugging Face Hub.

    LoadDataFromHub

  • LoadDataFromFileSystem

    Loads a dataset from a file in your filesystem.

    LoadDataFromFileSystem

  • LoadDataFromDisk

    Load a dataset that was previously saved to disk.

    LoadDataFromDisk

  • PrepareExamples

    Helper step to create examples from query and answers pairs used as Few Shots in APIGen.

    PrepareExamples

  • ConversationTemplate

    Generate a conversation template from an instruction and a response.

    ConversationTemplate

  • FormatTextGenerationDPO

    Format the output of your LLMs for Direct Preference Optimization (DPO).

    FormatTextGenerationDPO

  • FormatChatGenerationDPO

    Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO).

    FormatChatGenerationDPO

  • FormatTextGenerationSFT

    Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT).

    FormatTextGenerationSFT

  • FormatChatGenerationSFT

    Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT).

    FormatChatGenerationSFT

  • DeitaFiltering

    Filter dataset rows using DEITA filtering strategy.

    DeitaFiltering

  • EmbeddingDedup

    Deduplicates text using embeddings.

    EmbeddingDedup

  • APIGenExecutionChecker

    Executes the generated function calls.

    APIGenExecutionChecker

  • MinHashDedup

    Deduplicates text using MinHash and MinHashLSH.

    MinHashDedup

  • CombineOutputs

    Combine the outputs of several upstream steps.

    CombineOutputs

  • ExpandColumns

    Expand columns that contain lists into multiple rows.

    ExpandColumns

  • GroupColumns

    Combines columns from a list of StepInput.

    GroupColumns

  • KeepColumns

    Keeps selected columns in the dataset.

    KeepColumns

  • MergeColumns

    Merge columns from a row.

    MergeColumns

  • DBSCAN

    DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core

    DBSCAN

  • UMAP

    UMAP is a general purpose manifold learning and dimension reduction algorithm.

    UMAP

  • FaissNearestNeighbour

    Create a faiss index to get the nearest neighbours.

    FaissNearestNeighbour

  • EmbeddingGeneration

    Generate embeddings using an Embeddings model.

    EmbeddingGeneration

  • RewardModelScore

    Assign a score to a response using a Reward Model.

    RewardModelScore

  • TruncateTextColumn

    Truncate a row using a tokenizer or the number of characters.

    TruncateTextColumn

"},{"location":"components-gallery/steps/preferencetoargilla/","title":"PreferenceToArgilla","text":"

Creates a preference dataset in Argilla.

Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a preference dataset, where there's one field for the instruction and one extra field per each generation within the same record, and then a rating question per each of the generation fields. The rating question asks the annotator to set a rating from 1 to 5 for each of the provided generations.

"},{"location":"components-gallery/steps/preferencetoargilla/#note","title":"Note","text":"

This step is meant to be used in conjunction with the UltraFeedback step, or any other step generating both ratings and responses for a given set of instruction and generations for the given instruction. But alternatively, it can also be used with any other task or step generating only the instruction and generations, as the ratings and rationales are optional.

"},{"location":"components-gallery/steps/preferencetoargilla/#attributes","title":"Attributes","text":"
  • num_generations: The number of generations to include in the dataset.

  • dataset_name: The name of the dataset in Argilla.

  • dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

  • api_url: The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

  • api_key: The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

"},{"location":"components-gallery/steps/preferencetoargilla/#runtime-parameters","title":"Runtime Parameters","text":"
  • api_url: The base URL to use for the Argilla API requests.

  • api_key: The API key to authenticate the requests to the Argilla API.

"},{"location":"components-gallery/steps/preferencetoargilla/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generations]\n            ICOL2[ratings]\n            ICOL3[rationales]\n        end\n    end\n\n    subgraph PreferenceToArgilla\n        StepInput[Input Columns: instruction, generations, ratings, rationales]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n
"},{"location":"components-gallery/steps/preferencetoargilla/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction that was used to generate the completion.

  • generations (List[str]): The completion that was generated based on the input instruction.

  • ratings (List[str], optional): The ratings for the generations. If not provided, the generated ratings won't be pushed to Argilla.

  • rationales (List[str], optional): The rationales for the ratings. If not provided, the generated rationales won't be pushed to Argilla.

"},{"location":"components-gallery/steps/preferencetoargilla/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/preferencetoargilla/#push-a-preference-dataset-to-an-argilla-instance","title":"Push a preference dataset to an Argilla instance","text":"
from distilabel.steps import PreferenceToArgilla\n\nto_argilla = PreferenceToArgilla(\n    num_generations=2,\n    api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n    api_key=\"api.key\",\n    dataset_name=\"argilla_dataset\",\n    dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generations\": [\"first_generation\", \"second_generation\"],\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generations': ['first_generation', 'second_generation']}]\n
"},{"location":"components-gallery/steps/preferencetoargilla/#it-can-also-include-ratings-and-rationales","title":"It can also include ratings and rationales","text":"
result = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generations\": [\"first_generation\", \"second_generation\"],\n                \"ratings\": [\"4\", \"5\"],\n                \"rationales\": [\"rationale for 4\", \"rationale for 5\"],\n            }\n        ],\n    )\n)\n# >>> result\n# [\n#     {\n#         'instruction': 'instruction',\n#         'generations': ['first_generation', 'second_generation'],\n#         'ratings': ['4', '5'],\n#         'rationales': ['rationale for 4', 'rationale for 5']\n#     }\n# ]\n
"},{"location":"components-gallery/steps/textgenerationtoargilla/","title":"TextGenerationToArgilla","text":"

Creates a text generation dataset in Argilla.

Step that creates a dataset in Argilla during the load phase, and then pushes the input batches into it as records. This dataset is a text-generation dataset, where there's one field per each input, and then a label question to rate the quality of the completion in either bad (represented with \ud83d\udc4e) or good (represented with \ud83d\udc4d).

"},{"location":"components-gallery/steps/textgenerationtoargilla/#note","title":"Note","text":"

This step is meant to be used in conjunction with a TextGeneration step and no column mapping is needed, as it will use the default values for the instruction and generation columns.

"},{"location":"components-gallery/steps/textgenerationtoargilla/#attributes","title":"Attributes","text":"
  • dataset_name: The name of the dataset in Argilla.

  • dataset_workspace: The workspace where the dataset will be created in Argilla. Defaults to None, which means it will be created in the default workspace.

  • api_url: The URL of the Argilla API. Defaults to None, which means it will be read from the ARGILLA_API_URL environment variable.

  • api_key: The API key to authenticate with Argilla. Defaults to None, which means it will be read from the ARGILLA_API_KEY environment variable.

"},{"location":"components-gallery/steps/textgenerationtoargilla/#runtime-parameters","title":"Runtime Parameters","text":"
  • api_url: The base URL to use for the Argilla API requests.

  • api_key: The API key to authenticate the requests to the Argilla API.

"},{"location":"components-gallery/steps/textgenerationtoargilla/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generation]\n        end\n    end\n\n    subgraph TextGenerationToArgilla\n        StepInput[Input Columns: instruction, generation]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n
"},{"location":"components-gallery/steps/textgenerationtoargilla/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction that was used to generate the completion.

  • generation (str or List[str]): The completions that were generated based on the input instruction.

"},{"location":"components-gallery/steps/textgenerationtoargilla/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/textgenerationtoargilla/#push-a-text-generation-dataset-to-an-argilla-instance","title":"Push a text generation dataset to an Argilla instance","text":"
from distilabel.steps import PreferenceToArgilla\n\nto_argilla = TextGenerationToArgilla(\n    num_generations=2,\n    api_url=\"https://dibt-demo-argilla-space.hf.space/\",\n    api_key=\"api.key\",\n    dataset_name=\"argilla_dataset\",\n    dataset_workspace=\"my_workspace\",\n)\nto_argilla.load()\n\nresult = next(\n    to_argilla.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"generation\": \"generation\",\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction', 'generation': 'generation'}]\n
"},{"location":"components-gallery/steps/combinecolumns/","title":"CombineColumns","text":"

CombineColumns is deprecated and will be removed in version 1.5.0, use GroupColumns instead.

"},{"location":"components-gallery/steps/combinecolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n    end\n\n    subgraph CombineColumns\n    end\n\n
"},{"location":"components-gallery/steps/pushtohub/","title":"PushToHub","text":"

Push data to a Hugging Face Hub dataset.

A GlobalStep which creates a datasets.Dataset with the input data and pushes it to the Hugging Face Hub.

"},{"location":"components-gallery/steps/pushtohub/#attributes","title":"Attributes","text":"
  • repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.

  • split: The split of the dataset that will be pushed. Defaults to \"train\".

  • private: Whether the dataset to be pushed should be private or not. Defaults to False.

  • token: The token that will be used to authenticate in the Hub. If not provided, the token will be tried to be obtained from the environment variable HF_TOKEN. If not provided using one of the previous methods, then huggingface_hub library will try to use the token from the local Hugging Face CLI configuration. Defaults to None.

"},{"location":"components-gallery/steps/pushtohub/#runtime-parameters","title":"Runtime Parameters","text":"
  • repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.

  • split: The split of the dataset that will be pushed.

  • private: Whether the dataset to be pushed should be private or not.

  • token: The token that will be used to authenticate in the Hub.

"},{"location":"components-gallery/steps/pushtohub/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n    end\n\n    subgraph PushToHub\n        StepInput[Input Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n
"},{"location":"components-gallery/steps/pushtohub/#inputs","title":"Inputs","text":"
  • dynamic (all): all columns from the input will be used to create the dataset.
"},{"location":"components-gallery/steps/pushtohub/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/pushtohub/#push-batches-of-your-dataset-to-the-hugging-face-hub-repository","title":"Push batches of your dataset to the Hugging Face Hub repository","text":"
from distilabel.steps import PushToHub\n\npush = PushToHub(repo_id=\"path_to/repo\")\npush.load()\n\nresult = next(\n    push.process(\n        [\n            {\n                \"instruction\": \"instruction \",\n                \"generation\": \"generation\"\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction ', 'generation': 'generation'}]\n
"},{"location":"components-gallery/steps/loaddatafromdicts/","title":"LoadDataFromDicts","text":"

Loads a dataset from a list of dictionaries.

GeneratorStep that loads a dataset from a list of dictionaries and yields it in batches.

"},{"location":"components-gallery/steps/loaddatafromdicts/#attributes","title":"Attributes","text":"
  • data: The list of dictionaries to load the data from.
"},{"location":"components-gallery/steps/loaddatafromdicts/#runtime-parameters","title":"Runtime Parameters","text":"
  • batch_size: The batch size to use when processing the data.
"},{"location":"components-gallery/steps/loaddatafromdicts/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph LoadDataFromDicts\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/loaddatafromdicts/#outputs","title":"Outputs","text":"
  • dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
"},{"location":"components-gallery/steps/loaddatafromdicts/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromdicts/#load-data-from-a-list-of-dictionaries","title":"Load data from a list of dictionaries","text":"
from distilabel.steps import LoadDataFromDicts\n\nloader = LoadDataFromDicts(\n    data=[{\"instruction\": \"What are 2+2?\"}] * 5,\n    batch_size=2\n)\nloader.load()\n\nresult = next(loader.process())\n# >>> result\n# ([{'instruction': 'What are 2+2?'}, {'instruction': 'What are 2+2?'}], False)\n
"},{"location":"components-gallery/steps/datasampler/","title":"DataSampler","text":"

Step to sample from a dataset.

GeneratorStep that samples from a dataset and yields it in batches. This step is useful when you have a pipeline that can benefit from using examples in the prompts for example as few-shot learning, that can be changing on each row. For example, you can pass a list of dictionaries with N examples and generate M samples from it (assuming you have another step loading data, this M should have the same size as the data being loaded in that step). The size S argument is the number of samples per row generated, so each example would contain S examples to be used as examples.

"},{"location":"components-gallery/steps/datasampler/#attributes","title":"Attributes","text":"
  • data: The list of dictionaries to sample from.

  • size: Number of samples per example. For example in a few-shot learning scenario, the number of few-shot examples that will be generated per example. Defaults to 2.

  • samples: Number of examples that will be generated by the step in total. If used with another loader step, this should be the same as the number of samples in the loader step. Defaults to 100.

"},{"location":"components-gallery/steps/datasampler/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph DataSampler\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/datasampler/#outputs","title":"Outputs","text":"
  • dynamic (based on the keys found on the first dictionary of the list): The columns of the dataset.
"},{"location":"components-gallery/steps/datasampler/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/datasampler/#sample-data-from-a-list-of-dictionaries","title":"Sample data from a list of dictionaries","text":"
from distilabel.steps import DataSampler\n\nsampler = DataSampler(\n    data=[{\"sample\": f\"sample {i}\"} for i in range(30)],\n    samples=10,\n    size=2,\n    batch_size=4\n)\nsampler.load()\n\nresult = next(sampler.process())\n# >>> result\n# ([{'sample': ['sample 7', 'sample 0']}, {'sample': ['sample 2', 'sample 21']}, {'sample': ['sample 17', 'sample 12']}, {'sample': ['sample 2', 'sample 14']}], False)\n
"},{"location":"components-gallery/steps/datasampler/#pipeline-with-a-loader-and-a-sampler-combined-in-a-single-stream","title":"Pipeline with a loader and a sampler combined in a single stream","text":"
from datasets import load_dataset\n\nfrom distilabel.steps import LoadDataFromDicts, DataSampler\nfrom distilabel.steps.tasks.apigen.utils import PrepareExamples\nfrom distilabel.pipeline import Pipeline\n\nds = (\n    load_dataset(\"Salesforce/xlam-function-calling-60k\", split=\"train\")\n    .shuffle(seed=42)\n    .select(range(500))\n    .to_list()\n)\ndata = [\n    {\n        \"func_name\": \"final_velocity\",\n        \"func_desc\": \"Calculates the final velocity of an object given its initial velocity, acceleration, and time.\",\n    },\n    {\n        \"func_name\": \"permutation_count\",\n        \"func_desc\": \"Calculates the number of permutations of k elements from a set of n elements.\",\n    },\n    {\n        \"func_name\": \"getdivision\",\n        \"func_desc\": \"Divides two numbers by making an API call to a division service.\",\n    },\n]\nwith Pipeline(name=\"APIGenPipeline\") as pipeline:\n    loader_seeds = LoadDataFromDicts(data=data)\n    sampler = DataSampler(\n        data=ds,\n        size=2,\n        samples=len(data),\n        batch_size=8,\n    )\n    prep_examples = PrepareExamples()\n\n    sampler >> prep_examples\n    (\n        [loader_seeds, prep_examples]\n        >> combine_steps\n    )\n# Now we have a single stream of data with the loader and the sampler data\n
"},{"location":"components-gallery/steps/loaddatafromhub/","title":"LoadDataFromHub","text":"

Loads a dataset from the Hugging Face Hub.

GeneratorStep that loads a dataset from the Hugging Face Hub using the datasets library.

"},{"location":"components-gallery/steps/loaddatafromhub/#attributes","title":"Attributes","text":"
  • repo_id: The Hugging Face Hub repository ID of the dataset to load.

  • split: The split of the dataset to load.

  • config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations.

"},{"location":"components-gallery/steps/loaddatafromhub/#runtime-parameters","title":"Runtime Parameters","text":"
  • batch_size: The batch size to use when processing the data.

  • repo_id: The Hugging Face Hub repository ID of the dataset to load.

  • split: The split of the dataset to load. Defaults to 'train'.

  • config: The configuration of the dataset to load. This is optional and only needed if the dataset has multiple configurations.

  • revision: The revision of the dataset to load. Defaults to the latest revision.

  • streaming: Whether to load the dataset in streaming mode or not. Defaults to False.

  • num_examples: The number of examples to load from the dataset. By default will load all examples.

  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

"},{"location":"components-gallery/steps/loaddatafromhub/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph LoadDataFromHub\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/loaddatafromhub/#outputs","title":"Outputs","text":"
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
"},{"location":"components-gallery/steps/loaddatafromhub/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromhub/#load-data-from-a-dataset-in-hugging-face-hub","title":"Load data from a dataset in Hugging Face Hub","text":"
from distilabel.steps import LoadDataFromHub\n\nloader = LoadDataFromHub(\n    repo_id=\"distilabel-internal-testing/instruction-dataset-mini\",\n    split=\"test\",\n    batch_size=2\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'prompt': 'Arianna has 12...', False)\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/","title":"LoadDataFromFileSystem","text":"

Loads a dataset from a file in your filesystem.

GeneratorStep that creates a dataset from a file in the filesystem, uses Hugging Face datasets library. Take a look at Hugging Face Datasets for more information of the supported file types.

"},{"location":"components-gallery/steps/loaddatafromfilesystem/#attributes","title":"Attributes","text":"
  • data_files: The path to the file, or directory containing the files that conform the dataset.

  • split: The split of the dataset to load (typically will be train, test or validation).

"},{"location":"components-gallery/steps/loaddatafromfilesystem/#runtime-parameters","title":"Runtime Parameters","text":"
  • batch_size: The batch size to use when processing the data.

  • data_files: The path to the file, or directory containing the files that conform the dataset.

  • split: The split of the dataset to load. Defaults to 'train'.

  • streaming: Whether to load the dataset in streaming mode or not. Defaults to False.

  • num_examples: The number of examples to load from the dataset. By default will load all examples.

  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

  • filetype: The expected filetype. If not provided, it will be inferred from the file extension. For more than one file, it will be inferred from the first file.

"},{"location":"components-gallery/steps/loaddatafromfilesystem/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph LoadDataFromFileSystem\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#outputs","title":"Outputs","text":"
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-from-a-hugging-face-dataset-in-your-file-system","title":"Load data from a Hugging Face dataset in your file system","text":"
from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(data_files=\"path/to/dataset.jsonl\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#specify-a-filetype-if-the-file-extension-is-not-expected","title":"Specify a filetype if the file extension is not expected","text":"
from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(filetype=\"csv\", data_files=\"path/to/dataset.txtr\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-from-a-file-in-your-cloud-provider","title":"Load data from a file in your cloud provider","text":"
from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n    data_files=\"gcs://path/to/dataset\",\n    storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromfilesystem/#load-data-passing-a-glob-pattern","title":"Load data passing a glob pattern","text":"
from distilabel.steps import LoadDataFromFileSystem\n\nloader = LoadDataFromFileSystem(\n    data_files=\"path/to/dataset/*.jsonl\",\n    streaming=True\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromdisk/","title":"LoadDataFromDisk","text":"

Load a dataset that was previously saved to disk.

If you previously saved your dataset using the save_to_disk method, or Distiset.save_to_disk you can load it again to build a new pipeline using this class.

"},{"location":"components-gallery/steps/loaddatafromdisk/#attributes","title":"Attributes","text":"
  • dataset_path: The path to the dataset or distiset.

  • split: The split of the dataset to load (typically will be train, test or validation).

  • config: The configuration of the dataset to load. Defaults to default, if there are multiple configurations in the dataset this must be suplied or an error is raised.

"},{"location":"components-gallery/steps/loaddatafromdisk/#runtime-parameters","title":"Runtime Parameters","text":"
  • batch_size: The batch size to use when processing the data.

  • dataset_path: The path to the dataset or distiset.

  • is_distiset: Whether the dataset to load is a Distiset or not. Defaults to False.

  • split: The split of the dataset to load. Defaults to 'train'.

  • config: The configuration of the dataset to load. Defaults to default, if there are multiple configurations in the dataset this must be suplied or an error is raised.

  • num_examples: The number of examples to load from the dataset. By default will load all examples.

  • storage_options: Key/value pairs to be passed on to the file-system backend, if any. Defaults to None.

"},{"location":"components-gallery/steps/loaddatafromdisk/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph LoadDataFromDisk\n        StepOutput[Output Columns: dynamic]\n    end\n\n    StepOutput --> OCOL0\n
"},{"location":"components-gallery/steps/loaddatafromdisk/#outputs","title":"Outputs","text":"
  • dynamic (all): The columns that will be generated by this step, based on the datasets loaded from the Hugging Face Hub.
"},{"location":"components-gallery/steps/loaddatafromdisk/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-hugging-face-dataset","title":"Load data from a Hugging Face Dataset","text":"
from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(dataset_path=\"path/to/dataset\")\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-distilabel-distiset","title":"Load data from a distilabel Distiset","text":"
from distilabel.steps import LoadDataFromDisk\n\n# Specify the configuration to load.\nloader = LoadDataFromDisk(\n    dataset_path=\"path/to/dataset\",\n    is_distiset=True,\n    config=\"leaf_step_1\"\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'a': 1}, {'a': 2}, {'a': 3}], True)\n
"},{"location":"components-gallery/steps/loaddatafromdisk/#load-data-from-a-hugging-face-dataset-or-distiset-in-your-cloud-provider","title":"Load data from a Hugging Face Dataset or Distiset in your cloud provider","text":"
from distilabel.steps import LoadDataFromDisk\n\nloader = LoadDataFromDisk(\n    dataset_path=\"gcs://path/to/dataset\",\n    storage_options={\"project\": \"experiments-0001\"}\n)\nloader.load()\n\n# Just like we saw with LoadDataFromDicts, the `process` method will yield batches.\nresult = next(loader.process())\n# >>> result\n# ([{'type': 'function', 'function':...', False)\n
"},{"location":"components-gallery/steps/prepareexamples/","title":"PrepareExamples","text":"

Helper step to create examples from query and answers pairs used as Few Shots in APIGen.

"},{"location":"components-gallery/steps/prepareexamples/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[query]\n            ICOL1[answers]\n        end\n        subgraph New columns\n            OCOL0[examples]\n        end\n    end\n\n    subgraph PrepareExamples\n        StepInput[Input Columns: query, answers]\n        StepOutput[Output Columns: examples]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/prepareexamples/#inputs","title":"Inputs","text":"
  • query (str): The query to generate examples from.

  • answers (str): The answers to the query.

"},{"location":"components-gallery/steps/prepareexamples/#outputs","title":"Outputs","text":"
  • examples (str): The formatted examples.
"},{"location":"components-gallery/steps/prepareexamples/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/prepareexamples/#generate-examples-for-apigen","title":"Generate examples for APIGen","text":"
from distilabel.steps.tasks.apigen.utils import PrepareExamples\n\nprepare_examples = PrepareExamples()\nresult = next(prepare_examples.process(\n    [\n        {\n            \"query\": ['I need the area of circles with radius 2.5, 5, and 7.5 inches, please.', 'Can you provide the current locations of buses and trolleys on route 12?'],\n            \"answers\": ['[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]', '[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]']\n        }\n    ]\n)\n# result\n# [{'examples': '## Query:\\nI need the area of circles with radius 2.5, 5, and 7.5 inches, please.\\n## Answers:\\n[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]\\n\\n## Query:\\nCan you provide the current locations of buses and trolleys on route 12?\\n## Answers:\\n[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]'}, {'examples': '## Query:\\nI need the area of circles with radius 2.5, 5, and 7.5 inches, please.\\n## Answers:\\n[{\"name\": \"circle_area\", \"arguments\": {\"radius\": 2.5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 5}}, {\"name\": \"circle_area\", \"arguments\": {\"radius\": 7.5}}]\\n\\n## Query:\\nCan you provide the current locations of buses and trolleys on route 12?\\n## Answers:\\n[{\"name\": \"bus_trolley_locations\", \"arguments\": {\"route\": \"12\"}}]'}]\n
"},{"location":"components-gallery/steps/conversationtemplate/","title":"ConversationTemplate","text":"

Generate a conversation template from an instruction and a response.

"},{"location":"components-gallery/steps/conversationtemplate/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[response]\n        end\n        subgraph New columns\n            OCOL0[conversation]\n        end\n    end\n\n    subgraph ConversationTemplate\n        StepInput[Input Columns: instruction, response]\n        StepOutput[Output Columns: conversation]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/conversationtemplate/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to be used in the conversation.

  • response (str): The response to be used in the conversation.

"},{"location":"components-gallery/steps/conversationtemplate/#outputs","title":"Outputs","text":"
  • conversation (ChatType): The conversation template.
"},{"location":"components-gallery/steps/conversationtemplate/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/conversationtemplate/#create-a-conversation-from-an-instruction-and-a-response","title":"Create a conversation from an instruction and a response","text":"
from distilabel.steps import ConversationTemplate\n\nconv_template = ConversationTemplate()\nconv_template.load()\n\nresult = next(\n    conv_template.process(\n        [\n            {\n                \"instruction\": \"Hello\",\n                \"response\": \"Hi\",\n            }\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'Hello', 'response': 'Hi', 'conversation': [{'role': 'user', 'content': 'Hello'}, {'role': 'assistant', 'content': 'Hi'}]}]\n
"},{"location":"components-gallery/steps/formattextgenerationdpo/","title":"FormatTextGenerationDPO","text":"

Format the output of your LLMs for Direct Preference Optimization (DPO).

FormatTextGenerationDPO is a Step that formats the output of the combination of a TextGeneration task with a preference Task i.e. a task generating ratings, so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings. Use this step to transform the output of a combination of a TextGeneration + a preference task such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook.

"},{"location":"components-gallery/steps/formattextgenerationdpo/#note","title":"Note","text":"

The generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations.

"},{"location":"components-gallery/steps/formattextgenerationdpo/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[system_prompt]\n            ICOL1[instruction]\n            ICOL2[generations]\n            ICOL3[generation_models]\n            ICOL4[ratings]\n        end\n        subgraph New columns\n            OCOL0[prompt]\n            OCOL1[prompt_id]\n            OCOL2[chosen]\n            OCOL3[chosen_model]\n            OCOL4[chosen_rating]\n            OCOL5[rejected]\n            OCOL6[rejected_model]\n            OCOL7[rejected_rating]\n        end\n    end\n\n    subgraph FormatTextGenerationDPO\n        StepInput[Input Columns: system_prompt, instruction, generations, generation_models, ratings]\n        StepOutput[Output Columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    ICOL4 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n    StepOutput --> OCOL5\n    StepOutput --> OCOL6\n    StepOutput --> OCOL7\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/formattextgenerationdpo/#inputs","title":"Inputs","text":"
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generations, if available.

  • instruction (str): The instruction used to generate the generations with the LLM.

  • generations (List[str]): The generations produced by the LLM.

  • generation_models (List[str], optional): The model names used to generate the generations, only available if the model_name from the TextGeneration task/s is combined into a single column named this way, otherwise, it will be ignored.

  • ratings (List[float]): The ratings for each of the generations, produced by a preference task such as UltraFeedback.

"},{"location":"components-gallery/steps/formattextgenerationdpo/#outputs","title":"Outputs","text":"
  • prompt (str): The instruction used to generate the generations with the LLM.

  • prompt_id (str): The SHA256 hash of the prompt.

  • chosen (List[Dict[str, str]]): The chosen generation based on the ratings.

  • chosen_model (str, optional): The model name used to generate the chosen generation, if the generation_models are available.

  • chosen_rating (float): The rating of the chosen generation.

  • rejected (List[Dict[str, str]]): The rejected generation based on the ratings.

  • rejected_model (str, optional): The model name used to generate the rejected generation, if the generation_models are available.

  • rejected_rating (float): The rating of the rejected generation.

"},{"location":"components-gallery/steps/formattextgenerationdpo/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formattextgenerationdpo/#format-your-dataset-for-dpo-fine-tuning","title":"Format your dataset for DPO fine tuning","text":"
from distilabel.steps import FormatTextGenerationDPO\n\nformat_dpo = FormatTextGenerationDPO()\nformat_dpo.load()\n\n# NOTE: Both \"system_prompt\" and \"generation_models\" can be added optionally.\nresult = next(\n    format_dpo.process(\n        [\n            {\n                \"instruction\": \"What's 2+2?\",\n                \"generations\": [\"4\", \"5\", \"6\"],\n                \"ratings\": [1, 0, -1],\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#    {   'instruction': \"What's 2+2?\",\n#        'generations': ['4', '5', '6'],\n#        'ratings': [1, 0, -1],\n#        'prompt': \"What's 2+2?\",\n#        'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#        'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#        'chosen_rating': 1,\n#        'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n#        'rejected_rating': -1\n#    }\n# ]\n
"},{"location":"components-gallery/steps/formatchatgenerationdpo/","title":"FormatChatGenerationDPO","text":"

Format the output of a combination of a ChatGeneration + a preference task for Direct Preference Optimization (DPO).

FormatChatGenerationDPO is a Step that formats the output of the combination of a ChatGeneration task with a preference Task i.e. a task generating ratings such as UltraFeedback following the standard formatting from frameworks such as axolotl or alignment-handbook., so that those are used to rank the existing generations and provide the chosen and rejected generations based on the ratings.

"},{"location":"components-gallery/steps/formatchatgenerationdpo/#note","title":"Note","text":"

The messages column should contain at least one message from the user, the generations column should contain at least two generations, the ratings column should contain the same number of ratings as generations.

"},{"location":"components-gallery/steps/formatchatgenerationdpo/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[messages]\n            ICOL1[generations]\n            ICOL2[generation_models]\n            ICOL3[ratings]\n        end\n        subgraph New columns\n            OCOL0[prompt]\n            OCOL1[prompt_id]\n            OCOL2[chosen]\n            OCOL3[chosen_model]\n            OCOL4[chosen_rating]\n            OCOL5[rejected]\n            OCOL6[rejected_model]\n            OCOL7[rejected_rating]\n        end\n    end\n\n    subgraph FormatChatGenerationDPO\n        StepInput[Input Columns: messages, generations, generation_models, ratings]\n        StepOutput[Output Columns: prompt, prompt_id, chosen, chosen_model, chosen_rating, rejected, rejected_model, rejected_rating]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n    StepOutput --> OCOL5\n    StepOutput --> OCOL6\n    StepOutput --> OCOL7\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/formatchatgenerationdpo/#inputs","title":"Inputs","text":"
  • messages (List[Dict[str, str]]): The conversation messages.

  • generations (List[str]): The generations produced by the LLM.

  • generation_models (List[str], optional): The model names used to generate the generations, only available if the model_name from the ChatGeneration task/s is combined into a single column named this way, otherwise, it will be ignored.

  • ratings (List[float]): The ratings for each of the generations, produced by a preference task such as UltraFeedback.

"},{"location":"components-gallery/steps/formatchatgenerationdpo/#outputs","title":"Outputs","text":"
  • prompt (str): The user message used to generate the generations with the LLM.

  • prompt_id (str): The SHA256 hash of the prompt.

  • chosen (List[Dict[str, str]]): The chosen generation based on the ratings.

  • chosen_model (str, optional): The model name used to generate the chosen generation, if the generation_models are available.

  • chosen_rating (float): The rating of the chosen generation.

  • rejected (List[Dict[str, str]]): The rejected generation based on the ratings.

  • rejected_model (str, optional): The model name used to generate the rejected generation, if the generation_models are available.

  • rejected_rating (float): The rating of the rejected generation.

"},{"location":"components-gallery/steps/formatchatgenerationdpo/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatchatgenerationdpo/#format-your-dataset-for-dpo-fine-tuning","title":"Format your dataset for DPO fine tuning","text":"
from distilabel.steps import FormatChatGenerationDPO\n\nformat_dpo = FormatChatGenerationDPO()\nformat_dpo.load()\n\n# NOTE: \"generation_models\" can be added optionally.\nresult = next(\n    format_dpo.process(\n        [\n            {\n                \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                \"generations\": [\"4\", \"5\", \"6\"],\n                \"ratings\": [1, 0, -1],\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}],\n#         'generations': ['4', '5', '6'],\n#         'ratings': [1, 0, -1],\n#         'prompt': \"What's 2+2?\",\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#         'chosen': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#         'chosen_rating': 1,\n#         'rejected': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '6'}],\n#         'rejected_rating': -1\n#     }\n# ]\n
"},{"location":"components-gallery/steps/formattextgenerationsft/","title":"FormatTextGenerationSFT","text":"

Format the output of a TextGeneration task for Supervised Fine-Tuning (SFT).

FormatTextGenerationSFT is a Step that formats the output of a TextGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook. The output of the TextGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation.

"},{"location":"components-gallery/steps/formattextgenerationsft/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[system_prompt]\n            ICOL1[instruction]\n            ICOL2[generation]\n        end\n        subgraph New columns\n            OCOL0[prompt]\n            OCOL1[prompt_id]\n            OCOL2[messages]\n        end\n    end\n\n    subgraph FormatTextGenerationSFT\n        StepInput[Input Columns: system_prompt, instruction, generation]\n        StepOutput[Output Columns: prompt, prompt_id, messages]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/formattextgenerationsft/#inputs","title":"Inputs","text":"
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generation, if available.

  • instruction (str): The instruction used to generate the generation with the LLM.

  • generation (str): The generation produced by the LLM.

"},{"location":"components-gallery/steps/formattextgenerationsft/#outputs","title":"Outputs","text":"
  • prompt (str): The instruction used to generate the generation with the LLM.

  • prompt_id (str): The SHA256 hash of the prompt.

  • messages (List[Dict[str, str]]): The chat-like conversation with the instruction as the user message and the generation as the assistant message.

"},{"location":"components-gallery/steps/formattextgenerationsft/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formattextgenerationsft/#format-your-dataset-for-sft-fine-tuning","title":"Format your dataset for SFT fine tuning","text":"
from distilabel.steps import FormatTextGenerationSFT\n\nformat_sft = FormatTextGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n    format_sft.process(\n        [\n            {\n                \"instruction\": \"What's 2+2?\",\n                \"generation\": \"4\"\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'instruction': 'What's 2+2?',\n#         'generation': '4',\n#         'prompt': 'What's 2+2?',\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}]\n#     }\n# ]\n
"},{"location":"components-gallery/steps/formatchatgenerationsft/","title":"FormatChatGenerationSFT","text":"

Format the output of a ChatGeneration task for Supervised Fine-Tuning (SFT).

FormatChatGenerationSFT is a Step that formats the output of a ChatGeneration task for Supervised Fine-Tuning (SFT) following the standard formatting from frameworks such as axolotl or alignment-handbook. The output of the ChatGeneration task is formatted into a chat-like conversation with the instruction as the user message and the generation as the assistant message. Optionally, if the system_prompt is available, it is included as the first message in the conversation.

"},{"location":"components-gallery/steps/formatchatgenerationsft/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[system_prompt]\n            ICOL1[instruction]\n            ICOL2[generation]\n        end\n        subgraph New columns\n            OCOL0[prompt]\n            OCOL1[prompt_id]\n            OCOL2[messages]\n        end\n    end\n\n    subgraph FormatChatGenerationSFT\n        StepInput[Input Columns: system_prompt, instruction, generation]\n        StepOutput[Output Columns: prompt, prompt_id, messages]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/formatchatgenerationsft/#inputs","title":"Inputs","text":"
  • system_prompt (str, optional): The system prompt used within the LLM to generate the generation, if available.

  • instruction (str): The instruction used to generate the generation with the LLM.

  • generation (str): The generation produced by the LLM.

"},{"location":"components-gallery/steps/formatchatgenerationsft/#outputs","title":"Outputs","text":"
  • prompt (str): The instruction used to generate the generation with the LLM.

  • prompt_id (str): The SHA256 hash of the prompt.

  • messages (List[Dict[str, str]]): The chat-like conversation with the instruction as the user message and the generation as the assistant message.

"},{"location":"components-gallery/steps/formatchatgenerationsft/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/formatchatgenerationsft/#format-your-dataset-for-sft","title":"Format your dataset for SFT","text":"
from distilabel.steps import FormatChatGenerationSFT\n\nformat_sft = FormatChatGenerationSFT()\nformat_sft.load()\n\n# NOTE: \"system_prompt\" can be added optionally.\nresult = next(\n    format_sft.process(\n        [\n            {\n                \"messages\": [{\"role\": \"user\", \"content\": \"What's 2+2?\"}],\n                \"generation\": \"4\"\n            }\n        ]\n    )\n)\n# >>> result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': \"What's 2+2?\"}, {'role': 'assistant', 'content': '4'}],\n#         'generation': '4',\n#         'prompt': 'What's 2+2?',\n#         'prompt_id': '7762ecf17ad41479767061a8f4a7bfa3b63d371672af5180872f9b82b4cd4e29',\n#     }\n# ]\n
"},{"location":"components-gallery/steps/deitafiltering/","title":"DeitaFiltering","text":"

Filter dataset rows using DEITA filtering strategy.

Filter the dataset based on the DEITA score and the cosine distance between the embeddings. It's an implementation of the filtering step from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

"},{"location":"components-gallery/steps/deitafiltering/#attributes","title":"Attributes","text":"
  • data_budget: The desired size of the dataset after filtering.

  • diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset. Defaults to 0.9.

  • normalize_embeddings: Whether to normalize the embeddings before computing the cosine distance. Defaults to True.

"},{"location":"components-gallery/steps/deitafiltering/#runtime-parameters","title":"Runtime Parameters","text":"
  • data_budget: The desired size of the dataset after filtering.

  • diversity_threshold: If a row has a cosine distance with respect to it's nearest neighbor greater than this value, it will be included in the filtered dataset.

"},{"location":"components-gallery/steps/deitafiltering/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[evol_instruction_score]\n            ICOL1[evol_response_score]\n            ICOL2[embedding]\n        end\n        subgraph New columns\n            OCOL0[deita_score]\n            OCOL1[deita_score_computed_with]\n            OCOL2[nearest_neighbor_distance]\n        end\n    end\n\n    subgraph DeitaFiltering\n        StepInput[Input Columns: evol_instruction_score, evol_response_score, embedding]\n        StepOutput[Output Columns: deita_score, deita_score_computed_with, nearest_neighbor_distance]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/deitafiltering/#inputs","title":"Inputs","text":"
  • evol_instruction_score (float): The score of the instruction generated by ComplexityScorer step.

  • evol_response_score (float): The score of the response generated by QualityScorer step.

  • embedding (List[float]): The embedding generated for the conversation of the instruction-response pair using GenerateEmbeddings step.

"},{"location":"components-gallery/steps/deitafiltering/#outputs","title":"Outputs","text":"
  • deita_score (float): The DEITA score for the instruction-response pair.

  • deita_score_computed_with (List[str]): The scores used to compute the DEITA score.

  • nearest_neighbor_distance (float): The cosine distance between the embeddings of the instruction-response pair.

"},{"location":"components-gallery/steps/deitafiltering/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/deitafiltering/#filter-the-dataset-based-on-the-deita-score-and-the-cosine-distance-between-the-embeddings","title":"Filter the dataset based on the DEITA score and the cosine distance between the embeddings","text":"
from distilabel.steps import DeitaFiltering\n\ndeita_filtering = DeitaFiltering(data_budget=1)\n\ndeita_filtering.load()\n\nresult = next(\n    deita_filtering.process(\n        [\n            {\n                \"evol_instruction_score\": 0.5,\n                \"evol_response_score\": 0.5,\n                \"embedding\": [-8.12729941, -5.24642847, -6.34003029],\n            },\n            {\n                \"evol_instruction_score\": 0.6,\n                \"evol_response_score\": 0.6,\n                \"embedding\": [2.99329242, 0.7800932, 0.7799726],\n            },\n            {\n                \"evol_instruction_score\": 0.7,\n                \"evol_response_score\": 0.7,\n                \"embedding\": [10.29041806, 14.33088073, 13.00557506],\n            },\n        ],\n    )\n)\n# >>> result\n# [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]\n
"},{"location":"components-gallery/steps/deitafiltering/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/steps/embeddingdedup/","title":"EmbeddingDedup","text":"

Deduplicates text using embeddings.

EmbeddingDedup is a Step that detects near-duplicates in datasets, using embeddings to compare the similarity between the texts. The typical workflow with this step would include having a dataset with embeddings precomputed, and then (possibly using the FaissNearestNeighbour) using the nn_indices and nn_scores, determine the texts that are duplicate.

"},{"location":"components-gallery/steps/embeddingdedup/#attributes","title":"Attributes","text":"
  • threshold: the threshold to consider 2 examples as duplicates. It's dependent on the type of index that was used to generate the embeddings. For example, if the embeddings were generated using cosine similarity, a threshold of 0.9 would make all the texts with a cosine similarity above the value duplicates. Higher values detect less duplicates in such an index, but that should be taken into account when building it. Defaults to 0.9. Runtime Parameters: - threshold: the threshold to consider 2 examples as duplicates.
"},{"location":"components-gallery/steps/embeddingdedup/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[nn_indices]\n            ICOL1[nn_scores]\n        end\n        subgraph New columns\n            OCOL0[keep_row_after_embedding_filtering]\n        end\n    end\n\n    subgraph EmbeddingDedup\n        StepInput[Input Columns: nn_indices, nn_scores]\n        StepOutput[Output Columns: keep_row_after_embedding_filtering]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/embeddingdedup/#inputs","title":"Inputs","text":"
  • nn_indices (List[int]): a list containing the indices of the k nearest neighbours in the inputs for the row.

  • nn_scores (List[float]): a list containing the score or distance to each k nearest neighbour in the inputs.

"},{"location":"components-gallery/steps/embeddingdedup/#outputs","title":"Outputs","text":"
  • keep_row_after_embedding_filtering (bool): boolean indicating if the piece text is not a duplicate i.e. this text should be kept.
"},{"location":"components-gallery/steps/embeddingdedup/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/embeddingdedup/#deduplicate-a-list-of-texts-using-embedding-information","title":"Deduplicate a list of texts using embedding information","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n    data = LoadDataFromDicts(\n        data=[\n            {\n                \"persona\": \"A chemistry student or academic researcher interested in inorganic or physical chemistry, likely at an advanced undergraduate or graduate level, studying acid-base interactions and chemical bonding.\",\n                \"embedding\": [\n                    0.018477669046149742,\n                    -0.03748236608841726,\n                    0.001919870620352492,\n                    0.024918478063770535,\n                    0.02348063521315178,\n                    0.0038251285566308375,\n                    -0.01723884983037716,\n                    0.02881971942372201,\n                ],\n                \"nn_indices\": [0, 1],\n                \"nn_scores\": [\n                    0.9164746999740601,\n                    0.782106876373291,\n                ],\n            },\n            {\n                \"persona\": \"A music teacher or instructor focused on theoretical and practical piano lessons.\",\n                \"embedding\": [\n                    -0.0023464179614082125,\n                    -0.07325472251663565,\n                    -0.06058678419516501,\n                    -0.02100326928586996,\n                    -0.013462744792362657,\n                    0.027368447064244242,\n                    -0.003916070100455717,\n                    0.01243614518480423,\n                ],\n                \"nn_indices\": [0, 2],\n                \"nn_scores\": [\n                    0.7552462220191956,\n                    0.7261884808540344,\n                ],\n            },\n            {\n                \"persona\": \"A classical guitar teacher or instructor, likely with experience teaching beginners, who focuses on breaking down complex music notation into understandable steps for their students.\",\n                \"embedding\": [\n                    -0.01630817942328242,\n                    -0.023760151552345232,\n                    -0.014249650090627883,\n                    -0.005713686451446624,\n                    -0.016033059279131567,\n                    0.0071440908501058786,\n                    -0.05691099643425161,\n                    0.01597412704817784,\n                ],\n                \"nn_indices\": [1, 2],\n                \"nn_scores\": [\n                    0.8107735514640808,\n                    0.7172299027442932,\n                ],\n            },\n        ],\n        batch_size=batch_size,\n    )\n    # In general you should do something like this before the deduplication step, to obtain the\n    # `nn_indices` and `nn_scores`. In this case the embeddings are already normalized, so there's\n    # no need for it.\n    # nn = FaissNearestNeighbour(\n    #     k=30,\n    #     metric_type=faiss.METRIC_INNER_PRODUCT,\n    #     search_batch_size=50,\n    #     train_size=len(dataset),              # The number of embeddings to use for training\n    #     string_factory=\"IVF300_HNSW32,Flat\"   # To use an index (optional, maybe required for big datasets)\n    # )\n    # Read more about the `string_factory` here:\n    # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index\n\n    embedding_dedup = EmbeddingDedup(\n        threshold=0.8,\n        input_batch_size=batch_size,\n    )\n\n    data >> embedding_dedup\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    ds = distiset[\"default\"][\"train\"]\n    # Filter out the duplicates\n    ds_dedup = ds.filter(lambda x: x[\"keep_row_after_embedding_filtering\"])\n
"},{"location":"components-gallery/steps/apigenexecutionchecker/","title":"APIGenExecutionChecker","text":"

Executes the generated function calls.

This step checks if a given answer from a model as generated by APIGenGenerator can be executed against the given library (given by libpath, which is a string pointing to a python .py file with functions).

"},{"location":"components-gallery/steps/apigenexecutionchecker/#attributes","title":"Attributes","text":"
  • libpath: The path to the library where we will retrieve the functions. It can also point to a folder with the functions. In this case, the folder layout should be a folder with .py files, each containing a single function, the name of the function being the same as the filename.

  • check_is_dangerous: Bool to exclude some potentially dangerous functions, it contains some heuristics found while testing. This functions can run subprocesses, deal with the OS, or have other potentially dangerous operations. Defaults to True.

"},{"location":"components-gallery/steps/apigenexecutionchecker/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[answers]\n        end\n        subgraph New columns\n            OCOL0[keep_row_after_execution_check]\n            OCOL1[execution_result]\n        end\n    end\n\n    subgraph APIGenExecutionChecker\n        StepInput[Input Columns: answers]\n        StepOutput[Output Columns: keep_row_after_execution_check, execution_result]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/apigenexecutionchecker/#inputs","title":"Inputs","text":"
  • answers (str): List with arguments to be passed to the function, dumped as a string from a list of dictionaries. Should be loaded using json.loads.
"},{"location":"components-gallery/steps/apigenexecutionchecker/#outputs","title":"Outputs","text":"
  • keep_row_after_execution_check (bool): Whether the function should be kept or not.

  • execution_result (str): The result from executing the function.

"},{"location":"components-gallery/steps/apigenexecutionchecker/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/apigenexecutionchecker/#execute-a-function-from-a-given-library-with-the-answer-from-an-llm","title":"Execute a function from a given library with the answer from an LLM","text":"
from distilabel.steps.tasks import APIGenExecutionChecker\n\n# For the libpath you can use as an example the file at the tests folder:\n# ../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\ntask = APIGenExecutionChecker(\n    libpath=\"../distilabel/tests/unit/steps/tasks/apigen/_sample_module.py\",\n)\ntask.load()\n\nres = next(\n    task.process(\n        [\n            {\n                \"answers\": [\n                    {\n                        \"arguments\": {\n                            \"initial_velocity\": 0.2,\n                            \"acceleration\": 0.1,\n                            \"time\": 0.5,\n                        },\n                        \"name\": \"final_velocity\",\n                    }\n                ],\n            }\n        ]\n    )\n)\nres\n#[{'answers': [{'arguments': {'initial_velocity': 0.2, 'acceleration': 0.1, 'time': 0.5}, 'name': 'final_velocity'}], 'keep_row_after_execution_check': True, 'execution_result': ['0.25']}]\n
"},{"location":"components-gallery/steps/apigenexecutionchecker/#references","title":"References","text":"
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets

  • Salesforce/xlam-function-calling-60k

"},{"location":"components-gallery/steps/minhashdedup/","title":"MinHashDedup","text":"

Deduplicates text using MinHash and MinHashLSH.

MinHashDedup is a Step that detects near-duplicates in datasets. The idea roughly translates to the following steps: 1. Tokenize the text into words or ngrams. 2. Create a MinHash for each text. 3. Store the MinHashes in a MinHashLSH. 4. Check if the MinHash is already in the LSH, if so, it is a duplicate.

"},{"location":"components-gallery/steps/minhashdedup/#attributes","title":"Attributes","text":"
  • num_perm: the number of permutations to use. Defaults to 128.

  • seed: the seed to use for the MinHash. This seed must be the same used for MinHash, keep in mind when both steps are created. Defaults to 1.

  • tokenizer: the tokenizer to use. Available ones are words or ngrams. If words is selected, it tokenize the text into words using nltk's word tokenizer. ngram estimates the ngrams (together with the size n) using. Defaults to words.

  • n: the size of the ngrams to use. Only relevant if tokenizer=\"ngrams\". Defaults to 5.

  • threshold: the threshold to consider two MinHashes as duplicates. Values closer to 0 detect more duplicates. Defaults to 0.9.

  • storage: the storage to use for the LSH. Can be dict to store the index in memory, or disk. Keep in mind, disk is an experimental feature not defined in datasketch, that is based on DiskCache's Index class. It should work as a dict, but backed by disk, but depending on the system it can be slower. Defaults to dict. which uses a custom shelve backend. Note the disk is an experimetal feature that may cause issues. Defaults to dict.

"},{"location":"components-gallery/steps/minhashdedup/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n        end\n        subgraph New columns\n            OCOL0[keep_row_after_minhash_filtering]\n        end\n    end\n\n    subgraph MinHashDedup\n        StepInput[Input Columns: text]\n        StepOutput[Output Columns: keep_row_after_minhash_filtering]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/minhashdedup/#inputs","title":"Inputs","text":"
  • text (str): the texts to be filtered.
"},{"location":"components-gallery/steps/minhashdedup/#outputs","title":"Outputs","text":"
  • keep_row_after_minhash_filtering (bool): boolean indicating if the piece text is not a duplicate i.e. this text should be kept.
"},{"location":"components-gallery/steps/minhashdedup/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/minhashdedup/#deduplicate-a-list-of-texts-using-minhash-and-minhashlsh","title":"Deduplicate a list of texts using MinHash and MinHashLSH","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps import MinHashDedup\nfrom distilabel.steps import LoadDataFromDicts\n\nwith Pipeline() as pipeline:\n    ds_size = 1000\n    batch_size = 500  # Bigger batch sizes work better for this step\n    data = LoadDataFromDicts(\n        data=[\n            {\"text\": \"This is a test document.\"},\n            {\"text\": \"This document is a test.\"},\n            {\"text\": \"Test document for duplication.\"},\n            {\"text\": \"Document for duplication test.\"},\n            {\"text\": \"This is another unique document.\"},\n        ]\n        * (ds_size // 5),\n        batch_size=batch_size,\n    )\n    minhash_dedup = MinHashDedup(\n        tokenizer=\"words\",\n        threshold=0.9,      # lower values will increase the number of duplicates\n        storage=\"dict\",     # or \"disk\" for bigger datasets\n    )\n\n    data >> minhash_dedup\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(use_cache=False)\n    ds = distiset[\"default\"][\"train\"]\n    # Filter out the duplicates\n    ds_dedup = ds.filter(lambda x: x[\"keep_row_after_minhash_filtering\"])\n
"},{"location":"components-gallery/steps/minhashdedup/#references","title":"References","text":"
  • datasketch documentation

  • Identifying and Filtering Near-Duplicate Documents

  • Diskcache's Index

"},{"location":"components-gallery/steps/combineoutputs/","title":"CombineOutputs","text":"

Combine the outputs of several upstream steps.

CombineOutputs is a Step that takes the outputs of several upstream steps and combines them to generate a new dictionary with all keys/columns of the upstream steps outputs.

"},{"location":"components-gallery/steps/combineoutputs/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph CombineOutputs\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/combineoutputs/#inputs","title":"Inputs","text":"
  • dynamic (based on the upstream Steps): All the columns of the upstream steps outputs.
"},{"location":"components-gallery/steps/combineoutputs/#outputs","title":"Outputs","text":"
  • dynamic (based on the upstream Steps): All the columns of the upstream steps outputs.
"},{"location":"components-gallery/steps/combineoutputs/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/combineoutputs/#combine-dictionaries-of-a-dataset","title":"Combine dictionaries of a dataset","text":"
from distilabel.steps import CombineOutputs\n\ncombine_outputs = CombineOutputs()\ncombine_outputs.load()\n\nresult = next(\n    combine_outputs.process(\n        [{\"a\": 1, \"b\": 2}, {\"a\": 3, \"b\": 4}],\n        [{\"c\": 5, \"d\": 6}, {\"c\": 7, \"d\": 8}],\n    )\n)\n# [\n#   {\"a\": 1, \"b\": 2, \"c\": 5, \"d\": 6},\n#   {\"a\": 3, \"b\": 4, \"c\": 7, \"d\": 8},\n# ]\n
"},{"location":"components-gallery/steps/combineoutputs/#combine-upstream-steps-outputs-in-a-pipeline","title":"Combine upstream steps outputs in a pipeline","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps import CombineOutputs\n\nwith Pipeline() as pipeline:\n    step_1 = ...\n    step_2 = ...\n    step_3 = ...\n    combine = CombineOutputs()\n\n    [step_1, step_2, step_3] >> combine\n
"},{"location":"components-gallery/steps/expandcolumns/","title":"ExpandColumns","text":"

Expand columns that contain lists into multiple rows.

ExpandColumns is a Step that takes a list of columns and expands them into multiple rows. The new rows will have the same data as the original row, except for the expanded column, which will contain a single item from the original list.

"},{"location":"components-gallery/steps/expandcolumns/#attributes","title":"Attributes","text":"
  • columns: A dictionary that maps the column to be expanded to the new column name or a list of columns to be expanded. If a list is provided, the new column name will be the same as the column name.
"},{"location":"components-gallery/steps/expandcolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph ExpandColumns\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/expandcolumns/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): The columns to be expanded into multiple rows.
"},{"location":"components-gallery/steps/expandcolumns/#outputs","title":"Outputs","text":"
  • dynamic (determined by columns attribute): The expanded columns.
"},{"location":"components-gallery/steps/expandcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/expandcolumns/#expand-the-selected-columns-into-multiple-rows","title":"Expand the selected columns into multiple rows","text":"
from distilabel.steps import ExpandColumns\n\nexpand_columns = ExpandColumns(\n    columns=[\"generation\"],\n)\nexpand_columns.load()\n\nresult = next(\n    expand_columns.process(\n        [\n            {\n                \"instruction\": \"instruction 1\",\n                \"generation\": [\"generation 1\", \"generation 2\"]}\n        ],\n    )\n)\n# >>> result\n# [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]\n
"},{"location":"components-gallery/steps/groupcolumns/","title":"GroupColumns","text":"

Combines columns from a list of StepInput.

GroupColumns is a Step that implements the process method that calls the group_dicts function to handle and combine a list of StepInput. Also GroupColumns provides two attributes columns and output_columns to specify the columns to group and the output columns which will override the default value for the properties inputs and outputs, respectively.

"},{"location":"components-gallery/steps/groupcolumns/#attributes","title":"Attributes","text":"
  • columns: List of strings with the names of the columns to group.

  • output_columns: Optional list of strings with the names of the output columns.

"},{"location":"components-gallery/steps/groupcolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph GroupColumns\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/groupcolumns/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): The columns to group.
"},{"location":"components-gallery/steps/groupcolumns/#outputs","title":"Outputs","text":"
  • dynamic (determined by columns and output_columns attributes): The columns that were grouped.
"},{"location":"components-gallery/steps/groupcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/groupcolumns/#group-columns-of-a-dataset","title":"Group columns of a dataset","text":"
from distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n    name=\"group_columns\",\n    columns=[\"generation\", \"model_name\"],\n)\ngroup_columns.load()\n\nresult = next(\n    group_columns.process(\n        [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n        [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n    )\n)\n# >>> result\n# [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]\n
"},{"location":"components-gallery/steps/groupcolumns/#specify-the-name-of-the-output-columns","title":"Specify the name of the output columns","text":"
from distilabel.steps import GroupColumns\n\ngroup_columns = GroupColumns(\n    name=\"group_columns\",\n    columns=[\"generation\", \"model_name\"],\n    output_columns=[\"generations\", \"generation_models\"]\n)\ngroup_columns.load()\n\nresult = next(\n    group_columns.process(\n        [{\"generation\": \"AI generated text\"}, {\"model_name\": \"my_model\"}],\n        [{\"generation\": \"Other generated text\", \"model_name\": \"my_model\"}]\n    )\n)\n# >>> result\n#[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]\n
"},{"location":"components-gallery/steps/keepcolumns/","title":"KeepColumns","text":"

Keeps selected columns in the dataset.

KeepColumns is a Step that implements the process method that keeps only the columns specified in the columns attribute. Also KeepColumns provides an attribute columns to specify the columns to keep which will override the default value for the properties inputs and outputs.

"},{"location":"components-gallery/steps/keepcolumns/#note","title":"Note","text":"

The order in which the columns are provided is important, as the output will be sorted using the provided order, which is useful before pushing either a dataset.Dataset via the PushToHub step or a distilabel.Distiset via the Pipeline.run output variable.

"},{"location":"components-gallery/steps/keepcolumns/#attributes","title":"Attributes","text":"
  • columns: List of strings with the names of the columns to keep.
"},{"location":"components-gallery/steps/keepcolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph KeepColumns\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/keepcolumns/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): The columns to keep.
"},{"location":"components-gallery/steps/keepcolumns/#outputs","title":"Outputs","text":"
  • dynamic (determined by columns attribute): The columns that were kept.
"},{"location":"components-gallery/steps/keepcolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/keepcolumns/#select-the-columns-to-keep","title":"Select the columns to keep","text":"
from distilabel.steps import KeepColumns\n\nkeep_columns = KeepColumns(\n    columns=[\"instruction\", \"generation\"],\n)\nkeep_columns.load()\n\nresult = next(\n    keep_columns.process(\n        [{\"instruction\": \"What's the brightest color?\", \"generation\": \"white\", \"model_name\": \"my_model\"}],\n    )\n)\n# >>> result\n# [{'instruction': \"What's the brightest color?\", 'generation': 'white'}]\n
"},{"location":"components-gallery/steps/mergecolumns/","title":"MergeColumns","text":"

Merge columns from a row.

MergeColumns is a Step that implements the process method that calls the merge_columns function to handle and combine columns in a StepInput. MergeColumns provides two attributes columns and output_column to specify the columns to merge and the resulting output column.

This step can be useful if you have a `Task` that generates instructions for example, and you\nwant to have more examples of those. In such a case, you could for example use another `Task`\nto multiply your instructions synthetically, what would yield two different columns splitted.\nUsing `MergeColumns` you can merge them and use them as a single column in your dataset for\nfurther processing.\n
"},{"location":"components-gallery/steps/mergecolumns/#attributes","title":"Attributes","text":"
  • columns: List of strings with the names of the columns to merge.

  • output_column: str name of the output column

"},{"location":"components-gallery/steps/mergecolumns/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph MergeColumns\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/mergecolumns/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): The columns to merge.
"},{"location":"components-gallery/steps/mergecolumns/#outputs","title":"Outputs","text":"
  • dynamic (determined by columns and output_column attributes): The columns that were merged.
"},{"location":"components-gallery/steps/mergecolumns/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/mergecolumns/#combine-columns-in-rows-of-a-dataset","title":"Combine columns in rows of a dataset","text":"
from distilabel.steps import MergeColumns\n\ncombiner = MergeColumns(\n    columns=[\"queries\", \"multiple_queries\"],\n    output_column=\"queries\",\n)\ncombiner.load()\n\nresult = next(\n    combiner.process(\n        [\n            {\n                \"queries\": \"How are you?\",\n                \"multiple_queries\": [\"What's up?\", \"Everything ok?\"]\n            }\n        ],\n    )\n)\n# >>> result\n# [{'queries': ['How are you?', \"What's up?\", 'Everything ok?']}]\n
"},{"location":"components-gallery/steps/dbscan/","title":"DBSCAN","text":"

DBSCAN (Density-Based Spatial Clustering of Applications with Noise) finds core

samples in regions of high density and expands clusters from them. This algorithm is good for data which contains clusters of similar density.

This is a `GlobalStep` that clusters the embeddings using the DBSCAN algorithm\nfrom `sklearn`. Visit `TextClustering` step for an example of use.\nThe trained model is saved as an artifact when creating a distiset\nand pushing it to the Hugging Face Hub.\n
"},{"location":"components-gallery/steps/dbscan/#attributes","title":"Attributes","text":"
  • eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. - min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse. - metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter. - n_jobs: The number of parallel jobs to run.
"},{"location":"components-gallery/steps/dbscan/#runtime-parameters","title":"Runtime Parameters","text":"
  • eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.

  • min_samples: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If min_samples is set to a higher value, DBSCAN will find denser clusters, whereas if it is set to a lower value, the found clusters will be more sparse.

  • metric: The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter.

  • n_jobs: The number of parallel jobs to run.

"},{"location":"components-gallery/steps/dbscan/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[projection]\n        end\n        subgraph New columns\n            OCOL0[cluster_label]\n        end\n    end\n\n    subgraph DBSCAN\n        StepInput[Input Columns: projection]\n        StepOutput[Output Columns: cluster_label]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/dbscan/#inputs","title":"Inputs","text":"
  • projection (List[float]): Vector representation of the text to cluster, normally the output from the UMAP step.
"},{"location":"components-gallery/steps/dbscan/#outputs","title":"Outputs","text":"
  • cluster_label (int): Integer representing the label of a given cluster. -1 means it wasn't clustered.
"},{"location":"components-gallery/steps/dbscan/#references","title":"References","text":"
  • DBSCAN demo of sklearn

  • sklearn dbscan

"},{"location":"components-gallery/steps/umap/","title":"UMAP","text":"

UMAP is a general purpose manifold learning and dimension reduction algorithm.

This is a GlobalStep that reduces the dimensionality of the embeddings using. Visit the TextClustering step for an example of use. The trained model is saved as an artifact when creating a distiset and pushing it to the Hugging Face Hub.

"},{"location":"components-gallery/steps/umap/#attributes","title":"Attributes","text":"
  • n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100. - metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean. - n_jobs: The number of parallel jobs to run. Defaults to 8. - random_state: The random state to use for the UMAP algorithm.
"},{"location":"components-gallery/steps/umap/#runtime-parameters","title":"Runtime Parameters","text":"
  • n_components: The dimension of the space to embed into. This defaults to 2 to provide easy visualization (that's probably what you want), but can reasonably be set to any integer value in the range 2 to 100.

  • metric: The metric to use to compute distances in high dimensional space. Visit UMAP's documentation for more information. Defaults to euclidean.

  • n_jobs: The number of parallel jobs to run. Defaults to 8.

  • random_state: The random state to use for the UMAP algorithm.

"},{"location":"components-gallery/steps/umap/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[embedding]\n        end\n        subgraph New columns\n            OCOL0[projection]\n        end\n    end\n\n    subgraph UMAP\n        StepInput[Input Columns: embedding]\n        StepOutput[Output Columns: projection]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/umap/#inputs","title":"Inputs","text":"
  • embedding (List[float]): The original embeddings we want to reduce the dimension.
"},{"location":"components-gallery/steps/umap/#outputs","title":"Outputs","text":"
  • projection (List[float]): Embedding reduced to the number of components specified, the size of the new embeddings will be determined by the n_components.
"},{"location":"components-gallery/steps/umap/#references","title":"References","text":"
  • UMAP repository

  • UMAP documentation

"},{"location":"components-gallery/steps/faissnearestneighbour/","title":"FaissNearestNeighbour","text":"

Create a faiss index to get the nearest neighbours.

FaissNearestNeighbour is a GlobalStep that creates a faiss index using the Hugging Face datasets library integration, and then gets the nearest neighbours and the scores or distance of the nearest neighbours for each input row.

"},{"location":"components-gallery/steps/faissnearestneighbour/#attributes","title":"Attributes","text":"
  • device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None.

  • string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None.

  • metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None.

  • k: the number of nearest neighbours to search for each input row. Defaults to 1.

  • search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50.

  • train_size: If the index needs a training step, specifies how many vectors will be used to train the index.

"},{"location":"components-gallery/steps/faissnearestneighbour/#runtime-parameters","title":"Runtime Parameters","text":"
  • device: the CUDA device ID or a list of IDs to be used. If negative integer, it will use all the available GPUs. Defaults to None.

  • string_factory: the name of the factory to be used to build the faiss index. Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes. Defaults to None.

  • metric_type: the metric to be used to measure the distance between the points. It's an integer and the recommend way to pass it is importing faiss and then passing one of faiss.METRIC_x variables. Defaults to None.

  • k: the number of nearest neighbours to search for each input row. Defaults to 1.

  • search_batch_size: the number of rows to include in a search batch. The value can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults to 50.

  • train_size: If the index needs a training step, specifies how many vectors will be used to train the index.

"},{"location":"components-gallery/steps/faissnearestneighbour/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[embedding]\n        end\n        subgraph New columns\n            OCOL0[nn_indices]\n            OCOL1[nn_scores]\n        end\n    end\n\n    subgraph FaissNearestNeighbour\n        StepInput[Input Columns: embedding]\n        StepOutput[Output Columns: nn_indices, nn_scores]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/faissnearestneighbour/#inputs","title":"Inputs","text":"
  • embedding (List[Union[float, int]]): a sentence embedding.
"},{"location":"components-gallery/steps/faissnearestneighbour/#outputs","title":"Outputs","text":"
  • nn_indices (List[int]): a list containing the indices of the k nearest neighbours in the inputs for the row.

  • nn_scores (List[float]): a list containing the score or distance to each k nearest neighbour in the inputs.

"},{"location":"components-gallery/steps/faissnearestneighbour/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/faissnearestneighbour/#generating-embeddings-and-getting-the-nearest-neighbours","title":"Generating embeddings and getting the nearest neighbours","text":"
from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.pipeline import Pipeline\nfrom distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub\n\nwith Pipeline(name=\"hello\") as pipeline:\n    load_data = LoadDataFromHub(output_mappings={\"prompt\": \"text\"})\n\n    embeddings = EmbeddingGeneration(\n        embeddings=SentenceTransformerEmbeddings(\n            model=\"mixedbread-ai/mxbai-embed-large-v1\"\n        )\n    )\n\n    nearest_neighbours = FaissNearestNeighbour()\n\n    load_data >> embeddings >> nearest_neighbours\n\nif __name__ == \"__main__\":\n    distiset = pipeline.run(\n        parameters={\n            load_data.name: {\n                \"repo_id\": \"distilabel-internal-testing/instruction-dataset-mini\",\n                \"split\": \"test\",\n            },\n        },\n        use_cache=False,\n    )\n
"},{"location":"components-gallery/steps/faissnearestneighbour/#references","title":"References","text":"
  • The Faiss library
"},{"location":"components-gallery/steps/embeddinggeneration/","title":"EmbeddingGeneration","text":"

Generate embeddings using an Embeddings model.

EmbeddingGeneration is a Step that using an Embeddings model generates sentence embeddings for the provided input texts.

"},{"location":"components-gallery/steps/embeddinggeneration/#attributes","title":"Attributes","text":"
  • embeddings: the Embeddings model used to generate the sentence embeddings.
"},{"location":"components-gallery/steps/embeddinggeneration/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n        end\n        subgraph New columns\n            OCOL0[embedding]\n        end\n    end\n\n    subgraph EmbeddingGeneration\n        StepInput[Input Columns: text]\n        StepOutput[Output Columns: embedding]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/embeddinggeneration/#inputs","title":"Inputs","text":"
  • text (str): The text for which the sentence embedding has to be generated.
"},{"location":"components-gallery/steps/embeddinggeneration/#outputs","title":"Outputs","text":"
  • embedding (List[Union[float, int]]): the generated sentence embedding.
"},{"location":"components-gallery/steps/embeddinggeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/embeddinggeneration/#generate-sentence-embeddings-with-sentence-transformers","title":"Generate sentence embeddings with Sentence Transformers","text":"
from distilabel.models import SentenceTransformerEmbeddings\nfrom distilabel.steps import EmbeddingGeneration\n\nembedding_generation = EmbeddingGeneration(\n    embeddings=SentenceTransformerEmbeddings(\n        model=\"mixedbread-ai/mxbai-embed-large-v1\",\n    )\n)\n\nembedding_generation.load()\n\nresult = next(embedding_generation.process([{\"text\": \"Hello, how are you?\"}]))\n# [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]\n
"},{"location":"components-gallery/steps/rewardmodelscore/","title":"RewardModelScore","text":"

Assign a score to a response using a Reward Model.

RewardModelScore is a Step that using a Reward Model (RM) loaded using transformers, assigns an score to a response generated for an instruction, or a score to a multi-turn conversation.

"},{"location":"components-gallery/steps/rewardmodelscore/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

  • torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\".

  • trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

  • device_map: a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\". Defaults to None.

  • token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

  • truncation: whether to truncate sequences at the maximum length. Defaults to False.

  • max_length: maximun length to use for padding or truncation. Defaults to None.

"},{"location":"components-gallery/steps/rewardmodelscore/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[response]\n            ICOL2[conversation]\n        end\n        subgraph New columns\n            OCOL0[score]\n        end\n    end\n\n    subgraph RewardModelScore\n        StepInput[Input Columns: instruction, response, conversation]\n        StepOutput[Output Columns: score]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/rewardmodelscore/#inputs","title":"Inputs","text":"
  • instruction (str, optional): the instruction used to generate a response. If provided, then response must be provided too.

  • response (str, optional): the response generated for instruction. If provided, then instruction must be provide too.

  • conversation (ChatType, optional): a multi-turn conversation. If not provided, then instruction and response columns must be provided.

"},{"location":"components-gallery/steps/rewardmodelscore/#outputs","title":"Outputs","text":"
  • score (float): the score given by the reward model for the instruction-response pair or the conversation.
"},{"location":"components-gallery/steps/rewardmodelscore/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/rewardmodelscore/#response-pair","title":"response pair","text":"
from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n    model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n    step.process(\n        inputs=[\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"response\": \"The output of 2+2 is 4\",\n            },\n            {\"instruction\": \"How much is 2+2?\", \"response\": \"4\"},\n        ]\n    )\n)\n# [\n#   {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},\n#   {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}\n# ]\n
"},{"location":"components-gallery/steps/rewardmodelscore/#turn-conversation","title":"turn conversation","text":"
from distilabel.steps import RewardModelScore\n\nstep = RewardModelScore(\n    model=\"RLHFlow/ArmoRM-Llama3-8B-v0.1\", device_map=\"auto\", trust_remote_code=True\n)\n\nstep.load()\n\nresult = next(\n    step.process(\n        inputs=[\n            {\n                \"conversation\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                    {\"role\": \"assistant\", \"content\": \"The output of 2+2 is 4\"},\n                ],\n            },\n            {\n                \"conversation\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                    {\"role\": \"assistant\", \"content\": \"4\"},\n                ],\n            },\n        ]\n    )\n)\n# [\n#   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},\n#   {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}\n# ]\n
"},{"location":"components-gallery/steps/truncatetextcolumn/","title":"TruncateTextColumn","text":"

Truncate a row using a tokenizer or the number of characters.

TruncateTextColumn is a Step that truncates a row according to the max length. If the tokenizer is provided, then the row will be truncated using the tokenizer, and the max_length will be used as the maximum number of tokens, otherwise it will be used as the maximum number of characters. The TruncateTextColumn step is useful when one wants to truncate a row to a certain length, to avoid posterior errors in the model due to the length.

"},{"location":"components-gallery/steps/truncatetextcolumn/#attributes","title":"Attributes","text":"
  • column: the column to truncate. Defaults to \"text\".

  • max_length: the maximum length to use for truncation. If a tokenizer is given, corresponds to the number of tokens, otherwise corresponds to the number of characters. Defaults to 8192.

  • tokenizer: the name of the tokenizer to use. If provided, the row will be truncated using the tokenizer. Defaults to None.

"},{"location":"components-gallery/steps/truncatetextcolumn/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[dynamic]\n        end\n    end\n\n    subgraph TruncateTextColumn\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: dynamic]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/steps/truncatetextcolumn/#inputs","title":"Inputs","text":"
  • dynamic (determined by column attribute): The columns to be truncated, defaults to \"text\".
"},{"location":"components-gallery/steps/truncatetextcolumn/#outputs","title":"Outputs","text":"
  • dynamic (determined by column attribute): The truncated column.
"},{"location":"components-gallery/steps/truncatetextcolumn/#examples","title":"Examples","text":""},{"location":"components-gallery/steps/truncatetextcolumn/#truncating-a-row-to-a-given-number-of-tokens","title":"Truncating a row to a given number of tokens","text":"
from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(\n    tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    max_length=4,\n    column=\"text\"\n)\n\ntrunc.load()\n\nresult = next(\n    trunc.process(\n        [\n            {\"text\": \"This is a sample text that is longer than 10 characters\"}\n        ]\n    )\n)\n# result\n# [{'text': 'This is a sample'}]\n
"},{"location":"components-gallery/steps/truncatetextcolumn/#truncating-a-row-to-a-given-number-of-characters","title":"Truncating a row to a given number of characters","text":"
from distilabel.steps import TruncateTextColumn\n\ntrunc = TruncateTextColumn(max_length=10)\n\ntrunc.load()\n\nresult = next(\n    trunc.process(\n        [\n            {\"text\": \"This is a sample text that is longer than 10 characters\"}\n        ]\n    )\n)\n# result\n# [{'text': 'This is a '}]\n
"},{"location":"components-gallery/tasks/","title":"Tasks Gallery","text":"Category Overview

The gallery page showcases the different types of components within distilabel.

Icon Category Description text-generation Text generation steps are used to generate text based on a given prompt. chat-generation Chat generation steps are used to generate text based on a conversation. text-classification Text classification steps are used to classify text into a category. text-manipulation Text manipulation steps are used to manipulate or rewrite an input text. evol Evol steps are used to rewrite input text and evolve it to a higher quality. critique Critique steps are used to provide feedback on the quality of the data with a written explanation. scorer Scorer steps are used to evaluate and score the data with a numerical value. preference Preference steps are used to collect preferences on the data with numerical values or ranks. embedding Embedding steps are used to generate embeddings for the data. clustering Clustering steps are used to group similar data points together. columns Columns steps are used to manipulate columns in the data. filtering Filtering steps are used to filter the data based on some criteria. format Format steps are used to format the data. load Load steps are used to load the data. execution Executes python functions. save Save steps are used to save the data.
  • APIGenGenerator

    Generate queries and answers for the given functions in JSON format.

    APIGenGenerator

  • Genstruct

    Generate a pair of instruction-response from a document using an LLM.

    Genstruct

  • Magpie

    Generates conversations using an instruct fine-tuned LLM.

    Magpie

  • SelfInstruct

    Generate instructions based on a given input using an LLM.

    SelfInstruct

  • TextGeneration

    Text generation with an LLM given a prompt.

    TextGeneration

  • URIAL

    Generates a response using a non-instruct fine-tuned model.

    URIAL

  • MagpieGenerator

    Generator task the generates instructions or conversations using Magpie.

    MagpieGenerator

  • ChatGeneration

    Generates text based on a conversation.

    ChatGeneration

  • ArgillaLabeller

    Annotate Argilla records based on input fields, example records and question settings.

    ArgillaLabeller

  • TextClassification

    Classifies text into one or more categories or labels.

    TextClassification

  • EvolInstruct

    Evolve instructions using an LLM.

    EvolInstruct

  • EvolComplexity

    Evolve instructions to make them more complex using an LLM.

    EvolComplexity

  • EvolQuality

    Evolve the quality of the responses using an LLM.

    EvolQuality

  • EvolInstructGenerator

    Generate evolved instructions using an LLM.

    EvolInstructGenerator

  • EvolComplexityGenerator

    Generate evolved instructions with increased complexity using an LLM.

    EvolComplexityGenerator

  • InstructionBacktranslation

    Self-Alignment with Instruction Backtranslation.

    InstructionBacktranslation

  • PrometheusEval

    Critique and rank the quality of generations from an LLM using Prometheus 2.0.

    PrometheusEval

  • ComplexityScorer

    Score instructions based on their complexity using an LLM.

    ComplexityScorer

  • QualityScorer

    Score responses based on their quality using an LLM.

    QualityScorer

  • CLAIR

    Contrastive Learning from AI Revisions (CLAIR).

    CLAIR

  • UltraFeedback

    Rank generations focusing on different aspects using an LLM.

    UltraFeedback

  • PairRM

    Rank the candidates based on the input using the LLM model.

    PairRM

  • GenerateSentencePair

    Generate a positive and negative (optionally) sentences given an anchor sentence.

    GenerateSentencePair

  • GenerateEmbeddings

    Generate embeddings using the last hidden state of an LLM.

    GenerateEmbeddings

  • TextClustering

    Task that clusters a set of texts and generates summary labels for each cluster.

    TextClustering

  • TextClustering

    Task that clusters a set of texts and generates summary labels for each cluster.

    TextClustering

  • APIGenSemanticChecker

    Generate queries and answers for the given functions in JSON format.

    APIGenSemanticChecker

  • GenerateTextRetrievalData

    Generate text retrieval data with an LLM to later on train an embedding model.

    GenerateTextRetrievalData

  • GenerateShortTextMatchingData

    Generate short text matching data with an LLM to later on train an embedding model.

    GenerateShortTextMatchingData

  • GenerateLongTextMatchingData

    Generate long text matching data with an LLM to later on train an embedding model.

    GenerateLongTextMatchingData

  • GenerateTextClassificationData

    Generate text classification data with an LLM to later on train an embedding model.

    GenerateTextClassificationData

  • StructuredGeneration

    Generate structured content for a given instruction using an LLM.

    StructuredGeneration

  • MonolingualTripletGenerator

    Generate monolingual triplets with an LLM to later on train an embedding model.

    MonolingualTripletGenerator

  • BitextRetrievalGenerator

    Generate bitext retrieval data with an LLM to later on train an embedding model.

    BitextRetrievalGenerator

  • EmbeddingTaskGenerator

    Generate task descriptions for embedding-related tasks using an LLM.

    EmbeddingTaskGenerator

"},{"location":"components-gallery/tasks/apigengenerator/","title":"APIGenGenerator","text":"

Generate queries and answers for the given functions in JSON format.

The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format.

"},{"location":"components-gallery/tasks/apigengenerator/#attributes","title":"Attributes","text":"
  • system_prompt: The system prompt to guide the user in the generation of queries and answers.

  • use_tools: Whether to use the tools available in the prompt to generate the queries and answers. In case the tools are given in the input, they will be added to the prompt.

  • number: The number of queries to generate. It can be a list, where each number will be chosen randomly, or a dictionary with the number of queries and the probability of each. I.e: number=1, number=[1, 2, 3], number={1: 0.5, 2: 0.3, 3: 0.2} are all valid inputs. It corresponds to the number of parallel queries to generate.

  • use_default_structured_output: Whether to use the default structured output or not.

"},{"location":"components-gallery/tasks/apigengenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[examples]\n            ICOL1[func_name]\n            ICOL2[func_desc]\n            ICOL3[tools]\n        end\n        subgraph New columns\n            OCOL0[query]\n            OCOL1[answers]\n        end\n    end\n\n    subgraph APIGenGenerator\n        StepInput[Input Columns: examples, func_name, func_desc, tools]\n        StepOutput[Output Columns: query, answers]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/apigengenerator/#inputs","title":"Inputs","text":"
  • examples (str): Examples used as few shots to guide the model.

  • func_name (str): Name for the function to generate.

  • func_desc (str): Description of what the function should do.

  • tools (str): JSON formatted string containing the tool representation of the function.

"},{"location":"components-gallery/tasks/apigengenerator/#outputs","title":"Outputs","text":"
  • query (str): The list of queries.

  • answers (str): JSON formatted string with the list of answers, containing the info as a dictionary to be passed to the functions.

"},{"location":"components-gallery/tasks/apigengenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/apigengenerator/#generate-without-structured-output-original-implementation","title":"Generate without structured output (original implementation)","text":"
from distilabel.steps.tasks import ApiGenGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\napigen = ApiGenGenerator(\n    use_default_structured_output=False,\n    llm=llm\n)\napigen.load()\n\nres = next(\n    apigen.process(\n        [\n            {\n                \"examples\": 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n                \"func_name\": \"getrandommovie\",\n                \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n            }\n        ]\n    )\n)\nres\n# [{'examples': 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n# 'number': 1,\n# 'func_name': 'getrandommovie',\n# 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n# 'queries': ['I want to watch a movie tonight, can you recommend a random one from your database?',\n# 'Give me 5 random movie suggestions from your database to plan my weekend.'],\n# 'answers': [[{'name': 'getrandommovie', 'arguments': {}}],\n# [{'name': 'getrandommovie', 'arguments': {}},\n#     {'name': 'getrandommovie', 'arguments': {}},\n#     {'name': 'getrandommovie', 'arguments': {}},\n#     {'name': 'getrandommovie', 'arguments': {}},\n#     {'name': 'getrandommovie', 'arguments': {}}]],\n# 'raw_input_api_gen_generator_0': [{'role': 'system',\n#     'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\n\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\n\nEnsure the query:\n- Is clear and concise\n- Demonstrates typical use cases\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\n- The corresponding result's parameter types and ranges match with the function's descriptions\n\nEnsure the answer:\n- Is a list of function calls in JSON format\n- The length of the answer list should be equal to the number of requests in the query\n- Can solve all the requests in the query effectively\"},\n#     {'role': 'user',\n#     'content': 'Here are examples of queries and the corresponding answers for similar functions:\nQUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\n\nNote that the query could be interpreted as a combination of several independent requests.\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\nThe detailed function description is the following:\nReturns a list of random movies from a database by calling an external API.\n\nThe output MUST strictly adhere to the following JSON format, and NO other text MUST be included:\n
"},{"location":"components-gallery/tasks/apigengenerator/#generate-with-structured-output","title":"Generate with structured output","text":"
from distilabel.steps.tasks import ApiGenGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\napigen = ApiGenGenerator(\n    use_default_structured_output=True,\n    llm=llm\n)\napigen.load()\n\nres_struct = next(\n    apigen.process(\n        [\n            {\n                \"examples\": 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n                \"func_name\": \"getrandommovie\",\n                \"func_desc\": \"Returns a list of random movies from a database by calling an external API.\"\n            }\n        ]\n    )\n)\nres_struct\n# [{'examples': 'QUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]',\n# 'number': 1,\n# 'func_name': 'getrandommovie',\n# 'func_desc': 'Returns a list of random movies from a database by calling an external API.',\n# 'queries': [\"I'm bored and want to watch a movie. Can you suggest some movies?\",\n# \"My family and I are planning a movie night. We can't decide on what to watch. Can you suggest some random movie titles?\"],\n# 'answers': [[{'arguments': {}, 'name': 'getrandommovie'}],\n# [{'arguments': {}, 'name': 'getrandommovie'}]],\n# 'raw_input_api_gen_generator_0': [{'role': 'system',\n#     'content': \"You are a data labeler. Your responsibility is to generate a set of diverse queries and corresponding answers for the given functions in JSON format.\n\nConstruct queries and answers that exemplify how to use these functions in a practical scenario. Include in each query specific, plausible values for each parameter. For instance, if the function requires a date, use a typical and reasonable date.\n\nEnsure the query:\n- Is clear and concise\n- Demonstrates typical use cases\n- Includes all necessary parameters in a meaningful way. For numerical parameters, it could be either numbers or words\n- Across a variety level of difficulties, ranging from beginner and advanced use cases\n- The corresponding result's parameter types and ranges match with the function's descriptions\n\nEnsure the answer:\n- Is a list of function calls in JSON format\n- The length of the answer list should be equal to the number of requests in the query\n- Can solve all the requests in the query effectively\"},\n#     {'role': 'user',\n#     'content': 'Here are examples of queries and the corresponding answers for similar functions:\nQUERY:\nWhat is the binary sum of 10010 and 11101?\nANSWER:\n[{\"name\": \"binary_addition\", \"arguments\": {\"a\": \"10010\", \"b\": \"11101\"}}]\n\nNote that the query could be interpreted as a combination of several independent requests.\nBased on these examples, generate 2 diverse query and answer pairs for the function `getrandommovie`\nThe detailed function description is the following:\nReturns a list of random movies from a database by calling an external API.\n\nNow please generate 2 diverse query and answer pairs following the above format.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/apigengenerator/#references","title":"References","text":"
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets

  • Salesforce/xlam-function-calling-60k

"},{"location":"components-gallery/tasks/genstruct/","title":"Genstruct","text":"

Generate a pair of instruction-response from a document using an LLM.

Genstruct is a pre-defined task designed to generate valid instructions from a given raw document, with the title and the content, enabling the creation of new, partially synthetic instruction finetuning datasets from any raw-text corpus. The task is based on the Genstruct 7B model by Nous Research, which is inspired in the Ada-Instruct paper.

"},{"location":"components-gallery/tasks/genstruct/#note","title":"Note","text":"

The Genstruct prompt i.e. the task, can be used with any model really, but the safest / recommended option is to use NousResearch/Genstruct-7B as the LLM provided to the task, since it was trained for this specific task.

"},{"location":"components-gallery/tasks/genstruct/#attributes","title":"Attributes","text":"
  • _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/genstruct/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[title]\n            ICOL1[content]\n        end\n        subgraph New columns\n            OCOL0[user]\n            OCOL1[assistant]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph Genstruct\n        StepInput[Input Columns: title, content]\n        StepOutput[Output Columns: user, assistant, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/genstruct/#inputs","title":"Inputs","text":"
  • title (str): The title of the document.

  • content (str): The content of the document.

"},{"location":"components-gallery/tasks/genstruct/#outputs","title":"Outputs","text":"
  • user (str): The user's instruction based on the document.

  • assistant (str): The assistant's response based on the user's instruction.

  • model_name (str): The model name used to generate the feedback and result.

"},{"location":"components-gallery/tasks/genstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/genstruct/#generate-instructions-from-raw-documents-using-the-title-and-content","title":"Generate instructions from raw documents using the title and content","text":"
from distilabel.steps.tasks import Genstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ngenstruct = Genstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"NousResearch/Genstruct-7B\",\n    ),\n)\n\ngenstruct.load()\n\nresult = next(\n    genstruct.process(\n        [\n            {\"title\": \"common instruction\", \"content\": \"content of the document\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'title': 'An instruction',\n#         'content': 'content of the document',\n#         'model_name': 'test',\n#         'user': 'An instruction',\n#         'assistant': 'content of the document',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/genstruct/#references","title":"References","text":"
  • Genstruct 7B by Nous Research

  • Ada-Instruct: Adapting Instruction Generators for Complex Reasoning

"},{"location":"components-gallery/tasks/magpie/","title":"Magpie","text":"

Generates conversations using an instruct fine-tuned LLM.

Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as if it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'.

"},{"location":"components-gallery/tasks/magpie/#attributes","title":"Attributes","text":"
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.

  • end_with_user: whether the conversation should end with a user message. Defaults to False.

  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.

  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None.

"},{"location":"components-gallery/tasks/magpie/#runtime-parameters","title":"Runtime Parameters","text":"
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.

  • end_with_user: whether the conversation should end with a user message. Defaults to False.

  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.

  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic.

"},{"location":"components-gallery/tasks/magpie/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[system_prompt]\n        end\n        subgraph New columns\n            OCOL0[conversation]\n            OCOL1[instruction]\n            OCOL2[response]\n            OCOL3[system_prompt_key]\n            OCOL4[model_name]\n        end\n    end\n\n    subgraph Magpie\n        StepInput[Input Columns: system_prompt]\n        StepOutput[Output Columns: conversation, instruction, response, system_prompt_key, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/magpie/#inputs","title":"Inputs","text":"
  • system_prompt (str, optional): an optional system prompt that can be provided to guide the generation of the instruct LLM and steer it to generate instructions of certain topic.
"},{"location":"components-gallery/tasks/magpie/#outputs","title":"Outputs","text":"
  • conversation (ChatType): the generated conversation which is a list of chat items with a role and a message. Only if only_instruction=False.

  • instruction (str): the generated instructions if only_instruction=True or n_turns==1.

  • response (str): the generated response if n_turns==1.

  • system_prompt_key (str, optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary.

  • model_name (str): The model name used to generate the conversation or instruction.

"},{"location":"components-gallery/tasks/magpie/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/magpie/#generating-instructions-with-llama-3-8b-instruct-and-transformersllm","title":"Generating instructions with Llama 3 8B Instruct and TransformersLLM","text":"
from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 64,\n        },\n        device=\"mps\",\n    ),\n    only_instruction=True,\n)\n\nmagpie.load()\n\nresult = next(\n    magpie.process(\n        inputs=[\n            {\n                \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n            },\n            {\n                \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n            },\n        ]\n    )\n)\n# [\n#     {'instruction': \"That's me! I'd love some help with solving calculus problems! What kind of calculation are you most effective at? Linear Algebra, derivatives, integrals, optimization?\"},\n#     {'instruction': 'I was wondering if there are certain flowers and plants that can be used for pest control?'}\n# ]\n
"},{"location":"components-gallery/tasks/magpie/#generating-conversations-with-llama-3-8b-instruct-and-transformersllm","title":"Generating conversations with Llama 3 8B Instruct and TransformersLLM","text":"
from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import Magpie\n\nmagpie = Magpie(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 256,\n        },\n        device=\"mps\",\n    ),\n    n_turns=2,\n)\n\nmagpie.load()\n\nresult = next(\n    magpie.process(\n        inputs=[\n            {\n                \"system_prompt\": \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"\n            },\n            {\n                \"system_prompt\": \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"\n            },\n        ]\n    )\n)\n# [\n#     {\n#         'conversation': [\n#             {'role': 'system', 'content': \"You're a math expert AI assistant that helps students of secondary school to solve calculus problems.\"},\n#             {\n#                 'role': 'user',\n#                 'content': 'I'm having trouble solving the limits of functions in calculus. Could you explain how to work with them? Limits of functions are denoted by lim x\u2192a f(x) or lim x\u2192a [f(x)]. It is read as \"the limit as x approaches a of f\n# of x\".'\n#             },\n#             {\n#                 'role': 'assistant',\n#                 'content': 'Limits are indeed a fundamental concept in calculus, and understanding them can be a bit tricky at first, but don't worry, I'm here to help! The notation lim x\u2192a f(x) indeed means \"the limit as x approaches a of f of\n# x\". What it's asking us to do is find the'\n#             }\n#         ]\n#     },\n#     {\n#         'conversation': [\n#             {'role': 'system', 'content': \"You're an expert florist AI assistant that helps user to erradicate pests in their crops.\"},\n#             {\n#                 'role': 'user',\n#                 'content': \"As a flower shop owner, I'm noticing some unusual worm-like creatures causing damage to my roses and other flowers. Can you help me identify what the problem is? Based on your expertise as a florist AI assistant, I think it\n# might be pests or diseases, but I'm not sure which.\"\n#             },\n#             {\n#                 'role': 'assistant',\n#                 'content': \"I'd be delighted to help you investigate the issue! Since you've noticed worm-like creatures damaging your roses and other flowers, I'll take a closer look at the possibilities. Here are a few potential culprits: 1.\n# **Aphids**: These small, soft-bodied insects can secrete a sticky substance called\"\n#             }\n#         ]\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/magpie/#references","title":"References","text":"
  • Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
"},{"location":"components-gallery/tasks/selfinstruct/","title":"SelfInstruct","text":"

Generate instructions based on a given input using an LLM.

SelfInstruct is a pre-defined task that, given a number of instructions, a certain criteria for query generations, an application description, and an input, generates a number of instruction related to the given input and following what is stated in the criteria for query generation and the application description. It is based in the SelfInstruct framework from the paper \"Self-Instruct: Aligning Language Models with Self-Generated Instructions\".

"},{"location":"components-gallery/tasks/selfinstruct/#attributes","title":"Attributes","text":"
  • num_instructions: The number of instructions to be generated. Defaults to 5.

  • criteria_for_query_generation: The criteria for the query generation. Defaults to the criteria defined within the paper.

  • application_description: The description of the AI application that one want to build with these instructions. Defaults to AI assistant.

"},{"location":"components-gallery/tasks/selfinstruct/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[input]\n        end\n        subgraph New columns\n            OCOL0[instructions]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph SelfInstruct\n        StepInput[Input Columns: input]\n        StepOutput[Output Columns: instructions, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/selfinstruct/#inputs","title":"Inputs","text":"
  • input (str): The input to generate the instructions. It's also called seed in the paper.
"},{"location":"components-gallery/tasks/selfinstruct/#outputs","title":"Outputs","text":"
  • instructions (List[str]): The generated instructions.

  • model_name (str): The model name used to generate the instructions.

"},{"location":"components-gallery/tasks/selfinstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/selfinstruct/#generate-instructions-based-on-a-given-input","title":"Generate instructions based on a given input","text":"
from distilabel.steps.tasks import SelfInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\nself_instruct = SelfInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=5,  # This is the default value\n)\n\nself_instruct.load()\n\nresult = next(self_instruct.process([{\"input\": \"instruction\"}]))\n# result\n# [\n#     {\n#         'input': 'instruction',\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#         'instructions': [\"instruction 1\", \"instruction 2\", \"instruction 3\", \"instruction 4\", \"instruction 5\"],\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/textgeneration/","title":"TextGeneration","text":"

Text generation with an LLM given a prompt.

TextGeneration is a pre-defined task that allows passing a custom prompt using the Jinja2 syntax. By default, a instruction is expected in the inputs, but the using template and columns attributes one can define a custom prompt and columns expected from the text. This task should be good enough for tasks that don't need post-processing of the responses generated by the LLM.

"},{"location":"components-gallery/tasks/textgeneration/#attributes","title":"Attributes","text":"
  • system_prompt: The system prompt to use in the generation. If not provided, then it will check if the input row has a column named system_prompt and use it. If not, then no system prompt will be used. Defaults to None.

  • template: The template to use for the generation. It must follow the Jinja2 template syntax. If not provided, it will assume the text passed is an instruction and construct the appropriate template.

  • columns: A string with the column, or a list with columns expected in the template. Take a look at the examples for more information. Defaults to instruction.

  • use_system_prompt: DEPRECATED. To be removed in 1.5.0. Whether to use the system prompt in the generation. Defaults to True, which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored.

"},{"location":"components-gallery/tasks/textgeneration/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[dynamic]\n        end\n        subgraph New columns\n            OCOL0[generation]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph TextGeneration\n        StepInput[Input Columns: dynamic]\n        StepOutput[Output Columns: generation, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/textgeneration/#inputs","title":"Inputs","text":"
  • dynamic (determined by columns attribute): By default will be set to instruction. The columns can point both to a str or a List[str] to be used in the template.
"},{"location":"components-gallery/tasks/textgeneration/#outputs","title":"Outputs","text":"
  • generation (str): The generated text.

  • model_name (str): The name of the model used to generate the text.

"},{"location":"components-gallery/tasks/textgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textgeneration/#generate-text-from-an-instruction","title":"Generate text from an instruction","text":"
from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    )\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [{\"instruction\": \"your instruction\"}]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'your instruction',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'generation',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/textgeneration/#use-a-custom-template-to-generate-text","title":"Use a custom template to generate text","text":"
from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Document:\n{{ document }}\n\nQuestion: {{ question }}\n\nPlease provide a clear and concise answer to the question based on the information in the document and your general knowledge:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    system_prompt=\"You are a helpful AI assistant. Your task is to answer the following question based on the provided document. If the answer is not explicitly stated in the document, use your knowledge to provide the most relevant and accurate answer possible. If you cannot answer the question based on the given information, state that clearly.\",\n    template=CUSTOM_TEMPLATE,\n    columns=[\"document\", \"question\"],\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [\n            {\n                \"document\": \"The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.\",\n                \"question\": \"What is the main threat to the Great Barrier Reef mentioned in the document?\"\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'document': 'The Great Barrier Reef, located off the coast of Australia, is the world's largest coral reef system. It stretches over 2,300 kilometers and is home to a diverse array of marine life, including over 1,500 species of fish. However, in recent years, the reef has faced significant challenges due to climate change, with rising sea temperatures causing coral bleaching events.',\n#         'question': 'What is the main threat to the Great Barrier Reef mentioned in the document?',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'According to the document, the main threat to the Great Barrier Reef is climate change, specifically rising sea temperatures causing coral bleaching events.',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/textgeneration/#few-shot-learning-with-different-system-prompts","title":"Few shot learning with different system prompts","text":"
from distilabel.steps.tasks import TextGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nCUSTOM_TEMPLATE = '''Generate a clear, single-sentence instruction based on the following examples:\n\n{% for example in examples %}\nExample {{ loop.index }}:\nInstruction: {{ example }}\n\n{% endfor %}\nNow, generate a new instruction in a similar style:\n'''.rstrip()\n\ntext_gen = TextGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    template=CUSTOM_TEMPLATE,\n    columns=\"examples\",\n)\n\ntext_gen.load()\n\nresult = next(\n    text_gen.process(\n        [\n            {\n                \"examples\": [\"This is an example\", \"Another relevant example\"],\n                \"system_prompt\": \"You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.\"\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'examples': ['This is an example', 'Another relevant example'],\n#         'system_prompt': 'You are an AI assistant specialised in cybersecurity and computing in general, you make your point clear without any explanations.',\n#         'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct',\n#         'generation': 'Disable the firewall on the router',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/textgeneration/#references","title":"References","text":"
  • Jinja2 Template Designer Documentation
"},{"location":"components-gallery/tasks/urial/","title":"URIAL","text":"

Generates a response using a non-instruct fine-tuned model.

URIAL is a pre-defined task that generates a response using a non-instruct fine-tuned model. This task is used to generate a response based on the conversation provided as input.

"},{"location":"components-gallery/tasks/urial/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[conversation]\n        end\n        subgraph New columns\n            OCOL0[generation]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph URIAL\n        StepInput[Input Columns: instruction, conversation]\n        StepOutput[Output Columns: generation, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/urial/#inputs","title":"Inputs","text":"
  • instruction (str, optional): The instruction to generate a response from.

  • conversation (List[Dict[str, str]], optional): The conversation to generate a response from (the last message must be from the user).

"},{"location":"components-gallery/tasks/urial/#outputs","title":"Outputs","text":"
  • generation (str): The generated response.

  • model_name (str): The name of the model used to generate the response.

"},{"location":"components-gallery/tasks/urial/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/urial/#generate-text-from-an-instruction","title":"Generate text from an instruction","text":"
from distilabel.models import vLLM\nfrom distilabel.steps.tasks import URIAL\n\nstep = URIAL(\n    llm=vLLM(\n        model=\"meta-llama/Meta-Llama-3.1-8B\",\n        generation_kwargs={\"temperature\": 0.7},\n    ),\n)\n\nstep.load()\n\nresults = next(\n    step.process(inputs=[{\"instruction\": \"What's the most most common type of cloud?\"}])\n)\n# [\n#     {\n#         'instruction': \"What's the most most common type of cloud?\",\n#         'generation': 'Clouds are classified into three main types, high, middle, and low. The most common type of cloud is the middle cloud.',\n#         'distilabel_metadata': {...},\n#         'model_name': 'meta-llama/Meta-Llama-3.1-8B'\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/urial/#references","title":"References","text":"
  • The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context Learning
"},{"location":"components-gallery/tasks/magpiegenerator/","title":"MagpieGenerator","text":"

Generator task the generates instructions or conversations using Magpie.

Magpie is a neat method that allows generating user instructions with no seed data or specific system prompt thanks to the autoregressive capabilities of the instruct fine-tuned LLMs. As they were fine-tuned using a chat template composed by a user message and a desired assistant output, the instruct fine-tuned LLM learns that after the pre-query or pre-instruct tokens comes an instruction. If these pre-query tokens are sent to the LLM without any user message, then the LLM will continue generating tokens as it was the user. This trick allows \"extracting\" instructions from the instruct fine-tuned LLM. After this instruct is generated, it can be sent again to the LLM to generate this time an assistant response. This process can be repeated N times allowing to build a multi-turn conversation. This method was described in the paper 'Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing'.

"},{"location":"components-gallery/tasks/magpiegenerator/#attributes","title":"Attributes","text":"
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.

  • end_with_user: whether the conversation should end with a user message. Defaults to False.

  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.

  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic. Defaults to None.

  • num_rows: the number of rows to be generated.

"},{"location":"components-gallery/tasks/magpiegenerator/#runtime-parameters","title":"Runtime Parameters","text":"
  • n_turns: the number of turns that the generated conversation will have. Defaults to 1.

  • end_with_user: whether the conversation should end with a user message. Defaults to False.

  • include_system_prompt: whether to include the system prompt used in the generated conversation. Defaults to False.

  • only_instruction: whether to generate only the instruction. If this argument is True, then n_turns will be ignored. Defaults to False.

  • system_prompt: an optional system prompt, or a list of system prompts from which a random one will be chosen, or a dictionary of system prompts from which a random one will be choosen, or a dictionary of system prompts with their probability of being chosen. The random system prompt will be chosen per input/output batch. This system prompt can be used to guide the generation of the instruct LLM and steer it to generate instructions of a certain topic.

  • num_rows: the number of rows to be generated.

"},{"location":"components-gallery/tasks/magpiegenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[conversation]\n            OCOL1[instruction]\n            OCOL2[response]\n            OCOL3[system_prompt_key]\n            OCOL4[model_name]\n        end\n    end\n\n    subgraph MagpieGenerator\n        StepOutput[Output Columns: conversation, instruction, response, system_prompt_key, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n
"},{"location":"components-gallery/tasks/magpiegenerator/#outputs","title":"Outputs","text":"
  • conversation (ChatType): the generated conversation which is a list of chat items with a role and a message.

  • instruction (str): the generated instructions if only_instruction=True.

  • response (str): the generated response if n_turns==1.

  • system_prompt_key (str, optional): the key of the system prompt used to generate the conversation or instruction. Only if system_prompt is a dictionary.

  • model_name (str): The model name used to generate the conversation or instruction.

"},{"location":"components-gallery/tasks/magpiegenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/magpiegenerator/#generating-instructions-with-llama-3-8b-instruct-and-transformersllm","title":"Generating instructions with Llama 3 8B Instruct and TransformersLLM","text":"
from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 256,\n        },\n        device=\"mps\",\n    ),\n    only_instruction=True,\n    num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n#       [\n#           {\"instruction\": \"I've just bought a new phone and I're excited to start using it.\"},\n#           {\"instruction\": \"What are the most common types of companies that use digital signage?\"}\n#       ],\n#       True\n# )\n
"},{"location":"components-gallery/tasks/magpiegenerator/#generating-a-conversation-with-llama-3-8b-instruct-and-transformersllm","title":"Generating a conversation with Llama 3 8B Instruct and TransformersLLM","text":"
from distilabel.models import TransformersLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\ngenerator = MagpieGenerator(\n    llm=TransformersLLM(\n        model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 1.0,\n            \"max_new_tokens\": 64,\n        },\n        device=\"mps\",\n    ),\n    n_turns=3,\n    num_rows=5,\n)\n\ngenerator.load()\n\nresult = next(generator.process())\n# (\n#     [\n#         {\n#             'conversation': [\n#                 {\n#                     'role': 'system',\n#                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n#                 },\n#                 {'role': 'user', 'content': \"I'm considering starting a social media campaign for my small business and I're not sure where to start. Can you help?\"},\n#                 {\n#                     'role': 'assistant',\n#                     'content': \"Exciting endeavor! Creating a social media campaign can be a great way to increase brand awareness, drive website traffic, and ultimately boost sales. I'd be happy to guide you through the process. To get started,\n# let's break down the basics. First, we need to identify your goals and target audience. What do\"\n#                 },\n#                 {\n#                     'role': 'user',\n#                     'content': \"Before I start a social media campaign, what kind of costs ammol should I expect to pay? There are several factors that contribute to the total cost of running a social media campaign. Let me outline some of the main\n# expenses you might encounter: 1. Time: As the business owner, you'll likely spend time creating\"\n#                 },\n#                 {\n#                     'role': 'assistant',\n#                     'content': 'Time is indeed one of the biggest investments when it comes to running a social media campaign! Besides time, you may also incur costs associated with: 2. Content creation: You might need to hire freelancers or\n# agencies to create high-quality content (images, videos, captions) for your social media platforms. 3. Advertising'\n#                 }\n#             ]\n#         },\n#         {\n#             'conversation': [\n#                 {\n#                     'role': 'system',\n#                     'content': 'You are a helpful Al assistant. The user will engage in a multi\u2212round conversation with you,asking initial questions and following up with additional related questions. Your goal is to provide thorough, relevant and\n# insightful responses to help the user with their queries.'\n#                 },\n#                 {'role': 'user', 'content': \"I am thinking of buying a new laptop or computer. What are some important factors I should consider when making your decision? I'll make sure to let you know if any other favorites or needs come up!\"},\n#                 {\n#                     'role': 'assistant',\n#                     'content': 'Exciting times ahead! When considering a new laptop or computer, there are several key factors to think about to ensure you find the right one for your needs. Here are some crucial ones to get you started: 1.\n# **Purpose**: How will you use your laptop or computer? For work, gaming, video editing,'\n#                 },\n#                 {\n#                     'role': 'user',\n#                     'content': 'Let me stop you there. Let's explore this \"purpose\" factor that you mentioned earlier. Can you elaborate more on what type of devices would be suitable for different purposes? For example, if I're primarily using my\n# laptop for general usage like browsing, email, and word processing, would a budget-friendly laptop be sufficient'\n#                 },\n#                 {\n#                     'role': 'assistant',\n#                     'content': \"Understanding your purpose can greatly impact the type of device you'll need. **General Usage (Browsing, Email, Word Processing)**: For casual users who mainly use their laptop for daily tasks, a budget-friendly\n# option can be sufficient. Look for laptops with: * Intel Core i3 or i5 processor* \"\n#                 }\n#             ]\n#         }\n#     ],\n#     True\n# )\n
"},{"location":"components-gallery/tasks/magpiegenerator/#generating-with-system-prompts-with-probabilities","title":"Generating with system prompts with probabilities","text":"
from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import MagpieGenerator\n\nmagpie = MagpieGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n        magpie_pre_query_template=\"llama3\",\n        generation_kwargs={\n            \"temperature\": 0.8,\n            \"max_new_tokens\": 256,\n        },\n    ),\n    n_turns=2,\n    system_prompt={\n        \"math\": (\"You're an expert AI assistant.\", 0.8),\n        \"writing\": (\"You're an expert writing assistant.\", 0.2),\n    },\n)\n\nmagpie.load()\n\nresult = next(magpie.process())\n
"},{"location":"components-gallery/tasks/magpiegenerator/#references","title":"References","text":"
  • Magpie: Alignment Data Synthesis from Scratch by Prompting Aligned LLMs with Nothing
"},{"location":"components-gallery/tasks/chatgeneration/","title":"ChatGeneration","text":"

Generates text based on a conversation.

ChatGeneration is a pre-defined task that defines the messages as the input and generation as the output. This task is used to generate text based on a conversation. The model_name is also returned as part of the output in order to enhance it.

"},{"location":"components-gallery/tasks/chatgeneration/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[messages]\n        end\n        subgraph New columns\n            OCOL0[generation]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph ChatGeneration\n        StepInput[Input Columns: messages]\n        StepOutput[Output Columns: generation, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/chatgeneration/#inputs","title":"Inputs","text":"
  • messages (List[Dict[Literal[\"role\", \"content\"], str]]): The messages to generate the follow up completion from.
"},{"location":"components-gallery/tasks/chatgeneration/#outputs","title":"Outputs","text":"
  • generation (str): The generated text from the assistant.

  • model_name (str): The model name used to generate the text.

"},{"location":"components-gallery/tasks/chatgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/chatgeneration/#generate-text-from-a-conversation-in-openai-chat-format","title":"Generate text from a conversation in OpenAI chat format","text":"
from distilabel.steps.tasks import ChatGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nchat = ChatGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nchat.load()\n\nresult = next(\n    chat.process(\n        [\n            {\n                \"messages\": [\n                    {\"role\": \"user\", \"content\": \"How much is 2+2?\"},\n                ]\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'messages': [{'role': 'user', 'content': 'How much is 2+2?'}],\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#         'generation': '4',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/argillalabeller/","title":"ArgillaLabeller","text":"

Annotate Argilla records based on input fields, example records and question settings.

This task is designed to facilitate the annotation of Argilla records by leveraging a pre-trained LLM. It uses a system prompt that guides the LLM to understand the input fields, the question type, and the question settings. The task then formats the input data and generates a response based on the question. The response is validated against the question's value model, and the final suggestion is prepared for annotation.

"},{"location":"components-gallery/tasks/argillalabeller/#attributes","title":"Attributes","text":"
  • _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/argillalabeller/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[record]\n            ICOL1[fields]\n            ICOL2[question]\n            ICOL3[example_records]\n            ICOL4[guidelines]\n        end\n        subgraph New columns\n            OCOL0[suggestion]\n        end\n    end\n\n    subgraph ArgillaLabeller\n        StepInput[Input Columns: record, fields, question, example_records, guidelines]\n        StepOutput[Output Columns: suggestion]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    ICOL4 --> StepInput\n    StepOutput --> OCOL0\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/argillalabeller/#inputs","title":"Inputs","text":"
  • record (argilla.Record): The record to be annotated.

  • fields (Optional[List[Dict[str, Any]]]): The list of field settings for the input fields.

  • question (Optional[Dict[str, Any]]): The question settings for the question to be answered.

  • example_records (Optional[List[Dict[str, Any]]]): The few shot example records with responses to be used to answer the question.

  • guidelines (Optional[str]): The guidelines for the annotation task.

"},{"location":"components-gallery/tasks/argillalabeller/#outputs","title":"Outputs","text":"
  • suggestion (Dict[str, Any]): The final suggestion for annotation.
"},{"location":"components-gallery/tasks/argillalabeller/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/argillalabeller/#annotate-a-record-with-the-same-dataset-and-question","title":"Annotate a record with the same dataset and question","text":"
import argilla as rg\nfrom argilla import Suggestion\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\npending_records_filter = rg.Filter((\"status\", \"==\", \"pending\"))\ncompleted_records_filter = rg.Filter((\"status\", \"==\", \"completed\"))\npending_records = list(\n    dataset.records(\n        query=rg.Query(filter=pending_records_filter),\n        limit=5,\n    )\n)\nexample_records = list(\n    dataset.records(\n        query=rg.Query(filter=completed_records_filter),\n        limit=5,\n    )\n)\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    fields=[field],\n    question=question,\n    example_records=example_records,\n    guidelines=dataset.guidelines\n)\nlabeller.load()\n\n# Process the pending records\nresult = next(\n    labeller.process(\n        [\n            {\n                \"record\": record\n            } for record in pending_records\n        ]\n    )\n)\n\n# Add the suggestions to the records\nfor record, suggestion in zip(pending_records, result):\n    record.suggestions.add(Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated records\ndataset.records.log(pending_records)\n
"},{"location":"components-gallery/tasks/argillalabeller/#annotate-a-record-with-alternating-datasets-and-questions","title":"Annotate a record with alternating datasets and questions","text":"
import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Get information from Argilla dataset definition\ndataset = rg.Dataset(\"my_dataset\")\nfield = dataset.settings.fields[\"text\"]\nquestion = dataset.settings.questions[\"label\"]\nquestion2 = dataset.settings.questions[\"label2\"]\n\n# Initialize the labeller with the model and fields\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\nlabeller.load()\n\n# Process the record\nrecord = next(dataset.records())\nresult = next(\n    labeller.process(\n        [\n            {\n                \"record\": record,\n                \"fields\": [field],\n                \"question\": question,\n            },\n            {\n                \"record\": record,\n                \"fields\": [field],\n                \"question\": question2,\n            }\n        ]\n    )\n)\n\n# Add the suggestions to the record\nfor suggestion in result:\n    record.suggestions.add(rg.Suggestion(**suggestion[\"suggestion\"]))\n\n# Log the updated record\ndataset.records.log([record])\n
"},{"location":"components-gallery/tasks/argillalabeller/#overwrite-default-prompts-and-instructions","title":"Overwrite default prompts and instructions","text":"
import argilla as rg\nfrom distilabel.steps.tasks import ArgillaLabeller\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Overwrite default prompts and instructions\nlabeller = ArgillaLabeller(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    system_prompt=\"You are an expert annotator and labelling assistant that understands complex domains and natural language processing.\",\n    question_to_label_instruction={\n        \"label_selection\": \"Select the appropriate label from the list of provided labels.\",\n        \"multi_label_selection\": \"Select none, one or multiple labels from the list of provided labels.\",\n        \"text\": \"Provide a text response to the question.\",\n        \"rating\": \"Provide a rating for the question.\",\n    },\n)\nlabeller.load()\n
"},{"location":"components-gallery/tasks/argillalabeller/#references","title":"References","text":"
  • Argilla: Argilla is a collaboration tool for AI engineers and domain experts to build high-quality datasets
"},{"location":"components-gallery/tasks/textclassification/","title":"TextClassification","text":"

Classifies text into one or more categories or labels.

This task can be used for text classification problems, where the goal is to assign one or multiple labels to a given text. It uses structured generation as per the reference paper by default, it can help to generate more concise labels. See section 4.1 in the reference.

"},{"location":"components-gallery/tasks/textclassification/#attributes","title":"Attributes","text":"
  • system_prompt: A prompt to display to the user before the task starts. Contains a default message to make the model behave like a classifier specialist.

  • n: Number of labels to generate If only 1 is required, corresponds to a label classification problem, if >1 it will intend return the \"n\" labels most representative for the text. Defaults to 1.

  • context: Context to use when generating the labels. By default contains a generic message, but can be used to customize the context for the task.

  • examples: List of examples to help the model understand the task, few shots.

  • available_labels: List of available labels to choose from when classifying the text, or a dictionary with the labels and their descriptions.

  • default_label: Default label to use when the text is ambiguous or lacks sufficient information for classification. Can be a list in case of multiple labels (n>1).

"},{"location":"components-gallery/tasks/textclassification/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n        end\n        subgraph New columns\n            OCOL0[labels]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph TextClassification\n        StepInput[Input Columns: text]\n        StepOutput[Output Columns: labels, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/textclassification/#inputs","title":"Inputs","text":"
  • text (str): The reference text we want to obtain labels for.
"},{"location":"components-gallery/tasks/textclassification/#outputs","title":"Outputs","text":"
  • labels (Union[str, List[str]]): The label or list of labels for the text.

  • model_name (str): The name of the model used to generate the label/s.

"},{"location":"components-gallery/tasks/textclassification/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textclassification/#assigning-a-sentiment-to-a-text","title":"Assigning a sentiment to a text","text":"
from distilabel.steps.tasks import TextClassification\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\ntext_classification = TextClassification(\n    llm=llm,\n    context=\"You are an AI system specialized in assigning sentiment to movies.\",\n    available_labels=[\"positive\", \"negative\"],\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\"}]\n    )\n)\n# result\n# [{'text': 'This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.',\n# 'labels': 'positive',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"positive\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the user query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nYou are an AI system specialized in assigning sentiment to movie the user queries.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"positive\",  # The text shows positive sentiment\\n    \"negative\",  # The text shows negative sentiment\\n]\\n\\n\\n## User Query\\n```\\nThis was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/textclassification/#assigning-predefined-labels-with-specified-descriptions","title":"Assigning predefined labels with specified descriptions","text":"
from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n    llm=llm,\n    n=1,\n    context=\"Determine the intent of the text.\",\n    available_labels={\n        \"complaint\": \"A statement expressing dissatisfaction or annoyance about a product, service, or experience. It's a negative expression of discontent, often with the intention of seeking a resolution or compensation.\",\n        \"inquiry\": \"A question or request for information about a product, service, or situation. It's a neutral or curious expression seeking clarification or details.\",\n        \"feedback\": \"A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\",\n        \"praise\": \"A statement expressing admiration, approval, or appreciation for a product, service, or experience. It's a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\"\n    },\n    query_title=\"Customer Query\",\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"Can you tell me more about your return policy?\"}]\n    )\n)\n# result\n# [{'text': 'Can you tell me more about your return policy?',\n# 'labels': 'inquiry',\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": \"inquiry\"\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the customer query by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide the label that best describes the text.\\nDetermine the intent of the text.\\n## Labeling the user input\\nUse the available labels to classify the user query. Analyze the context of each label specifically:\\navailable_labels = [\\n    \"complaint\",  # A statement expressing dissatisfaction or annoyance about a product, service, or experience. It\\'s a negative expression of discontent, often with the intention of seeking a resolution or compensation.\\n    \"inquiry\",  # A question or request for information about a product, service, or situation. It\\'s a neutral or curious expression seeking clarification or details.\\n    \"feedback\",  # A statement providing evaluation, opinion, or suggestion about a product, service, or experience. It can be positive, negative, or neutral, and is often intended to help improve or inform.\\n    \"praise\",  # A statement expressing admiration, approval, or appreciation for a product, service, or experience. It\\'s a positive expression of satisfaction or delight, often with the intention of encouraging or recommending.\\n]\\n\\n\\n## Customer Query\\n```\\nCan you tell me more about your return policy?\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": \"label\"\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/textclassification/#free-multi-label-classification-without-predefined-labels","title":"Free multi label classification without predefined labels","text":"
from distilabel.steps.tasks import TextClassification\n\ntext_classification = TextClassification(\n    llm=llm,\n    n=3,\n    context=(\n        \"Describe the main themes, topics, or categories that could describe the \"\n        \"following type of persona.\"\n    ),\n    query_title=\"Example of Persona\",\n)\n\ntext_classification.load()\n\nresult = next(\n    text_classification.process(\n        [{\"text\": \"A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\"}]\n    )\n)\n# result\n# [{'text': 'A historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.',\n# 'labels': ['Historical Researcher',\n# 'Cultural Specialist',\n# 'Ethnic Studies Expert'],\n# 'distilabel_metadata': {'raw_output_text_classification_0': '{\\n    \"labels\": [\"Historical Researcher\", \"Cultural Specialist\", \"Ethnic Studies Expert\"]\\n}',\n# 'raw_input_text_classification_0': [{'role': 'system',\n#     'content': 'You are an AI system specialized in generating labels to classify pieces of text. Your sole purpose is to analyze the given text and provide appropriate classification labels.'},\n#     {'role': 'user',\n#     'content': '# Instruction\\nPlease classify the example of persona by assigning the most appropriate labels.\\nDo not explain your reasoning or provide any additional commentary.\\nIf the text is ambiguous or lacks sufficient information for classification, respond with \"Unclassified\".\\nProvide a list of 3 labels that best describe the text.\\nDescribe the main themes, topics, or categories that could describe the following type of persona.\\nUse clear, widely understood terms for labels.Avoid overly specific or obscure labels unless the text demands it.\\n\\n\\n## Example of Persona\\n```\\nA historian or curator of Mexican-American history and culture focused on the cultural, social, and historical impact of the Mexican presence in the United States.\\n```\\n\\n## Output Format\\nNow, please give me the labels in JSON format, do not include any other text in your response:\\n```\\n{\\n    \"labels\": [\"label_0\", \"label_1\", \"label_2\"]\\n}\\n```'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/textclassification/#references","title":"References","text":"
  • Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models
"},{"location":"components-gallery/tasks/evolinstruct/","title":"EvolInstruct","text":"

Evolve instructions using an LLM.

WizardLM: Empowering Large Language Models to Follow Complex Instructions

"},{"location":"components-gallery/tasks/evolinstruct/#attributes","title":"Attributes","text":"
  • num_evolutions: The number of evolutions to be performed.

  • store_evolutions: Whether to store all the evolutions or just the last one. Defaults to False.

  • generate_answers: Whether to generate answers for the evolved instructions. Defaults to False.

  • include_original_instruction: Whether to include the original instruction in the evolved_instructions output column. Defaults to False.

  • mutation_templates: The mutation templates to be used for evolving the instructions. Defaults to the ones provided in the utils.py file.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolinstruct/#runtime-parameters","title":"Runtime Parameters","text":"
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
"},{"location":"components-gallery/tasks/evolinstruct/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n        end\n        subgraph New columns\n            OCOL0[evolved_instruction]\n            OCOL1[evolved_instructions]\n            OCOL2[model_name]\n            OCOL3[answer]\n            OCOL4[answers]\n        end\n    end\n\n    subgraph EvolInstruct\n        StepInput[Input Columns: instruction]\n        StepOutput[Output Columns: evolved_instruction, evolved_instructions, model_name, answer, answers]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepOutput --> OCOL4\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/evolinstruct/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to evolve.
"},{"location":"components-gallery/tasks/evolinstruct/#outputs","title":"Outputs","text":"
  • evolved_instruction (str): The evolved instruction if store_evolutions=False.

  • evolved_instructions (List[str]): The evolved instructions if store_evolutions=True.

  • model_name (str): The name of the LLM used to evolve the instructions.

  • answer (str): The answer to the evolved instruction if generate_answers=True and store_evolutions=False.

  • answers (List[str]): The answers to the evolved instructions if generate_answers=True and store_evolutions=True.

"},{"location":"components-gallery/tasks/evolinstruct/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolinstruct/#evolve-an-instruction-using-an-llm","title":"Evolve an instruction using an LLM","text":"
from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n
"},{"location":"components-gallery/tasks/evolinstruct/#keep-the-iterations-of-the-evolutions","title":"Keep the iterations of the evolutions","text":"
from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n    store_evolutions=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'evolved_instructions': ['initial evolution', 'final evolution'],\n#         'model_name': 'model_name'\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/evolinstruct/#generate-answers-for-the-instructions-in-a-single-step","title":"Generate answers for the instructions in a single step","text":"
from distilabel.steps.tasks import EvolInstruct\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct = EvolInstruct(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n    generate_answers=True,\n)\n\nevol_instruct.load()\n\nresult = next(evol_instruct.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'evolved_instruction': 'evolved instruction',\n#         'answer': 'answer to the instruction',\n#         'model_name': 'model_name'\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/evolinstruct/#references","title":"References","text":"
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

  • GitHub: h2oai/h2o-wizardlm

"},{"location":"components-gallery/tasks/evolcomplexity/","title":"EvolComplexity","text":"

Evolve instructions to make them more complex using an LLM.

EvolComplexity is a task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, using slight different prompts, but the exact same evolutionary approach.

"},{"location":"components-gallery/tasks/evolcomplexity/#attributes","title":"Attributes","text":"
  • num_instructions: The number of instructions to be generated.

  • generate_answers: Whether to generate answers for the instructions or not. Defaults to False.

  • mutation_templates: The mutation templates to be used for the generation of the instructions.

  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolcomplexity/#runtime-parameters","title":"Runtime Parameters","text":"
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.

  • seed: The number of evolutions to be run.

"},{"location":"components-gallery/tasks/evolcomplexity/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n        end\n        subgraph New columns\n            OCOL0[evolved_instruction]\n            OCOL1[answer]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph EvolComplexity\n        StepInput[Input Columns: instruction]\n        StepOutput[Output Columns: evolved_instruction, answer, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/evolcomplexity/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to evolve.
"},{"location":"components-gallery/tasks/evolcomplexity/#outputs","title":"Outputs","text":"
  • evolved_instruction (str): The evolved instruction.

  • answer (str, optional): The answer to the instruction if generate_answers=True.

  • model_name (str): The name of the LLM used to evolve the instructions.

"},{"location":"components-gallery/tasks/evolcomplexity/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolcomplexity/#evolve-an-instruction-using-an-llm","title":"Evolve an instruction using an LLM","text":"
from distilabel.steps.tasks import EvolComplexity\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity = EvolComplexity(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_complexity.load()\n\nresult = next(evol_complexity.process([{\"instruction\": \"common instruction\"}]))\n# result\n# [{'instruction': 'common instruction', 'evolved_instruction': 'evolved instruction', 'model_name': 'model_name'}]\n
"},{"location":"components-gallery/tasks/evolcomplexity/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

"},{"location":"components-gallery/tasks/evolquality/","title":"EvolQuality","text":"

Evolve the quality of the responses using an LLM.

EvolQuality task is used to evolve the quality of the responses given a prompt, by generating a new response with a language model. This step implements the evolution quality task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

"},{"location":"components-gallery/tasks/evolquality/#attributes","title":"Attributes","text":"
  • num_evolutions: The number of evolutions to be performed on the responses.

  • store_evolutions: Whether to store all the evolved responses or just the last one. Defaults to False.

  • include_original_response: Whether to include the original response within the evolved responses. Defaults to False.

  • mutation_templates: The mutation templates to be used to evolve the responses.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolquality/#runtime-parameters","title":"Runtime Parameters","text":"
  • seed: The seed to be set for numpy in order to randomly pick a mutation method.
"},{"location":"components-gallery/tasks/evolquality/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[response]\n        end\n        subgraph New columns\n            OCOL0[evolved_response]\n            OCOL1[evolved_responses]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph EvolQuality\n        StepInput[Input Columns: instruction, response]\n        StepOutput[Output Columns: evolved_response, evolved_responses, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/evolquality/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction that was used to generate the responses.

  • response (str): The responses to be rewritten.

"},{"location":"components-gallery/tasks/evolquality/#outputs","title":"Outputs","text":"
  • evolved_response (str): The evolved response if store_evolutions=False.

  • evolved_responses (List[str]): The evolved responses if store_evolutions=True.

  • model_name (str): The name of the LLM used to evolve the responses.

"},{"location":"components-gallery/tasks/evolquality/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolquality/#evolve-the-quality-of-the-responses-given-a-prompt","title":"Evolve the quality of the responses given a prompt","text":"
from distilabel.steps.tasks import EvolQuality\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_quality = EvolQuality(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_evolutions=2,\n)\n\nevol_quality.load()\n\nresult = next(\n    evol_quality.process(\n        [\n            {\"instruction\": \"common instruction\", \"response\": \"a response\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'common instruction',\n#         'response': 'a response',\n#         'evolved_response': 'evolved response',\n#         'model_name': '\"mistralai/Mistral-7B-Instruct-v0.2\"'\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/evolquality/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/evolinstructgenerator/","title":"EvolInstructGenerator","text":"

Generate evolved instructions using an LLM.

WizardLM: Empowering Large Language Models to Follow Complex Instructions

"},{"location":"components-gallery/tasks/evolinstructgenerator/#attributes","title":"Attributes","text":"
  • num_instructions: The number of instructions to be generated.

  • generate_answers: Whether to generate answers for the instructions or not. Defaults to False.

  • mutation_templates: The mutation templates to be used for the generation of the instructions.

  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolinstructgenerator/#runtime-parameters","title":"Runtime Parameters","text":"
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method.

"},{"location":"components-gallery/tasks/evolinstructgenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[instruction]\n            OCOL1[answer]\n            OCOL2[instructions]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph EvolInstructGenerator\n        StepOutput[Output Columns: instruction, answer, instructions, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n
"},{"location":"components-gallery/tasks/evolinstructgenerator/#outputs","title":"Outputs","text":"
  • instruction (str): The generated instruction if generate_answers=False.

  • answer (str): The generated answer if generate_answers=True.

  • instructions (List[str]): The generated instructions if generate_answers=True.

  • model_name (str): The name of the LLM used to generate and evolve the instructions.

"},{"location":"components-gallery/tasks/evolinstructgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolinstructgenerator/#generate-evolved-instructions-without-initial-instructions","title":"Generate evolved instructions without initial instructions","text":"
from distilabel.steps.tasks import EvolInstructGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_instruct_generator = EvolInstructGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=2,\n)\n\nevol_instruct_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n
"},{"location":"components-gallery/tasks/evolinstructgenerator/#references","title":"References","text":"
  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

  • GitHub: h2oai/h2o-wizardlm

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/","title":"EvolComplexityGenerator","text":"

Generate evolved instructions with increased complexity using an LLM.

EvolComplexityGenerator is a generation task that evolves instructions to make them more complex, and it is based in the EvolInstruct task, but using slight different prompts, but the exact same evolutionary approach.

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#attributes","title":"Attributes","text":"
  • num_instructions: The number of instructions to be generated.

  • generate_answers: Whether to generate answers for the instructions or not. Defaults to False.

  • mutation_templates: The mutation templates to be used for the generation of the instructions.

  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid. Defaults to 512.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid. Defaults to 1024.

  • seed: The seed to be set for numpy in order to randomly pick a mutation method. Defaults to 42.

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#runtime-parameters","title":"Runtime Parameters","text":"
  • min_length: Defines the length (in bytes) that the generated instruction needs to be higher than, to be considered valid.

  • max_length: Defines the length (in bytes) that the generated instruction needs to be lower than, to be considered valid.

  • seed: The number of evolutions to be run.

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[instruction]\n            OCOL1[answer]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph EvolComplexityGenerator\n        StepOutput[Output Columns: instruction, answer, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n
"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#outputs","title":"Outputs","text":"
  • instruction (str): The evolved instruction.

  • answer (str, optional): The answer to the instruction if generate_answers=True.

  • model_name (str): The name of the LLM used to evolve the instructions.

"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/evolcomplexitygenerator/#generate-evolved-instructions-without-initial-instructions","title":"Generate evolved instructions without initial instructions","text":"
from distilabel.steps.tasks import EvolComplexityGenerator\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nevol_complexity_generator = EvolComplexityGenerator(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    num_instructions=2,\n)\n\nevol_complexity_generator.load()\n\nresult = next(scorer.process())\n# result\n# [{'instruction': 'generated instruction', 'model_name': 'test'}]\n
"},{"location":"components-gallery/tasks/evolcomplexitygenerator/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning

  • WizardLM: Empowering Large Language Models to Follow Complex Instructions

"},{"location":"components-gallery/tasks/instructionbacktranslation/","title":"InstructionBacktranslation","text":"

Self-Alignment with Instruction Backtranslation.

"},{"location":"components-gallery/tasks/instructionbacktranslation/#attributes","title":"Attributes","text":"
  • _template: the Jinja2 template to use for the Instruction Backtranslation task.
"},{"location":"components-gallery/tasks/instructionbacktranslation/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generation]\n        end\n        subgraph New columns\n            OCOL0[score]\n            OCOL1[reason]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph InstructionBacktranslation\n        StepInput[Input Columns: instruction, generation]\n        StepOutput[Output Columns: score, reason, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/instructionbacktranslation/#inputs","title":"Inputs","text":"
  • instruction (str): The reference instruction to evaluate the text output.

  • generation (str): The text output to evaluate for the given instruction.

"},{"location":"components-gallery/tasks/instructionbacktranslation/#outputs","title":"Outputs","text":"
  • score (str): The score for the generation based on the given instruction.

  • reason (str): The reason for the provided score.

  • model_name (str): The model name used to score the generation.

"},{"location":"components-gallery/tasks/instructionbacktranslation/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/instructionbacktranslation/#generate-a-score-and-reason-for-a-given-instruction-and-generation","title":"Generate a score and reason for a given instruction and generation","text":"
from distilabel.steps.tasks import InstructionBacktranslation\n\ninstruction_backtranslation = InstructionBacktranslation(\n        name=\"instruction_backtranslation\",\n        llm=llm,\n        input_batch_size=10,\n        output_mappings={\"model_name\": \"scoring_model\"},\n    )\ninstruction_backtranslation.load()\n\nresult = next(\n    instruction_backtranslation.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generation\": \"4\",\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         \"instruction\": \"How much is 2+2?\",\n#         \"generation\": \"4\",\n#         \"score\": 3,\n#         \"reason\": \"Reason for the generation.\",\n#         \"model_name\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/instructionbacktranslation/#references","title":"References","text":"
  • Self-Alignment with Instruction Backtranslation
"},{"location":"components-gallery/tasks/prometheuseval/","title":"PrometheusEval","text":"

Critique and rank the quality of generations from an LLM using Prometheus 2.0.

PrometheusEval is a task created for Prometheus 2.0, covering both the absolute and relative evaluations. The absolute evaluation i.e. mode=\"absolute\" is used to evaluate a single generation from an LLM for a given instruction. The relative evaluation i.e. mode=\"relative\" is used to evaluate two generations from an LLM for a given instruction. Both evaluations provide the possibility of using a reference answer to compare with or withoug the reference attribute, and both are based on a score rubric that critiques the generation/s based on the following default aspects: helpfulness, harmlessness, honesty, factual-validity, and reasoning, that can be overridden via rubrics, and the selected rubric is set via the attribute rubric.

"},{"location":"components-gallery/tasks/prometheuseval/#note","title":"Note","text":"

The PrometheusEval task is better suited and intended to be used with any of the Prometheus 2.0 models released by Kaist AI, being: https://huggingface.co/prometheus-eval/prometheus-7b-v2.0, and https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0. The critique assessment formatting and quality is not guaranteed if using another model, even though some other models may be able to correctly follow the formatting and generate insightful critiques too.

"},{"location":"components-gallery/tasks/prometheuseval/#attributes","title":"Attributes","text":"
  • mode: the evaluation mode to use, either absolute or relative. It defines whether the task will evaluate one or two generations.

  • rubric: the score rubric to use within the prompt to run the critique based on different aspects. Can be any existing key in the rubrics attribute, which by default means that it can be: helpfulness, harmlessness, honesty, factual-validity, or reasoning. Those will only work if using the default rubrics, otherwise, the provided rubrics should be used.

  • rubrics: a dictionary containing the different rubrics to use for the critique, where the keys are the rubric names and the values are the rubric descriptions. The default rubrics are the following: helpfulness, harmlessness, honesty, factual-validity, and reasoning.

  • reference: a boolean flag to indicate whether a reference answer / completion will be provided, so that the model critique is based on the comparison with it. It implies that the column reference needs to be provided within the input data in addition to the rest of the inputs.

  • _template: a Jinja2 template used to format the input for the LLM.

"},{"location":"components-gallery/tasks/prometheuseval/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generation]\n            ICOL2[generations]\n            ICOL3[reference]\n        end\n        subgraph New columns\n            OCOL0[feedback]\n            OCOL1[result]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph PrometheusEval\n        StepInput[Input Columns: instruction, generation, generations, reference]\n        StepOutput[Output Columns: feedback, result, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/prometheuseval/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to use as reference.

  • generation (str, optional): The generated text from the given instruction. This column is required if mode=absolute.

  • generations (List[str], optional): The generated texts from the given instruction. It should contain 2 generations only. This column is required if mode=relative.

  • reference (str, optional): The reference / golden answer for the instruction, to be used by the LLM for comparison against.

"},{"location":"components-gallery/tasks/prometheuseval/#outputs","title":"Outputs","text":"
  • feedback (str): The feedback explaining the result below, as critiqued by the LLM using the pre-defined score rubric, compared against reference if provided.

  • result (Union[int, Literal[\"A\", \"B\"]]): If mode=absolute, then the result contains the score for the generation in a likert-scale from 1-5, otherwise, if mode=relative, then the result contains either \"A\" or \"B\", the \"winning\" one being the generation in the index 0 of generations if result='A' or the index 1 if result='B'.

  • model_name (str): The model name used to generate the feedback and result.

"},{"location":"components-gallery/tasks/prometheuseval/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/prometheuseval/#critique-and-evaluate-llm-generation-quality-using-prometheus-2_0","title":"Critique and evaluate LLM generation quality using Prometheus 2_0","text":"
from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"factual-validity\"\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generation\": \"something done\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/prometheuseval/#critique-for-relative-evaluation","title":"Critique for relative evaluation","text":"
from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"relative\",\n    rubric=\"honesty\"\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generations\": [\"something done\", \"other thing\"]},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generations': ['something done', 'other thing'],\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 'something done',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/prometheuseval/#critique-with-a-custom-rubric","title":"Critique with a custom rubric","text":"
from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"custom\",\n    rubrics={\n        \"custom\": \"[A]\\nScore 1: A\\nScore 2: B\\nScore 3: C\\nScore 4: D\\nScore 5: E\"\n    }\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\"instruction\": \"make something\", \"generation\": \"something done\"},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/prometheuseval/#critique-using-a-reference-answer","title":"Critique using a reference answer","text":"
from distilabel.steps.tasks import PrometheusEval\nfrom distilabel.models import vLLM\n\n# Consider this as a placeholder for your actual LLM.\nprometheus = PrometheusEval(\n    llm=vLLM(\n        model=\"prometheus-eval/prometheus-7b-v2.0\",\n        chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n    ),\n    mode=\"absolute\",\n    rubric=\"helpfulness\",\n    reference=True,\n)\n\nprometheus.load()\n\nresult = next(\n    prometheus.process(\n        [\n            {\n                \"instruction\": \"make something\",\n                \"generation\": \"something done\",\n                \"reference\": \"this is a reference answer\",\n            },\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'make something',\n#         'generation': 'something done',\n#         'reference': 'this is a reference answer',\n#         'model_name': 'prometheus-eval/prometheus-7b-v2.0',\n#         'feedback': 'the feedback',\n#         'result': 6,\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/prometheuseval/#references","title":"References","text":"
  • Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models

  • prometheus-eval: Evaluate your LLM's response with Prometheus \ud83d\udcaf

"},{"location":"components-gallery/tasks/complexityscorer/","title":"ComplexityScorer","text":"

Score instructions based on their complexity using an LLM.

ComplexityScorer is a pre-defined task used to rank a list of instructions based in their complexity. It's an implementation of the complexity score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

"},{"location":"components-gallery/tasks/complexityscorer/#attributes","title":"Attributes","text":"
  • _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/complexityscorer/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instructions]\n        end\n        subgraph New columns\n            OCOL0[scores]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph ComplexityScorer\n        StepInput[Input Columns: instructions]\n        StepOutput[Output Columns: scores, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/complexityscorer/#inputs","title":"Inputs","text":"
  • instructions (List[str]): The list of instructions to be scored.
"},{"location":"components-gallery/tasks/complexityscorer/#outputs","title":"Outputs","text":"
  • scores (List[float]): The score for each instruction.

  • model_name (str): The model name used to generate the scores.

"},{"location":"components-gallery/tasks/complexityscorer/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/complexityscorer/#evaluate-the-complexity-of-your-instructions","title":"Evaluate the complexity of your instructions","text":"
from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n    )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 5], 'distilabel_metadata': {'raw_output_complexity_scorer_0': 'output'}}]\n
"},{"location":"components-gallery/tasks/complexityscorer/#generate-structured-output-with-default-schema","title":"Generate structured output with default schema","text":"
from distilabel.steps.tasks import ComplexityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = ComplexityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    use_default_structured_output=use_default_structured_output\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [{\"instructions\": [\"plain instruction\", \"highly complex instruction\"]}]\n    )\n)\n# result\n# [{'instructions': ['plain instruction', 'highly complex instruction'], 'model_name': 'test', 'scores': [1, 2], 'distilabel_metadata': {'raw_output_complexity_scorer_0': '{ \\n  \"scores\": [\\n    1, \\n    2\\n  ]\\n}'}}]\n
"},{"location":"components-gallery/tasks/complexityscorer/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/qualityscorer/","title":"QualityScorer","text":"

Score responses based on their quality using an LLM.

QualityScorer is a pre-defined task that defines the instruction as the input and score as the output. This task is used to rate the quality of instructions and responses. It's an implementation of the quality score task from the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'. The task follows the same scheme as the Complexity Scorer, but the instruction-response pairs are scored in terms of quality, obtaining a quality score for each instruction.

"},{"location":"components-gallery/tasks/qualityscorer/#attributes","title":"Attributes","text":"
  • _template: a Jinja2 template used to format the input for the LLM.
"},{"location":"components-gallery/tasks/qualityscorer/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[responses]\n        end\n        subgraph New columns\n            OCOL0[scores]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph QualityScorer\n        StepInput[Input Columns: instruction, responses]\n        StepOutput[Output Columns: scores, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/qualityscorer/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction that was used to generate the responses.

  • responses (List[str]): The responses to be scored. Each response forms a pair with the instruction.

"},{"location":"components-gallery/tasks/qualityscorer/#outputs","title":"Outputs","text":"
  • scores (List[float]): The score for each instruction.

  • model_name (str): The model name used to generate the scores.

"},{"location":"components-gallery/tasks/qualityscorer/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/qualityscorer/#evaluate-the-quality-of-your-instructions","title":"Evaluate the quality of your instructions","text":"
from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nscorer = QualityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    )\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n            }\n        ]\n    )\n)\n# result\n[\n    {\n        'instructions': 'instruction',\n        'model_name': 'test',\n        'scores': [5, 3, 1],\n    }\n]\n
"},{"location":"components-gallery/tasks/qualityscorer/#generate-structured-output-with-default-schema","title":"Generate structured output with default schema","text":"
from distilabel.steps.tasks import QualityScorer\nfrom distilabel.models import InferenceEndpointsLLM\n\nscorer = QualityScorer(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    use_default_structured_output=True\n)\n\nscorer.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\n                \"instruction\": \"instruction\",\n                \"responses\": [\"good response\", \"weird response\", \"bad response\"]\n            }\n        ]\n    )\n)\n\n# result\n[{'instruction': 'instruction',\n'responses': ['good response', 'weird response', 'bad response'],\n'scores': [1, 2, 3],\n'distilabel_metadata': {'raw_output_quality_scorer_0': '{  \"scores\": [1, 2, 3] }'},\n'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/qualityscorer/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/clair/","title":"CLAIR","text":"

Contrastive Learning from AI Revisions (CLAIR).

CLAIR uses an AI system to minimally revise a solution A\u2192A\u00b4 such that the resulting preference A preferred A\u2019 is much more contrastive and precise.

"},{"location":"components-gallery/tasks/clair/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n            ICOL1[student_solution]\n        end\n        subgraph New columns\n            OCOL0[revision]\n            OCOL1[rational]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph CLAIR\n        StepInput[Input Columns: task, student_solution]\n        StepOutput[Output Columns: revision, rational, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/clair/#inputs","title":"Inputs","text":"
  • task (str): The task or instruction.

  • student_solution (str): An answer to the task that is to be revised.

"},{"location":"components-gallery/tasks/clair/#outputs","title":"Outputs","text":"
  • revision (str): The revised text.

  • rational (str): The rational for the provided revision.

  • model_name (str): The name of the model used to generate the revision and rational.

"},{"location":"components-gallery/tasks/clair/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/clair/#create-contrastive-preference-pairs","title":"Create contrastive preference pairs","text":"
from distilabel.steps.tasks import CLAIR\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 4096,\n    },\n)\nclair_task = CLAIR(llm=llm)\n\nclair_task.load()\n\nresult = next(\n    clair_task.process(\n        [\n            {\n                \"task\": \"How many gaps are there between the earth and the moon?\",\n                \"student_solution\": 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon's orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.'\n            }\n        ]\n    )\n)\n# result\n# [{'task': 'How many gaps are there between the earth and the moon?',\n# 'student_solution': 'There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.',\n# 'revision': 'There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'rational': 'The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.',\n# 'distilabel_metadata': {'raw_output_c_l_a_i_r_0': '{teacher_reasoning}: The student\\'s solution provides a clear and concise answer to the question. However, there are a few areas where it can be improved. Firstly, the term \"gaps\" can be misleading in this context. The student should clarify what they mean by \"gaps.\" Secondly, the student provides some additional information about the Moon\\'s orbit, which is correct but could be more clearly connected to the main point. Lastly, the student\\'s conclusion could be more concise.\\n\\n{corrected_student_solution}: There are no physical gaps or empty spaces between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a significant separation or gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range. This variation in distance is a result of the Moon\\'s orbital path, not the presence of any gaps.\\n\\nIn summary, the Moon\\'s orbit is continuous, with no intervening gaps, and its distance from the Earth varies due to the elliptical shape of its orbit.',\n# 'raw_input_c_l_a_i_r_0': [{'role': 'system',\n#     'content': \"You are a teacher and your task is to minimally improve a student's answer. I will give you a {task} and a {student_solution}. Your job is to revise the {student_solution} such that it is clearer, more correct, and more engaging. Copy all non-corrected parts of the student's answer. Do not allude to the {corrected_student_solution} being a revision or a correction in your final solution.\"},\n#     {'role': 'user',\n#     'content': '{task}: How many gaps are there between the earth and the moon?\\n\\n{student_solution}: There are no gaps between the Earth and the Moon. The Moon is actually in a close orbit around the Earth, and it is held in place by gravity. The average distance between the Earth and the Moon is about 384,400 kilometers (238,900 miles), and this distance is known as the \"lunar distance\" or \"lunar mean distance.\"\\n\\nThe Moon does not have a gap between it and the Earth because it is a natural satellite that is gravitationally bound to our planet. The Moon\\'s orbit is elliptical, which means that its distance from the Earth varies slightly over the course of a month, but it always remains within a certain range.\\n\\nSo, to summarize, there are no gaps between the Earth and the Moon. The Moon is simply a satellite that orbits the Earth, and its distance from our planet varies slightly due to the elliptical shape of its orbit.\\n\\n-----------------\\n\\nLet\\'s first think step by step with a {teacher_reasoning} to decide how to improve the {student_solution}, then give the {corrected_student_solution}. Mention the {teacher_reasoning} and {corrected_student_solution} identifiers to structure your answer.'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/clair/#references","title":"References","text":"
  • Anchored Preference Optimization and Contrastive Revisions: Addressing Underspecification in Alignment

  • APO and CLAIR - GitHub Repository

"},{"location":"components-gallery/tasks/ultrafeedback/","title":"UltraFeedback","text":"

Rank generations focusing on different aspects using an LLM.

UltraFeedback: Boosting Language Models with High-quality Feedback.

"},{"location":"components-gallery/tasks/ultrafeedback/#attributes","title":"Attributes","text":"
  • aspect: The aspect to perform with the UltraFeedback model. The available aspects are: - helpfulness: Evaluate text outputs based on helpfulness. - honesty: Evaluate text outputs based on honesty. - instruction-following: Evaluate text outputs based on given instructions. - truthfulness: Evaluate text outputs based on truthfulness. Additionally, a custom aspect has been defined by Argilla, so as to evaluate the overall assessment of the text outputs within a single prompt. The custom aspect is: - overall-rating: Evaluate text outputs based on an overall assessment. Defaults to \"overall-rating\".
"},{"location":"components-gallery/tasks/ultrafeedback/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[generations]\n        end\n        subgraph New columns\n            OCOL0[ratings]\n            OCOL1[rationales]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph UltraFeedback\n        StepInput[Input Columns: instruction, generations]\n        StepOutput[Output Columns: ratings, rationales, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/ultrafeedback/#inputs","title":"Inputs","text":"
  • instruction (str): The reference instruction to evaluate the text outputs.

  • generations (List[str]): The text outputs to evaluate for the given instruction.

"},{"location":"components-gallery/tasks/ultrafeedback/#outputs","title":"Outputs","text":"
  • ratings (List[float]): The ratings for each of the provided text outputs.

  • rationales (List[str]): The rationales for each of the provided text outputs.

  • model_name (str): The name of the model used to generate the ratings and rationales.

"},{"location":"components-gallery/tasks/ultrafeedback/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-selected-aspect","title":"Rate generations from different LLMs based on the selected aspect","text":"
from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"mistralai/Mistral-7B-Instruct-v0.2\",\n    ),\n    use_default_structured_output=False\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'instruction': 'How much is 2+2?',\n#         'generations': ['4', 'and a car'],\n#         'ratings': [1, 2],\n#         'rationales': ['explanation for 4', 'explanation for and a car'],\n#         'model_name': 'mistralai/Mistral-7B-Instruct-v0.2',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-honesty-using-the-default-structured-output","title":"Rate generations from different LLMs based on the honesty, using the default structured output","text":"
from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    aspect=\"honesty\"\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n# 'generations': ['4', 'and a car'],\n# 'ratings': [5, 1],\n# 'rationales': ['The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.',\n# \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"],\n# 'distilabel_metadata': {'raw_output_ultra_feedback_0': '{\"ratings\": [\\n    5,\\n    1\\n] \\n\\n,\"rationales\": [\\n    \"The response is correct and confident, as it directly answers the question without expressing any uncertainty or doubt.\",\\n    \"The response is confidently incorrect, as it provides unrelated information ('a car') and does not address the question. The model shows no uncertainty or indication that it does not know the answer.\"\\n] }'},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/ultrafeedback/#rate-generations-from-different-llms-based-on-the-helpfulness-using-the-default-structured-output","title":"Rate generations from different LLMs based on the helpfulness, using the default structured output","text":"
from distilabel.steps.tasks import UltraFeedback\nfrom distilabel.models import InferenceEndpointsLLM\n\n# Consider this as a placeholder for your actual LLM.\nultrafeedback = UltraFeedback(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        generation_kwargs={\"max_new_tokens\": 512},\n    ),\n    aspect=\"helpfulness\"\n)\n\nultrafeedback.load()\n\nresult = next(\n    ultrafeedback.process(\n        [\n            {\n                \"instruction\": \"How much is 2+2?\",\n                \"generations\": [\"4\", \"and a car\"],\n            }\n        ]\n    )\n)\n# result\n# [{'instruction': 'How much is 2+2?',\n#   'generations': ['4', 'and a car'],\n#   'ratings': [1, 5],\n#   'rationales': ['Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.',\n#    'Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.'],\n#   'rationales_for_rating': ['Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.',\n#    'Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.'],\n#   'types': [1, 3, 1],\n#   'distilabel_metadata': {'raw_output_ultra_feedback_0': '{ \\n  \"ratings\": [\\n    1,\\n    5\\n  ]\\n ,\\n  \"rationales\": [\\n    \"Text 1 is clear and relevant, providing the correct answer to the question. It is also not lengthy and does not contain repetition. However, it lacks comprehensive information or detailed description.\",\\n    \"Text 2 is neither clear nor relevant to the task. It does not provide any useful information and seems unrelated to the question.\"\\n  ]\\n ,\\n  \"rationales_for_rating\": [\\n    \"Text 1 is rated as Correct (3) because it provides the accurate answer to the question, but lacks comprehensive information or detailed description.\",\\n    \"Text 2 is rated as Severely Incorrect (1) because it does not provide any relevant information and seems unrelated to the question.\"\\n  ]\\n ,\\n  \"types\": [\\n    1, 3,\\n    1\\n  ]\\n  }'},\n#   'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/ultrafeedback/#references","title":"References","text":"
  • UltraFeedback: Boosting Language Models with High-quality Feedback

  • UltraFeedback - GitHub Repository

"},{"location":"components-gallery/tasks/pairrm/","title":"PairRM","text":"

Rank the candidates based on the input using the LLM model.

"},{"location":"components-gallery/tasks/pairrm/#note","title":"Note","text":"

This step differs to other tasks as there is a single implementation of this model currently, and we will use a specific LLM.

"},{"location":"components-gallery/tasks/pairrm/#attributes","title":"Attributes","text":"
  • model: The model to use for the ranking. Defaults to \"llm-blender/PairRM\".

  • instructions: The instructions to use for the model. Defaults to None.

"},{"location":"components-gallery/tasks/pairrm/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[inputs]\n            ICOL1[candidates]\n        end\n        subgraph New columns\n            OCOL0[ranks]\n            OCOL1[ranked_candidates]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph PairRM\n        StepInput[Input Columns: inputs, candidates]\n        StepOutput[Output Columns: ranks, ranked_candidates, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/pairrm/#inputs","title":"Inputs","text":"
  • inputs (List[Dict[str, Any]]): The input text or conversation to rank the candidates for.

  • candidates (List[Dict[str, Any]]): The candidates to rank.

"},{"location":"components-gallery/tasks/pairrm/#outputs","title":"Outputs","text":"
  • ranks (List[int]): The ranks of the candidates based on the input.

  • ranked_candidates (List[Dict[str, Any]]): The candidates ranked based on the input.

  • model_name (str): The model name used to rank the candidate responses. Defaults to \"llm-blender/PairRM\".

"},{"location":"components-gallery/tasks/pairrm/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/pairrm/#rank-llm-candidates","title":"Rank LLM candidates","text":"
from distilabel.steps.tasks import PairRM\n\n# Consider this as a placeholder for your actual LLM.\npair_rm = PairRM()\n\npair_rm.load()\n\nresult = next(\n    scorer.process(\n        [\n            {\"input\": \"Hello, how are you?\", \"candidates\": [\"fine\", \"good\", \"bad\"]},\n        ]\n    )\n)\n# result\n# [\n#     {\n#         'input': 'Hello, how are you?',\n#         'candidates': ['fine', 'good', 'bad'],\n#         'ranks': [2, 1, 3],\n#         'ranked_candidates': ['good', 'fine', 'bad'],\n#         'model_name': 'llm-blender/PairRM',\n#     }\n# ]\n
"},{"location":"components-gallery/tasks/pairrm/#references","title":"References","text":"
  • LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion

  • Pair Ranking Model

"},{"location":"components-gallery/tasks/generatesentencepair/","title":"GenerateSentencePair","text":"

Generate a positive and negative (optionally) sentences given an anchor sentence.

GenerateSentencePair is a pre-defined task that given an anchor sentence generates a positive sentence related to the anchor and optionally a negative sentence unrelated to the anchor or similar to it. Optionally, you can give a context to guide the LLM towards more specific behavior. This task is useful to generate training datasets for training embeddings models.

"},{"location":"components-gallery/tasks/generatesentencepair/#attributes","title":"Attributes","text":"
  • triplet: a flag to indicate if the task should generate a triplet of sentences (anchor, positive, negative). Defaults to False.

  • action: the action to perform to generate the positive sentence.

  • context: the context to use for the generation. Can be helpful to guide the LLM towards more specific context. Not used by default.

  • hard_negative: A flag to indicate if the negative should be a hard-negative or not. Hard negatives make it hard for the model to distinguish against the positive, with a higher degree of semantic similarity.

"},{"location":"components-gallery/tasks/generatesentencepair/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[anchor]\n        end\n        subgraph New columns\n            OCOL0[positive]\n            OCOL1[negative]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph GenerateSentencePair\n        StepInput[Input Columns: anchor]\n        StepOutput[Output Columns: positive, negative, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generatesentencepair/#inputs","title":"Inputs","text":"
  • anchor (str): The anchor sentence to generate the positive and negative sentences.
"},{"location":"components-gallery/tasks/generatesentencepair/#outputs","title":"Outputs","text":"
  • positive (str): The positive sentence related to the anchor.

  • negative (str): The negative sentence unrelated to the anchor if triplet=True, or more similar to the positive to make it more challenging for a model to distinguish in case hard_negative=True.

  • model_name (str): The name of the model that was used to generate the sentences.

"},{"location":"components-gallery/tasks/generatesentencepair/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatesentencepair/#paraphrasing","title":"Paraphrasing","text":"
from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"paraphrase\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n
"},{"location":"components-gallery/tasks/generatesentencepair/#generating-semantically-similar-sentences","title":"Generating semantically similar sentences","text":"
from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps.tasks import GenerateSentencePair\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"semantically-similar\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"How does 3D printing work?\"}])\n
"},{"location":"components-gallery/tasks/generatesentencepair/#generating-queries","title":"Generating queries","text":"
from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"Argilla is an open-source data curation platform for LLMs. Using Argilla, ...\"}])\n
"},{"location":"components-gallery/tasks/generatesentencepair/#generating-answers","title":"Generating answers","text":"
from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"answer\",\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"What Game of Thrones villain would be the most likely to give you mercy?\"}])\n
"},{"location":"components-gallery/tasks/generatesentencepair/#_1","title":")","text":"
from distilabel.steps.tasks import GenerateSentencePair\nfrom distilabel.models import InferenceEndpointsLLM\n\ngenerate_sentence_pair = GenerateSentencePair(\n    triplet=True, # `False` to generate only positive\n    action=\"query\",\n    context=\"Argilla is an open-source data curation platform for LLMs.\",\n    hard_negative=True,\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    ),\n    input_batch_size=10,\n    use_default_structured_output=True\n)\n\ngenerate_sentence_pair.load()\n\nresult = generate_sentence_pair.process([{\"anchor\": \"I want to generate queries for my LLM.\"}])\n
"},{"location":"components-gallery/tasks/generateembeddings/","title":"GenerateEmbeddings","text":"

Generate embeddings using the last hidden state of an LLM.

Generate embeddings for a text input using the last hidden state of an LLM, as described in the paper 'What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.

"},{"location":"components-gallery/tasks/generateembeddings/#attributes","title":"Attributes","text":"
  • llm: The LLM to use to generate the embeddings.
"},{"location":"components-gallery/tasks/generateembeddings/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n        end\n        subgraph New columns\n            OCOL0[embedding]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph GenerateEmbeddings\n        StepInput[Input Columns: text]\n        StepOutput[Output Columns: embedding, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generateembeddings/#inputs","title":"Inputs","text":"
  • text (str, List[Dict[str, str]]): The input text or conversation to generate embeddings for.
"},{"location":"components-gallery/tasks/generateembeddings/#outputs","title":"Outputs","text":"
  • embedding (List[float]): The embedding of the input text or conversation.

  • model_name (str): The model name used to generate the embeddings.

"},{"location":"components-gallery/tasks/generateembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generateembeddings/#rank-llm-candidates","title":"Rank LLM candidates","text":"
from distilabel.steps.tasks import GenerateEmbeddings\nfrom distilabel.models.llms.huggingface import TransformersLLM\n\n# Consider this as a placeholder for your actual LLM.\nembedder = GenerateEmbeddings(\n    llm=TransformersLLM(\n        model=\"TaylorAI/bge-micro-v2\",\n        model_kwargs={\"is_decoder\": True},\n        cuda_devices=[],\n    )\n)\nembedder.load()\n\nresult = next(\n    embedder.process(\n        [\n            {\"text\": \"Hello, how are you?\"},\n        ]\n    )\n)\n
"},{"location":"components-gallery/tasks/generateembeddings/#references","title":"References","text":"
  • What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning
"},{"location":"components-gallery/tasks/textclustering/","title":"TextClustering","text":"

Task that clusters a set of texts and generates summary labels for each cluster.

This is a GlobalTask that inherits from TextClassification, this means that all the attributes from that class are available here. Also, in this case we deal with all the inputs at once, instead of using batches. The input_batch_size is used here to send the examples to the LLM in batches (a subtle difference with the more common Task definitions). The task looks in each cluster for a given number of representative examples (the number is set by the samples_per_cluster attribute), and sends them to the LLM to get a label/s that represent the cluster. The labels are then assigned to each text in the cluster. The clusters and projections used in the step, are assumed to be obtained from the UMAP + DBSCAN steps, but could be generated for similar steps, as long as they represent the same concepts. This step runs a pipeline like the one in this repository: https://github.com/huggingface/text-clustering

"},{"location":"components-gallery/tasks/textclustering/#attributes","title":"Attributes","text":"
  • savefig: Whether to generate and save a figure with the clustering of the texts. - samples_per_cluster: The number of examples to use in the LLM as a sample of the cluster.
"},{"location":"components-gallery/tasks/textclustering/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[text]\n            ICOL1[projection]\n            ICOL2[cluster_label]\n        end\n        subgraph New columns\n            OCOL0[summary_label]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph TextClustering\n        StepInput[Input Columns: text, projection, cluster_label]\n        StepOutput[Output Columns: summary_label, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/textclustering/#inputs","title":"Inputs","text":"
  • text (str): The reference text we want to obtain labels for.

  • projection (List[float]): Vector representation of the text to cluster, normally the output from the UMAP step.

  • cluster_label (int): Integer representing the label of a given cluster. -1 means it wasn't clustered.

"},{"location":"components-gallery/tasks/textclustering/#outputs","title":"Outputs","text":"
  • summary_label (str): The label or list of labels for the text.

  • model_name (str): The name of the model used to generate the label/s.

"},{"location":"components-gallery/tasks/textclustering/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/textclustering/#generate-labels-for-a-set-of-texts-using-clustering","title":"Generate labels for a set of texts using clustering","text":"
from distilabel.models import InferenceEndpointsLLM\nfrom distilabel.steps import UMAP, DBSCAN, TextClustering\nfrom distilabel.pipeline import Pipeline\n\nds_name = \"argilla-warehouse/personahub-fineweb-edu-4-clustering-100k\"\n\nwith Pipeline(name=\"Text clustering dataset\") as pipeline:\n    batch_size = 500\n\n    ds = load_dataset(ds_name, split=\"train\").select(range(10000))\n    loader = make_generator_step(ds, batch_size=batch_size, repo_id=ds_name)\n\n    umap = UMAP(n_components=2, metric=\"cosine\")\n    dbscan = DBSCAN(eps=0.3, min_samples=30)\n\n    text_clustering = TextClustering(\n        llm=InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n        ),\n        n=3,  # 3 labels per example\n        query_title=\"Examples of Personas\",\n        samples_per_cluster=10,\n        context=(\n            \"Describe the main themes, topics, or categories that could describe the \"\n            \"following types of personas. All the examples of personas must share \"\n            \"the same set of labels.\"\n        ),\n        default_label=\"None\",\n        savefig=True,\n        input_batch_size=8,\n        input_mappings={\"text\": \"persona\"},\n        use_default_structured_output=True,\n    )\n\n    loader >> umap >> dbscan >> text_clustering\n
"},{"location":"components-gallery/tasks/textclustering/#references","title":"References","text":"
  • text-clustering repository
"},{"location":"components-gallery/tasks/apigensemanticchecker/","title":"APIGenSemanticChecker","text":"

Generate queries and answers for the given functions in JSON format.

The APIGenGenerator is inspired by the APIGen pipeline, which was designed to generate verifiable and diverse function-calling datasets. The task generates a set of diverse queries and corresponding answers for the given functions in JSON format.

"},{"location":"components-gallery/tasks/apigensemanticchecker/#attributes","title":"Attributes","text":"
  • system_prompt: System prompt for the task. Has a default one.

  • exclude_failed_execution: Whether to exclude failed executions (won't run on those rows that have a False in keep_row_after_execution_check column, which comes from running APIGenExecutionChecker). Defaults to True.

"},{"location":"components-gallery/tasks/apigensemanticchecker/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[func_desc]\n            ICOL1[query]\n            ICOL2[answers]\n            ICOL3[execution_result]\n        end\n        subgraph New columns\n            OCOL0[thought]\n            OCOL1[keep_row_after_semantic_check]\n        end\n    end\n\n    subgraph APIGenSemanticChecker\n        StepInput[Input Columns: func_desc, query, answers, execution_result]\n        StepOutput[Output Columns: thought, keep_row_after_semantic_check]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    ICOL2 --> StepInput\n    ICOL3 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/apigensemanticchecker/#inputs","title":"Inputs","text":"
  • func_desc (str): Description of what the function should do.

  • query (str): Instruction from the user.

  • answers (str): JSON encoded list with arguments to be passed to the function/API. Should be loaded using json.loads.

  • execution_result (str): Result of the function/API executed.

"},{"location":"components-gallery/tasks/apigensemanticchecker/#outputs","title":"Outputs","text":"
  • thought (str): Reasoning for the output on whether to keep this output or not.

  • keep_row_after_semantic_check (bool): True or False, can be used to filter afterwards.

"},{"location":"components-gallery/tasks/apigensemanticchecker/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/apigensemanticchecker/#semantic-checker-for-generated-function-calls-original-implementation","title":"Semantic checker for generated function calls (original implementation)","text":"
from distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\nsemantic_checker = APIGenSemanticChecker(\n    use_default_structured_output=False,\n    llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n    semantic_checker.process(\n        [\n            {\n                \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n            }\n        ]\n    )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'thought': '',\n# 'keep_row_after_semantic_check': True,\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n#     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n#     {'role': 'user',\n#     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n\\nYour response MUST strictly adhere to the following JSON format, and NO other text MUST be included.\\n```\\n{\\n   \"thought\": \"Concisely describe your reasoning here\",\\n   \"pass\": \"yes\" or \"no\"\\n}\\n```\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/apigensemanticchecker/#semantic-checker-for-generated-function-calls-structured-output","title":"Semantic checker for generated function calls (structured output)","text":"
from distilabel.steps.tasks import APIGenSemanticChecker\nfrom distilabel.models import InferenceEndpointsLLM\n\nllm=InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n    generation_kwargs={\n        \"temperature\": 0.7,\n        \"max_new_tokens\": 1024,\n    },\n)\nsemantic_checker = APIGenSemanticChecker(\n    use_default_structured_output=True,\n    llm=llm\n)\nsemantic_checker.load()\n\nres = next(\n    semantic_checker.process(\n        [\n            {\n                \"func_desc\": \"Fetch information about a specific cat breed from the Cat Breeds API.\",\n                \"query\": \"What information can be obtained about the Maine Coon cat breed?\",\n                \"answers\": json.dumps([{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]),\n                \"execution_result\": \"The Maine Coon is a big and hairy breed of cat\",\n            }\n        ]\n    )\n)\nres\n# [{'func_desc': 'Fetch information about a specific cat breed from the Cat Breeds API.',\n# 'query': 'What information can be obtained about the Maine Coon cat breed?',\n# 'answers': [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}],\n# 'execution_result': 'The Maine Coon is a big and hairy breed of cat',\n# 'keep_row_after_semantic_check': True,\n# 'thought': '',\n# 'raw_input_a_p_i_gen_semantic_checker_0': [{'role': 'system',\n#     'content': 'As a data quality evaluator, you must assess the alignment between a user query, corresponding function calls, and their execution results.\\nThese function calls and results are generated by other models, and your task is to ensure these results accurately reflect the user\u2019s intentions.\\n\\nDo not pass if:\\n1. The function call does not align with the query\u2019s objective, or the input arguments appear incorrect.\\n2. The function call and arguments are not properly chosen from the available functions.\\n3. The number of function calls does not correspond to the user\u2019s intentions.\\n4. The execution results are irrelevant and do not match the function\u2019s purpose.\\n5. The execution results contain errors or reflect that the function calls were not executed successfully.\\n'},\n#     {'role': 'user',\n#     'content': 'Given Information:\\n- All Available Functions:\\nFetch information about a specific cat breed from the Cat Breeds API.\\n- User Query: What information can be obtained about the Maine Coon cat breed?\\n- Generated Function Calls: [{\"name\": \"get_breed_information\", \"arguments\": {\"breed\": \"Maine Coon\"}}]\\n- Execution Results: The Maine Coon is a big and hairy breed of cat\\n\\nNote: The query may have multiple intentions. Functions may be placeholders, and execution results may be truncated due to length, which is acceptable and should not cause a failure.\\n\\nThe main decision factor is wheather the function calls accurately reflect the query\\'s intentions and the function descriptions.\\nProvide your reasoning in the thought section and decide if the data passes (answer yes or no).\\nIf not passing, concisely explain your reasons in the thought section; otherwise, leave this section blank.\\n'}]},\n# 'model_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'}]\n
"},{"location":"components-gallery/tasks/apigensemanticchecker/#references","title":"References","text":"
  • APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets

  • Salesforce/xlam-function-calling-60k

"},{"location":"components-gallery/tasks/generatetextretrievaldata/","title":"GenerateTextRetrievalData","text":"

Generate text retrieval data with an LLM to later on train an embedding model.

GenerateTextRetrievalData is a Task that generates text retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/generatetextretrievaldata/#note","title":"Note","text":"

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-retrieval\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-retrieval category.

"},{"location":"components-gallery/tasks/generatetextretrievaldata/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • query_type: The type of query to be generated, which can be extremely long-tail, long-tail, or common. Defaults to None, meaning that it will be randomly sampled.

  • query_length: The length of the query to be generated, which can be less than 5 words, 5 to 15 words, or at least 10 words. Defaults to None, meaning that it will be randomly sampled.

  • difficulty: The difficulty of the query to be generated, which can be high school, college, or PhD. Defaults to None, meaning that it will be randomly sampled.

  • clarity: The clarity of the query to be generated, which can be clear, understandable with some effort, or ambiguous. Defaults to None, meaning that it will be randomly sampled.

  • num_words: The number of words in the query to be generated, which can be 50, 100, 200, 300, 400, or 500. Defaults to None, meaning that it will be randomly sampled.

  • seed: The random seed to be set in case there's any sampling within the format_input method.

"},{"location":"components-gallery/tasks/generatetextretrievaldata/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n        end\n        subgraph New columns\n            OCOL0[user_query]\n            OCOL1[positive_document]\n            OCOL2[hard_negative_document]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph GenerateTextRetrievalData\n        StepInput[Input Columns: task]\n        StepOutput[Output Columns: user_query, positive_document, hard_negative_document, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generatetextretrievaldata/#inputs","title":"Inputs","text":"
  • task (str): The task description to be used in the generation.
"},{"location":"components-gallery/tasks/generatetextretrievaldata/#outputs","title":"Outputs","text":"
  • user_query (str): the user query generated by the LLM.

  • positive_document (str): the positive document generated by the LLM.

  • hard_negative_document (str): the hard negative document generated by the LLM.

  • model_name (str): the name of the model used to generate the text retrieval data.

"},{"location":"components-gallery/tasks/generatetextretrievaldata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatetextretrievaldata/#generate-synthetic-text-retrieval-data-for-training-embedding-models","title":"Generate synthetic text retrieval data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextRetrievalData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-retrieval\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateTextRetrievalData(\n        language=\"English\",\n        query_type=\"common\",\n        query_length=\"5 to 15 words\",\n        difficulty=\"high school\",\n        clarity=\"clear\",\n        num_words=100,\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
"},{"location":"components-gallery/tasks/generatetextretrievaldata/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/","title":"GenerateShortTextMatchingData","text":"

Generate short text matching data with an LLM to later on train an embedding model.

GenerateShortTextMatchingData is a Task that generates short text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#note","title":"Note","text":"

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-short\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-short category.

"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • seed: The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params.

"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n        end\n        subgraph New columns\n            OCOL0[input]\n            OCOL1[positive_document]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph GenerateShortTextMatchingData\n        StepInput[Input Columns: task]\n        StepOutput[Output Columns: input, positive_document, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#inputs","title":"Inputs","text":"
  • task (str): The task description to be used in the generation.
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#outputs","title":"Outputs","text":"
  • input (str): the input generated by the LLM.

  • positive_document (str): the positive document generated by the LLM.

  • model_name (str): the name of the model used to generate the short text matching data.

"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#generate-synthetic-short-text-matching-data-for-training-embedding-models","title":"Generate synthetic short text matching data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateShortTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-matching-short\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateShortTextMatchingData(\n        language=\"English\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
"},{"location":"components-gallery/tasks/generateshorttextmatchingdata/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/","title":"GenerateLongTextMatchingData","text":"

Generate long text matching data with an LLM to later on train an embedding model.

GenerateLongTextMatchingData is a Task that generates long text matching data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#note","title":"Note","text":"

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-matching-long\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-matching-long category.

"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • seed: The random seed to be set in case there's any sampling within the format_input method. Note that in this task the seed has no effect since there are no sampling params.

"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n        end\n        subgraph New columns\n            OCOL0[input]\n            OCOL1[positive_document]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph GenerateLongTextMatchingData\n        StepInput[Input Columns: task]\n        StepOutput[Output Columns: input, positive_document, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#inputs","title":"Inputs","text":"
  • task (str): The task description to be used in the generation.
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#outputs","title":"Outputs","text":"
  • input (str): the input generated by the LLM.

  • positive_document (str): the positive document generated by the LLM.

  • model_name (str): the name of the model used to generate the long text matching data.

"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#generate-synthetic-long-text-matching-data-for-training-embedding-models","title":"Generate synthetic long text matching data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateLongTextMatchingData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-matching-long\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateLongTextMatchingData(\n        language=\"English\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
"},{"location":"components-gallery/tasks/generatelongtextmatchingdata/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/generatetextclassificationdata/","title":"GenerateTextClassificationData","text":"

Generate text classification data with an LLM to later on train an embedding model.

GenerateTextClassificationData is a Task that generates text classification data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/generatetextclassificationdata/#note","title":"Note","text":"

Ideally this task should be used with EmbeddingTaskGenerator with flatten_tasks=True with the category=\"text-classification\"; so that the LLM generates a list of tasks that are flattened so that each row contains a single task for the text-classification category.

"},{"location":"components-gallery/tasks/generatetextclassificationdata/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • difficulty: The difficulty of the query to be generated, which can be high school, college, or PhD. Defaults to None, meaning that it will be randomly sampled.

  • clarity: The clarity of the query to be generated, which can be clear, understandable with some effort, or ambiguous. Defaults to None, meaning that it will be randomly sampled.

  • seed: The random seed to be set in case there's any sampling within the format_input method.

"},{"location":"components-gallery/tasks/generatetextclassificationdata/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[task]\n        end\n        subgraph New columns\n            OCOL0[input_text]\n            OCOL1[label]\n            OCOL2[misleading_label]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph GenerateTextClassificationData\n        StepInput[Input Columns: task]\n        StepOutput[Output Columns: input_text, label, misleading_label, model_name]\n    end\n\n    ICOL0 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/generatetextclassificationdata/#inputs","title":"Inputs","text":"
  • task (str): The task description to be used in the generation.
"},{"location":"components-gallery/tasks/generatetextclassificationdata/#outputs","title":"Outputs","text":"
  • input_text (str): the input text generated by the LLM.

  • label (str): the label generated by the LLM.

  • misleading_label (str): the misleading label generated by the LLM.

  • model_name (str): the name of the model used to generate the text classification data.

"},{"location":"components-gallery/tasks/generatetextclassificationdata/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/generatetextclassificationdata/#generate-synthetic-text-classification-data-for-training-embedding-models","title":"Generate synthetic text classification data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator, GenerateTextClassificationData\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-classification\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    generate = GenerateTextClassificationData(\n        language=\"English\",\n        difficulty=\"high school\",\n        clarity=\"clear\",\n        llm=...,  # LLM instance\n    )\n\n    task >> generate\n
"},{"location":"components-gallery/tasks/generatetextclassificationdata/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/tasks/structuredgeneration/","title":"StructuredGeneration","text":"

Generate structured content for a given instruction using an LLM.

StructuredGeneration is a pre-defined task that defines the instruction and the structured_output as the inputs, and generation as the output. This task is used to generate structured content based on the input instruction and following the schema provided within the structured_output column per each instruction. The model_name also returned as part of the output in order to enhance it.

"},{"location":"components-gallery/tasks/structuredgeneration/#attributes","title":"Attributes","text":"
  • use_system_prompt: Whether to use the system prompt in the generation. Defaults to True, which means that if the column system_prompt is defined within the input batch, then the system_prompt will be used, otherwise, it will be ignored.
"},{"location":"components-gallery/tasks/structuredgeneration/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph Columns\n            ICOL0[instruction]\n            ICOL1[structured_output]\n        end\n        subgraph New columns\n            OCOL0[generation]\n            OCOL1[model_name]\n        end\n    end\n\n    subgraph StructuredGeneration\n        StepInput[Input Columns: instruction, structured_output]\n        StepOutput[Output Columns: generation, model_name]\n    end\n\n    ICOL0 --> StepInput\n    ICOL1 --> StepInput\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepInput --> StepOutput\n
"},{"location":"components-gallery/tasks/structuredgeneration/#inputs","title":"Inputs","text":"
  • instruction (str): The instruction to generate structured content from.

  • structured_output (Dict[str, Any]): The structured_output to generate structured content from. It should be a Python dictionary with the keys format and schema, where format should be one of json or regex, and the schema should be either the JSON schema or the regex pattern, respectively.

"},{"location":"components-gallery/tasks/structuredgeneration/#outputs","title":"Outputs","text":"
  • generation (str): The generated text matching the provided schema, if possible.

  • model_name (str): The name of the model used to generate the text.

"},{"location":"components-gallery/tasks/structuredgeneration/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/structuredgeneration/#generate-structured-output-from-a-json-schema","title":"Generate structured output from a JSON schema","text":"
from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\n\nstructured_gen.load()\n\nresult = next(\n    structured_gen.process(\n        [\n            {\n                \"instruction\": \"Create an RPG character\",\n                \"structured_output\": {\n                    \"format\": \"json\",\n                    \"schema\": {\n                        \"properties\": {\n                            \"name\": {\n                                \"title\": \"Name\",\n                                \"type\": \"string\"\n                            },\n                            \"description\": {\n                                \"title\": \"Description\",\n                                \"type\": \"string\"\n                            },\n                            \"role\": {\n                                \"title\": \"Role\",\n                                \"type\": \"string\"\n                            },\n                            \"weapon\": {\n                                \"title\": \"Weapon\",\n                                \"type\": \"string\"\n                            }\n                        },\n                        \"required\": [\n                            \"name\",\n                            \"description\",\n                            \"role\",\n                            \"weapon\"\n                        ],\n                        \"title\": \"Character\",\n                        \"type\": \"object\"\n                    }\n                },\n            }\n        ]\n    )\n)\n
"},{"location":"components-gallery/tasks/structuredgeneration/#generate-structured-output-from-a-regex-pattern-only-works-with-llms-that-support-regex-the-providers-using-outlines","title":"Generate structured output from a regex pattern (only works with LLMs that support regex, the providers using outlines)","text":"
from distilabel.steps.tasks import StructuredGeneration\nfrom distilabel.models import InferenceEndpointsLLM\n\nstructured_gen = StructuredGeneration(\n    llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n)\n\nstructured_gen.load()\n\nresult = next(\n    structured_gen.process(\n        [\n            {\n                \"instruction\": \"What's the weather like today in Seattle in Celsius degrees?\",\n                \"structured_output\": {\n                    \"format\": \"regex\",\n                    \"schema\": r\"(\\d{1,2})\u00b0C\"\n                },\n\n            }\n        ]\n    )\n)\n
"},{"location":"components-gallery/tasks/monolingualtripletgenerator/","title":"MonolingualTripletGenerator","text":"

Generate monolingual triplets with an LLM to later on train an embedding model.

MonolingualTripletGenerator is a GeneratorTask that generates monolingual triplets with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/monolingualtripletgenerator/#attributes","title":"Attributes","text":"
  • language: The language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • unit: The unit of the data to be generated, which can be sentence, phrase, or passage. Defaults to None, meaning that it will be randomly sampled.

  • difficulty: The difficulty of the query to be generated, which can be elementary school, high school, or college. Defaults to None, meaning that it will be randomly sampled.

  • high_score: The high score of the query to be generated, which can be 4, 4.5, or 5. Defaults to None, meaning that it will be randomly sampled.

  • low_score: The low score of the query to be generated, which can be 2.5, 3, or 3.5. Defaults to None, meaning that it will be randomly sampled.

  • seed: The random seed to be set in case there's any sampling within the format_input method.

"},{"location":"components-gallery/tasks/monolingualtripletgenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[S1]\n            OCOL1[S2]\n            OCOL2[S3]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph MonolingualTripletGenerator\n        StepOutput[Output Columns: S1, S2, S3, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n
"},{"location":"components-gallery/tasks/monolingualtripletgenerator/#outputs","title":"Outputs","text":"
  • S1 (str): the first sentence generated by the LLM.

  • S2 (str): the second sentence generated by the LLM.

  • S3 (str): the third sentence generated by the LLM.

  • model_name (str): the name of the model used to generate the monolingual triplets.

"},{"location":"components-gallery/tasks/monolingualtripletgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/monolingualtripletgenerator/#generate-monolingual-triplets-for-training-embedding-models","title":"Generate monolingual triplets for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import MonolingualTripletGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = MonolingualTripletGenerator(\n        language=\"English\",\n        unit=\"sentence\",\n        difficulty=\"elementary school\",\n        high_score=\"4\",\n        low_score=\"2.5\",\n        llm=...,\n    )\n\n    ...\n\n    task >> ...\n
"},{"location":"components-gallery/tasks/bitextretrievalgenerator/","title":"BitextRetrievalGenerator","text":"

Generate bitext retrieval data with an LLM to later on train an embedding model.

BitextRetrievalGenerator is a GeneratorTask that generates bitext retrieval data with an LLM to later on train an embedding model. The task is based on the paper \"Improving Text Embeddings with Large Language Models\" and the data is generated based on the provided attributes, or randomly sampled if not provided.

"},{"location":"components-gallery/tasks/bitextretrievalgenerator/#attributes","title":"Attributes","text":"
  • source_language: The source language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • target_language: The target language of the data to be generated, which can be any of the languages retrieved from the list of XLM-R in the Appendix A of https://aclanthology.org/2020.acl-main.747.pdf.

  • unit: The unit of the data to be generated, which can be sentence, phrase, or passage. Defaults to None, meaning that it will be randomly sampled.

  • difficulty: The difficulty of the query to be generated, which can be elementary school, high school, or college. Defaults to None, meaning that it will be randomly sampled.

  • high_score: The high score of the query to be generated, which can be 4, 4.5, or 5. Defaults to None, meaning that it will be randomly sampled.

  • low_score: The low score of the query to be generated, which can be 2.5, 3, or 3.5. Defaults to None, meaning that it will be randomly sampled.

  • seed: The random seed to be set in case there's any sampling within the format_input method.

"},{"location":"components-gallery/tasks/bitextretrievalgenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[S1]\n            OCOL1[S2]\n            OCOL2[S3]\n            OCOL3[model_name]\n        end\n    end\n\n    subgraph BitextRetrievalGenerator\n        StepOutput[Output Columns: S1, S2, S3, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n    StepOutput --> OCOL3\n
"},{"location":"components-gallery/tasks/bitextretrievalgenerator/#outputs","title":"Outputs","text":"
  • S1 (str): the first sentence generated by the LLM.

  • S2 (str): the second sentence generated by the LLM.

  • S3 (str): the third sentence generated by the LLM.

  • model_name (str): the name of the model used to generate the bitext retrieval data.

"},{"location":"components-gallery/tasks/bitextretrievalgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/bitextretrievalgenerator/#generate-bitext-retrieval-data-for-training-embedding-models","title":"Generate bitext retrieval data for training embedding models","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import BitextRetrievalGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = BitextRetrievalGenerator(\n        source_language=\"English\",\n        target_language=\"Spanish\",\n        unit=\"sentence\",\n        difficulty=\"elementary school\",\n        high_score=\"4\",\n        low_score=\"2.5\",\n        llm=...,\n    )\n\n    ...\n\n    task >> ...\n
"},{"location":"components-gallery/tasks/embeddingtaskgenerator/","title":"EmbeddingTaskGenerator","text":"

Generate task descriptions for embedding-related tasks using an LLM.

EmbeddingTaskGenerator is a GeneratorTask that doesn't receieve any input besides the provided attributes that generates task descriptions for embedding-related tasks using a pre-defined prompt based on the category attribute. The category attribute should be one of the following:

- `text-retrieval`: Generate task descriptions for text retrieval tasks.\n- `text-matching-short`: Generate task descriptions for short text matching tasks.\n- `text-matching-long`: Generate task descriptions for long text matching tasks.\n- `text-classification`: Generate task descriptions for text classification tasks.\n
"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#attributes","title":"Attributes","text":"
  • category: The category of the task to be generated, which can either be text-retrieval, text-matching-short, text-matching-long, or text-classification.

  • flatten_tasks: Whether to flatten the tasks i.e. since a list of tasks is generated by the LLM, this attribute indicates whether to flatten the list or not. Defaults to False, meaning that running this task with num_generations=1 will return a distilabel.Distiset with one row only containing a list with around 20 tasks; otherwise, if set to True, it will return a distilabel.Distiset with around 20 rows, each containing one task.

"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#input-output-columns","title":"Input & Output Columns","text":"
graph TD\n    subgraph Dataset\n        subgraph New columns\n            OCOL0[tasks]\n            OCOL1[task]\n            OCOL2[model_name]\n        end\n    end\n\n    subgraph EmbeddingTaskGenerator\n        StepOutput[Output Columns: tasks, task, model_name]\n    end\n\n    StepOutput --> OCOL0\n    StepOutput --> OCOL1\n    StepOutput --> OCOL2\n
"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#outputs","title":"Outputs","text":"
  • tasks (List[str]): the list of tasks generated by the LLM.

  • task (str): the task generated by the LLM if flatten_tasks=True.

  • model_name (str): the name of the model used to generate the tasks.

"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#examples","title":"Examples","text":""},{"location":"components-gallery/tasks/embeddingtaskgenerator/#generate-embedding-tasks-for-text-retrieval","title":"Generate embedding tasks for text retrieval","text":"
from distilabel.pipeline import Pipeline\nfrom distilabel.steps.tasks import EmbeddingTaskGenerator\n\nwith Pipeline(\"my-pipeline\") as pipeline:\n    task = EmbeddingTaskGenerator(\n        category=\"text-retrieval\",\n        flatten_tasks=True,\n        llm=...,  # LLM instance\n    )\n\n    ...\n\n    task >> ...\n
"},{"location":"components-gallery/tasks/embeddingtaskgenerator/#references","title":"References","text":"
  • Improving Text Embeddings with Large Language Models
"},{"location":"components-gallery/llms/","title":"LLMs Gallery","text":"
  • AnthropicLLM

    Anthropic LLM implementation running the Async API client.

    AnthropicLLM

  • OpenAILLM

    OpenAI LLM implementation running the async API client.

    OpenAILLM

  • AnyscaleLLM

    Anyscale LLM implementation running the async API client of OpenAI.

    AnyscaleLLM

  • AzureOpenAILLM

    Azure OpenAI LLM implementation running the async API client.

    AzureOpenAILLM

  • TogetherLLM

    TogetherLLM LLM implementation running the async API client of OpenAI.

    TogetherLLM

  • ClientvLLM

    A client for the vLLM server implementing the OpenAI API specification.

    ClientvLLM

  • CohereLLM

    Cohere API implementation using the async client for concurrent text generation.

    CohereLLM

  • GroqLLM

    Groq API implementation using the async client for concurrent text generation.

    GroqLLM

  • InferenceEndpointsLLM

    InferenceEndpoints LLM implementation running the async API client.

    InferenceEndpointsLLM

  • LiteLLM

    LiteLLM implementation running the async API client.

    LiteLLM

  • MistralLLM

    Mistral LLM implementation running the async API client.

    MistralLLM

  • MixtureOfAgentsLLM

    Mixture-of-Agents implementation.

    MixtureOfAgentsLLM

  • OllamaLLM

    Ollama LLM implementation running the Async API client.

    OllamaLLM

  • VertexAILLM

    VertexAI LLM implementation running the async API clients for Gemini.

    VertexAILLM

  • TransformersLLM

    Hugging Face transformers library LLM implementation using the text generation

    TransformersLLM

  • LlamaCppLLM

    llama.cpp LLM implementation running the Python bindings for the C++ code.

    LlamaCppLLM

  • vLLM

    vLLM library LLM implementation.

    vLLM

"},{"location":"components-gallery/llms/anthropicllm/","title":"AnthropicLLM","text":"

Anthropic LLM implementation running the Async API client.

"},{"location":"components-gallery/llms/anthropicllm/#attributes","title":"Attributes","text":"
  • model: the name of the model to use for the LLM e.g. \"claude-3-opus-20240229\", \"claude-3-sonnet-20240229\", etc. Available models can be checked here: Anthropic: Models overview.

  • api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable.

  • base_url: the base URL to use for the Anthropic API. Defaults to None which means that https://api.anthropic.com will be used internally.

  • timeout: the maximum time in seconds to wait for a response. Defaults to 600.0.

  • max_retries: The maximum number of times to retry the request before failing. Defaults to 6.

  • http_client: if provided, an alternative HTTP client to use for calling Anthropic API. Defaults to None.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

  • _api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally.

  • _aclient: the AsyncAnthropic client to use for the Anthropic API. It is meant to be used internally. Set in the load method.

"},{"location":"components-gallery/llms/anthropicllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • api_key: the API key to authenticate the requests to the Anthropic API. If not provided, it will be read from ANTHROPIC_API_KEY environment variable.

  • base_url: the base URL to use for the Anthropic API. Defaults to \"https://api.anthropic.com\".

  • timeout: the maximum time in seconds to wait for a response. Defaults to 600.0.

  • max_retries: the maximum number of times to retry the request before failing. Defaults to 6.

"},{"location":"components-gallery/llms/anthropicllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/anthropicllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import AnthropicLLM\n\nllm = AnthropicLLM(model=\"claude-3-opus-20240229\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/anthropicllm/#generate-structured-data","title":"Generate structured data","text":"
from pydantic import BaseModel\nfrom distilabel.models.llms import AnthropicLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = AnthropicLLM(\n    model=\"claude-3-opus-20240229\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/llms/openaillm/","title":"OpenAILLM","text":"

OpenAI LLM implementation running the async API client.

"},{"location":"components-gallery/llms/openaillm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\", \"gpt-4\", etc. Supported models can be found here.

  • base_url: the base URL to use for the OpenAI API requests. Defaults to None, which means that the value set for the environment variable OPENAI_BASE_URL will be used, or \"https://api.openai.com/v1\" if not set.

  • api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set.

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

"},{"location":"components-gallery/llms/openaillm/#runtime-parameters","title":"Runtime Parameters","text":"
  • base_url: the base URL to use for the OpenAI API requests. Defaults to None.

  • api_key: the API key to authenticate the requests to the OpenAI API. Defaults to None.

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

"},{"location":"components-gallery/llms/openaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/openaillm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/openaillm/#generate-text-from-a-custom-endpoint-following-the-openai-api","title":"Generate text from a custom endpoint following the OpenAI API","text":"
from distilabel.models.llms import OpenAILLM\n\nllm = OpenAILLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/openaillm/#generate-structured-data","title":"Generate structured data","text":"
from pydantic import BaseModel\nfrom distilabel.models.llms import OpenAILLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = OpenAILLM(\n    model=\"gpt-4-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/llms/openaillm/#generate-with-batch-api-offline-batch-generation","title":"Generate with Batch API (offline batch generation)","text":"
from distilabel.models.llms import OpenAILLM\n\nload = llm = OpenAILLM(\n    model=\"gpt-3.5-turbo\",\n    use_offline_batch_generation=True,\n    offline_batch_generation_block_until_done=5,  # poll for results every 5 seconds\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n# [['Hello! How can I assist you today?']]\n
"},{"location":"components-gallery/llms/anyscalellm/","title":"AnyscaleLLM","text":"

Anyscale LLM implementation running the async API client of OpenAI.

"},{"location":"components-gallery/llms/anyscalellm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM, e.g., google/gemma-7b-it. See the supported models under the \"Text Generation -> Supported Models\" section here.

  • base_url: the base URL to use for the Anyscale API requests. Defaults to None, which means that the value set for the environment variable ANYSCALE_BASE_URL will be used, or \"https://api.endpoints.anyscale.com/v1\" if not set.

  • api_key: the API key to authenticate the requests to the Anyscale API. Defaults to None which means that the value set for the environment variable ANYSCALE_API_KEY will be used, or None if not set.

  • _api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally.

"},{"location":"components-gallery/llms/anyscalellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/anyscalellm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import AnyscaleLLM\n\nllm = AnyscaleLLM(model=\"google/gemma-7b-it\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/azureopenaillm/","title":"AzureOpenAILLM","text":"

Azure OpenAI LLM implementation running the async API client.

"},{"location":"components-gallery/llms/azureopenaillm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM i.e. the name of the Azure deployment.

  • base_url: the base URL to use for the Azure OpenAI API can be set with AZURE_OPENAI_ENDPOINT. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_ENDPOINT will be used, or None if not set.

  • api_key: the API key to authenticate the requests to the Azure OpenAI API. Defaults to None which means that the value set for the environment variable AZURE_OPENAI_API_KEY will be used, or None if not set.

  • api_version: the API version to use for the Azure OpenAI API. Defaults to None which means that the value set for the environment variable OPENAI_API_VERSION will be used, or None if not set.

"},{"location":"components-gallery/llms/azureopenaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/azureopenaillm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(model=\"gpt-4-turbo\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/azureopenaillm/#generate-text-from-a-custom-endpoint-following-the-openai-api","title":"Generate text from a custom endpoint following the OpenAI API","text":"
from distilabel.models.llms import AzureOpenAILLM\n\nllm = AzureOpenAILLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    base_url=r\"http://localhost:8080/v1\"\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/azureopenaillm/#generate-structured-data","title":"Generate structured data","text":"
from pydantic import BaseModel\nfrom distilabel.models.llms import AzureOpenAILLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = AzureOpenAILLM(\n    model=\"gpt-4-turbo\",\n    api_key=\"api.key\",\n    structured_output={\"schema\": User}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/llms/togetherllm/","title":"TogetherLLM","text":"

TogetherLLM LLM implementation running the async API client of OpenAI.

"},{"location":"components-gallery/llms/togetherllm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"mistralai/Mixtral-8x7B-Instruct-v0.1\". Supported models can be found here.

  • base_url: the base URL to use for the Together API can be set with TOGETHER_BASE_URL. Defaults to None which means that the value set for the environment variable TOGETHER_BASE_URL will be used, or \"https://api.together.xyz/v1\" if not set.

  • api_key: the API key to authenticate the requests to the Together API. Defaults to None which means that the value set for the environment variable TOGETHER_API_KEY will be used, or None if not set.

  • _api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally.

"},{"location":"components-gallery/llms/togetherllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/togetherllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import AnyscaleLLM\n\nllm = TogetherLLM(model=\"mistralai/Mixtral-8x7B-Instruct-v0.1\", api_key=\"api.key\")\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/clientvllm/","title":"ClientvLLM","text":"

A client for the vLLM server implementing the OpenAI API specification.

"},{"location":"components-gallery/llms/clientvllm/#attributes","title":"Attributes","text":"
  • base_url: the base URL of the vLLM server. Defaults to \"http://localhost:8000\".

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None.

  • tokenizer: the Hugging Face Hub repo id or path of the tokenizer that will be used to apply the chat template and tokenize the inputs before sending it to the server. Defaults to None.

  • tokenizer_revision: the revision of the tokenizer to load. Defaults to None.

  • _aclient: the httpx.AsyncClient used to comunicate with the vLLM server. Defaults to None.

"},{"location":"components-gallery/llms/clientvllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • base_url: the base url of the vLLM server. Defaults to \"http://localhost:8000\".

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 6.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • httpx_client_kwargs: extra kwargs that will be passed to the httpx.AsyncClient created to comunicate with the vLLM server. Defaults to None.

"},{"location":"components-gallery/llms/clientvllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/clientvllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import ClientvLLM\n\nllm = ClientvLLM(\n    base_url=\"http://localhost:8000/v1\",\n    tokenizer=\"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n)\n\nllm.load()\n\nresults = llm.generate_outputs(\n    inputs=[[{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]],\n    temperature=0.7,\n    top_p=1.0,\n    max_new_tokens=256,\n)\n# [\n#     [\n#         \"I'm functioning properly, thank you for asking. How can I assist you today?\",\n#         \"I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm here to help answer any questions or provide information you might need. How can I assist you today?\",\n#         \"I'm just a computer program, so I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have. What's on your mind?\"\n#     ]\n# ]\n
"},{"location":"components-gallery/llms/coherellm/","title":"CohereLLM","text":"

Cohere API implementation using the async client for concurrent text generation.

"},{"location":"components-gallery/llms/coherellm/#attributes","title":"Attributes","text":"
  • model: the name of the model from the Cohere API to use for the generation.

  • base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\".

  • api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • client_name: the name of the client to use for the API requests. Defaults to \"distilabel\".

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

  • _ChatMessage: the ChatMessage class from the cohere package.

  • _aclient: the AsyncClient client from the cohere package.

"},{"location":"components-gallery/llms/coherellm/#runtime-parameters","title":"Runtime Parameters","text":"
  • base_url: the base URL to use for the Cohere API requests. Defaults to \"https://api.cohere.ai/v1\".

  • api_key: the API key to authenticate the requests to the Cohere API. Defaults to the value of the COHERE_API_KEY environment variable.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • client_name: the name of the client to use for the API requests. Defaults to \"distilabel\".

"},{"location":"components-gallery/llms/coherellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/coherellm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import CohereLLM\n\nllm = CohereLLM(model=\"CohereForAI/c4ai-command-r-plus\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n
"},{"location":"components-gallery/llms/groqllm/","title":"GroqLLM","text":"

Groq API implementation using the async client for concurrent text generation.

"},{"location":"components-gallery/llms/groqllm/#attributes","title":"Attributes","text":"
  • model: the name of the model from the Groq API to use for the generation.

  • base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\".

  • api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable.

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

  • _api_key_env_var: the name of the environment variable to use for the API key.

  • _aclient: the AsyncGroq client from the groq package.

"},{"location":"components-gallery/llms/groqllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • base_url: the base URL to use for the Groq API requests. Defaults to \"https://api.groq.com\".

  • api_key: the API key to authenticate the requests to the Groq API. Defaults to the value of the GROQ_API_KEY environment variable.

  • max_retries: the maximum number of times to retry the request to the API before failing. Defaults to 2.

  • timeout: the maximum time in seconds to wait for a response from the API. Defaults to 120.

"},{"location":"components-gallery/llms/groqllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/groqllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import GroqLLM\n\nllm = GroqLLM(model=\"llama3-70b-8192\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n
"},{"location":"components-gallery/llms/inferenceendpointsllm/","title":"InferenceEndpointsLLM","text":"

InferenceEndpoints LLM implementation running the async API client.

This LLM will internally use huggingface_hub.AsyncInferenceClient.

"},{"location":"components-gallery/llms/inferenceendpointsllm/#attributes","title":"Attributes","text":"
  • model_id: the model ID to use for the LLM as available in the Hugging Face Hub, which will be used to resolve the base URL for the serverless Inference Endpoints API requests. Defaults to None.

  • endpoint_name: the name of the Inference Endpoint to use for the LLM. Defaults to None.

  • endpoint_namespace: the namespace of the Inference Endpoint to use for the LLM. Defaults to None.

  • base_url: the base URL to use for the Inference Endpoints API requests.

  • api_key: the API key to authenticate the requests to the Inference Endpoints API.

  • tokenizer_id: the tokenizer ID to use for the LLM as available in the Hugging Face Hub. Defaults to None, but defining one is recommended to properly format the prompt.

  • model_display_name: the model display name to use for the LLM. Defaults to None.

  • use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

  • magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

  • structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

"},{"location":"components-gallery/llms/inferenceendpointsllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/inferenceendpointsllm/#free-serverless-inference-api-set-the-input_batch_size-of-the-task-that-uses-this-to-avoid-model-is-overloaded","title":"Free serverless Inference API, set the input_batch_size of the Task that uses this to avoid Model is overloaded","text":"
from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3.1-70B-Instruct\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/inferenceendpointsllm/#dedicated-inference-endpoints","title":"Dedicated Inference Endpoints","text":"
from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    endpoint_name=\"<ENDPOINT_NAME>\",\n    api_key=\"<HF_API_KEY>\",\n    endpoint_namespace=\"<USER|ORG>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/inferenceendpointsllm/#dedicated-inference-endpoints-or-tgi","title":"Dedicated Inference Endpoints or TGI","text":"
from distilabel.models.llms.huggingface import InferenceEndpointsLLM\n\nllm = InferenceEndpointsLLM(\n    api_key=\"<HF_API_KEY>\",\n    base_url=\"<BASE_URL>\",\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/inferenceendpointsllm/#generate-structured-data","title":"Generate structured data","text":"
from pydantic import BaseModel\nfrom distilabel.models.llms import InferenceEndpointsLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = InferenceEndpointsLLM(\n    model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    api_key=\"api.key\",\n    structured_output={\"format\": \"json\", \"schema\": User.model_json_schema()}\n)\n\nllm.load()\n\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the Tour De France\"}]])\n
"},{"location":"components-gallery/llms/litellm/","title":"LiteLLM","text":"

LiteLLM implementation running the async API client.

"},{"location":"components-gallery/llms/litellm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"gpt-3.5-turbo\" or \"mistral/mistral-large\", etc.

  • verbose: whether to log the LiteLLM client's logs. Defaults to False.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

"},{"location":"components-gallery/llms/litellm/#runtime-parameters","title":"Runtime Parameters","text":"
  • verbose: whether to log the LiteLLM client's logs. Defaults to False.
"},{"location":"components-gallery/llms/litellm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/litellm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import LiteLLM\n\nllm = LiteLLM(model=\"gpt-3.5-turbo\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n
"},{"location":"components-gallery/llms/mistralllm/","title":"MistralLLM","text":"

Mistral LLM implementation running the async API client.

"},{"location":"components-gallery/llms/mistralllm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"mistral-tiny\", \"mistral-large\", etc.

  • endpoint: the endpoint to use for the Mistral API. Defaults to \"https://api.mistral.ai\".

  • api_key: the API key to authenticate the requests to the Mistral API. Defaults to None which means that the value set for the environment variable OPENAI_API_KEY will be used, or None if not set.

  • max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5.

  • timeout: the maximum time in seconds to wait for a response. Defaults to 120.

  • max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64.

  • structured_output: a dictionary containing the structured output configuration configuration using instructor. You can take a look at the dictionary structure in InstructorStructuredOutputType from distilabel.steps.tasks.structured_outputs.instructor.

  • _api_key_env_var: the name of the environment variable to use for the API key. It is meant to be used internally.

  • _aclient: the Mistral to use for the Mistral API. It is meant to be used internally. Set in the load method.

"},{"location":"components-gallery/llms/mistralllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • api_key: the API key to authenticate the requests to the Mistral API.

  • max_retries: the maximum number of retries to attempt when a request fails. Defaults to 5.

  • timeout: the maximum time in seconds to wait for a response. Defaults to 120.

  • max_concurrent_requests: the maximum number of concurrent requests to send. Defaults to 64.

"},{"location":"components-gallery/llms/mistralllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/mistralllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import MistralLLM\n\nllm = MistralLLM(model=\"open-mixtral-8x22b\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n\nGenerate structured data:\n
"},{"location":"components-gallery/llms/mixtureofagentsllm/","title":"MixtureOfAgentsLLM","text":"

Mixture-of-Agents implementation.

An LLM class that leverages LLMs collective strenghts to generate a response, as described in the \"Mixture-of-Agents Enhances Large Language model Capabilities\" paper. There is a list of LLMs proposing/generating outputs that LLMs from the next round/layer can use as auxiliary information. Finally, there is an LLM that aggregates the outputs to generate the final response.

"},{"location":"components-gallery/llms/mixtureofagentsllm/#attributes","title":"Attributes","text":"
  • aggregator_llm: The LLM that aggregates the outputs of the proposer LLMs.

  • proposers_llms: The list of LLMs that propose outputs to be aggregated.

  • rounds: The number of layers or rounds that the proposers_llms will generate outputs. Defaults to 1.

"},{"location":"components-gallery/llms/mixtureofagentsllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/mixtureofagentsllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import MixtureOfAgentsLLM, InferenceEndpointsLLM\n\nllm = MixtureOfAgentsLLM(\n    aggregator_llm=InferenceEndpointsLLM(\n        model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n    ),\n    proposers_llms=[\n        InferenceEndpointsLLM(\n            model_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n            tokenizer_id=\"meta-llama/Meta-Llama-3-70B-Instruct\",\n        ),\n        InferenceEndpointsLLM(\n            model_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n            tokenizer_id=\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO\",\n        ),\n        InferenceEndpointsLLM(\n            model_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n            tokenizer_id=\"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1\",\n        ),\n    ],\n    rounds=2,\n)\n\nllm.load()\n\noutput = llm.generate_outputs(\n    inputs=[\n        [\n            {\n                \"role\": \"user\",\n                \"content\": \"My favorite witty review of The Rings of Power series is this: Input:\",\n            }\n        ]\n    ]\n)\n
"},{"location":"components-gallery/llms/mixtureofagentsllm/#references","title":"References","text":"
  • Mixture-of-Agents Enhances Large Language Model Capabilities
"},{"location":"components-gallery/llms/ollamallm/","title":"OllamaLLM","text":"

Ollama LLM implementation running the Async API client.

"},{"location":"components-gallery/llms/ollamallm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"notus\".

  • host: the Ollama server host.

  • timeout: the timeout for the LLM. Defaults to 120.

  • _aclient: the AsyncClient to use for the Ollama API. It is meant to be used internally. Set in the load method.

"},{"location":"components-gallery/llms/ollamallm/#runtime-parameters","title":"Runtime Parameters","text":"
  • host: the Ollama server host.

  • timeout: the client timeout for the Ollama API. Defaults to 120.

"},{"location":"components-gallery/llms/ollamallm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/ollamallm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import OllamaLLM\n\nllm = OllamaLLM(model=\"llama3\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/vertexaillm/","title":"VertexAILLM","text":"

VertexAI LLM implementation running the async API clients for Gemini.

  • Gemini API: https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini

    To use the VertexAILLM is necessary to have configured the Google Cloud authentication using one of these methods:

    • Setting GOOGLE_CLOUD_CREDENTIALS environment variable
    • Using gcloud auth application-default login command
    • Using vertexai.init function from the google-cloud-aiplatform library
"},{"location":"components-gallery/llms/vertexaillm/#attributes","title":"Attributes","text":"
  • model: the model name to use for the LLM e.g. \"gemini-1.0-pro\". Supported models.

  • _aclient: the GenerativeModel to use for the Vertex AI Gemini API. It is meant to be used internally. Set in the load method.

"},{"location":"components-gallery/llms/vertexaillm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/vertexaillm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import VertexAILLM\n\nllm = VertexAILLM(model=\"gemini-1.5-pro\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/transformersllm/","title":"TransformersLLM","text":"

Hugging Face transformers library LLM implementation using the text generation

pipeline.

"},{"location":"components-gallery/llms/transformersllm/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

  • torch_dtype: the torch dtype to use for the model e.g. \"float16\", \"float32\", etc. Defaults to \"auto\".

  • trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

  • model_kwargs: additional dictionary of keyword arguments that will be passed to the from_pretrained method of the model.

  • tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer config files. If not provided, the one associated to the model will be used. Defaults to None.

  • use_fast: whether to use a fast tokenizer or not. Defaults to True.

  • chat_template: a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None.

  • device: the name or index of the device where the model will be loaded. Defaults to None.

  • device_map: a dictionary mapping each layer of the model to a device, or a mode like \"sequential\" or \"auto\". Defaults to None.

  • token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

  • structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

  • use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

  • magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

"},{"location":"components-gallery/llms/transformersllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/transformersllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import TransformersLLM\n\nllm = TransformersLLM(model=\"microsoft/Phi-3-mini-4k-instruct\")\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/llamacppllm/","title":"LlamaCppLLM","text":"

llama.cpp LLM implementation running the Python bindings for the C++ code.

"},{"location":"components-gallery/llms/llamacppllm/#attributes","title":"Attributes","text":"
  • model_path: contains the path to the GGUF quantized model, compatible with the installed version of the llama.cpp Python bindings.

  • n_gpu_layers: the number of layers to use for the GPU. Defaults to -1, meaning that the available GPU device will be used.

  • chat_format: the chat format to use for the model. Defaults to None, which means the Llama format will be used.

  • n_ctx: the context size to use for the model. Defaults to 512.

  • n_batch: the prompt processing maximum batch size to use for the model. Defaults to 512.

  • seed: random seed to use for the generation. Defaults to 4294967295.

  • verbose: whether to print verbose output. Defaults to False.

  • structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {}.

  • _model: the Llama model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

"},{"location":"components-gallery/llms/llamacppllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • model_path: the path to the GGUF quantized model.

  • n_gpu_layers: the number of layers to use for the GPU. Defaults to -1.

  • chat_format: the chat format to use for the model. Defaults to None.

  • verbose: whether to print verbose output. Defaults to False.

  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the Llama class of llama_cpp library. Defaults to {}.

"},{"location":"components-gallery/llms/llamacppllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/llamacppllm/#generate-text","title":"Generate text","text":"
from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\n# You can follow along this example downloading the following model running the following\n# command in the terminal, that will download the model to the `Downloads` folder:\n# curl -L -o ~/Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nllm = LlamaCppLLM(\n    model_path=str(Path.home() / model_path),\n    n_gpu_layers=-1,  # To use the GPU if available\n    n_ctx=1024,       # Set the context size\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/llamacppllm/#generate-structured-data","title":"Generate structured data","text":"
from pathlib import Path\nfrom distilabel.models.llms import LlamaCppLLM\n\nmodel_path = \"Downloads/openhermes-2.5-mistral-7b.Q4_K_M.gguf\"\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = LlamaCppLLM(\n    model_path=str(Path.home() / model_path),  # type: ignore\n    n_gpu_layers=-1,\n    n_ctx=1024,\n    structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/llms/llamacppllm/#references","title":"References","text":"
  • llama.cpp

  • llama-cpp-python

"},{"location":"components-gallery/llms/vllm/","title":"vLLM","text":"

vLLM library LLM implementation.

"},{"location":"components-gallery/llms/vllm/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • dtype: the data type to use for the model. Defaults to auto.

  • trust_remote_code: whether to trust the remote code when loading the model. Defaults to False.

  • quantization: the quantization mode to use for the model. Defaults to None.

  • revision: the revision of the model to load. Defaults to None.

  • tokenizer: the tokenizer Hugging Face Hub repo id or a path to a directory containing the tokenizer files. If not provided, the tokenizer will be loaded from the model directory. Defaults to None.

  • tokenizer_mode: the mode to use for the tokenizer. Defaults to auto.

  • tokenizer_revision: the revision of the tokenizer to load. Defaults to None.

  • skip_tokenizer_init: whether to skip the initialization of the tokenizer. Defaults to False.

  • chat_template: a chat template that will be used to build the prompts before sending them to the model. If not provided, the chat template defined in the tokenizer config will be used. If not provided and the tokenizer doesn't have a chat template, then ChatML template will be used. Defaults to None.

  • structured_output: a dictionary containing the structured output configuration or if more fine-grained control is needed, an instance of OutlinesStructuredOutput. Defaults to None.

  • seed: the seed to use for the random number generator. Defaults to 0.

  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {}.

  • _model: the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

  • _tokenizer: the tokenizer instance used to format the prompt before passing it to the LLM. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

  • use_magpie_template: a flag used to enable/disable applying the Magpie pre-query template. Defaults to False.

  • magpie_pre_query_template: the pre-query template to be applied to the prompt or sent to the LLM to generate an instruction or a follow up user message. Valid values are \"llama3\", \"qwen2\" or another pre-query template provided. Defaults to None.

"},{"location":"components-gallery/llms/vllm/#runtime-parameters","title":"Runtime Parameters","text":"
  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library.
"},{"location":"components-gallery/llms/vllm/#examples","title":"Examples","text":""},{"location":"components-gallery/llms/vllm/#generate-text","title":"Generate text","text":"
from distilabel.models.llms import vLLM\n\n# You can pass a custom chat_template to the model\nllm = vLLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\",\n    chat_template=\"[INST] {{ messages[0]\"content\" }}\\n{{ messages[1]\"content\" }}[/INST]\",\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Hello world!\"}]])\n
"},{"location":"components-gallery/llms/vllm/#generate-structured-data","title":"Generate structured data","text":"
from pathlib import Path\nfrom distilabel.models.llms import vLLM\n\nclass User(BaseModel):\n    name: str\n    last_name: str\n    id: int\n\nllm = vLLM(\n    model=\"prometheus-eval/prometheus-7b-v2.0\"\n    structured_output={\"format\": \"json\", \"schema\": Character},\n)\n\nllm.load()\n\n# Call the model\noutput = llm.generate_outputs(inputs=[[{\"role\": \"user\", \"content\": \"Create a user profile for the following marathon\"}]])\n
"},{"location":"components-gallery/embeddings/","title":"Embeddings Gallery","text":"
  • SentenceTransformerEmbeddings

    sentence-transformers library implementation for embedding generation.

    SentenceTransformerEmbeddings

  • vLLMEmbeddings

    vllm library implementation for embedding generation.

    vLLMEmbeddings

"},{"location":"components-gallery/embeddings/sentencetransformerembeddings/","title":"SentenceTransformerEmbeddings","text":"

sentence-transformers library implementation for embedding generation.

"},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • device: the name of the device used to load the model e.g. \"cuda\", \"mps\", etc. Defaults to None.

  • prompts: a dictionary containing prompts to be used with the model. Defaults to None.

  • default_prompt_name: the default prompt (in prompts) that will be applied to the inputs. If not provided, then no prompt will be used. Defaults to None.

  • trust_remote_code: whether to allow fetching and executing remote code fetched from the repository in the Hub. Defaults to False.

  • revision: if model refers to a Hugging Face Hub repository, then the revision (e.g. a branch name or a commit id) to use. Defaults to \"main\".

  • token: the Hugging Face Hub token that will be used to authenticate to the Hugging Face Hub. If not provided, the HF_TOKEN environment or huggingface_hub package local configuration will be used. Defaults to None.

  • truncate_dim: the dimension to truncate the sentence embeddings. Defaults to None.

  • model_kwargs: extra kwargs that will be passed to the Hugging Face transformers model class. Defaults to None.

  • tokenizer_kwargs: extra kwargs that will be passed to the Hugging Face transformers tokenizer class. Defaults to None.

  • config_kwargs: extra kwargs that will be passed to the Hugging Face transformers configuration class. Defaults to None.

  • precision: the dtype that will have the resulting embeddings. Defaults to \"float32\".

  • normalize_embeddings: whether to normalize the embeddings so they have a length of 1. Defaults to None.

"},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/embeddings/sentencetransformerembeddings/#generating-sentence-embeddings","title":"Generating sentence embeddings","text":"
from distilabel.models import SentenceTransformerEmbeddings\n\nembeddings = SentenceTransformerEmbeddings(model=\"mixedbread-ai/mxbai-embed-large-v1\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n#   [-0.05447685346007347, -0.01623094454407692, ...],\n#   [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n
"},{"location":"components-gallery/embeddings/vllmembeddings/","title":"vLLMEmbeddings","text":"

vllm library implementation for embedding generation.

"},{"location":"components-gallery/embeddings/vllmembeddings/#attributes","title":"Attributes","text":"
  • model: the model Hugging Face Hub repo id or a path to a directory containing the model weights and configuration files.

  • dtype: the data type to use for the model. Defaults to auto.

  • trust_remote_code: whether to trust the remote code when loading the model. Defaults to False.

  • quantization: the quantization mode to use for the model. Defaults to None.

  • revision: the revision of the model to load. Defaults to None.

  • enforce_eager: whether to enforce eager execution. Defaults to True.

  • seed: the seed to use for the random number generator. Defaults to 0.

  • extra_kwargs: additional dictionary of keyword arguments that will be passed to the LLM class of vllm library. Defaults to {}.

  • _model: the vLLM model instance. This attribute is meant to be used internally and should not be accessed directly. It will be set in the load method.

"},{"location":"components-gallery/embeddings/vllmembeddings/#examples","title":"Examples","text":""},{"location":"components-gallery/embeddings/vllmembeddings/#generating-sentence-embeddings","title":"Generating sentence embeddings","text":"
from distilabel.models import vLLMEmbeddings\n\nembeddings = vLLMEmbeddings(model=\"intfloat/e5-mistral-7b-instruct\")\n\nembeddings.load()\n\nresults = embeddings.encode(inputs=[\"distilabel is awesome!\", \"and Argilla!\"])\n# [\n#   [-0.05447685346007347, -0.01623094454407692, ...],\n#   [4.4889533455716446e-05, 0.044016145169734955, ...],\n# ]\n
"},{"location":"components-gallery/embeddings/vllmembeddings/#references","title":"References","text":"
  • Offline inference embeddings
"}]} \ No newline at end of file