From d7e61b5219c30a0bd287d46cd85a90ca937f4a45 Mon Sep 17 00:00:00 2001 From: David Berenstein Date: Mon, 23 Sep 2024 12:54:10 +0200 Subject: [PATCH] Fix schema inference structured generation (#994) * fix: converting ModelMetaClass to model_json_schema * fix: allow for adding optional literal format json to instructor to make methods more inter-changable * docs: emphasize usability with any framework * fix: first check if structured_output has been defined * Update docs/sections/how_to_guides/advanced/structured_generation.md Co-authored-by: Agus --------- Co-authored-by: Agus --- .../how_to_guides/advanced/structured_generation.md | 11 ++++++----- .../llms/huggingface/inference_endpoints.py | 7 +++++++ src/distilabel/steps/tasks/typing.py | 2 ++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/docs/sections/how_to_guides/advanced/structured_generation.md b/docs/sections/how_to_guides/advanced/structured_generation.md index 02b8cc8e4a..6f907951c1 100644 --- a/docs/sections/how_to_guides/advanced/structured_generation.md +++ b/docs/sections/how_to_guides/advanced/structured_generation.md @@ -129,7 +129,7 @@ These were some simple examples, but one can see the options this opens. ## Instructor -When working with model providers behind an API, there's no direct way of accessing the internal logit processor like `outlines` does, but thanks to [`instructor`](https://python.useinstructor.com/) we can generate structured output from LLM providers based on `pydantic.BaseModel` objects. We have integrated `instructor` to deal with the [`AsyncLLM`][distilabel.llms.AsyncLLM], so you can work with the following LLMs: [`OpenAILLM`][distilabel.llms.OpenAILLM], [`AzureOpenAILLM`][distilabel.llms.AzureOpenAILLM], [`CohereLLM`][distilabel.llms.CohereLLM], [`GroqLLM`][distilabel.llms.GroqLLM], [`LiteLLM`][distilabel.llms.LiteLLM] and [`MistralLLM`][distilabel.llms.MistralLLM]. +For other LLM providers behind APIs, there's no direct way of accessing the internal logit processor like `outlines` does, but thanks to [`instructor`](https://python.useinstructor.com/) we can generate structured output from LLM providers based on `pydantic.BaseModel` objects. We have integrated `instructor` to deal with the [`AsyncLLM`][distilabel.llms.AsyncLLM]. !!! Note For `instructor` integration to work you may need to install the corresponding dependencies: @@ -155,14 +155,15 @@ class User(BaseModel): And then we provide that schema to the `structured_output` argument of the LLM: -!!! Note - In this example we are using *open-mixtral-8x22b*, keep in mind not all the models work with the function calling functionality required for this example to work. +!!! NOTE + In this example we are using *Meta Llama 3.1 8B Instruct*, keep in mind not all the models support structured outputs. ```python from distilabel.llms import MistralLLM -llm = MistralLLM( - model="open-mixtral-8x22b", +llm = InferenceEndpointsLLM( + model_id="meta-llama/Meta-Llama-3.1-8B-Instruct", + tokenizer_id="meta-llama/Meta-Llama-3.1-8B-Instruct", structured_output={"schema": User} ) llm.load() diff --git a/src/distilabel/llms/huggingface/inference_endpoints.py b/src/distilabel/llms/huggingface/inference_endpoints.py index 42cf1b4345..3566228f56 100644 --- a/src/distilabel/llms/huggingface/inference_endpoints.py +++ b/src/distilabel/llms/huggingface/inference_endpoints.py @@ -26,6 +26,7 @@ model_validator, validate_call, ) +from pydantic._internal._model_construction import ModelMetaclass from typing_extensions import Annotated, override from distilabel.llms.base import AsyncLLM @@ -363,6 +364,12 @@ def _get_structured_output( "the `structured_output` attribute." ) from e + if structured_output: + if isinstance(structured_output["value"], ModelMetaclass): + structured_output["value"] = structured_output[ + "value" + ].model_json_schema() + return structured_output async def _generate_with_text_generation( diff --git a/src/distilabel/steps/tasks/typing.py b/src/distilabel/steps/tasks/typing.py index ae9fd9519e..920a94c3b9 100644 --- a/src/distilabel/steps/tasks/typing.py +++ b/src/distilabel/steps/tasks/typing.py @@ -49,6 +49,8 @@ class OutlinesStructuredOutputType(TypedDict, total=False): class InstructorStructuredOutputType(TypedDict, total=False): """TypedDict to represent the structured output configuration from `instructor`.""" + format: Optional[Literal["json"]] + """One of "json".""" schema: Union[Type[BaseModel], Dict[str, Any]] """The schema to use for the structured output, a `pydantic.BaseModel` class. """ mode: Optional[str]