From d7e61b5219c30a0bd287d46cd85a90ca937f4a45 Mon Sep 17 00:00:00 2001
From: David Berenstein <david.m.berenstein@gmail.com>
Date: Mon, 23 Sep 2024 12:54:10 +0200
Subject: [PATCH] Fix schema inference structured generation (#994)

* fix: converting ModelMetaClass to model_json_schema

* fix: allow for adding optional literal format json to instructor to make methods more inter-changable

* docs: emphasize usability with any framework

* fix: first check if structured_output has been defined

* Update docs/sections/how_to_guides/advanced/structured_generation.md

Co-authored-by: Agus <agustin@argilla.io>

---------

Co-authored-by: Agus <agustin@argilla.io>
---
 .../how_to_guides/advanced/structured_generation.md   | 11 ++++++-----
 .../llms/huggingface/inference_endpoints.py           |  7 +++++++
 src/distilabel/steps/tasks/typing.py                  |  2 ++
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/docs/sections/how_to_guides/advanced/structured_generation.md b/docs/sections/how_to_guides/advanced/structured_generation.md
index 02b8cc8e4a..6f907951c1 100644
--- a/docs/sections/how_to_guides/advanced/structured_generation.md
+++ b/docs/sections/how_to_guides/advanced/structured_generation.md
@@ -129,7 +129,7 @@ These were some simple examples, but one can see the options this opens.
 
 ## Instructor
 
-When working with model providers behind an API, there's no direct way of accessing the internal logit processor like `outlines` does, but thanks to [`instructor`](https://python.useinstructor.com/) we can generate structured output from LLM providers based on `pydantic.BaseModel` objects. We have integrated `instructor` to deal with the [`AsyncLLM`][distilabel.llms.AsyncLLM], so you can work with the following LLMs: [`OpenAILLM`][distilabel.llms.OpenAILLM], [`AzureOpenAILLM`][distilabel.llms.AzureOpenAILLM], [`CohereLLM`][distilabel.llms.CohereLLM], [`GroqLLM`][distilabel.llms.GroqLLM], [`LiteLLM`][distilabel.llms.LiteLLM] and [`MistralLLM`][distilabel.llms.MistralLLM].
+For other LLM providers behind APIs, there's no direct way of accessing the internal logit processor like `outlines` does, but thanks to [`instructor`](https://python.useinstructor.com/) we can generate structured output from LLM providers based on `pydantic.BaseModel` objects. We have integrated `instructor` to deal with the [`AsyncLLM`][distilabel.llms.AsyncLLM].
 
 !!! Note
     For `instructor` integration to work you may need to install the corresponding dependencies:
@@ -155,14 +155,15 @@ class User(BaseModel):
 
 And then we provide that schema to the `structured_output` argument of the LLM:
 
-!!! Note
-    In this example we are using *open-mixtral-8x22b*, keep in mind not all the models work with the function calling functionality required for this example to work.
+!!! NOTE
+    In this example we are using *Meta Llama 3.1 8B Instruct*, keep in mind not all the models support structured outputs.
 
 ```python
 from distilabel.llms import MistralLLM
 
-llm = MistralLLM(
-    model="open-mixtral-8x22b",
+llm = InferenceEndpointsLLM(
+    model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    tokenizer_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
     structured_output={"schema": User}
 )
 llm.load()
diff --git a/src/distilabel/llms/huggingface/inference_endpoints.py b/src/distilabel/llms/huggingface/inference_endpoints.py
index 42cf1b4345..3566228f56 100644
--- a/src/distilabel/llms/huggingface/inference_endpoints.py
+++ b/src/distilabel/llms/huggingface/inference_endpoints.py
@@ -26,6 +26,7 @@
     model_validator,
     validate_call,
 )
+from pydantic._internal._model_construction import ModelMetaclass
 from typing_extensions import Annotated, override
 
 from distilabel.llms.base import AsyncLLM
@@ -363,6 +364,12 @@ def _get_structured_output(
                     "the `structured_output` attribute."
                 ) from e
 
+        if structured_output:
+            if isinstance(structured_output["value"], ModelMetaclass):
+                structured_output["value"] = structured_output[
+                    "value"
+                ].model_json_schema()
+
         return structured_output
 
     async def _generate_with_text_generation(
diff --git a/src/distilabel/steps/tasks/typing.py b/src/distilabel/steps/tasks/typing.py
index ae9fd9519e..920a94c3b9 100644
--- a/src/distilabel/steps/tasks/typing.py
+++ b/src/distilabel/steps/tasks/typing.py
@@ -49,6 +49,8 @@ class OutlinesStructuredOutputType(TypedDict, total=False):
 class InstructorStructuredOutputType(TypedDict, total=False):
     """TypedDict to represent the structured output configuration from `instructor`."""
 
+    format: Optional[Literal["json"]]
+    """One of "json"."""
     schema: Union[Type[BaseModel], Dict[str, Any]]
     """The schema to use for the structured output, a `pydantic.BaseModel` class. """
     mode: Optional[str]