Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix how quotes are parsed in json #2189

Merged
merged 10 commits into from
Feb 21, 2025
41 changes: 5 additions & 36 deletions libs/agno/agno/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from agno.utils.log import logger, set_log_level_to_debug, set_log_level_to_info
from agno.utils.message import get_text_from_message
from agno.utils.safe_formatter import SafeFormatter
from agno.utils.string import parse_structured_output
from agno.utils.timer import Timer


Expand Down Expand Up @@ -812,23 +813,7 @@ def run(
# Otherwise convert the response to the structured format
if isinstance(run_response.content, str):
try:
from pydantic import ValidationError

structured_output = None
try:
structured_output = self.response_model.model_validate_json(run_response.content)
except ValidationError:
# Check if response starts with ```json
if run_response.content.startswith("```json"):
run_response.content = run_response.content.replace("```json\n", "").replace(
"\n```", ""
)
try:
structured_output = self.response_model.model_validate_json(
run_response.content
)
except Exception as e:
logger.warning(f"Failed to convert response to pydantic model: {e}")
structured_output = parse_structured_output(run_response.content, self.response_model)

# Update RunResponse
if structured_output is not None:
Expand Down Expand Up @@ -1218,23 +1203,7 @@ async def arun(
# Otherwise convert the response to the structured format
if isinstance(run_response.content, str):
try:
from pydantic import ValidationError

structured_output = None
try:
structured_output = self.response_model.model_validate_json(run_response.content)
except ValidationError:
# Check if response starts with ```json
if run_response.content.startswith("```json"):
run_response.content = run_response.content.replace("```json\n", "").replace(
"\n```", ""
)
try:
structured_output = self.response_model.model_validate_json(
run_response.content
)
except Exception as e:
logger.warning(f"Failed to convert response to pydantic model: {e}")
structured_output = parse_structured_output(run_response.content, self.response_model)

# Update RunResponse
if structured_output is not None:
Expand Down Expand Up @@ -2508,9 +2477,9 @@ def convert_context_to_string(self, context: Dict[str, Any]) -> str:
if context is None:
return ""

try:
import json
import json

try:
return json.dumps(context, indent=2, default=str)
except (TypeError, ValueError, OverflowError) as e:
logger.warning(f"Failed to convert context to JSON: {e}")
Expand Down
110 changes: 53 additions & 57 deletions libs/agno/agno/utils/string.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import hashlib
import json
from typing import Any, Dict, Optional
import re
from typing import Optional, Type

from pydantic import BaseModel, ValidationError

from agno.utils.log import logger


def hash_string_sha256(input_string):
Expand All @@ -19,60 +24,51 @@ def hash_string_sha256(input_string):
return hex_digest


def extract_valid_json(content: str) -> Optional[Dict[str, Any]]:
"""
Extract the first valid JSON object from a string and return the JSON object
along with the rest of the string without the JSON.

Args:
content (str): The input string containing potential JSON data.

Returns:
Tuple[Optional[Dict[str, Any]], str]:
- Extracted JSON dictionary if valid, else None.
- The rest of the string without the extracted JSON.
"""
search_start = 0
while True:
# Find the next opening brace
start_idx = content.find("{", search_start)
if start_idx == -1:
# No more '{' found; stop searching
return None

# Track brace depth
brace_depth = 0
# This will store the end of the matching closing brace once found
end_idx = None

for i in range(start_idx, len(content)):
char = content[i]
if char == "{":
brace_depth += 1
elif char == "}":
brace_depth -= 1

# If brace_depth returns to 0, we’ve found a potential JSON substring
if brace_depth == 0:
end_idx = i
break

# If we never returned to depth 0, it means we couldn't find a matching '}'
if end_idx is None:
return None

# Extract the candidate substring
candidate = content[start_idx : end_idx + 1]

# Try to parse it
def parse_structured_output(content: str, response_model: Type[BaseModel]) -> Optional[BaseModel]:
structured_output = None
try:
# First attempt: direct JSON validation
structured_output = response_model.model_validate_json(content)
except (ValidationError, json.JSONDecodeError):
# Second attempt: Extract JSON from markdown code blocks and clean
content = content

# Handle code blocks
if "```json" in content:
content = content.split("```json")[-1].split("```")[0].strip()
elif "```" in content:
content = content.split("```")[1].strip()

# Clean the JSON string
# Remove markdown formatting
content = re.sub(r"[*_`#]", "", content)

# Handle newlines and control characters
content = content.replace("\n", " ").replace("\r", "")
content = re.sub(r"[\x00-\x1F\x7F]", "", content)

# Escape quotes only in values, not keys
def escape_quotes_in_values(match):
key = match.group(1)
value = match.group(2)
# Escape quotes in the value portion only
escaped_value = value.replace('"', '\\"')
return f'"{key}": "{escaped_value}'

# Find and escape quotes in field values
content = re.sub(r'"(?P<key>[^"]+)"\s*:\s*"(?P<value>.*?)(?="\s*(?:,|\}))', escape_quotes_in_values, content)

try:
parsed = json.loads(candidate)
# If parsed successfully, check if it's a dict
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
# Not valid JSON, keep going
pass

# Move just past the current opening brace to look for another candidate
search_start = start_idx + 1
# Try parsing the cleaned JSON
structured_output = response_model.model_validate_json(content)
except (ValidationError, json.JSONDecodeError) as e:
logger.warning(f"Failed to parse cleaned JSON: {e}")

try:
# Final attempt: Try parsing as Python dict
data = json.loads(content)
structured_output = response_model.model_validate(data)
except (ValidationError, json.JSONDecodeError) as e:
logger.warning(f"Failed to parse as Python dict: {e}")

return structured_output
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import List

from pydantic import BaseModel, Field

from agno.agent import Agent, RunResponse # noqa
from agno.models.openai.chat import OpenAIChat # noqa


def test_structured_output_parsing_with_quotes():
class MovieScript(BaseModel):
script: str = Field(..., description="The script of the movie.")
name: str = Field(..., description="Give a name to this movie")
characters: List[str] = Field(..., description="Name of characters for this movie.")

movie_agent = Agent(
model=OpenAIChat(id="gpt-4o-mini"),
description="You help people write movie scripts. Always add some example dialog in your scripts in double quotes.",
response_model=MovieScript,
)

# Get the response in a variable
response: RunResponse = movie_agent.run("New York")
# Verify structured output
assert isinstance(response.content, MovieScript)
assert response.content.script is not None
assert response.content.name is not None
assert response.content.characters is not None
Loading