Skip to content

Commit

Permalink
Fix how quotes are parsed in json (#2189)
Browse files Browse the repository at this point in the history
## Description

When we have agents that respond with JSON with quotes in the fields,
then it breaks parsing. This fixes that.

Fixes #2128
  • Loading branch information
dirkbrnd authored Feb 21, 2025
1 parent f0affe2 commit a2070cb
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 158 deletions.
41 changes: 5 additions & 36 deletions libs/agno/agno/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from agno.utils.log import logger, set_log_level_to_debug, set_log_level_to_info
from agno.utils.message import get_text_from_message
from agno.utils.safe_formatter import SafeFormatter
from agno.utils.string import parse_structured_output
from agno.utils.timer import Timer


Expand Down Expand Up @@ -812,23 +813,7 @@ def run(
# Otherwise convert the response to the structured format
if isinstance(run_response.content, str):
try:
from pydantic import ValidationError

structured_output = None
try:
structured_output = self.response_model.model_validate_json(run_response.content)
except ValidationError:
# Check if response starts with ```json
if run_response.content.startswith("```json"):
run_response.content = run_response.content.replace("```json\n", "").replace(
"\n```", ""
)
try:
structured_output = self.response_model.model_validate_json(
run_response.content
)
except Exception as e:
logger.warning(f"Failed to convert response to pydantic model: {e}")
structured_output = parse_structured_output(run_response.content, self.response_model)

# Update RunResponse
if structured_output is not None:
Expand Down Expand Up @@ -1218,23 +1203,7 @@ async def arun(
# Otherwise convert the response to the structured format
if isinstance(run_response.content, str):
try:
from pydantic import ValidationError

structured_output = None
try:
structured_output = self.response_model.model_validate_json(run_response.content)
except ValidationError:
# Check if response starts with ```json
if run_response.content.startswith("```json"):
run_response.content = run_response.content.replace("```json\n", "").replace(
"\n```", ""
)
try:
structured_output = self.response_model.model_validate_json(
run_response.content
)
except Exception as e:
logger.warning(f"Failed to convert response to pydantic model: {e}")
structured_output = parse_structured_output(run_response.content, self.response_model)

# Update RunResponse
if structured_output is not None:
Expand Down Expand Up @@ -2509,9 +2478,9 @@ def convert_context_to_string(self, context: Dict[str, Any]) -> str:
if context is None:
return ""

try:
import json
import json

try:
return json.dumps(context, indent=2, default=str)
except (TypeError, ValueError, OverflowError) as e:
logger.warning(f"Failed to convert context to JSON: {e}")
Expand Down
110 changes: 53 additions & 57 deletions libs/agno/agno/utils/string.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import hashlib
import json
from typing import Any, Dict, Optional
import re
from typing import Optional, Type

from pydantic import BaseModel, ValidationError

from agno.utils.log import logger


def hash_string_sha256(input_string):
Expand All @@ -19,60 +24,51 @@ def hash_string_sha256(input_string):
return hex_digest


def extract_valid_json(content: str) -> Optional[Dict[str, Any]]:
"""
Extract the first valid JSON object from a string and return the JSON object
along with the rest of the string without the JSON.
Args:
content (str): The input string containing potential JSON data.
Returns:
Tuple[Optional[Dict[str, Any]], str]:
- Extracted JSON dictionary if valid, else None.
- The rest of the string without the extracted JSON.
"""
search_start = 0
while True:
# Find the next opening brace
start_idx = content.find("{", search_start)
if start_idx == -1:
# No more '{' found; stop searching
return None

# Track brace depth
brace_depth = 0
# This will store the end of the matching closing brace once found
end_idx = None

for i in range(start_idx, len(content)):
char = content[i]
if char == "{":
brace_depth += 1
elif char == "}":
brace_depth -= 1

# If brace_depth returns to 0, we’ve found a potential JSON substring
if brace_depth == 0:
end_idx = i
break

# If we never returned to depth 0, it means we couldn't find a matching '}'
if end_idx is None:
return None

# Extract the candidate substring
candidate = content[start_idx : end_idx + 1]

# Try to parse it
def parse_structured_output(content: str, response_model: Type[BaseModel]) -> Optional[BaseModel]:
structured_output = None
try:
# First attempt: direct JSON validation
structured_output = response_model.model_validate_json(content)
except (ValidationError, json.JSONDecodeError):
# Second attempt: Extract JSON from markdown code blocks and clean
content = content

# Handle code blocks
if "```json" in content:
content = content.split("```json")[-1].split("```")[0].strip()
elif "```" in content:
content = content.split("```")[1].strip()

# Clean the JSON string
# Remove markdown formatting
content = re.sub(r"[*_`#]", "", content)

# Handle newlines and control characters
content = content.replace("\n", " ").replace("\r", "")
content = re.sub(r"[\x00-\x1F\x7F]", "", content)

# Escape quotes only in values, not keys
def escape_quotes_in_values(match):
key = match.group(1)
value = match.group(2)
# Escape quotes in the value portion only
escaped_value = value.replace('"', '\\"')
return f'"{key}": "{escaped_value}'

# Find and escape quotes in field values
content = re.sub(r'"(?P<key>[^"]+)"\s*:\s*"(?P<value>.*?)(?="\s*(?:,|\}))', escape_quotes_in_values, content)

try:
parsed = json.loads(candidate)
# If parsed successfully, check if it's a dict
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
# Not valid JSON, keep going
pass

# Move just past the current opening brace to look for another candidate
search_start = start_idx + 1
# Try parsing the cleaned JSON
structured_output = response_model.model_validate_json(content)
except (ValidationError, json.JSONDecodeError) as e:
logger.warning(f"Failed to parse cleaned JSON: {e}")

try:
# Final attempt: Try parsing as Python dict
data = json.loads(content)
structured_output = response_model.model_validate(data)
except (ValidationError, json.JSONDecodeError) as e:
logger.warning(f"Failed to parse as Python dict: {e}")

return structured_output
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import List

from pydantic import BaseModel, Field

from agno.agent import Agent, RunResponse # noqa
from agno.models.openai.chat import OpenAIChat # noqa


def test_structured_output_parsing_with_quotes():
class MovieScript(BaseModel):
script: str = Field(..., description="The script of the movie.")
name: str = Field(..., description="Give a name to this movie")
characters: List[str] = Field(..., description="Name of characters for this movie.")

movie_agent = Agent(
model=OpenAIChat(id="gpt-4o-mini"),
description="You help people write movie scripts. Always add some example dialog in your scripts in double quotes.",
response_model=MovieScript,
)

# Get the response in a variable
response: RunResponse = movie_agent.run("New York")
# Verify structured output
assert isinstance(response.content, MovieScript)
assert response.content.script is not None
assert response.content.name is not None
assert response.content.characters is not None
Loading

0 comments on commit a2070cb

Please sign in to comment.