Skip to content

Commit

Permalink
Prevent Illegal Look-Around for OneOf in JSONSchema (#897)
Browse files Browse the repository at this point in the history
Fixes #823

This comment details the issues error:
#823 (comment)

The reproduction code provided results in a json schema with
`OneOf[pets]`:

```
class Model(BaseModel):
    pet: Union[Cat, Dog] = Field(..., discriminator='pet_type')
```

Before this PR: `OneOf` uses negative lookaheads to assert that only one
schema member is included. This is illegal in `interegular`, more
details available here:
#456

After `OneOf` uses or-joined non-capturing groups which don't have the
same issues with `interegular`.
  • Loading branch information
lapp0 authored May 17, 2024
1 parent 159d1ec commit 499d19d
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 10 deletions.
8 changes: 1 addition & 7 deletions outlines/fsm/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,13 +195,7 @@ def to_regex(
to_regex(resolver, t, whitespace_pattern) for t in instance["oneOf"]
]

xor_patterns = []
# json schema validation ensured there is no overlapping schemas in oneOf
for subregex in subregexes:
other_subregexes = filter(lambda r: r != subregex, subregexes)
other_subregexes_str = "|".join([f"{s}" for s in other_subregexes])
negative_lookahead = f"(?!.*({other_subregexes_str}))"
xor_patterns.append(f"({subregex}){negative_lookahead}")
xor_patterns = [f"(?:{subregex})" for subregex in subregexes]

return rf"({'|'.join(xor_patterns)})"

Expand Down
31 changes: 28 additions & 3 deletions tests/fsm/test_json_schema.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import json
import re
from typing import List
from typing import List, Literal, Union

import interegular
import pytest
from pydantic import BaseModel, constr
from pydantic import BaseModel, Field, constr

from outlines.fsm.json_schema import (
BOOLEAN,
Expand Down Expand Up @@ -321,7 +322,7 @@ def test_match_number(pattern, does_match):
"title": "Foo",
"oneOf": [{"type": "string"}, {"type": "number"}, {"type": "boolean"}],
},
rf"(({STRING})(?!.*({NUMBER}|{BOOLEAN}))|({NUMBER})(?!.*({STRING}|{BOOLEAN}))|({BOOLEAN})(?!.*({STRING}|{NUMBER})))",
rf'((?:"{STRING_INNER}*")|(?:{NUMBER})|(?:{BOOLEAN}))',
[
("12.3", True),
("true", True),
Expand Down Expand Up @@ -750,3 +751,27 @@ class MockModel(BaseModel):
assert match_default_ws is None
assert re.fullmatch(pattern, mock_result_maybe_ws)
def test_one_of_doesnt_produce_illegal_lookaround():
"""Reproduces failure in https://github.com/outlines-dev/outlines/issues/823"""

class Cat(BaseModel):
pet_type: Literal["cat"]
meows: int

class Dog(BaseModel):
pet_type: Literal["dog"]
barks: float

class Model(BaseModel):
pet: Union[Cat, Dog] = Field(..., discriminator="pet_type")
n: int

json_schema = Model.schema_json()

json_schema = Model.schema_json()
pattern = build_regex_from_schema(json_schema, whitespace_pattern=None)

# check if the pattern uses lookarounds incompatible with interegular.Pattern.to_fsm()
interegular.parse_pattern(pattern).to_fsm()

0 comments on commit 499d19d

Please sign in to comment.