diff --git a/src/json_schema/helpers.rs b/src/json_schema/helpers.rs deleted file mode 100644 index e388089d..00000000 --- a/src/json_schema/helpers.rs +++ /dev/null @@ -1,40 +0,0 @@ -use anyhow::{anyhow, Result}; -use std::num::NonZeroU64; - -pub fn validate_quantifiers( - min_bound: Option, - max_bound: Option, - start_offset: u64, -) -> Result<(Option, Option)> { - let min_bound = min_bound.map(|n| NonZeroU64::new(n.saturating_sub(start_offset))); - let max_bound = max_bound.map(|n| NonZeroU64::new(n.saturating_sub(start_offset))); - - if let (Some(min), Some(max)) = (min_bound, max_bound) { - if max < min { - return Err(anyhow!( - "max bound must be greater than or equal to min bound" - )); - } - } - - Ok((min_bound.flatten(), max_bound.flatten())) -} - -pub fn get_num_items_pattern(min_items: Option, max_items: Option) -> Option { - let min_items = min_items.unwrap_or(0); - - match max_items { - None => Some(format!("{{{},}}", min_items.saturating_sub(1))), - Some(max_items) => { - if max_items < 1 { - None - } else { - Some(format!( - "{{{},{}}}", - min_items.saturating_sub(1), - max_items.saturating_sub(1) - )) - } - } - } -} diff --git a/src/json_schema/mod.rs b/src/json_schema/mod.rs index ec1172f7..f8f3aa21 100644 --- a/src/json_schema/mod.rs +++ b/src/json_schema/mod.rs @@ -1,86 +1,22 @@ -mod helpers; mod parsing; mod types; pub use types::*; -use anyhow::{anyhow, Result}; +use anyhow::Result; use serde_json::Value; -#[derive(Debug, Copy, Clone)] -enum SchemaKeyword { - Properties, - AllOf, - AnyOf, - OneOf, - PrefixItems, - Enum, - Const, - Ref, - Type, - EmptyObject, -} - pub fn build_regex_from_schema(json: &str, whitespace_pattern: Option<&str>) -> Result { let json_value: Value = serde_json::from_str(json)?; - to_regex(&json_value, whitespace_pattern, &json_value) + to_regex(&json_value, whitespace_pattern) } -pub fn to_regex( - json: &Value, - whitespace_pattern: Option<&str>, - full_schema: &Value, -) -> Result { - let whitespace_pattern = whitespace_pattern.unwrap_or(types::WHITESPACE); - - match json { - Value::Object(obj) => { - let keyword = if obj.is_empty() { - SchemaKeyword::EmptyObject - } else { - [ - ("properties", SchemaKeyword::Properties), - ("allOf", SchemaKeyword::AllOf), - ("anyOf", SchemaKeyword::AnyOf), - ("oneOf", SchemaKeyword::OneOf), - ("prefixItems", SchemaKeyword::PrefixItems), - ("enum", SchemaKeyword::Enum), - ("const", SchemaKeyword::Const), - ("$ref", SchemaKeyword::Ref), - ("type", SchemaKeyword::Type), - ] - .iter() - .find_map(|&(key, schema_keyword)| { - if obj.contains_key(key) { - Some(schema_keyword) - } else { - None - } - }) - .ok_or_else(|| anyhow!("Unsupported JSON Schema structure {} \nMake sure it is valid to the JSON Schema specification and check if it's supported by Outlines.\nIf it should be supported, please open an issue.", json))? - }; - - match keyword { - SchemaKeyword::Properties => { - parsing::parse_properties(obj, whitespace_pattern, full_schema) - } - SchemaKeyword::AllOf => parsing::parse_all_of(obj, whitespace_pattern, full_schema), - SchemaKeyword::AnyOf => parsing::parse_any_of(obj, whitespace_pattern, full_schema), - SchemaKeyword::OneOf => parsing::parse_one_of(obj, whitespace_pattern, full_schema), - SchemaKeyword::PrefixItems => { - parsing::parse_prefix_items(obj, whitespace_pattern, full_schema) - } - SchemaKeyword::Enum => parsing::parse_enum(obj, whitespace_pattern), - SchemaKeyword::Const => parsing::parse_const(obj, whitespace_pattern), - SchemaKeyword::Ref => parsing::parse_ref(obj, whitespace_pattern, full_schema), - SchemaKeyword::Type => parsing::parse_type(obj, whitespace_pattern, full_schema), - SchemaKeyword::EmptyObject => { - parsing::parse_empty_object(whitespace_pattern, full_schema) - } - } - } - _ => Err(anyhow!("Invalid JSON Schema: expected an object")), +pub fn to_regex(json: &Value, whitespace_pattern: Option<&str>) -> Result { + let mut parser = parsing::Parser::new(json); + if let Some(pattern) = whitespace_pattern { + parser = parser.with_whitespace_pattern(pattern) } + Ok(parser.to_regex(json)?) } #[cfg(test)] @@ -89,29 +25,31 @@ mod tests { use regex::Regex; #[test] - fn error_on_recursive_ref() { - let json = r##" - { - "type": "object", - "properties": { - "name": { "type": "string" }, - "children": { - "type": "array", - "items": { "$ref": "#" } - } - } + fn recursive_ref() { + let json = r##"{ + "type": "object", + "properties": { + "node": { "$ref": "#/definitions/node" } + }, + "definitions": { + "node": { + "type": "object", + "properties": { + "value": { "type": "integer" }, + "next": { "$ref": "#/definitions/node" } + } + } + } }"##; - let json_value: Value = serde_json::from_str(json).expect("Can't parse json"); - let result = to_regex(&json_value, None, &json_value); - - match result { - Err(e) => { - let message = "Recursive references are not supported for now"; - assert_eq!(message, e.to_string()); - } - _ => unreachable!(), - } + let mut parser = parsing::Parser::new(&json_value).with_max_recursion_depth(1); + let result = parser.to_regex(&json_value); + assert!(result.is_ok(), "{:?}", result); + let regex = result.unwrap(); + assert_eq!( + r#"\{([ ]?"node"[ ]?:[ ]?\{([ ]?"value"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)([ ]?,[ ]?"next"[ ]?:[ ]?\{([ ]?"value"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)([ ]?,[ ]?"next"[ ]?:[ ]?{})?|([ ]?"value"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?,)?[ ]?"next"[ ]?:[ ]?{})?[ ]?\})?|([ ]?"value"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?,)?[ ]?"next"[ ]?:[ ]?\{([ ]?"value"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)([ ]?,[ ]?"next"[ ]?:[ ]?{})?|([ ]?"value"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?,)?[ ]?"next"[ ]?:[ ]?{})?[ ]?\})?[ ]?\})?[ ]?\}"#, + regex, + ); } #[test] @@ -135,7 +73,7 @@ mod tests { }"##; let json_value: Value = serde_json::from_str(json).expect("Can't parse json"); - let result = to_regex(&json_value, None, &json_value); + let result = to_regex(&json_value, None); match result { Ok(r) => { @@ -1007,9 +945,25 @@ mod tests { vec![r#"["a", 1]"#], vec![r#"["a", 1, 1]"#, "[]"], ), + // Unconstrained value (no schema) + // (huge regex, but important test to verify matching it explicitely) + ( + "{}", + "((true|false))|(null)|(((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?)|((-)?(0|[1-9][0-9]*))|(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")|(\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])){0,})?[ ]?\\])(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])){0,})?[ ]?\\])){0,})?[ ]?\\])|(\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])){0,})?[ ]?\\])([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|\\{[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)([ ]?,[ ]?\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"[ ]?:[ ]?(\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\"|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(true|false)|null)){0,})?[ ]?\\}|\\[[ ]?(((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")(,[ ]?((true|false)|null|((-)?(0|[1-9][0-9]*))(\\.[0-9]+)?([eE][+-][0-9]+)?|(-)?(0|[1-9][0-9]*)|\"([^\"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\[\"\\\\])*\")){0,})?[ ]?\\])){0,})?[ ]?\\])){0,})?[ ]?\\})", + vec![ + r#""aaabbuecuh""#, + "5.554", + "true", + "null", + "5999", + r#"["a", "b"]"#, + r#"{"key": {"k2": "value"}}"#, + ], + vec!["this isnt valid json"], + ), ] { let json: Value = serde_json::from_str(schema).expect("Can't parse json"); - let result = to_regex(&json, None, &json).expect("To regex failed"); + let result = to_regex(&json, None).expect("To regex failed"); assert_eq!(result, regex); let re = Regex::new(&result).expect("Regex failed"); @@ -1023,7 +977,7 @@ mod tests { } #[test] - fn test_unconstrained() { + fn test_unconstrained_others() { for (schema, a_match, not_a_match) in [ // Unconstrained Object ( @@ -1060,27 +1014,13 @@ mod tests { r#"{}"#, r#"{"a": 1, "b": null}"#, r#"{"a": {"z": {"g": 4}}, "b": null}"#, - "1234", // not an array - r#"{"a": "a"}"#, // not an array + "1234", + r#"{"a": "a"}"#, ], ), - // Unconstrained value (no schema) - ( - "{}", - vec![ - r#""aaabbuecuh""#, - "5.554", - "true", - "null", - "5999", - r#"["a", "b"]"#, - r#"{"key": {"k2": "value"}}"#, - ], - vec!["this isnt valid json"], - ), ] { let json: Value = serde_json::from_str(schema).expect("Can't parse json"); - let regex = to_regex(&json, None, &json).expect("To regex failed"); + let regex = to_regex(&json, None).expect("To regex failed"); let re = Regex::new(®ex).expect("Regex failed"); for m in a_match { should_match(&re, m); diff --git a/src/json_schema/parsing.rs b/src/json_schema/parsing.rs index 4182ac8e..867c60a1 100644 --- a/src/json_schema/parsing.rs +++ b/src/json_schema/parsing.rs @@ -1,582 +1,649 @@ -use anyhow::{anyhow, Result}; +use std::collections::HashSet; +use std::num::NonZeroU64; + use regex::escape; use serde_json::json; use serde_json::Value; +use thiserror::Error as ThisError; -use crate::json_schema::helpers; -use crate::json_schema::to_regex; use crate::json_schema::types; -pub fn parse_properties( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - let mut regex = String::from(r"\{"); - - let properties = obj - .get("properties") - .and_then(Value::as_object) - .ok_or_else(|| anyhow!("'properties' not found or not an object"))?; - - let required_properties = obj - .get("required") - .and_then(Value::as_array) - .map(|arr| arr.iter().filter_map(Value::as_str).collect::>()) - .unwrap_or_default(); - - let is_required: Vec = properties - .keys() - .map(|item| required_properties.contains(&item.as_str())) - .collect(); - - if is_required.iter().any(|&x| x) { - let last_required_pos = is_required - .iter() - .enumerate() - .filter(|&(_, &value)| value) - .map(|(i, _)| i) - .max() - .unwrap(); - - for (i, (name, value)) in properties.iter().enumerate() { - let mut subregex = format!( - r#"{whitespace_pattern}"{}"{}:{}"#, - escape(name), - whitespace_pattern, - whitespace_pattern - ); - subregex += &to_regex(value, Some(whitespace_pattern), full_schema)?; - - match i { - i if i < last_required_pos => { - subregex = format!("{}{},", subregex, whitespace_pattern) - } - i if i > last_required_pos => { - subregex = format!("{},{}", whitespace_pattern, subregex) - } - _ => (), - } +#[derive(Debug, ThisError)] +pub enum ParserError { + #[error("serde json error")] + SerdeJsonError(#[from] serde_json::Error), + #[error("Unsupported JSON Schema structure {0} \nMake sure it is valid to the JSON Schema specification and check if it's supported by Outlines.\nIf it should be supported, please open an issue.")] + UnsupportedJsonSchema(Box), + #[error("'properties' not found or not an object")] + PropertiesNotFound, + #[error("'allOf' must be an array")] + AllOfMustBeAnArray, + #[error("'anyOf' must be an array")] + AnyOfMustBeAnArray, + #[error("'oneOf' must be an array")] + OneOfMustBeAnArray, + #[error("'prefixItems' must be an array")] + PrefixItemsMustBeAnArray, + #[error("Unsupported data type in enum: {0}")] + UnsupportedEnumDataType(Box), + #[error("'enum' must be an array")] + EnumMustBeAnArray, + #[error("Unsupported data type in const: {0}")] + UnsupportedConstDataType(Box), + #[error("'const' key not found in object")] + ConstKeyNotFound, + #[error("'$ref' must be a string")] + RefMustBeAString, + #[error("External references are not supported: {0}")] + ExternalReferencesNotSupported(Box), + #[error("Invalid reference format: {0}")] + InvalidReferenceFormat(Box), + #[error("'type' must be a string")] + TypeMustBeAString, + #[error("Unsupported type: {0}")] + UnsupportedType(Box), + #[error("maxLength must be greater than or equal to minLength")] + MaxBoundError, + #[error("Format {0} is not supported by Outlines")] + StringTypeUnsupportedFormat(Box), + #[error("Invalid reference path: {0}")] + InvalidRefecencePath(Box), +} - regex += &if is_required[i] { - subregex - } else { - format!("({})?", subregex) - }; +type Result = std::result::Result; + +pub(crate) struct Parser<'a> { + root: &'a Value, + whitespace_pattern: &'a str, + visited: HashSet, + recursion_depth: usize, + max_recursion_depth: usize, +} + +impl<'a> Parser<'a> { + pub fn new(root: &'a Value) -> Self { + Self { + root, + whitespace_pattern: types::WHITESPACE, + visited: HashSet::new(), + recursion_depth: 0, + max_recursion_depth: 5, } - } else { - let mut property_subregexes = Vec::new(); - for (name, value) in properties.iter() { - let mut subregex = format!( - r#"{whitespace_pattern}"{}"{}:{}"#, - escape(name), - whitespace_pattern, - whitespace_pattern - ); - - subregex += &to_regex(value, Some(whitespace_pattern), full_schema)?; - property_subregexes.push(subregex); + } + + pub fn with_whitespace_pattern(self, whitespace_pattern: &'a str) -> Self { + Self { + whitespace_pattern, + ..self } + } - let mut possible_patterns = Vec::new(); - for i in 0..property_subregexes.len() { - let mut pattern = String::new(); - for subregex in &property_subregexes[..i] { - pattern += &format!("({}{},)?", subregex, whitespace_pattern); - } - pattern += &property_subregexes[i]; - for subregex in &property_subregexes[i + 1..] { - pattern += &format!("({},{})?", whitespace_pattern, subregex); - } - possible_patterns.push(pattern); + pub fn with_max_recursion_depth(self, max_recursion_depth: usize) -> Self { + Self { + max_recursion_depth, + ..self } + } - regex += &format!("({})?", possible_patterns.join("|")); + pub fn to_regex(&mut self, json: &Value) -> Result { + match json { + Value::Object(obj) if obj.is_empty() => self.parse_empty_object(), + Value::Object(obj) if obj.contains_key("properties") => self.parse_properties(obj), + Value::Object(obj) if obj.contains_key("allOf") => self.parse_all_of(obj), + Value::Object(obj) if obj.contains_key("anyOf") => self.parse_any_of(obj), + Value::Object(obj) if obj.contains_key("oneOf") => self.parse_one_of(obj), + Value::Object(obj) if obj.contains_key("prefixItems") => self.parse_prefix_items(obj), + Value::Object(obj) if obj.contains_key("enum") => self.parse_enum(obj), + Value::Object(obj) if obj.contains_key("const") => self.parse_const(obj), + Value::Object(obj) if obj.contains_key("$ref") => self.parse_ref(obj), + Value::Object(obj) if obj.contains_key("type") => self.parse_type(obj), + json => Err(ParserError::UnsupportedJsonSchema(Box::new(json.clone()))), + } } - regex += &format!("{}\\}}", whitespace_pattern); + fn parse_empty_object(&mut self) -> Result { + // JSON Schema Spec: Empty object means unconstrained, any json type is legal + let types = vec![ + json!({"type": "boolean"}), + json!({"type": "null"}), + json!({"type": "number"}), + json!({"type": "integer"}), + json!({"type": "string"}), + json!({"type": "array"}), + json!({"type": "object"}), + ]; + let regex = types + .iter() + .try_fold(Vec::with_capacity(types.len()), |mut acc, object| { + self.to_regex(object).map(|string| { + acc.push(format!("({})", string)); + acc + }) + })? + .join("|"); + Ok(regex) + } - Ok(regex) -} + fn parse_properties(&mut self, obj: &serde_json::Map) -> Result { + let mut regex = String::from(r"\{"); + + let properties = obj + .get("properties") + .and_then(Value::as_object) + .ok_or_else(|| ParserError::PropertiesNotFound)?; + + let required_properties = obj + .get("required") + .and_then(Value::as_array) + .map(|arr| arr.iter().filter_map(Value::as_str).collect::>()) + .unwrap_or_default(); + + let is_required: Vec = properties + .keys() + .map(|item| required_properties.contains(&item.as_str())) + .collect(); -pub fn parse_all_of( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - match obj.get("allOf") { - Some(Value::Array(all_of)) => { - let subregexes: Result> = all_of + if is_required.iter().any(|&x| x) { + let last_required_pos = is_required .iter() - .map(|t| to_regex(t, Some(whitespace_pattern), full_schema)) - .collect(); + .enumerate() + .filter(|&(_, &value)| value) + .map(|(i, _)| i) + .max() + .unwrap(); + + for (i, (name, value)) in properties.iter().enumerate() { + let mut subregex = + format!(r#"{0}"{1}"{0}:{0}"#, self.whitespace_pattern, escape(name)); + subregex += &mut self.to_regex(value)?; + match i { + i if i < last_required_pos => { + subregex = format!("{}{},", subregex, self.whitespace_pattern) + } + i if i > last_required_pos => { + subregex = format!("{},{}", self.whitespace_pattern, subregex) + } + _ => (), + } + regex += &if is_required[i] { + subregex + } else { + format!("({})?", subregex) + }; + } + } else { + let mut property_subregexes = Vec::new(); + for (name, value) in properties.iter() { + let mut subregex = + format!(r#"{0}"{1}"{0}:{0}"#, self.whitespace_pattern, escape(name)); + subregex += &mut self.to_regex(value)?; + property_subregexes.push(subregex); + } - let subregexes = subregexes?; - let combined_regex = subregexes.join(""); + let mut possible_patterns = Vec::new(); + for i in 0..property_subregexes.len() { + let mut pattern = String::new(); + for subregex in &property_subregexes[..i] { + pattern += &format!("({}{},)?", subregex, self.whitespace_pattern); + } + pattern += &property_subregexes[i]; + for subregex in &property_subregexes[i + 1..] { + pattern += &format!("({},{})?", self.whitespace_pattern, subregex); + } + possible_patterns.push(pattern); + } - Ok(format!(r"({})", combined_regex)) + regex += &format!("({})?", possible_patterns.join("|")); } - _ => Err(anyhow!("'allOf' must be an array")), + + regex += &format!("{}\\}}", self.whitespace_pattern); + Ok(regex) } -} -pub fn parse_any_of( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - match obj.get("anyOf") { - Some(Value::Array(any_of)) => { - let subregexes: Result> = any_of - .iter() - .map(|t| to_regex(t, Some(whitespace_pattern), full_schema)) - .collect(); + fn parse_all_of(&mut self, obj: &serde_json::Map) -> Result { + match obj.get("allOf") { + Some(Value::Array(all_of)) => { + let subregexes: Result> = + all_of.iter().map(|t| self.to_regex(t)).collect(); - let subregexes = subregexes?; + let subregexes = subregexes?; + let combined_regex = subregexes.join(""); - Ok(format!(r"({})", subregexes.join("|"))) + Ok(format!(r"({})", combined_regex)) + } + _ => Err(ParserError::AllOfMustBeAnArray), } - _ => Err(anyhow!("'anyOf' must be an array")), } -} -pub fn parse_one_of( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - match obj.get("oneOf") { - Some(Value::Array(one_of)) => { - let subregexes: Result> = one_of - .iter() - .map(|t| to_regex(t, Some(whitespace_pattern), full_schema)) - .collect(); + fn parse_any_of(&mut self, obj: &serde_json::Map) -> Result { + match obj.get("anyOf") { + Some(Value::Array(any_of)) => { + let subregexes: Result> = + any_of.iter().map(|t| self.to_regex(t)).collect(); - let subregexes = subregexes?; + let subregexes = subregexes?; - let xor_patterns: Vec = subregexes - .into_iter() - .map(|subregex| format!(r"(?:{})", subregex)) - .collect(); - - Ok(format!(r"({})", xor_patterns.join("|"))) + Ok(format!(r"({})", subregexes.join("|"))) + } + _ => Err(ParserError::AnyOfMustBeAnArray), } - _ => Err(anyhow!("'oneOf' must be an array")), } -} - -pub fn parse_prefix_items( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - match obj.get("prefixItems") { - Some(Value::Array(prefix_items)) => { - let element_patterns: Result> = prefix_items - .iter() - .map(|t| to_regex(t, Some(whitespace_pattern), full_schema)) - .collect(); - let element_patterns = element_patterns?; + fn parse_one_of(&mut self, obj: &serde_json::Map) -> Result { + match obj.get("oneOf") { + Some(Value::Array(one_of)) => { + let subregexes: Result> = + one_of.iter().map(|t| self.to_regex(t)).collect(); - let comma_split_pattern = format!("{},{}", whitespace_pattern, whitespace_pattern); - let tuple_inner = element_patterns.join(&comma_split_pattern); + let subregexes = subregexes?; + let xor_patterns: Vec = subregexes + .into_iter() + .map(|subregex| format!(r"(?:{})", subregex)) + .collect(); - Ok(format!( - r"\[{whitespace_pattern}{tuple_inner}{whitespace_pattern}\]" - )) + Ok(format!(r"({})", xor_patterns.join("|"))) + } + _ => Err(ParserError::OneOfMustBeAnArray), } - _ => Err(anyhow!("'prefixItems' must be an array")), } -} -pub fn parse_enum( - obj: &serde_json::Map, - _whitespace_pattern: &str, -) -> Result { - match obj.get("enum") { - Some(Value::Array(enum_values)) => { - let choices: Result> = enum_values - .iter() - .map(|choice| match choice { - Value::Null | Value::Bool(_) | Value::Number(_) | Value::String(_) => { - let json_string = serde_json::to_string(choice)?; - Ok(regex::escape(&json_string)) - } - _ => Err(anyhow!("Unsupported data type in enum: {:?}", choice)), - }) - .collect(); + fn parse_prefix_items(&mut self, obj: &serde_json::Map) -> Result { + match obj.get("prefixItems") { + Some(Value::Array(prefix_items)) => { + let element_patterns: Result> = + prefix_items.iter().map(|t| self.to_regex(t)).collect(); + + let element_patterns = element_patterns?; - let choices = choices?; - Ok(format!(r"({})", choices.join("|"))) + let comma_split_pattern = format!("{0},{0}", self.whitespace_pattern); + let tuple_inner = element_patterns.join(&comma_split_pattern); + + Ok(format!(r"\[{0}{tuple_inner}{0}\]", self.whitespace_pattern)) + } + _ => Err(ParserError::PrefixItemsMustBeAnArray), } - _ => Err(anyhow!("'enum' must be an array")), } -} -pub fn parse_const( - obj: &serde_json::Map, - _whitespace_pattern: &str, -) -> Result { - match obj.get("const") { - Some(const_value) => match const_value { - Value::Null | Value::Bool(_) | Value::Number(_) | Value::String(_) => { - let json_string = serde_json::to_string(const_value)?; - Ok(regex::escape(&json_string)) + fn parse_enum(&mut self, obj: &serde_json::Map) -> Result { + match obj.get("enum") { + Some(Value::Array(enum_values)) => { + let choices: Result> = enum_values + .iter() + .map(|choice| match choice { + Value::Null | Value::Bool(_) | Value::Number(_) | Value::String(_) => { + let json_string = serde_json::to_string(choice)?; + Ok(regex::escape(&json_string)) + } + _ => Err(ParserError::UnsupportedEnumDataType(Box::new( + choice.clone(), + ))), + }) + .collect(); + + let choices = choices?; + Ok(format!(r"({})", choices.join("|"))) } - _ => Err(anyhow!("Unsupported data type in const: {:?}", const_value)), - }, - None => Err(anyhow!("'const' key not found in object")), + _ => Err(ParserError::EnumMustBeAnArray), + } } -} -pub fn parse_ref( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - let ref_path = obj["$ref"] - .as_str() - .ok_or_else(|| anyhow!("'$ref' must be a string"))?; - - if ref_path == "#" { - return Err(anyhow!("Recursive references are not supported for now")); + fn parse_const(&mut self, obj: &serde_json::Map) -> Result { + match obj.get("const") { + Some(const_value) => match const_value { + Value::Null | Value::Bool(_) | Value::Number(_) | Value::String(_) => { + let json_string = serde_json::to_string(const_value)?; + Ok(regex::escape(&json_string)) + } + _ => Err(ParserError::UnsupportedConstDataType(Box::new( + const_value.clone(), + ))), + }, + None => Err(ParserError::ConstKeyNotFound), + } } - let parts: Vec<&str> = ref_path.split('#').collect(); - - match parts.as_slice() { - [fragment] | ["", fragment] => { - let path_parts: Vec<&str> = fragment.split('/').filter(|&s| !s.is_empty()).collect(); - let referenced_schema = resolve_local_ref(full_schema, &path_parts)?; - to_regex(referenced_schema, Some(whitespace_pattern), full_schema) + fn parse_ref(&mut self, obj: &serde_json::Map) -> Result { + let addr = obj as *const _ as usize; + if !self.visited.insert(addr) { + self.recursion_depth += 1; + } + // TODO: add recursion depth error and handle it higher + if self.recursion_depth >= self.max_recursion_depth { + return Ok("{}".into()); } - [base, fragment] => { - if let Some(id) = full_schema["$id"].as_str() { - if *base == id || base.is_empty() { - let path_parts: Vec<&str> = - fragment.split('/').filter(|&s| !s.is_empty()).collect(); - let referenced_schema = resolve_local_ref(full_schema, &path_parts)?; - return to_regex(referenced_schema, Some(whitespace_pattern), full_schema); + let ref_path = obj["$ref"] + .as_str() + .ok_or_else(|| ParserError::RefMustBeAString)?; + + let parts: Vec<&str> = ref_path.split('#').collect(); + + match parts.as_slice() { + [fragment] | ["", fragment] => { + let path_parts: Vec<&str> = + fragment.split('/').filter(|&s| !s.is_empty()).collect(); + let referenced_schema = resolve_local_ref(self.root, &path_parts)?; + self.to_regex(referenced_schema) + } + [base, fragment] => { + if let Some(id) = self.root["$id"].as_str() { + if *base == id || base.is_empty() { + let path_parts: Vec<&str> = + fragment.split('/').filter(|&s| !s.is_empty()).collect(); + let referenced_schema = resolve_local_ref(self.root, &path_parts)?; + return self.to_regex(referenced_schema); + } } + Err(ParserError::ExternalReferencesNotSupported(Box::from( + ref_path, + ))) } - Err(anyhow!( - "External references are not supported: {}", - ref_path - )) + _ => Err(ParserError::InvalidReferenceFormat(Box::from(ref_path))), } - _ => Err(anyhow!("Invalid reference format: {}", ref_path)), } -} -fn resolve_local_ref<'a>(schema: &'a Value, path_parts: &[&str]) -> Result<&'a Value> { - let mut current = schema; - for &part in path_parts { - current = current - .get(part) - .ok_or_else(|| anyhow!("Invalid reference path: {}", part))?; + fn parse_type(&mut self, obj: &serde_json::Map) -> Result { + let instance_type = obj["type"] + .as_str() + .ok_or_else(|| ParserError::TypeMustBeAString)?; + match instance_type { + "string" => self.parse_string_type(obj), + "number" => self.parse_number_type(obj), + "integer" => self.parse_integer_type(obj), + "array" => self.parse_array_type(obj), + "object" => self.parse_object_type(obj), + "boolean" => self.parse_boolean_type(), + "null" => self.parse_null_type(), + _ => Err(ParserError::UnsupportedType(Box::from(instance_type))), + } } - Ok(current) -} -pub fn parse_type( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - let instance_type = obj["type"] - .as_str() - .ok_or_else(|| anyhow!("'type' must be a string"))?; - match instance_type { - "string" => parse_string_type(obj), - "number" => parse_number_type(obj), - "integer" => parse_integer_type(obj), - "array" => parse_array_type(obj, whitespace_pattern, full_schema), - "object" => parse_object_type(obj, whitespace_pattern, full_schema), - "boolean" => parse_boolean_type(), - "null" => parse_null_type(), - _ => Err(anyhow!("Unsupported type: {}", instance_type)), + fn parse_boolean_type(&mut self) -> Result { + let format_type = types::JsonType::Boolean; + Ok(format_type.to_regex().to_string()) } -} -pub fn parse_empty_object(whitespace_pattern: &str, full_schema: &Value) -> Result { - // JSON Schema Spec: Empty object means unconstrained, any json type is legal - let types = vec![ - json!({"type": "boolean"}), - json!({"type": "null"}), - json!({"type": "number"}), - json!({"type": "integer"}), - json!({"type": "string"}), - json!({"type": "array"}), - json!({"type": "object"}), - ]; + fn parse_null_type(&mut self) -> Result { + let format_type = types::JsonType::Null; + Ok(format_type.to_regex().to_string()) + } - let regexes: Result> = types - .iter() - .map(|t| to_regex(t, Some(whitespace_pattern), full_schema)) - .collect(); + fn parse_string_type(&mut self, obj: &serde_json::Map) -> Result { + if obj.contains_key("maxLength") || obj.contains_key("minLength") { + let max_items = obj.get("maxLength"); + let min_items = obj.get("minLength"); - let regexes = regexes?; + match (min_items, max_items) { + (Some(min), Some(max)) if min.as_f64() > max.as_f64() => { + return Err(ParserError::MaxBoundError) + } + _ => {} + } - let wrapped_regexes: Vec = regexes.into_iter().map(|r| format!("({})", r)).collect(); + let formatted_max = max_items + .and_then(Value::as_u64) + .map_or("".to_string(), |n| format!("{}", n)); + let formatted_min = min_items + .and_then(Value::as_u64) + .map_or("0".to_string(), |n| format!("{}", n)); - Ok(wrapped_regexes.join("|")) -} + Ok(format!( + r#""{}{{{},{}}}""#, + types::STRING_INNER, + formatted_min, + formatted_max, + )) + } else if let Some(pattern) = obj.get("pattern").and_then(Value::as_str) { + if pattern.starts_with('^') && pattern.ends_with('$') { + Ok(format!(r#"("{}")"#, &pattern[1..pattern.len() - 1])) + } else { + Ok(format!(r#"("{}")"#, pattern)) + } + } else if let Some(format) = obj.get("format").and_then(Value::as_str) { + match types::FormatType::from_str(format) { + Some(format_type) => Ok(format_type.to_regex().to_string()), + None => Err(ParserError::StringTypeUnsupportedFormat(Box::from(format))), + } + } else { + Ok(types::JsonType::String.to_regex().to_string()) + } + } -fn parse_boolean_type() -> Result { - let format_type = types::JsonType::Boolean; - Ok(format_type.to_regex().to_string()) -} + fn parse_number_type(&mut self, obj: &serde_json::Map) -> Result { + let bounds = [ + "minDigitsInteger", + "maxDigitsInteger", + "minDigitsFraction", + "maxDigitsFraction", + "minDigitsExponent", + "maxDigitsExponent", + ]; -fn parse_null_type() -> Result { - let format_type = types::JsonType::Null; - Ok(format_type.to_regex().to_string()) -} + let has_bounds = bounds.iter().any(|&key| obj.contains_key(key)); + + if has_bounds { + let (min_digits_integer, max_digits_integer) = validate_quantifiers( + obj.get("minDigitsInteger").and_then(Value::as_u64), + obj.get("maxDigitsInteger").and_then(Value::as_u64), + 1, + )?; + + let (min_digits_fraction, max_digits_fraction) = validate_quantifiers( + obj.get("minDigitsFraction").and_then(Value::as_u64), + obj.get("maxDigitsFraction").and_then(Value::as_u64), + 0, + )?; + + let (min_digits_exponent, max_digits_exponent) = validate_quantifiers( + obj.get("minDigitsExponent").and_then(Value::as_u64), + obj.get("maxDigitsExponent").and_then(Value::as_u64), + 0, + )?; + + let integers_quantifier = match (min_digits_integer, max_digits_integer) { + (Some(min), Some(max)) => format!("{{{},{}}}", min, max), + (Some(min), None) => format!("{{{},}}", min), + (None, Some(max)) => format!("{{1,{}}}", max), + (None, None) => "*".to_string(), + }; -fn parse_string_type(obj: &serde_json::Map) -> Result { - if obj.contains_key("maxLength") || obj.contains_key("minLength") { - let max_items = obj.get("maxLength"); - let min_items = obj.get("minLength"); + let fraction_quantifier = match (min_digits_fraction, max_digits_fraction) { + (Some(min), Some(max)) => format!("{{{},{}}}", min, max), + (Some(min), None) => format!("{{{},}}", min), + (None, Some(max)) => format!("{{0,{}}}", max), + (None, None) => "+".to_string(), + }; - match (min_items, max_items) { - (Some(min), Some(max)) if min.as_f64() > max.as_f64() => { - return Err(anyhow::anyhow!( - "maxLength must be greater than or equal to minLength" - )); - } - _ => {} - } + let exponent_quantifier = match (min_digits_exponent, max_digits_exponent) { + (Some(min), Some(max)) => format!("{{{},{}}}", min, max), + (Some(min), None) => format!("{{{},}}", min), + (None, Some(max)) => format!("{{0,{}}}", max), + (None, None) => "+".to_string(), + }; - let formatted_max = max_items - .and_then(Value::as_u64) - .map_or("".to_string(), |n| format!("{}", n)); - let formatted_min = min_items - .and_then(Value::as_u64) - .map_or("0".to_string(), |n| format!("{}", n)); - - Ok(format!( - r#""{}{{{},{}}}""#, - types::STRING_INNER, - formatted_min, - formatted_max, - )) - } else if let Some(pattern) = obj.get("pattern").and_then(Value::as_str) { - if pattern.starts_with('^') && pattern.ends_with('$') { - Ok(format!(r#"("{}")"#, &pattern[1..pattern.len() - 1])) + Ok(format!( + r"((-)?(0|[1-9][0-9]{}))(\.[0-9]{})?([eE][+-][0-9]{})?", + integers_quantifier, fraction_quantifier, exponent_quantifier + )) } else { - Ok(format!(r#"("{}")"#, pattern)) + let format_type = types::JsonType::Number; + Ok(format_type.to_regex().to_string()) } - } else if let Some(format) = obj.get("format").and_then(Value::as_str) { - match types::FormatType::from_str(format) { - Some(format_type) => Ok(format_type.to_regex().to_string()), - None => Err(anyhow::anyhow!( - "Format {} is not supported by Outlines", - format - )), + } + + fn parse_integer_type(&mut self, obj: &serde_json::Map) -> Result { + if obj.contains_key("minDigits") || obj.contains_key("maxDigits") { + let (min_digits, max_digits) = validate_quantifiers( + obj.get("minDigits").and_then(Value::as_u64), + obj.get("maxDigits").and_then(Value::as_u64), + 1, + )?; + + let quantifier = match (min_digits, max_digits) { + (Some(min), Some(max)) => format!("{{{},{}}}", min, max), + (Some(min), None) => format!("{{{},}}", min), + (None, Some(max)) => format!("{{0,{}}}", max), + (None, None) => "*".to_string(), + }; + + Ok(format!(r"(-)?(0|[1-9][0-9]{})", quantifier)) + } else { + let format_type = types::JsonType::Integer; + Ok(format_type.to_regex().to_string()) } - } else { - Ok(types::JsonType::String.to_regex().to_string()) } -} -fn parse_number_type(obj: &serde_json::Map) -> Result { - let bounds = [ - "minDigitsInteger", - "maxDigitsInteger", - "minDigitsFraction", - "maxDigitsFraction", - "minDigitsExponent", - "maxDigitsExponent", - ]; - - let has_bounds = bounds.iter().any(|&key| obj.contains_key(key)); - - if has_bounds { - let (min_digits_integer, max_digits_integer) = helpers::validate_quantifiers( - obj.get("minDigitsInteger").and_then(Value::as_u64), - obj.get("maxDigitsInteger").and_then(Value::as_u64), - 1, - )?; - - let (min_digits_fraction, max_digits_fraction) = helpers::validate_quantifiers( - obj.get("minDigitsFraction").and_then(Value::as_u64), - obj.get("maxDigitsFraction").and_then(Value::as_u64), - 0, - )?; - - let (min_digits_exponent, max_digits_exponent) = helpers::validate_quantifiers( - obj.get("minDigitsExponent").and_then(Value::as_u64), - obj.get("maxDigitsExponent").and_then(Value::as_u64), - 0, - )?; - - let integers_quantifier = match (min_digits_integer, max_digits_integer) { - (Some(min), Some(max)) => format!("{{{},{}}}", min, max), - (Some(min), None) => format!("{{{},}}", min), - (None, Some(max)) => format!("{{1,{}}}", max), - (None, None) => "*".to_string(), - }; + fn parse_object_type(&mut self, obj: &serde_json::Map) -> Result { + let min_properties = obj.get("minProperties").and_then(|v| v.as_u64()); + let max_properties = obj.get("maxProperties").and_then(|v| v.as_u64()); - let fraction_quantifier = match (min_digits_fraction, max_digits_fraction) { - (Some(min), Some(max)) => format!("{{{},{}}}", min, max), - (Some(min), None) => format!("{{{},}}", min), - (None, Some(max)) => format!("{{0,{}}}", max), - (None, None) => "+".to_string(), - }; + let num_repeats = get_num_items_pattern(min_properties, max_properties); + + if num_repeats.is_none() { + return Ok(format!(r"\{{{}\}}", self.whitespace_pattern)); + } - let exponent_quantifier = match (min_digits_exponent, max_digits_exponent) { - (Some(min), Some(max)) => format!("{{{},{}}}", min, max), - (Some(min), None) => format!("{{{},}}", min), - (None, Some(max)) => format!("{{0,{}}}", max), - (None, None) => "+".to_string(), + let allow_empty = if min_properties.unwrap_or(0) == 0 { + "?" + } else { + "" }; - Ok(format!( - r"((-)?(0|[1-9][0-9]{}))(\.[0-9]{})?([eE][+-][0-9]{})?", - integers_quantifier, fraction_quantifier, exponent_quantifier - )) - } else { - let format_type = types::JsonType::Number; - Ok(format_type.to_regex().to_string()) - } -} + let additional_properties = obj.get("additionalProperties"); + + let value_pattern = match additional_properties { + None | Some(&Value::Bool(true)) => { + let mut legal_types = vec![ + json!({"type": "string"}), + json!({"type": "number"}), + json!({"type": "boolean"}), + json!({"type": "null"}), + ]; + + let depth = obj.get("depth").and_then(|v| v.as_u64()).unwrap_or(2); + if depth > 0 { + legal_types.push(json!({"type": "object", "depth": depth - 1})); + legal_types.push(json!({"type": "array", "depth": depth - 1})); + } -fn parse_integer_type(obj: &serde_json::Map) -> Result { - if obj.contains_key("minDigits") || obj.contains_key("maxDigits") { - let (min_digits, max_digits) = helpers::validate_quantifiers( - obj.get("minDigits").and_then(Value::as_u64), - obj.get("maxDigits").and_then(Value::as_u64), - 1, - )?; - - let quantifier = match (min_digits, max_digits) { - (Some(min), Some(max)) => format!("{{{},{}}}", min, max), - (Some(min), None) => format!("{{{},}}", min), - (None, Some(max)) => format!("{{0,{}}}", max), - (None, None) => "*".to_string(), + let any_of = json!({"anyOf": &legal_types}); + self.to_regex(&any_of)? + } + Some(props) => self.to_regex(props)?, }; - Ok(format!(r"(-)?(0|[1-9][0-9]{})", quantifier)) - } else { - let format_type = types::JsonType::Integer; - Ok(format_type.to_regex().to_string()) + let key_value_pattern = format!( + "{}{1}:{1}{value_pattern}", + types::STRING, + self.whitespace_pattern, + ); + let key_value_successor_pattern = + format!("{0},{0}{key_value_pattern}", self.whitespace_pattern,); + let multiple_key_value_pattern = + format!("({key_value_pattern}({key_value_successor_pattern}){{0,}}){allow_empty}"); + + let res = format!( + r"\{{{0}{1}{0}\}}", + self.whitespace_pattern, multiple_key_value_pattern + ); + + Ok(res) } -} - -fn parse_object_type( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - let min_properties = obj.get("minProperties").and_then(|v| v.as_u64()); - let max_properties = obj.get("maxProperties").and_then(|v| v.as_u64()); - let num_repeats = helpers::get_num_items_pattern(min_properties, max_properties); + fn parse_array_type(&mut self, obj: &serde_json::Map) -> Result { + let num_repeats = get_num_items_pattern( + obj.get("minItems").and_then(Value::as_u64), + obj.get("maxItems").and_then(Value::as_u64), + ) + .unwrap_or_else(|| String::from("")); - if num_repeats.is_none() { - return Ok(format!(r"\{{{}\}}", whitespace_pattern)); - } - - let allow_empty = if min_properties.unwrap_or(0) == 0 { - "?" - } else { - "" - }; + if num_repeats.is_empty() { + return Ok(format!(r"\[{0}\]", self.whitespace_pattern)); + } - let additional_properties = obj.get("additionalProperties"); + let allow_empty = if obj.get("minItems").and_then(Value::as_u64).unwrap_or(0) == 0 { + "?" + } else { + "" + }; - let value_pattern = match additional_properties { - None | Some(&Value::Bool(true)) => { + if let Some(items) = obj.get("items") { + let items_regex = self.to_regex(items)?; + Ok(format!( + r"\[{0}(({1})(,{0}({1})){2}){3}{0}\]", + self.whitespace_pattern, items_regex, num_repeats, allow_empty + )) + } else { // parse unconstrained object case let mut legal_types = vec![ - json!({"type": "string"}), - json!({"type": "number"}), json!({"type": "boolean"}), json!({"type": "null"}), + json!({"type": "number"}), + json!({"type": "integer"}), + json!({"type": "string"}), ]; - let depth = obj.get("depth").and_then(|v| v.as_u64()).unwrap_or(2); + let depth = obj.get("depth").and_then(Value::as_u64).unwrap_or(2); if depth > 0 { legal_types.push(json!({"type": "object", "depth": depth - 1})); legal_types.push(json!({"type": "array", "depth": depth - 1})); } - let any_of = json!({"anyOf": legal_types}); - to_regex(&any_of, Some(whitespace_pattern), full_schema)? + let regexes: Result> = + legal_types.iter().map(|t| self.to_regex(t)).collect(); + + let regexes = regexes?; + let regexes_joined = regexes.join("|"); + + Ok(format!( + r"\[{0}(({1})(,{0}({1})){2}){3}{0}\]", + self.whitespace_pattern, regexes_joined, num_repeats, allow_empty + )) } - Some(props) => to_regex(props, Some(whitespace_pattern), full_schema)?, - }; - - let key_value_pattern = format!( - "{}{whitespace_pattern}:{whitespace_pattern}{value_pattern}", - types::STRING - ); - let key_value_successor_pattern = - format!("{whitespace_pattern},{whitespace_pattern}{key_value_pattern}"); - let multiple_key_value_pattern = - format!("({key_value_pattern}({key_value_successor_pattern}){{0,}}){allow_empty}"); - - let res = format!( - r"\{{{}{}{}\}}", - whitespace_pattern, multiple_key_value_pattern, whitespace_pattern - ); - - Ok(res) + } } -fn parse_array_type( - obj: &serde_json::Map, - whitespace_pattern: &str, - full_schema: &Value, -) -> Result { - let num_repeats = helpers::get_num_items_pattern( - obj.get("minItems").and_then(Value::as_u64), - obj.get("maxItems").and_then(Value::as_u64), - ) - .unwrap_or_else(|| String::from("")); - - if num_repeats.is_empty() { - return Ok(format!(r"\[{0}\]", whitespace_pattern)); +fn resolve_local_ref<'a>(schema: &'a Value, path_parts: &[&str]) -> Result<&'a Value> { + let mut current = schema; + for &part in path_parts { + current = current + .get(part) + .ok_or_else(|| ParserError::InvalidRefecencePath(Box::from(part)))?; } + Ok(current) +} - let allow_empty = if obj.get("minItems").and_then(Value::as_u64).unwrap_or(0) == 0 { - "?" - } else { - "" - }; - - if let Some(items) = obj.get("items") { - let items_regex = to_regex(items, Some(whitespace_pattern), full_schema)?; - Ok(format!( - r"\[{0}(({1})(,{0}({1})){2}){3}{0}\]", - whitespace_pattern, items_regex, num_repeats, allow_empty - )) - } else { - let mut legal_types = vec![ - json!({"type": "boolean"}), - json!({"type": "null"}), - json!({"type": "number"}), - json!({"type": "integer"}), - json!({"type": "string"}), - ]; - - let depth = obj.get("depth").and_then(Value::as_u64).unwrap_or(2); - if depth > 0 { - legal_types.push(json!({"type": "object", "depth": depth - 1})); - legal_types.push(json!({"type": "array", "depth": depth - 1})); +fn validate_quantifiers( + min_bound: Option, + max_bound: Option, + start_offset: u64, +) -> Result<(Option, Option)> { + let min_bound = min_bound.map(|n| NonZeroU64::new(n.saturating_sub(start_offset))); + let max_bound = max_bound.map(|n| NonZeroU64::new(n.saturating_sub(start_offset))); + + if let (Some(min), Some(max)) = (min_bound, max_bound) { + if max < min { + return Err(ParserError::MaxBoundError); } + } - let regexes: Result> = legal_types - .iter() - .map(|t| to_regex(t, Some(whitespace_pattern), full_schema)) - .collect(); + Ok((min_bound.flatten(), max_bound.flatten())) +} - let regexes = regexes?; - let regexes_joined = regexes.join("|"); +fn get_num_items_pattern(min_items: Option, max_items: Option) -> Option { + let min_items = min_items.unwrap_or(0); - Ok(format!( - r"\[{0}(({1})(,{0}({1})){2}){3}{0}\]", - whitespace_pattern, regexes_joined, num_repeats, allow_empty - )) + match max_items { + None => Some(format!("{{{},}}", min_items.saturating_sub(1))), + Some(max_items) => { + if max_items < 1 { + None + } else { + Some(format!( + "{{{},{}}}", + min_items.saturating_sub(1), + max_items.saturating_sub(1) + )) + } + } } } diff --git a/src/python_bindings/mod.rs b/src/python_bindings/mod.rs index 55d979d1..6dc7d544 100644 --- a/src/python_bindings/mod.rs +++ b/src/python_bindings/mod.rs @@ -146,7 +146,7 @@ pub fn build_regex_from_schema_py( #[pyo3(signature = (json, whitespace_pattern=None))] pub fn to_regex_py(json: Bound, whitespace_pattern: Option<&str>) -> PyResult { let json_value: Value = serde_pyobject::from_pyobject(json)?; - json_schema::to_regex(&json_value, whitespace_pattern, &json_value) + json_schema::to_regex(&json_value, whitespace_pattern) .map_err(|e| PyValueError::new_err(e.to_string())) }