diff --git a/query-grammar/Cargo.toml b/query-grammar/Cargo.toml index 4b00cd1554..600fc59d5c 100644 --- a/query-grammar/Cargo.toml +++ b/query-grammar/Cargo.toml @@ -12,6 +12,4 @@ keywords = ["search", "information", "retrieval"] edition = "2021" [dependencies] -combine = {version="4", default-features=false, features=[] } -once_cell = "1.7.2" -regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] } +nom = "7" diff --git a/query-grammar/src/infallible.rs b/query-grammar/src/infallible.rs new file mode 100644 index 0000000000..6de085b13c --- /dev/null +++ b/query-grammar/src/infallible.rs @@ -0,0 +1,353 @@ +//! nom combinators for infallible operations + +use std::convert::Infallible; + +use nom::{AsChar, IResult, InputLength, InputTakeAtPosition}; + +pub(crate) type ErrorList = Vec; +pub(crate) type JResult = IResult; + +/// An error, with an end-of-string based offset +#[derive(Debug)] +pub(crate) struct LenientErrorInternal { + pub pos: usize, + pub message: String, +} + +/// A recoverable error and the position it happened at +#[derive(Debug, PartialEq)] +pub struct LenientError { + pub pos: usize, + pub message: String, +} + +impl LenientError { + pub(crate) fn from_internal(internal: LenientErrorInternal, str_len: usize) -> LenientError { + LenientError { + pos: str_len - internal.pos, + message: internal.message, + } + } +} + +fn unwrap_infallible(res: Result>) -> T { + match res { + Ok(val) => val, + Err(_) => unreachable!(), + } +} + +// when rfcs#1733 get stabilized, this can make things clearer +// trait InfallibleParser = nom::Parser; + +/// A variant of the classical `opt` parser, except it returns an infallible error type. +/// +/// It's less generic than the original to ease type resolution in the rest of the code. +pub(crate) fn opt_i(mut f: F) -> impl FnMut(I) -> JResult> +where F: nom::Parser> { + move |input: I| { + let i = input.clone(); + match f.parse(input) { + Ok((i, o)) => Ok((i, (Some(o), Vec::new()))), + Err(_) => Ok((i, (None, Vec::new()))), + } + } +} + +pub(crate) fn opt_i_err<'a, I: Clone + InputLength, O, F>( + mut f: F, + message: impl ToString + 'a, +) -> impl FnMut(I) -> JResult> + 'a +where + F: nom::Parser> + 'a, +{ + move |input: I| { + let i = input.clone(); + match f.parse(input) { + Ok((i, o)) => Ok((i, (Some(o), Vec::new()))), + Err(_) => { + let errs = vec![LenientErrorInternal { + pos: i.input_len(), + message: message.to_string(), + }]; + Ok((i, (None, errs))) + } + } + } +} + +pub(crate) fn space0_infallible(input: T) -> JResult +where + T: InputTakeAtPosition + Clone, + ::Item: AsChar + Clone, +{ + opt_i(nom::character::complete::space0)(input) + .map(|(left, (spaces, errors))| (left, (spaces.expect("space0 can't fail"), errors))) +} + +pub(crate) fn space1_infallible(input: T) -> JResult> +where + T: InputTakeAtPosition + Clone + InputLength, + ::Item: AsChar + Clone, +{ + opt_i(nom::character::complete::space1)(input).map(|(left, (spaces, mut errors))| { + if spaces.is_none() { + errors.push(LenientErrorInternal { + pos: left.input_len(), + message: "missing space".to_string(), + }) + } + (left, (spaces, errors)) + }) +} + +pub(crate) fn fallible, F>( + mut f: F, +) -> impl FnMut(I) -> IResult +where F: nom::Parser { + use nom::Err; + move |input: I| match f.parse(input) { + Ok((input, (output, _err))) => Ok((input, output)), + Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)), + Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {}, + } +} + +pub(crate) fn delimited_infallible( + mut first: F, + mut second: G, + mut third: H, +) -> impl FnMut(I) -> JResult +where + F: nom::Parser, + G: nom::Parser, + H: nom::Parser, +{ + move |input: I| { + let (input, (_, mut err)) = first.parse(input)?; + let (input, (o2, mut err2)) = second.parse(input)?; + err.append(&mut err2); + let (input, (_, mut err3)) = third.parse(input)?; + err.append(&mut err3); + Ok((input, (o2, err))) + } +} + +// Parse nothing. Just a lazy way to not implement terminated/preceded and use delimited instead +pub(crate) fn nothing(i: &str) -> JResult<&str, ()> { + Ok((i, ((), Vec::new()))) +} + +pub(crate) trait TupleInfallible { + /// Parses the input and returns a tuple of results of each parser. + fn parse(&mut self, input: I) -> JResult; +} + +impl> + TupleInfallible for (F,) +{ + fn parse(&mut self, input: Input) -> JResult { + self.0.parse(input).map(|(i, (o, e))| (i, ((o,), e))) + } +} + +// these macros are heavily copied from nom, with some minor adaptations for our type +macro_rules! tuple_trait( + ($name1:ident $ty1:ident, $name2: ident $ty2:ident, $($name:ident $ty:ident),*) => ( + tuple_trait!(__impl $name1 $ty1, $name2 $ty2; $($name $ty),*); + ); + (__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident, $($name2:ident $ty2:ident),*) => ( + tuple_trait_impl!($($name $ty),+); + tuple_trait!(__impl $($name $ty),+ , $name1 $ty1; $($name2 $ty2),*); + ); + (__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident) => ( + tuple_trait_impl!($($name $ty),+); + tuple_trait_impl!($($name $ty),+, $name1 $ty1); + ); +); + +macro_rules! tuple_trait_impl( + ($($name:ident $ty: ident),+) => ( + impl< + Input: Clone, $($ty),+ , + $($name: nom::Parser),+ + > TupleInfallible for ( $($name),+ ) { + + fn parse(&mut self, input: Input) -> JResult { + let mut error_list = Vec::new(); + tuple_trait_inner!(0, self, input, (), error_list, $($name)+) + } + } + ); +); + +macro_rules! tuple_trait_inner( + ($it:tt, $self:expr, $input:expr, (), $error_list:expr, $head:ident $($id:ident)+) => ({ + let (i, (o, mut err)) = $self.$it.parse($input.clone())?; + $error_list.append(&mut err); + + succ!($it, tuple_trait_inner!($self, i, ( o ), $error_list, $($id)+)) + }); + ($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident $($id:ident)+) => ({ + let (i, (o, mut err)) = $self.$it.parse($input.clone())?; + $error_list.append(&mut err); + + succ!($it, tuple_trait_inner!($self, i, ($($parsed)* , o), $error_list, $($id)+)) + }); + ($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident) => ({ + let (i, (o, mut err)) = $self.$it.parse($input.clone())?; + $error_list.append(&mut err); + + Ok((i, (($($parsed)* , o), $error_list))) + }); +); + +macro_rules! succ ( + (0, $submac:ident ! ($($rest:tt)*)) => ($submac!(1, $($rest)*)); + (1, $submac:ident ! ($($rest:tt)*)) => ($submac!(2, $($rest)*)); + (2, $submac:ident ! ($($rest:tt)*)) => ($submac!(3, $($rest)*)); + (3, $submac:ident ! ($($rest:tt)*)) => ($submac!(4, $($rest)*)); + (4, $submac:ident ! ($($rest:tt)*)) => ($submac!(5, $($rest)*)); + (5, $submac:ident ! ($($rest:tt)*)) => ($submac!(6, $($rest)*)); + (6, $submac:ident ! ($($rest:tt)*)) => ($submac!(7, $($rest)*)); + (7, $submac:ident ! ($($rest:tt)*)) => ($submac!(8, $($rest)*)); + (8, $submac:ident ! ($($rest:tt)*)) => ($submac!(9, $($rest)*)); + (9, $submac:ident ! ($($rest:tt)*)) => ($submac!(10, $($rest)*)); + (10, $submac:ident ! ($($rest:tt)*)) => ($submac!(11, $($rest)*)); + (11, $submac:ident ! ($($rest:tt)*)) => ($submac!(12, $($rest)*)); + (12, $submac:ident ! ($($rest:tt)*)) => ($submac!(13, $($rest)*)); + (13, $submac:ident ! ($($rest:tt)*)) => ($submac!(14, $($rest)*)); + (14, $submac:ident ! ($($rest:tt)*)) => ($submac!(15, $($rest)*)); + (15, $submac:ident ! ($($rest:tt)*)) => ($submac!(16, $($rest)*)); + (16, $submac:ident ! ($($rest:tt)*)) => ($submac!(17, $($rest)*)); + (17, $submac:ident ! ($($rest:tt)*)) => ($submac!(18, $($rest)*)); + (18, $submac:ident ! ($($rest:tt)*)) => ($submac!(19, $($rest)*)); + (19, $submac:ident ! ($($rest:tt)*)) => ($submac!(20, $($rest)*)); + (20, $submac:ident ! ($($rest:tt)*)) => ($submac!(21, $($rest)*)); +); + +tuple_trait!(FnA A, FnB B, FnC C, FnD D, FnE E, FnF F, FnG G, FnH H, FnI I, FnJ J, FnK K, FnL L, + FnM M, FnN N, FnO O, FnP P, FnQ Q, FnR R, FnS S, FnT T, FnU U); + +// Special case: implement `TupleInfallible` for `()`, the unit type. +// This can come up in macros which accept a variable number of arguments. +// Literally, `()` is an empty tuple, so it should simply parse nothing. +impl TupleInfallible for () { + fn parse(&mut self, input: I) -> JResult { + Ok((input, ((), Vec::new()))) + } +} + +pub(crate) fn tuple_infallible>( + mut l: List, +) -> impl FnMut(I) -> JResult { + move |i: I| l.parse(i) +} + +pub(crate) fn separated_list_infallible( + mut sep: G, + mut f: F, +) -> impl FnMut(I) -> JResult> +where + I: Clone + InputLength, + F: nom::Parser, + G: nom::Parser, +{ + move |i: I| { + let mut res: Vec = Vec::new(); + let mut errors: ErrorList = Vec::new(); + + let (mut i, (o, mut err)) = unwrap_infallible(f.parse(i.clone())); + errors.append(&mut err); + res.push(o); + + loop { + let (i_sep_parsed, (_, mut err_sep)) = unwrap_infallible(sep.parse(i.clone())); + let len_before = i_sep_parsed.input_len(); + + let (i_elem_parsed, (o, mut err_elem)) = + unwrap_infallible(f.parse(i_sep_parsed.clone())); + + // infinite loop check: the parser must always consume + // if we consumed nothing here, don't produce an element. + if i_elem_parsed.input_len() == len_before { + return Ok((i, (res, errors))); + } + res.push(o); + errors.append(&mut err_sep); + errors.append(&mut err_elem); + i = i_elem_parsed; + } + } +} + +pub(crate) trait Alt { + /// Tests each parser in the tuple and returns the result of the first one that succeeds + fn choice(&mut self, input: I) -> Option>; +} + +macro_rules! alt_trait( + ($first_cond:ident $first:ident, $($id_cond:ident $id: ident),+) => ( + alt_trait!(__impl $first_cond $first; $($id_cond $id),+); + ); + (__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => ( + alt_trait_impl!($($current_cond $current),*); + + alt_trait!(__impl $($current_cond $current,)* $head_cond $head; $($id_cond $id),+); + ); + (__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident) => ( + alt_trait_impl!($($current_cond $current),*); + alt_trait_impl!($($current_cond $current,)* $head_cond $head); + ); +); + +macro_rules! alt_trait_impl( + ($($id_cond:ident $id:ident),+) => ( + impl< + Input: Clone, Output, + $( + // () are to make things easier on me, but I'm not entirely sure whether we can do better + // with rule E0207 + $id_cond: nom::Parser, + $id: nom::Parser + ),+ + > Alt for ( $(($id_cond, $id),)+ ) { + + fn choice(&mut self, input: Input) -> Option> { + match self.0.0.parse(input.clone()) { + Err(_) => alt_trait_inner!(1, self, input, $($id_cond $id),+), + Ok((input_left, _)) => Some(self.0.1.parse(input_left)), + } + } + } + ); +); + +macro_rules! alt_trait_inner( + ($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => ( + match $self.$it.0.parse($input.clone()) { + Err(_) => succ!($it, alt_trait_inner!($self, $input, $($id_cond $id),+)), + Ok((input_left, _)) => Some($self.$it.1.parse(input_left)), + } + ); + ($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident) => ( + None + ); +); + +alt_trait!(A1 A, B1 B, C1 C, D1 D, E1 E, F1 F, G1 G, H1 H, I1 I, J1 J, K1 K, + L1 L, M1 M, N1 N, O1 O, P1 P, Q1 Q, R1 R, S1 S, T1 T, U1 U); + +/// An alt() like combinator. For each branch, it first tries a fallible parser, which commits to +/// this branch, or tells to check next branch, and the execute the infallible parser which follow. +/// +/// In case no branch match, the default (fallible) parser is executed. +pub(crate) fn alt_infallible>( + mut l: List, + mut default: F, +) -> impl FnMut(I) -> JResult +where + F: nom::Parser, +{ + move |i: I| l.choice(i.clone()).unwrap_or_else(|| default.parse(i)) +} diff --git a/query-grammar/src/lib.rs b/query-grammar/src/lib.rs index 9e8d0a4198..2696bf667f 100644 --- a/query-grammar/src/lib.rs +++ b/query-grammar/src/lib.rs @@ -1,19 +1,26 @@ #![allow(clippy::derive_partial_eq_without_eq)] +mod infallible; mod occur; mod query_grammar; mod user_input_ast; -use combine::parser::Parser; +pub use crate::infallible::LenientError; pub use crate::occur::Occur; -use crate::query_grammar::parse_to_ast; +use crate::query_grammar::{parse_to_ast, parse_to_ast_lenient}; pub use crate::user_input_ast::{ Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral, }; pub struct Error; +/// Parse a query pub fn parse_query(query: &str) -> Result { - let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?; + let (_remaining, user_input_ast) = parse_to_ast(query).map_err(|_| Error)?; Ok(user_input_ast) } + +/// Parse a query, trying to recover from syntax errors, and giving hints toward fixing errors. +pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec) { + parse_to_ast_lenient(query) +} diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs index c731b08a3c..6545be01b2 100644 --- a/query-grammar/src/query_grammar.rs +++ b/query-grammar/src/query_grammar.rs @@ -1,17 +1,16 @@ -use combine::error::StringStreamError; -use combine::parser::char::{char, digit, space, spaces, string}; -use combine::parser::combinator::recognize; -use combine::parser::range::{take_while, take_while1}; -use combine::parser::repeat::escaped; -use combine::parser::Parser; -use combine::{ - any, attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by, - skip_many1, value, +use nom::branch::alt; +use nom::bytes::complete::tag; +use nom::character::complete::{ + anychar, char, digit1, none_of, one_of, satisfy, space0, space1, u32, }; -use once_cell::sync::Lazy; -use regex::Regex; +use nom::combinator::{eof, map, map_res, opt, peek, recognize, value, verify}; +use nom::error::{Error, ErrorKind}; +use nom::multi::{many0, many1, separated_list0, separated_list1}; +use nom::sequence::{delimited, preceded, separated_pair, terminated, tuple}; +use nom::IResult; use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; +use crate::infallible::*; use crate::user_input_ast::Delimiter; use crate::Occur; @@ -20,249 +19,398 @@ use crate::Occur; const SPECIAL_CHARS: &[char] = &[ '+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ', ]; -const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#; -/// Parses a field_name -/// A field name must have at least one character and be followed by a colon. -/// All characters are allowed including special characters `SPECIAL_CHARS`, but these -/// need to be escaped with a backslash character '\'. -fn field_name<'a>() -> impl Parser<&'a str, Output = String> { - static ESCAPED_SPECIAL_CHARS_RE: Lazy = - Lazy::new(|| Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap()); - - recognize::(escaped( - ( - take_while1(|c| !SPECIAL_CHARS.contains(&c) && c != '-'), - take_while(|c| !SPECIAL_CHARS.contains(&c)), +/// consume a field name followed by colon. Return the field name with escape sequence +/// already interpreted +fn field_name(i: &str) -> IResult<&str, String> { + let simple_char = none_of(SPECIAL_CHARS); + let first_char = verify(none_of(SPECIAL_CHARS), |c| *c != '-'); + let escape_sequence = || preceded(char('\\'), one_of(SPECIAL_CHARS)); + + map( + terminated( + tuple(( + alt((first_char, escape_sequence())), + many0(alt((simple_char, escape_sequence(), char('\\')))), + )), + char(':'), ), - '\\', - satisfy(|_| true), /* if the next character is not a special char, the \ will be treated - * as the \ character. */ - )) - .skip(char(':')) - .map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string()) - .and_then(|s: String| match s.is_empty() { - true => Err(StringStreamError::UnexpectedParse), - _ => Ok(s), - }) -} - -fn word<'a>() -> impl Parser<&'a str, Output = String> { - ( - satisfy(|c: char| { - !c.is_whitespace() - && !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c) - }), - many(satisfy(|c: char| { - !c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c) - })), - ) - .map(|(s1, s2): (char, String)| format!("{s1}{s2}")) - .and_then(|s: String| match s.as_str() { - "OR" | "AND " | "NOT" => Err(StringStreamError::UnexpectedParse), + |(first_char, next)| { + std::iter::once(first_char) + .chain(next.into_iter()) + .collect() + }, + )(i) +} + +/// Consume a word outside of any context. +// TODO should support escape sequences +fn word(i: &str) -> IResult<&str, &str> { + map_res( + recognize(tuple(( + satisfy(|c| { + !c.is_whitespace() + && !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c) + }), + many0(satisfy(|c: char| { + !c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c) + })), + ))), + |s| match s { + "OR" | "AND" | "NOT" | "IN" => Err(Error::new(i, ErrorKind::Tag)), _ => Ok(s), - }) + }, + )(i) } -// word variant that allows more characters, e.g. for range queries that don't allow field -// specifier -fn relaxed_word<'a>() -> impl Parser<&'a str, Output = String> { - ( - satisfy(|c: char| { - !c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c) - }), - many(satisfy(|c: char| { +fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ { + |i| { + opt_i_err( + preceded( + space0, + recognize(many1(satisfy(|c| { + !c.is_whitespace() && !delimiter.contains(c) + }))), + ), + "expected word", + )(i) + } +} + +/// Consume a word inside a Range context. More values are allowed as they are +/// not ambiguous in this context. +fn relaxed_word(i: &str) -> IResult<&str, &str> { + recognize(tuple(( + satisfy(|c| !c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)), + many0(satisfy(|c: char| { !c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c) })), - ) - .map(|(s1, s2): (char, String)| format!("{s1}{s2}")) + )))(i) } -/// Parses a date time according to rfc3339 -/// 2015-08-02T18:54:42+02 -/// 2021-04-13T19:46:26.266051969+00:00 -/// -/// NOTE: also accepts 999999-99-99T99:99:99.266051969+99:99 -/// We delegate rejecting such invalid dates to the logical AST computation code -/// which invokes `time::OffsetDateTime::parse(..., &Rfc3339)` on the value to actually parse -/// it (instead of merely extracting the datetime value as string as done here). -fn date_time<'a>() -> impl Parser<&'a str, Output = String> { - let two_digits = || recognize::((digit(), digit())); - - // Parses a time zone - // -06:30 - // Z - let time_zone = { - let utc = recognize::(char('Z')); - let offset = recognize(( - choice([char('-'), char('+')]), - two_digits(), - char(':'), - two_digits(), - )); +fn negative_number(i: &str) -> IResult<&str, &str> { + recognize(preceded( + char('-'), + tuple((digit1, opt(tuple((char('.'), digit1))))), + ))(i) +} - utc.or(offset) +fn simple_term(i: &str) -> IResult<&str, (Delimiter, String)> { + let escaped_string = |delimiter| { + // we need this because none_of can't accept an owned array of char. + let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter); + map( + delimited( + char(delimiter), + many0(alt((preceded(char('\\'), anychar), not_delimiter))), + char(delimiter), + ), + |res| res.into_iter().collect::(), + ) }; - // Parses a date - // 2010-01-30 - let date = { - recognize::(( - many1::(digit()), - char('-'), - two_digits(), - char('-'), - two_digits(), - )) - }; + let negative_number = map(negative_number, |number| { + (Delimiter::None, number.to_string()) + }); + let double_quotes = map(escaped_string('"'), |phrase| { + (Delimiter::DoubleQuotes, phrase) + }); + let simple_quotes = map(escaped_string('\''), |phrase| { + (Delimiter::SingleQuotes, phrase) + }); + let text_no_delimiter = map(word, |text| (Delimiter::None, text.to_string())); + + alt(( + negative_number, + simple_quotes, + double_quotes, + text_no_delimiter, + ))(i) +} - // Parses a time - // 12:30:02 - // 19:46:26.266051969 - let time = { - recognize::(( - two_digits(), - char(':'), - two_digits(), - char(':'), - two_digits(), - optional((char('.'), many1::(digit()))), - time_zone, - )) - }; +fn simple_term_infallible( + delimiter: &str, +) -> impl Fn(&str) -> JResult<&str, Option<(Delimiter, String)>> + '_ { + |i| { + let escaped_string = |delimiter| { + // we need this because none_of can't accept an owned array of char. + let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter); + map( + delimited_infallible( + nothing, + opt_i(many0(alt((preceded(char('\\'), anychar), not_delimiter)))), + opt_i_err(char(delimiter), format!("missing delimiter \\{delimiter}")), + ), + |(res, err)| { + // many0 can't fail + (res.unwrap().into_iter().collect::(), err) + }, + ) + }; - recognize((date, char('T'), time)) + let double_quotes = map(escaped_string('"'), |(phrase, errors)| { + (Some((Delimiter::DoubleQuotes, phrase)), errors) + }); + let simple_quotes = map(escaped_string('\''), |(phrase, errors)| { + (Some((Delimiter::SingleQuotes, phrase)), errors) + }); + + alt_infallible( + ( + (value((), char('"')), double_quotes), + (value((), char('\'')), simple_quotes), + ), + // numbers are parsed with words in this case, as we allow string starting with a - + map(word_infallible(delimiter), |(text, errors)| { + (text.map(|text| (Delimiter::None, text.to_string())), errors) + }), + )(i) + } } -fn escaped_character<'a>() -> impl Parser<&'a str, Output = char> { - (char('\\'), any()).map(|(_, x)| x) +fn term_or_phrase(i: &str) -> IResult<&str, UserInputLeaf> { + map( + tuple((simple_term, fallible(slop_or_prefix_val))), + |((delimiter, phrase), (slop, prefix))| { + UserInputLiteral { + field_name: None, + phrase, + delimiter, + slop, + prefix, + } + .into() + }, + )(i) } -fn escaped_string<'a>(delimiter: char) -> impl Parser<&'a str, Output = String> { - ( - char(delimiter), - many(choice(( - escaped_character(), - satisfy(move |c: char| c != delimiter), - ))), - char(delimiter), - ) - .map(|(_, s, _)| s) -} - -fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> { - let double_quotes = escaped_string('"').map(|phrase| (Delimiter::DoubleQuotes, phrase)); - let single_quotes = escaped_string('\'').map(|phrase| (Delimiter::SingleQuotes, phrase)); - let text_no_delimiter = word().map(|text| (Delimiter::None, text)); - negative_number() - .map(|negative_number_str| (Delimiter::None, negative_number_str)) - .or(double_quotes) - .or(single_quotes) - .or(text_no_delimiter) -} - -fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> { - (field_name(), term_val(), slop_or_prefix_val()).map( - |(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral { - field_name: Some(field_name), - phrase, - delimiter, - slop, - prefix, +fn term_or_phrase_infallible(i: &str) -> JResult<&str, Option> { + map( + // ~* for slop/prefix, ) inside group or ast tree, ^ if boost + tuple_infallible((simple_term_infallible("*)^"), slop_or_prefix_val)), + |((delimiter_phrase, (slop, prefix)), errors)| { + let leaf = if let Some((delimiter, phrase)) = delimiter_phrase { + Some( + UserInputLiteral { + field_name: None, + phrase, + delimiter, + slop, + prefix, + } + .into(), + ) + } else if slop != 0 { + Some( + UserInputLiteral { + field_name: None, + phrase: "".to_string(), + delimiter: Delimiter::None, + slop, + prefix, + } + .into(), + ) + } else { + None + }; + (leaf, errors) }, - ) + )(i) } -fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> { - let prefix_val = char('*').map(|_ast| (0, true)); - let slop_val = slop_val().map(|slop| (slop, false)); +fn term_group(i: &str) -> IResult<&str, UserInputAst> { + let occur_symbol = alt(( + value(Occur::MustNot, char('-')), + value(Occur::Must, char('+')), + )); + + map( + tuple(( + terminated(field_name, space0), + delimited( + tuple((char('('), space0)), + separated_list0(space1, tuple((opt(occur_symbol), term_or_phrase))), + char(')'), + ), + )), + |(field_name, terms)| { + UserInputAst::Clause( + terms + .into_iter() + .map(|(occur, leaf)| (occur, leaf.set_field(Some(field_name.clone())).into())) + .collect(), + ) + }, + )(i) +} - prefix_val.or(slop_val) +// this is a precondition for term_group_infallible. Without it, term_group_infallible can fail +// with a panic. It does not consume its input. +fn term_group_precond(i: &str) -> IResult<&str, (), ()> { + value( + (), + peek(tuple(( + field_name, + space0, + char('('), // when we are here, we know it can't be anything but a term group + ))), + )(i) + .map_err(|e| e.map(|_| ())) } -fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> { - let slop = - (char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::() { - Ok(d) => Ok(d), - _ => Err(StringStreamError::UnexpectedParse), - }); - optional(slop).map(|slop| match slop { - Some(d) => d, - _ => 0, - }) +fn term_group_infallible(i: &str) -> JResult<&str, UserInputAst> { + let (mut i, (field_name, _, _, _)) = + tuple((field_name, space0, char('('), space0))(i).expect("precondition failed"); + + let mut terms = Vec::new(); + let mut errs = Vec::new(); + + let mut first_round = true; + loop { + let mut space_error = if first_round { + first_round = false; + Vec::new() + } else { + let (rest, (_, err)) = space1_infallible(i)?; + i = rest; + err + }; + if i.is_empty() { + errs.push(LenientErrorInternal { + pos: i.len(), + message: "missing )".to_string(), + }); + break Ok((i, (UserInputAst::Clause(terms), errs))); + } + if let Some(i) = i.strip_prefix(')') { + break Ok((i, (UserInputAst::Clause(terms), errs))); + } + // only append missing space error if we did not reach the end of group + errs.append(&mut space_error); + + // here we do the assumption term_or_phrase_infallible always consume something if the + // first byte is not `)` or ' '. If it did not, we would end up looping. + + let (rest, ((occur, leaf), mut err)) = + tuple_infallible((occur_symbol, term_or_phrase_infallible))(i)?; + errs.append(&mut err); + if let Some(leaf) = leaf { + terms.push((occur, leaf.set_field(Some(field_name.clone())).into())); + } + i = rest; + } } -fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { - let term_default_field = - (term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| { - UserInputLiteral { - field_name: None, - phrase, - delimiter, - slop, - prefix, - } - }); +fn literal(i: &str) -> IResult<&str, UserInputAst> { + alt(( + map( + tuple((opt(field_name), alt((range, set, term_or_phrase)))), + |(field_name, leaf): (Option, UserInputLeaf)| leaf.set_field(field_name).into(), + ), + term_group, + ))(i) +} - attempt(term_query()) - .or(term_default_field) - .map(UserInputLeaf::from) +fn literal_no_group_infallible(i: &str) -> JResult<&str, Option> { + map( + tuple_infallible(( + opt_i(field_name), + space0_infallible, + alt_infallible( + ( + ( + value((), tuple((tag("IN"), space0, char('[')))), + map(set_infallible, |(set, errs)| (Some(set), errs)), + ), + ( + value((), peek(one_of("{[><"))), + map(range_infallible, |(range, errs)| (Some(range), errs)), + ), + ), + delimited_infallible(space0_infallible, term_or_phrase_infallible, nothing), + ), + )), + |((field_name, _, leaf), mut errors)| { + ( + leaf.map(|leaf| { + if matches!(&leaf, UserInputLeaf::Literal(literal) + if literal.phrase.contains(':') && literal.delimiter == Delimiter::None) + && field_name.is_none() + { + errors.push(LenientErrorInternal { + pos: i.len(), + message: "parsed possible invalid field as term".to_string(), + }); + } + if matches!(&leaf, UserInputLeaf::Literal(literal) + if literal.phrase == "NOT" && literal.delimiter == Delimiter::None) + && field_name.is_none() + { + errors.push(LenientErrorInternal { + pos: i.len(), + message: "parsed keyword NOT as term. It should be quoted".to_string(), + }); + } + leaf.set_field(field_name).into() + }), + errors, + ) + }, + )(i) } -fn negative_number<'a>() -> impl Parser<&'a str, Output = String> { - ( - char('-'), - many1(digit()), - optional((char('.'), many1(digit()))), - ) - .map(|(s1, s2, s3): (char, String, Option<(char, String)>)| { - if let Some(('.', s3)) = s3 { - format!("{s1}{s2}.{s3}") - } else { - format!("{s1}{s2}") - } - }) +fn literal_infallible(i: &str) -> JResult<&str, Option> { + alt_infallible( + (( + term_group_precond, + map(term_group_infallible, |(group, errs)| (Some(group), errs)), + ),), + literal_no_group_infallible, + )(i) } -fn spaces1<'a>() -> impl Parser<&'a str, Output = ()> { - skip_many1(space()) +fn slop_or_prefix_val(i: &str) -> JResult<&str, (u32, bool)> { + map( + opt_i(alt(( + value((0, true), char('*')), + map(preceded(char('~'), u32), |slop| (slop, false)), + ))), + |(slop_or_prefix_opt, err)| (slop_or_prefix_opt.unwrap_or_default(), err), + )(i) } /// Function that parses a range out of a Stream /// Supports ranges like: /// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10 /// [a TO *], [a TO c], [abc TO bcd} -fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { +fn range(i: &str) -> IResult<&str, UserInputLeaf> { let range_term_val = || { - attempt(date_time()) - .or(negative_number()) - .or(relaxed_word()) - .or(char('*').with(value("*".to_string()))) + map( + alt((negative_number, relaxed_word, tag("*"))), + ToString::to_string, + ) }; // check for unbounded range in the form of <5, <=10, >5, >=5 - let elastic_unbounded_range = ( - choice([ - attempt(string(">=")), - attempt(string("<=")), - attempt(string("<")), - attempt(string(">")), - ]) - .skip(spaces()), - range_term_val(), - ) - .map( - |(comparison_sign, bound): (&str, String)| match comparison_sign { - ">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded), - "<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)), - "<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)), - ">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded), - // default case - _ => (UserInputBound::Unbounded, UserInputBound::Unbounded), - }, - ); - let lower_bound = (one_of("{[".chars()), range_term_val()).map( - |(boundary_char, lower_bound): (char, String)| { + let elastic_unbounded_range = map( + tuple(( + preceded(space0, alt((tag(">="), tag("<="), tag("<"), tag(">")))), + preceded(space0, range_term_val()), + )), + |(comparison_sign, bound)| match comparison_sign { + ">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded), + "<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)), + "<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)), + ">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded), + // unreachable case + _ => (UserInputBound::Unbounded, UserInputBound::Unbounded), + }, + ); + + let lower_bound = map( + separated_pair(one_of("{["), space0, range_term_val()), + |(boundary_char, lower_bound)| { if lower_bound == "*" { UserInputBound::Unbounded } else if boundary_char == '{' { @@ -272,120 +420,328 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { } }, ); - let upper_bound = (range_term_val(), one_of("}]".chars())).map( - |(higher_bound, boundary_char): (String, char)| { - if higher_bound == "*" { + + let upper_bound = map( + separated_pair(range_term_val(), space0, one_of("}]")), + |(upper_bound, boundary_char)| { + if upper_bound == "*" { UserInputBound::Unbounded } else if boundary_char == '}' { - UserInputBound::Exclusive(higher_bound) + UserInputBound::Exclusive(upper_bound) } else { - UserInputBound::Inclusive(higher_bound) + UserInputBound::Inclusive(upper_bound) } }, ); - // return only lower and upper - let lower_to_upper = ( - lower_bound.skip((spaces(), string("TO"), spaces())), - upper_bound, - ); - ( - optional(field_name()).skip(spaces()), - // try elastic first, if it matches, the range is unbounded - attempt(elastic_unbounded_range).or(lower_to_upper), - ) - .map(|(field, (lower, upper))| - // Construct the leaf from extracted field (optional) - // and bounds - UserInputLeaf::Range { - field, - lower, - upper - }) -} - -/// Function that parses a set out of a Stream -/// Supports ranges like: `IN [val1 val2 val3]` -fn set<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> { - let term_list = between( - char('['), - char(']'), - sep_by(term_val().map(|(_delimiter, text)| text), spaces()), + let lower_to_upper = + separated_pair(lower_bound, tuple((space1, tag("TO"), space1)), upper_bound); + + map( + alt((elastic_unbounded_range, lower_to_upper)), + |(lower, upper)| UserInputLeaf::Range { + field: None, + lower, + upper, + }, + )(i) +} + +fn range_infallible(i: &str) -> JResult<&str, UserInputLeaf> { + let lower_to_upper = map( + tuple_infallible(( + opt_i(anychar), + space0_infallible, + word_infallible("]}"), + space1_infallible, + opt_i_err( + terminated(tag("TO"), alt((value((), space1), value((), eof)))), + "missing keyword TO", + ), + word_infallible("]}"), + opt_i_err(one_of("]}"), "missing range delimiter"), + )), + |((lower_bound_kind, _space0, lower, _space1, to, upper, upper_bound_kind), errs)| { + let lower_bound = match (lower_bound_kind, lower) { + (_, Some("*")) => UserInputBound::Unbounded, + (_, None) => UserInputBound::Unbounded, + // if it is some, TO was actually the bound (i.e. [TO TO something]) + (_, Some("TO")) if to.is_none() => UserInputBound::Unbounded, + (Some('['), Some(bound)) => UserInputBound::Inclusive(bound.to_string()), + (Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()), + _ => unreachable!("precondition failed, range did not start with [ or {{"), + }; + let upper_bound = match (upper_bound_kind, upper) { + (_, Some("*")) => UserInputBound::Unbounded, + (_, None) => UserInputBound::Unbounded, + (Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()), + (Some('}'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()), + // the end is missing, assume this is an inclusive bound + (_, Some(bound)) => UserInputBound::Inclusive(bound.to_string()), + }; + ((lower_bound, upper_bound), errs) + }, ); - let set_content = ((string("IN"), spaces()), term_list).map(|(_, elements)| elements); + map( + alt_infallible( + ( + ( + value((), tag(">=")), + map(word_infallible(""), |(bound, err)| { + ( + ( + bound + .map(|bound| UserInputBound::Inclusive(bound.to_string())) + .unwrap_or(UserInputBound::Unbounded), + UserInputBound::Unbounded, + ), + err, + ) + }), + ), + ( + value((), tag("<=")), + map(word_infallible(""), |(bound, err)| { + ( + ( + UserInputBound::Unbounded, + bound + .map(|bound| UserInputBound::Inclusive(bound.to_string())) + .unwrap_or(UserInputBound::Unbounded), + ), + err, + ) + }), + ), + ( + value((), tag(">")), + map(word_infallible(""), |(bound, err)| { + ( + ( + bound + .map(|bound| UserInputBound::Exclusive(bound.to_string())) + .unwrap_or(UserInputBound::Unbounded), + UserInputBound::Unbounded, + ), + err, + ) + }), + ), + ( + value((), tag("<")), + map(word_infallible(""), |(bound, err)| { + ( + ( + UserInputBound::Unbounded, + bound + .map(|bound| UserInputBound::Exclusive(bound.to_string())) + .unwrap_or(UserInputBound::Unbounded), + ), + err, + ) + }), + ), + ), + lower_to_upper, + ), + |((lower, upper), errors)| { + ( + UserInputLeaf::Range { + field: None, + lower, + upper, + }, + errors, + ) + }, + )(i) +} + +fn set(i: &str) -> IResult<&str, UserInputLeaf> { + map( + preceded( + tuple((space0, tag("IN"), space1)), + delimited( + tuple((char('['), space0)), + separated_list0(space1, map(simple_term, |(_, term)| term)), + char(']'), + ), + ), + |elements| UserInputLeaf::Set { + field: None, + elements, + }, + )(i) +} - (optional(attempt(field_name().skip(spaces()))), set_content) - .map(|(field, elements)| UserInputLeaf::Set { field, elements }) +fn set_infallible(mut i: &str) -> JResult<&str, UserInputLeaf> { + // `IN [` has already been parsed when we enter, we only need to parse simple terms until we + // find a `]` + let mut elements = Vec::new(); + let mut errs = Vec::new(); + let mut first_round = true; + loop { + let mut space_error = if first_round { + first_round = false; + Vec::new() + } else { + let (rest, (_, err)) = space1_infallible(i)?; + i = rest; + err + }; + if i.is_empty() { + // TODO push error about missing ] + // + errs.push(LenientErrorInternal { + pos: i.len(), + message: "missing ]".to_string(), + }); + let res = UserInputLeaf::Set { + field: None, + elements, + }; + return Ok((i, (res, errs))); + } + if let Some(i) = i.strip_prefix(']') { + let res = UserInputLeaf::Set { + field: None, + elements, + }; + return Ok((i, (res, errs))); + } + errs.append(&mut space_error); + // TODO + // here we do the assumption term_or_phrase_infallible always consume something if the + // first byte is not `)` or ' '. If it did not, we would end up looping. + + let (rest, (delim_term, mut err)) = simple_term_infallible("]")(i)?; + errs.append(&mut err); + if let Some((_, term)) = delim_term { + elements.push(term); + } + i = rest; + } } fn negate(expr: UserInputAst) -> UserInputAst { expr.unary(Occur::MustNot) } -fn leaf<'a>() -> impl Parser<&'a str, Output = UserInputAst> { - parser(|input| { - char('(') - .with(ast()) - .skip(char(')')) - .or(char('*').map(|_| UserInputAst::from(UserInputLeaf::All))) - .or(attempt( - string("NOT").skip(spaces1()).with(leaf()).map(negate), - )) - .or(attempt(range().map(UserInputAst::from))) - .or(attempt(set().map(UserInputAst::from))) - .or(literal().map(UserInputAst::from)) - .parse_stream(input) - .into_result() - }) -} - -fn occur_symbol<'a>() -> impl Parser<&'a str, Output = Occur> { - char('-') - .map(|_| Occur::MustNot) - .or(char('+').map(|_| Occur::Must)) -} - -fn occur_leaf<'a>() -> impl Parser<&'a str, Output = (Option, UserInputAst)> { - (optional(occur_symbol()), boosted_leaf()) -} - -fn positive_float_number<'a>() -> impl Parser<&'a str, Output = f64> { - (many1(digit()), optional((char('.'), many1(digit())))).map( - |(int_part, decimal_part_opt): (String, Option<(char, String)>)| { - let mut float_str = int_part; - if let Some((chr, decimal_str)) = decimal_part_opt { - float_str.push(chr); - float_str.push_str(&decimal_str); +fn leaf(i: &str) -> IResult<&str, UserInputAst> { + alt(( + delimited(char('('), ast, char(')')), + map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)), + map(preceded(tuple((tag("NOT"), space1)), leaf), negate), + literal, + ))(i) +} + +fn leaf_infallible(i: &str) -> JResult<&str, Option> { + alt_infallible( + ( + ( + value((), char('(')), + map( + delimited_infallible( + nothing, + ast_infallible, + opt_i_err(char(')'), "expected ')'"), + ), + |(ast, errs)| (Some(ast), errs), + ), + ), + ( + value((), char('*')), + map(nothing, |_| { + (Some(UserInputAst::from(UserInputLeaf::All)), Vec::new()) + }), + ), + ( + value((), tag("NOT ")), + delimited_infallible( + space0_infallible, + map(leaf_infallible, |(res, err)| (res.map(negate), err)), + nothing, + ), + ), + ), + literal_infallible, + )(i) +} + +fn positive_float_number(i: &str) -> IResult<&str, f64> { + map( + recognize(tuple((digit1, opt(tuple((char('.'), digit1)))))), + // TODO this is actually dangerous if the number is actually not representable as a f64 + // (too big for instance) + |float_str: &str| float_str.parse::().unwrap(), + )(i) +} + +fn boost(i: &str) -> JResult<&str, Option> { + opt_i(preceded(char('^'), positive_float_number))(i) +} + +fn boosted_leaf(i: &str) -> IResult<&str, UserInputAst> { + map( + tuple((leaf, fallible(boost))), + |(leaf, boost_opt)| match boost_opt { + Some(boost) if (boost - 1.0).abs() > f64::EPSILON => { + UserInputAst::Boost(Box::new(leaf), boost) } - float_str.parse::().unwrap() + _ => leaf, }, - ) + )(i) } -fn boost<'a>() -> impl Parser<&'a str, Output = f64> { - (char('^'), positive_float_number()).map(|(_, boost)| boost) +fn boosted_leaf_infallible(i: &str) -> JResult<&str, Option> { + map( + tuple_infallible((leaf_infallible, boost)), + |((leaf, boost_opt), error)| match boost_opt { + Some(boost) if (boost - 1.0).abs() > f64::EPSILON => ( + leaf.map(|leaf| UserInputAst::Boost(Box::new(leaf), boost)), + error, + ), + _ => (leaf, error), + }, + )(i) } -fn boosted_leaf<'a>() -> impl Parser<&'a str, Output = UserInputAst> { - (leaf(), optional(boost())).map(|(leaf, boost_opt)| match boost_opt { - Some(boost) if (boost - 1.0).abs() > f64::EPSILON => { - UserInputAst::Boost(Box::new(leaf), boost) - } - _ => leaf, - }) +fn occur_symbol(i: &str) -> JResult<&str, Option> { + opt_i(alt(( + value(Occur::MustNot, char('-')), + value(Occur::Must, char('+')), + )))(i) +} + +fn occur_leaf(i: &str) -> IResult<&str, (Option, UserInputAst)> { + tuple((fallible(occur_symbol), boosted_leaf))(i) +} + +#[allow(clippy::type_complexity)] +fn operand_occur_leaf_infallible( + i: &str, +) -> JResult<&str, (Option, Option, Option)> { + // TODO maybe this should support multiple chained AND/OR, and "fuse" them? + tuple_infallible(( + delimited_infallible(nothing, opt_i(binary_operand), space0_infallible), + occur_symbol, + boosted_leaf_infallible, + ))(i) } -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] enum BinaryOperand { Or, And, } -fn binary_operand<'a>() -> impl Parser<&'a str, Output = BinaryOperand> { - string("AND") - .with(value(BinaryOperand::And)) - .or(string("OR").with(value(BinaryOperand::Or))) +fn binary_operand(i: &str) -> IResult<&str, BinaryOperand> { + alt(( + value(BinaryOperand::And, tag("AND ")), + value(BinaryOperand::Or, tag("OR ")), + ))(i) } fn aggregate_binary_expressions( @@ -413,38 +769,197 @@ fn aggregate_binary_expressions( } } -fn operand_leaf<'a>() -> impl Parser<&'a str, Output = (BinaryOperand, UserInputAst)> { - ( - binary_operand().skip(spaces()), - boosted_leaf().skip(spaces()), - ) -} +fn aggregate_infallible_expressions( + input_leafs: Vec<(Option, Option, Option)>, +) -> (UserInputAst, ErrorList) { + let mut err = Vec::new(); + let mut leafs: Vec<(_, _, UserInputAst)> = input_leafs + .into_iter() + .filter_map(|(operand, occur, ast)| ast.map(|ast| (operand, occur, ast))) + .collect(); + if leafs.is_empty() { + return (UserInputAst::empty_query(), err); + } + + let use_operand = leafs.iter().any(|(operand, _, _)| operand.is_some()); + let all_operand = leafs + .iter() + .skip(1) + .all(|(operand, _, _)| operand.is_some()); + let early_operand = leafs + .iter() + .take(1) + .all(|(operand, _, _)| operand.is_some()); + let use_occur = leafs.iter().any(|(_, occur, _)| occur.is_some()); + + if use_operand && use_occur { + err.push(LenientErrorInternal { + pos: 0, + message: "Use of mixed occur and boolean operator".to_string(), + }); + } + + if use_operand && !all_operand { + err.push(LenientErrorInternal { + pos: 0, + message: "Missing boolean operator".to_string(), + }); + } -pub fn ast<'a>() -> impl Parser<&'a str, Output = UserInputAst> { - let boolean_expr = (boosted_leaf().skip(spaces()), many1(operand_leaf())) - .map(|(left, right)| aggregate_binary_expressions(left, right)); - let whitespace_separated_leaves = many1(occur_leaf().skip(spaces().silent())).map( - |subqueries: Vec<(Option, UserInputAst)>| { - if subqueries.len() == 1 { - let (occur_opt, ast) = subqueries.into_iter().next().unwrap(); - match occur_opt.unwrap_or(Occur::Should) { - Occur::Must | Occur::Should => ast, - Occur::MustNot => UserInputAst::Clause(vec![(Some(Occur::MustNot), ast)]), + if early_operand { + err.push(LenientErrorInternal { + pos: 0, + message: "Found unexpeted boolean operator before term".to_string(), + }); + } + + let mut clauses: Vec, UserInputAst)>> = vec![]; + for ((prev_operator, occur, ast), (next_operator, _, _)) in + leafs.iter().zip(leafs.iter().skip(1)) + { + match prev_operator { + Some(BinaryOperand::And) => { + if let Some(last) = clauses.last_mut() { + last.push((occur.or(Some(Occur::Must)), ast.clone())); + } else { + let mut last = Vec::new(); + last.push((occur.or(Some(Occur::Must)), ast.clone())); + clauses.push(last); } + } + Some(BinaryOperand::Or) => { + let default_op = match next_operator { + Some(BinaryOperand::And) => Some(Occur::Must), + _ => Some(Occur::Should), + }; + clauses.push(vec![(occur.or(default_op), ast.clone())]); + } + None => { + let default_op = match next_operator { + Some(BinaryOperand::And) => Some(Occur::Must), + Some(BinaryOperand::Or) => Some(Occur::Should), + None => None, + }; + clauses.push(vec![(occur.or(default_op), ast.clone())]) + } + } + } + + // leaf isn't empty, so we can unwrap + let (last_operator, last_occur, last_ast) = leafs.pop().unwrap(); + match last_operator { + Some(BinaryOperand::And) => { + if let Some(last) = clauses.last_mut() { + last.push((last_occur.or(Some(Occur::Must)), last_ast)); } else { - UserInputAst::Clause(subqueries.into_iter().collect()) + let mut last = Vec::new(); + last.push((last_occur.or(Some(Occur::Must)), last_ast)); + clauses.push(last); } + } + Some(BinaryOperand::Or) => { + clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]); + } + None => clauses.push(vec![(last_occur, last_ast)]), + } + + if clauses.len() == 1 { + let mut clause = clauses.pop().unwrap(); + if clause.len() == 1 && clause[0].0 != Some(Occur::MustNot) { + (clause.pop().unwrap().1, err) + } else { + (UserInputAst::Clause(clause), err) + } + } else { + let mut final_clauses: Vec<(Option, UserInputAst)> = Vec::new(); + for mut sub_clauses in clauses { + if sub_clauses.len() == 1 { + final_clauses.push(sub_clauses.pop().unwrap()); + } else { + final_clauses.push((Some(Occur::Should), UserInputAst::Clause(sub_clauses))); + } + } + + (UserInputAst::Clause(final_clauses), err) + } +} + +fn operand_leaf(i: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> { + tuple(( + terminated(binary_operand, space0), + terminated(boosted_leaf, space0), + ))(i) +} + +fn ast(i: &str) -> IResult<&str, UserInputAst> { + let boolean_expr = map( + separated_pair(boosted_leaf, space1, many1(operand_leaf)), + |(left, right)| aggregate_binary_expressions(left, right), + ); + let whitespace_separated_leaves = map(separated_list1(space1, occur_leaf), |subqueries| { + if subqueries.len() == 1 { + let (occur_opt, ast) = subqueries.into_iter().next().unwrap(); + match occur_opt.unwrap_or(Occur::Should) { + Occur::Must | Occur::Should => ast, + Occur::MustNot => UserInputAst::Clause(vec![(Some(Occur::MustNot), ast)]), + } + } else { + UserInputAst::Clause(subqueries.into_iter().collect()) + } + }); + + delimited( + space0, + alt((boolean_expr, whitespace_separated_leaves)), + space0, + )(i) +} + +fn ast_infallible(i: &str) -> JResult<&str, UserInputAst> { + // ast() parse either `term AND term OR term` or `+term term -term` + // both are locally ambiguous, and as we allow error, it's hard to permit backtracking. + // Instead, we allow a mix of both syntaxes, trying to make sense of what a user meant. + // For instance `term OR -term` is interpreted as `*term -term`, but `term AND -term` + // is interpreted as `+term -term`. We also allow `AND term` to make things easier for us, + // even if it's not very sensical. + + let expression = map( + separated_list_infallible(space1_infallible, operand_occur_leaf_infallible), + |(leaf, mut err)| { + let (res, mut err2) = aggregate_infallible_expressions(leaf); + err.append(&mut err2); + (res, err) }, ); - let expr = attempt(boolean_expr).or(whitespace_separated_leaves); - spaces().with(expr).skip(spaces()) + + delimited_infallible(space0_infallible, expression, space0_infallible)(i) } -pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAst> { - spaces() - .with(optional(ast()).skip(eof())) - .map(|opt_ast| opt_ast.unwrap_or_else(UserInputAst::empty_query)) - .map(rewrite_ast) +pub fn parse_to_ast(i: &str) -> IResult<&str, UserInputAst> { + map(delimited(space0, opt(ast), eof), |opt_ast| { + rewrite_ast(opt_ast.unwrap_or_else(UserInputAst::empty_query)) + })(i) +} + +pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec) { + if query_str.trim().is_empty() { + return (UserInputAst::Clause(Vec::new()), Vec::new()); + } + let (left, (res, mut errors)) = ast_infallible(query_str).unwrap(); + if !left.trim().is_empty() { + errors.push(LenientErrorInternal { + pos: left.len(), + message: "unparsed end of query".to_string(), + }) + } + + // convert end-based index to start-based index. + let errors = errors + .into_iter() + .map(|internal_error| LenientError::from_internal(internal_error, query_str.len())) + .collect(); + + (rewrite_ast(res), errors) } /// Removes unnecessary children clauses in AST @@ -470,11 +985,6 @@ fn rewrite_ast_clause(input: &mut (Option, UserInputAst)) { #[cfg(test)] mod test { - - type TestParseResult = Result<(), StringStreamError>; - - use combine::parser::Parser; - use super::*; pub fn nearly_equals(a: f64, b: f64) -> bool { @@ -488,42 +998,44 @@ mod test { ); } - #[test] - fn test_occur_symbol() -> TestParseResult { - assert_eq!(super::occur_symbol().parse("-")?, (Occur::MustNot, "")); - assert_eq!(super::occur_symbol().parse("+")?, (Occur::Must, "")); - Ok(()) - } + // TODO test as part of occur_leaf + // #[test] + // fn test_occur_symbol() -> TestParseResult { + // assert_eq!(super::occur_symbol("-")?, ("", Occur::MustNot)); + // assert_eq!(super::occur_symbol("+")?, ("", Occur::Must)); + // Ok(()) + // } #[test] fn test_positive_float_number() { fn valid_parse(float_str: &str, expected_val: f64, expected_remaining: &str) { - let (val, remaining) = positive_float_number().parse(float_str).unwrap(); + let (remaining, val) = positive_float_number(float_str).unwrap(); assert_eq!(remaining, expected_remaining); assert_nearly_equals(val, expected_val); } fn error_parse(float_str: &str) { - assert!(positive_float_number().parse(float_str).is_err()); + assert!(positive_float_number(float_str).is_err()); } valid_parse("1.0", 1.0, ""); valid_parse("1", 1.0, ""); valid_parse("0.234234 aaa", 0.234234f64, " aaa"); error_parse(".3332"); - error_parse("1."); + // TODO trinity-1686a: I disagree that it should fail, I think it should succeeed, + // consuming only "1", and leave "." for the next thing (which will likely fail then) + // error_parse("1."); error_parse("-1."); } #[test] fn test_date_time() { - let (val, remaining) = date_time() - .parse("2015-08-02T18:54:42+02:30") - .expect("cannot parse date"); + let (remaining, val) = + relaxed_word("2015-08-02T18:54:42+02:30").expect("cannot parse date"); assert_eq!(val, "2015-08-02T18:54:42+02:30"); assert_eq!(remaining, ""); - assert!(date_time().parse("2015-08-02T18:54:42+02").is_err()); + // this isn't a valid date, but relaxed_word allows it. + // assert!(date_time().parse("2015-08-02T18:54:42+02").is_err()); - let (val, remaining) = date_time() - .parse("2021-04-13T19:46:26.266051969+00:00") + let (remaining, val) = relaxed_word("2021-04-13T19:46:26.266051969+00:00") .expect("cannot parse fractional date"); assert_eq!(val, "2021-04-13T19:46:26.266051969+00:00"); assert_eq!(remaining, ""); @@ -531,13 +1043,30 @@ mod test { #[track_caller] fn test_parse_query_to_ast_helper(query: &str, expected: &str) { - let query = parse_to_ast().parse(query).unwrap().0; - let query_str = format!("{query:?}"); - assert_eq!(query_str, expected); + let query_strict = parse_to_ast(query).unwrap().1; + let query_strict_str = format!("{query_strict:?}"); + assert_eq!(query_strict_str, expected, "strict parser failed"); + + let (query_lenient, errs) = parse_to_ast_lenient(query); + let query_lenient_str = format!("{query_lenient:?}"); + assert_eq!(query_lenient_str, expected, "lenient parser failed"); + assert!( + errs.is_empty(), + "lenient parser returned errors on valid query: {errs:?}" + ); } - fn test_is_parse_err(query: &str) { - assert!(parse_to_ast().parse(query).is_err()); + #[track_caller] + fn test_is_parse_err(query: &str, lenient_expected: &str) { + assert!( + parse_to_ast(query).is_err(), + "strict parser succeeded where an error was expected." + ); + + let (query_lenient, errs) = parse_to_ast_lenient(query); + let query_lenient_str = format!("{query_lenient:?}"); + assert_eq!(query_lenient_str, lenient_expected, "lenient parser failed"); + assert!(!errs.is_empty()); } #[test] @@ -553,20 +1082,25 @@ mod test { test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded"); } + #[test] + fn test_parse_query_lenient_unfinished_quote() { + test_is_parse_err("\"www-form-encoded", "\"www-form-encoded\""); + // TODO strict parser default to parsing a normal term, and parse "'www-forme-encoded" (note + // the initial \') + // test_is_parse_err("'www-form-encoded", "'www-form-encoded'"); + } + #[test] fn test_parse_query_to_ast_not_op() { - assert_eq!( - format!("{:?}", parse_to_ast().parse("NOT")), - "Err(UnexpectedParse)" - ); + test_is_parse_err("NOT", "NOT"); test_parse_query_to_ast_helper("NOTa", "NOTa"); test_parse_query_to_ast_helper("NOT a", "(-a)"); } #[test] fn test_boosting() { - assert!(parse_to_ast().parse("a^2^3").is_err()); - assert!(parse_to_ast().parse("a^2^").is_err()); + test_is_parse_err("a^2^3", "(a)^2"); + test_is_parse_err("a^2^", "(a)^2"); test_parse_query_to_ast_helper("a^3", "(a)^3"); test_parse_query_to_ast_helper("a^3 b^2", "(*(a)^3 *(b)^2)"); test_parse_query_to_ast_helper("a^1", "a"); @@ -578,22 +1112,21 @@ mod test { test_parse_query_to_ast_helper("a OR b", "(?a ?b)"); test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))"); test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)"); - assert_eq!( - format!("{:?}", parse_to_ast().parse("a OR b aaa")), - "Err(UnexpectedParse)" - ); - assert_eq!( - format!("{:?}", parse_to_ast().parse("a AND b aaa")), - "Err(UnexpectedParse)" - ); - assert_eq!( - format!("{:?}", parse_to_ast().parse("aaa a OR b ")), - "Err(UnexpectedParse)" - ); - assert_eq!( - format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), - "Err(UnexpectedParse)" - ); + test_is_parse_err("a OR b aaa", "(?a ?b *aaa)"); + test_is_parse_err("a AND b aaa", "(?(+a +b) *aaa)"); + test_is_parse_err("aaa a OR b ", "(*aaa ?a ?b)"); + test_is_parse_err("aaa ccc a OR b ", "(*aaa *ccc ?a ?b)"); + test_is_parse_err("aaa a AND b ", "(*aaa ?(+a +b))"); + test_is_parse_err("aaa ccc a AND b ", "(*aaa *ccc ?(+a +b))"); + } + + #[test] + fn test_parse_mixed_bool_occur() { + test_is_parse_err("a OR b +aaa", "(?a ?b +aaa)"); + test_is_parse_err("a AND b -aaa", "(?(+a +b) -aaa)"); + test_is_parse_err("+a OR +b aaa", "(+a +b *aaa)"); + test_is_parse_err("-a AND -b aaa", "(?(-a -b) *aaa)"); + test_is_parse_err("-aaa +ccc -a OR b ", "(-aaa +ccc -a ?b)"); } #[test] @@ -617,7 +1150,7 @@ mod test { #[test] fn test_occur_leaf() { - let ((occur, ast), _) = super::occur_leaf().parse("+abc").unwrap(); + let (_, (occur, ast)) = super::occur_leaf("+abc").unwrap(); assert_eq!(occur, Some(Occur::Must)); assert_eq!(format!("{ast:?}"), "abc"); } @@ -625,61 +1158,54 @@ mod test { #[test] fn test_field_name() { assert_eq!( - super::field_name().parse(".my.field.name:a"), - Ok((".my.field.name".to_string(), "a")) + super::field_name(".my.field.name:a"), + Ok(("a", ".my.field.name".to_string())) + ); + assert_eq!( + super::field_name(r#"にんじん:a"#), + Ok(("a", "にんじん".to_string())) ); assert_eq!( - super::field_name().parse(r#"にんじん:a"#), - Ok(("にんじん".to_string(), "a")) + super::field_name(r#"my\field:a"#), + Ok(("a", r#"my\field"#.to_string())) ); assert_eq!( - super::field_name().parse(r#"my\field:a"#), - Ok((r#"my\field"#.to_string(), "a")) + super::field_name(r#"my\\field:a"#), + Ok(("a", r#"my\field"#.to_string())) ); - assert!(super::field_name().parse("my field:a").is_err()); + assert!(super::field_name("my field:a").is_err()); assert_eq!( - super::field_name().parse("\\(1\\+1\\):2"), - Ok(("(1+1)".to_string(), "2")) + super::field_name("\\(1\\+1\\):2"), + Ok(("2", "(1+1)".to_string())) ); assert_eq!( - super::field_name().parse("my_field_name:a"), - Ok(("my_field_name".to_string(), "a")) + super::field_name("my_field_name:a"), + Ok(("a", "my_field_name".to_string())) ); assert_eq!( - super::field_name().parse("myfield.b:hello").unwrap(), - ("myfield.b".to_string(), "hello") + super::field_name("myfield.b:hello").unwrap(), + ("hello", "myfield.b".to_string()) ); assert_eq!( - super::field_name().parse(r#"myfield\.b:hello"#).unwrap(), - (r#"myfield\.b"#.to_string(), "hello") + super::field_name(r#"myfield\.b:hello"#).unwrap(), + ("hello", r#"myfield\.b"#.to_string()) ); - assert!(super::field_name().parse("my_field_name").is_err()); - assert!(super::field_name().parse(":a").is_err()); - assert!(super::field_name().parse("-my_field:a").is_err()); + assert!(super::field_name("my_field_name").is_err()); + assert!(super::field_name(":a").is_err()); + assert!(super::field_name("-my_field:a").is_err()); assert_eq!( - super::field_name().parse("_my_field:a"), - Ok(("_my_field".to_string(), "a")) + super::field_name("_my_field:a"), + Ok(("a", "_my_field".to_string())) ); assert_eq!( - super::field_name().parse("~my~field:a"), - Ok(("~my~field".to_string(), "a")) + super::field_name("~my~field:a"), + Ok(("a", "~my~field".to_string())) ); for special_char in SPECIAL_CHARS.iter() { let query = &format!("\\{special_char}my\\{special_char}field:a"); assert_eq!( - super::field_name().parse(query), - Ok((format!("{special_char}my{special_char}field"), "a")) - ); - } - } - - #[test] - fn test_field_name_re() { - let escaped_special_chars_re = Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap(); - for special_char in SPECIAL_CHARS.iter() { - assert_eq!( - escaped_special_chars_re.replace_all(&format!("\\{special_char}"), "$1"), - special_char.to_string() + super::field_name(query), + Ok(("a", format!("{special_char}my{special_char}field"))) ); } } @@ -687,19 +1213,18 @@ mod test { #[test] fn test_range_parser() { // testing the range() parser separately - let res = range() - .parse("title: =71.2") + } + .into(); + let res3 = literal("weight: >=71.2") .expect("Cannot parse flexible bound float") - .0; - let res4 = range() - .parse("weight:[71.2 TO *}") + .1; + let res4 = literal("weight:[71.2 TO *}") .expect("Cannot parse float to unbounded") - .0; + .1; assert_eq!(res3, expected_weight); assert_eq!(res4, expected_weight); @@ -723,38 +1247,35 @@ mod test { field: Some("date_field".to_string()), lower: UserInputBound::Exclusive("2015-08-02T18:54:42Z".to_string()), upper: UserInputBound::Inclusive("2021-08-02T18:54:42+02:30".to_string()), - }; - let res5 = range() - .parse("date_field:{2015-08-02T18:54:42Z TO 2021-08-02T18:54:42+02:30]") + } + .into(); + let res5 = literal("date_field:{2015-08-02T18:54:42Z TO 2021-08-02T18:54:42+02:30]") .expect("Cannot parse date range") - .0; + .1; assert_eq!(res5, expected_dates); let expected_flexible_dates = UserInputLeaf::Range { field: Some("date_field".to_string()), lower: UserInputBound::Unbounded, upper: UserInputBound::Inclusive("2021-08-02T18:54:42.12345+02:30".to_string()), - }; + } + .into(); - let res6 = range() - .parse("date_field: <=2021-08-02T18:54:42.12345+02:30") + let res6 = literal("date_field: <=2021-08-02T18:54:42.12345+02:30") .expect("Cannot parse date range") - .0; + .1; assert_eq!(res6, expected_flexible_dates); // IP Range Unbounded let expected_weight = UserInputLeaf::Range { field: Some("ip".to_string()), lower: UserInputBound::Inclusive("::1".to_string()), upper: UserInputBound::Unbounded, - }; - let res1 = range() - .parse("ip: >=::1") - .expect("Cannot parse ip v6 format") - .0; - let res2 = range() - .parse("ip:[::1 TO *}") + } + .into(); + let res1 = literal("ip: >=::1").expect("Cannot parse ip v6 format").1; + let res2 = literal("ip:[::1 TO *}") .expect("Cannot parse ip v6 format") - .0; + .1; assert_eq!(res1, expected_weight); assert_eq!(res2, expected_weight); @@ -763,12 +1284,121 @@ mod test { field: Some("ip".to_string()), lower: UserInputBound::Inclusive("::0.0.0.50".to_string()), upper: UserInputBound::Exclusive("::0.0.0.52".to_string()), - }; - let res1 = range() - .parse("ip:[::0.0.0.50 TO ::0.0.0.52}") + } + .into(); + let res1 = literal("ip:[::0.0.0.50 TO ::0.0.0.52}") .expect("Cannot parse ip v6 format") - .0; + .1; + assert_eq!(res1, expected_weight); + } + + #[test] + fn test_range_parser_lenient() { + let literal = |query| literal_infallible(query).unwrap().1 .0.unwrap(); + + // same tests as non-lenient + let res = literal("title: =71.2"); + let res4 = literal("weight:[71.2 TO *}"); + assert_eq!(res3, expected_weight); + assert_eq!(res4, expected_weight); + + let expected_dates = UserInputLeaf::Range { + field: Some("date_field".to_string()), + lower: UserInputBound::Exclusive("2015-08-02T18:54:42Z".to_string()), + upper: UserInputBound::Inclusive("2021-08-02T18:54:42+02:30".to_string()), + } + .into(); + let res5 = literal("date_field:{2015-08-02T18:54:42Z TO 2021-08-02T18:54:42+02:30]"); + assert_eq!(res5, expected_dates); + + let expected_flexible_dates = UserInputLeaf::Range { + field: Some("date_field".to_string()), + lower: UserInputBound::Unbounded, + upper: UserInputBound::Inclusive("2021-08-02T18:54:42.12345+02:30".to_string()), + } + .into(); + + let res6 = literal("date_field: <=2021-08-02T18:54:42.12345+02:30"); + assert_eq!(res6, expected_flexible_dates); + // IP Range Unbounded + let expected_weight = UserInputLeaf::Range { + field: Some("ip".to_string()), + lower: UserInputBound::Inclusive("::1".to_string()), + upper: UserInputBound::Unbounded, + } + .into(); + let res1 = literal("ip: >=::1"); + let res2 = literal("ip:[::1 TO *}"); + assert_eq!(res1, expected_weight); + assert_eq!(res2, expected_weight); + + // IP Range Bounded + let expected_weight = UserInputLeaf::Range { + field: Some("ip".to_string()), + lower: UserInputBound::Inclusive("::0.0.0.50".to_string()), + upper: UserInputBound::Exclusive("::0.0.0.52".to_string()), + } + .into(); + let res1 = literal("ip:[::0.0.0.50 TO ::0.0.0.52}"); + assert_eq!(res1, expected_weight); + + // additional tests + let expected_weight = UserInputLeaf::Range { + field: Some("ip".to_string()), + lower: UserInputBound::Inclusive("::0.0.0.50".to_string()), + upper: UserInputBound::Inclusive("::0.0.0.52".to_string()), + } + .into(); + let res1 = literal("ip:[::0.0.0.50 TO ::0.0.0.52"); + let res2 = literal("ip:[::0.0.0.50 ::0.0.0.52"); + let res3 = literal("ip:[::0.0.0.50 ::0.0.0.52 AND ..."); + assert_eq!(res1, expected_weight); + assert_eq!(res2, expected_weight); + assert_eq!(res3, expected_weight); + + let expected_weight = UserInputLeaf::Range { + field: Some("ip".to_string()), + lower: UserInputBound::Inclusive("::0.0.0.50".to_string()), + upper: UserInputBound::Unbounded, + } + .into(); + let res1 = literal("ip:[::0.0.0.50 TO "); + let res2 = literal("ip:[::0.0.0.50 TO"); + let res3 = literal("ip:[::0.0.0.50"); assert_eq!(res1, expected_weight); + assert_eq!(res2, expected_weight); + assert_eq!(res3, expected_weight); + + let expected_weight = UserInputLeaf::Range { + field: Some("ip".to_string()), + lower: UserInputBound::Unbounded, + upper: UserInputBound::Unbounded, + } + .into(); + let res1 = literal("ip:[ "); + let res2 = literal("ip:{ "); + let res3 = literal("ip:["); + assert_eq!(res1, expected_weight); + assert_eq!(res2, expected_weight); + assert_eq!(res3, expected_weight); + // we don't test ip: as that is not a valid range request as per percondition } #[test] @@ -781,6 +1411,15 @@ mod test { test_parse_query_to_ast_helper("a OR abc ", "(?a ?abc)"); test_parse_query_to_ast_helper("(a OR abc )", "(?a ?abc)"); test_parse_query_to_ast_helper("(a OR abc) ", "(?a ?abc)"); + test_is_parse_err("(a OR abc ", "(?a ?abc)"); + } + + #[test] + fn test_parse_query_term_group() { + test_parse_query_to_ast_helper(r#"field:(abc)"#, r#"(*"field":abc)"#); + test_parse_query_to_ast_helper(r#"field:(+a -"b c")"#, r#"(+"field":a -"field":"b c")"#); + + test_is_parse_err(r#"field:(+a -"b c""#, r#"(+"field":a -"field":"b c")"#); } #[test] @@ -837,6 +1476,11 @@ mod test { test_parse_query_to_ast_helper("abc: IN [1]", r#""abc": IN ["1"]"#); test_parse_query_to_ast_helper("abc: IN []", r#""abc": IN []"#); test_parse_query_to_ast_helper("IN [1 2]", r#"IN ["1" "2"]"#); + test_is_parse_err("IN [1 2", r#"IN ["1" "2"]"#); + + // TODO maybe support these too? + // test_is_parse_err("IN (1 2", r#"IN ["1" "2"]"#); + // test_is_parse_err("IN {1 2", r#"IN ["1" "2"]"#); } #[test] @@ -846,7 +1490,8 @@ mod test { test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":toto"); test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":toto -titi)"); test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":toto)"); - test_is_parse_err("--abc:toto"); + // TODO not entirely sure about this one (it's seen as a NOT '-abc:toto') + test_is_parse_err("--abc:toto", "(--abc:toto)"); test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)"); test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\""); test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]"); @@ -863,16 +1508,20 @@ mod test { "1.2.foo.bar:[1.1 TO *}", "\"1.2.foo.bar\":[\"1.1\" TO \"*\"}", ); - test_is_parse_err("abc + "); + test_is_parse_err("abc + ", "abc"); } #[test] fn test_slop() { - assert!(parse_to_ast().parse("\"a b\"~").is_err()); - assert!(parse_to_ast().parse("foo:\"a b\"~").is_err()); - assert!(parse_to_ast().parse("\"a b\"~a").is_err()); - assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err()); - test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *~4)"); + test_is_parse_err("\"a b\"~", "(*\"a b\" *~)"); + test_is_parse_err("foo:\"a b\"~", "(*\"foo\":\"a b\" *~)"); + test_is_parse_err("\"a b\"~a", "(*\"a b\" *~a)"); + test_is_parse_err( + "\"a b\"~100000000000000000", + "(*\"a b\" *~100000000000000000)", + ); + test_parse_query_to_ast_helper("\"a b\"^2 ~4", "(*(\"a b\")^2 *~4)"); + test_parse_query_to_ast_helper("\"a b\"~4^2", "(\"a b\"~4)^2"); test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\""); test_parse_query_to_ast_helper("~Document", "~Document"); test_parse_query_to_ast_helper("a~2", "a~2"); diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs index 02d93336cb..9324b24646 100644 --- a/query-grammar/src/user_input_ast.rs +++ b/query-grammar/src/user_input_ast.rs @@ -3,7 +3,7 @@ use std::fmt::{Debug, Formatter}; use crate::Occur; -#[derive(PartialEq)] +#[derive(PartialEq, Clone)] pub enum UserInputLeaf { Literal(UserInputLiteral), All, @@ -18,6 +18,28 @@ pub enum UserInputLeaf { }, } +impl UserInputLeaf { + pub(crate) fn set_field(self, field: Option) -> Self { + match self { + UserInputLeaf::Literal(mut literal) => { + literal.field_name = field; + UserInputLeaf::Literal(literal) + } + UserInputLeaf::All => UserInputLeaf::All, + UserInputLeaf::Range { + field: _, + lower, + upper, + } => UserInputLeaf::Range { + field, + lower, + upper, + }, + UserInputLeaf::Set { field: _, elements } => UserInputLeaf::Set { field, elements }, + } + } +} + impl Debug for UserInputLeaf { fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> { match self { @@ -28,6 +50,7 @@ impl Debug for UserInputLeaf { ref upper, } => { if let Some(ref field) = field { + // TODO properly escape field (in case of \") write!(formatter, "\"{field}\":")?; } lower.display_lower(formatter)?; @@ -37,6 +60,7 @@ impl Debug for UserInputLeaf { } UserInputLeaf::Set { field, elements } => { if let Some(ref field) = field { + // TODO properly escape field (in case of \") write!(formatter, "\"{field}\": ")?; } write!(formatter, "IN [")?; @@ -44,6 +68,7 @@ impl Debug for UserInputLeaf { if i != 0 { write!(formatter, " ")?; } + // TODO properly escape element write!(formatter, "\"{text}\"")?; } write!(formatter, "]") @@ -60,7 +85,7 @@ pub enum Delimiter { None, } -#[derive(PartialEq)] +#[derive(PartialEq, Clone)] pub struct UserInputLiteral { pub field_name: Option, pub phrase: String, @@ -72,16 +97,20 @@ pub struct UserInputLiteral { impl fmt::Debug for UserInputLiteral { fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { if let Some(ref field) = self.field_name { + // TODO properly escape field (in case of \") write!(formatter, "\"{field}\":")?; } match self.delimiter { Delimiter::SingleQuotes => { + // TODO properly escape element (in case of \') write!(formatter, "'{}'", self.phrase)?; } Delimiter::DoubleQuotes => { + // TODO properly escape element (in case of \") write!(formatter, "\"{}\"", self.phrase)?; } Delimiter::None => { + // TODO properly escape element write!(formatter, "{}", self.phrase)?; } } @@ -94,7 +123,7 @@ impl fmt::Debug for UserInputLiteral { } } -#[derive(PartialEq)] +#[derive(PartialEq, Debug, Clone)] pub enum UserInputBound { Inclusive(String), Exclusive(String), @@ -104,6 +133,7 @@ pub enum UserInputBound { impl UserInputBound { fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { + // TODO properly escape word if required UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{word}\""), UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{word}\""), UserInputBound::Unbounded => write!(formatter, "{{\"*\""), @@ -112,6 +142,7 @@ impl UserInputBound { fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { + // TODO properly escape word if required UserInputBound::Inclusive(ref word) => write!(formatter, "\"{word}\"]"), UserInputBound::Exclusive(ref word) => write!(formatter, "\"{word}\"}}"), UserInputBound::Unbounded => write!(formatter, "\"*\"}}"), @@ -127,6 +158,7 @@ impl UserInputBound { } } +#[derive(PartialEq, Clone)] pub enum UserInputAst { Clause(Vec<(Option, UserInputAst)>), Leaf(Box), @@ -196,6 +228,7 @@ impl fmt::Debug for UserInputAst { match *self { UserInputAst::Clause(ref subqueries) => { if subqueries.is_empty() { + // TODO this will break ast reserialization, is writing "( )" enought? write!(formatter, "")?; } else { write!(formatter, "(")?; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 4a8b864699..9d895bd62d 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -5,6 +5,7 @@ use std::str::{FromStr, ParseBoolError}; use base64::engine::general_purpose::STANDARD as BASE64; use base64::Engine; +use itertools::Itertools; use query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; use rustc_hash::FxHashMap; @@ -227,6 +228,25 @@ fn all_negative(ast: &LogicalAst) -> bool { } } +// Make an all-negative ast into a normal ast. Must not be used on an already okay ast. +fn make_non_negative(ast: &mut LogicalAst) { + match ast { + LogicalAst::Leaf(_) => (), + LogicalAst::Boost(ref mut child_ast, _) => make_non_negative(child_ast), + LogicalAst::Clause(children) => children.push((Occur::Should, LogicalLiteral::All.into())), + } +} + +/// Similar to the try/? macro, but returns a tuple of (None, Vec) instead of Err(Error) +macro_rules! try_tuple { + ($expr:expr) => {{ + match $expr { + Ok(val) => val, + Err(e) => return (None, vec![e.into()]), + } + }}; +} + impl QueryParser { /// Creates a `QueryParser`, given /// * schema - index Schema @@ -308,17 +328,24 @@ impl QueryParser { /// /// Note that `parse_query` returns an error if the input /// is not a valid query. - /// - /// There is currently no lenient mode for the query parser - /// which makes it a bad choice for a public/broad user search engine. - /// - /// Implementing a lenient mode for this query parser is tracked - /// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5) pub fn parse_query(&self, query: &str) -> Result, QueryParserError> { let logical_ast = self.parse_query_to_logical_ast(query)?; Ok(convert_to_query(&self.fuzzy, logical_ast)) } + /// Parse a query leniently + /// + /// This variant parses invalid query on a best effort basis. If some part of the query can't + /// reasonably be executed (range query without field, searching on a non existing field, + /// searching without precising field when no default field is provided...), they may get + /// turned into a "match-nothing" subquery. + /// + /// In case it encountered such issues, they are reported as a Vec of errors. + pub fn parse_query_lenient(&self, query: &str) -> (Box, Vec) { + let (logical_ast, errors) = self.parse_query_to_logical_ast_lenient(query); + (convert_to_query(&self.fuzzy, logical_ast), errors) + } + /// Build a query from an already parsed user input AST /// /// This can be useful if the user input AST parsed using [`query_grammar`] @@ -328,31 +355,70 @@ impl QueryParser { &self, user_input_ast: UserInputAst, ) -> Result, QueryParserError> { - let logical_ast = self.compute_logical_ast(user_input_ast)?; + let (logical_ast, mut err) = self.compute_logical_ast_lenient(user_input_ast); + if !err.is_empty() { + return Err(err.swap_remove(0)); + } Ok(convert_to_query(&self.fuzzy, logical_ast)) } + /// Build leniently a query from an already parsed user input AST. + /// + /// See also [`QueryParser::build_query_from_user_input_ast`] + pub fn build_query_from_user_input_ast_lenient( + &self, + user_input_ast: UserInputAst, + ) -> (Box, Vec) { + let (logical_ast, errors) = self.compute_logical_ast_lenient(user_input_ast); + (convert_to_query(&self.fuzzy, logical_ast), errors) + } + /// Parse the user query into an AST. fn parse_query_to_logical_ast(&self, query: &str) -> Result { let user_input_ast = query_grammar::parse_query(query) .map_err(|_| QueryParserError::SyntaxError(query.to_string()))?; - self.compute_logical_ast(user_input_ast) + let (ast, mut err) = self.compute_logical_ast_lenient(user_input_ast); + if !err.is_empty() { + return Err(err.swap_remove(0)); + } + Ok(ast) } - fn compute_logical_ast( + /// Parse the user query into an AST. + fn parse_query_to_logical_ast_lenient( + &self, + query: &str, + ) -> (LogicalAst, Vec) { + let (user_input_ast, errors) = query_grammar::parse_query_lenient(query); + let mut errors: Vec<_> = errors + .into_iter() + .map(|error| { + QueryParserError::SyntaxError(format!( + "{} at position {}", + error.message, error.pos + )) + }) + .collect(); + let (ast, mut ast_errors) = self.compute_logical_ast_lenient(user_input_ast); + errors.append(&mut ast_errors); + (ast, errors) + } + + fn compute_logical_ast_lenient( &self, user_input_ast: UserInputAst, - ) -> Result { - let ast = self.compute_logical_ast_with_occur(user_input_ast)?; + ) -> (LogicalAst, Vec) { + let (mut ast, mut err) = self.compute_logical_ast_with_occur_lenient(user_input_ast); if let LogicalAst::Clause(children) = &ast { if children.is_empty() { - return Ok(ast); + return (ast, err); } } if all_negative(&ast) { - return Err(QueryParserError::AllButQueryForbidden); + err.push(QueryParserError::AllButQueryForbidden); + make_non_negative(&mut ast); } - Ok(ast) + (ast, err) } fn compute_boundary_term( @@ -571,26 +637,37 @@ impl QueryParser { } } - fn compute_logical_ast_with_occur( + fn compute_logical_ast_with_occur_lenient( &self, user_input_ast: UserInputAst, - ) -> Result { + ) -> (LogicalAst, Vec) { match user_input_ast { UserInputAst::Clause(sub_queries) => { let default_occur = self.default_occur(); let mut logical_sub_queries: Vec<(Occur, LogicalAst)> = Vec::new(); + let mut errors = Vec::new(); for (occur_opt, sub_ast) in sub_queries { - let sub_ast = self.compute_logical_ast_with_occur(sub_ast)?; + let (sub_ast, mut sub_errors) = + self.compute_logical_ast_with_occur_lenient(sub_ast); let occur = occur_opt.unwrap_or(default_occur); logical_sub_queries.push((occur, sub_ast)); + errors.append(&mut sub_errors); } - Ok(LogicalAst::Clause(logical_sub_queries)) + (LogicalAst::Clause(logical_sub_queries), errors) } UserInputAst::Boost(ast, boost) => { - let ast = self.compute_logical_ast_with_occur(*ast)?; - Ok(ast.boost(boost as Score)) + let (ast, errors) = self.compute_logical_ast_with_occur_lenient(*ast); + (ast.boost(boost as Score), errors) + } + UserInputAst::Leaf(leaf) => { + let (ast, errors) = self.compute_logical_ast_from_leaf_lenient(*leaf); + // if the error is not recoverable, replace it with an empty clause. We will end up + // trimming those later + ( + ast.unwrap_or_else(|| LogicalAst::Clause(Vec::new())), + errors, + ) } - UserInputAst::Leaf(leaf) => self.compute_logical_ast_from_leaf(*leaf), } } @@ -658,23 +735,31 @@ impl QueryParser { Ok(triplets) } - fn compute_logical_ast_from_leaf( + fn compute_logical_ast_from_leaf_lenient( &self, leaf: UserInputLeaf, - ) -> Result { + ) -> (Option, Vec) { match leaf { UserInputLeaf::Literal(literal) => { let term_phrases: Vec<(Field, &str, &str)> = - self.compute_path_triplets_for_literal(&literal)?; + try_tuple!(self.compute_path_triplets_for_literal(&literal)); let mut asts: Vec = Vec::new(); + let mut errors: Vec = Vec::new(); for (field, json_path, phrase) in term_phrases { - for ast in self.compute_logical_ast_for_leaf( + let unboosted_asts = match self.compute_logical_ast_for_leaf( field, json_path, phrase, literal.slop, literal.prefix, - )? { + ) { + Ok(asts) => asts, + Err(e) => { + errors.push(e); + continue; + } + }; + for ast in unboosted_asts { // Apply some field specific boost defined at the query parser level. let boost = self.field_boost(field); asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost)); @@ -685,56 +770,82 @@ impl QueryParser { } else { LogicalAst::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) }; - Ok(result_ast) + (Some(result_ast), errors) } - UserInputLeaf::All => Ok(LogicalAst::Leaf(Box::new(LogicalLiteral::All))), + UserInputLeaf::All => ( + Some(LogicalAst::Leaf(Box::new(LogicalLiteral::All))), + Vec::new(), + ), UserInputLeaf::Range { field: full_field_opt, lower, upper, } => { - let full_path = full_field_opt.ok_or_else(|| { - QueryParserError::UnsupportedQuery( - "Range query need to target a specific field.".to_string(), - ) - })?; - let (field, json_path) = self + let Some(full_path) = full_field_opt else { + return ( + None, + vec![QueryParserError::UnsupportedQuery( + "Range query need to target a specific field.".to_string(), + )], + ); + }; + let (field, json_path) = try_tuple!(self .split_full_path(&full_path) - .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?; + .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))); let field_entry = self.schema.get_field_entry(field); let value_type = field_entry.field_type().value_type(); + let mut errors = Vec::new(); + let lower = match self.resolve_bound(field, json_path, &lower) { + Ok(bound) => bound, + Err(error) => { + errors.push(error); + Bound::Unbounded + } + }; + let upper = match self.resolve_bound(field, json_path, &upper) { + Ok(bound) => bound, + Err(error) => { + errors.push(error); + Bound::Unbounded + } + }; + if lower == Bound::Unbounded && upper == Bound::Unbounded { + // this range is useless, either because a user requested [* TO *], or because + // we failed to parse something. Either way, there is no point emiting it + return (None, errors); + } let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range { field: self.schema.get_field_name(field).to_string(), value_type, - lower: self.resolve_bound(field, json_path, &lower)?, - upper: self.resolve_bound(field, json_path, &upper)?, + lower, + upper, })); - Ok(logical_ast) + (Some(logical_ast), errors) } UserInputLeaf::Set { field: full_field_opt, elements, } => { - let full_path = full_field_opt.ok_or_else(|| { + let full_path = try_tuple!(full_field_opt.ok_or_else(|| { QueryParserError::UnsupportedQuery( - "Set query need to target a specific field.".to_string(), + "Range query need to target a specific field.".to_string(), ) - })?; - let (field, json_path) = self + })); + let (field, json_path) = try_tuple!(self .split_full_path(&full_path) - .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?; + .ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))); let field_entry = self.schema.get_field_entry(field); let value_type = field_entry.field_type().value_type(); + let (elements, errors) = elements + .into_iter() + .map(|element| self.compute_boundary_term(field, json_path, &element)) + .partition_result(); let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { - elements: elements - .into_iter() - .map(|element| self.compute_boundary_term(field, json_path, &element)) - .collect::, _>>()?, - + elements, field, value_type, })); - Ok(logical_ast) + (Some(logical_ast), errors) } } } diff --git a/src/schema/json_object_options.rs b/src/schema/json_object_options.rs index eee3618a8e..ea892b99d5 100644 --- a/src/schema/json_object_options.rs +++ b/src/schema/json_object_options.rs @@ -80,12 +80,12 @@ impl JsonObjectOptions { /// When expand_dots is enabled, json object like /// `{"k8s.node.id": 5}` is processed as if it was /// `{"k8s": {"node": {"id": 5}}}`. - /// It option has the merit of allowing users to + /// This option has the merit of allowing users to /// write queries like `k8s.node.id:5`. /// On the other, enabling that feature can lead to /// ambiguity. /// - /// If disabled, the "." need to be escaped: + /// If disabled, the "." needs to be escaped: /// `k8s\.node\.id:5`. pub fn is_expand_dots_enabled(&self) -> bool { self.expand_dots_enabled diff --git a/src/schema/mod.rs b/src/schema/mod.rs index f8de6dd9e1..8116fde895 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -5,8 +5,10 @@ //! Tantivy has a very strict schema. //! The schema defines information about the fields your index contains, that is, for each field: //! -//! - the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`) -//! - the type of the field (currently only `text` and `u64` are supported) +//! - the field name (may contain any characted, can't start with a `-` and can't be empty. Some +//! characters may require escaping when using the query parser). +//! - the type of the field (currently `text`, `u64`, `i64`, `f64`, `bool`, `date`, `IpAddr`, +//! facets, bytes and json are supported) //! - how the field should be indexed / stored. //! //! This very last point is critical as it will enable / disable some of the functionality