From ce0c916d607f683c5810657ad28d246ff77ef393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96mer=20Sinan=20A=C4=9Facan?= Date: Mon, 7 Feb 2022 21:13:38 +0300 Subject: [PATCH] Allow initializing lexers with a char iterator (#42) This fixes #41 in an almost backwards compatible way. Generated lexers now have an extra constructor: impl + Clone> Lexer<'static, I> { fn new_from_iter(iter: I) -> Self { Lexer(::lexgen_util::Lexer::new_from_iter(iter)) } } API of the generated lexers are exactly the same, however, if a lexer is constructed with `new_from_iter` instead of `new` or `new_with_state`, then `match_` method will panic in runtime. This is because in lexers constructed with `new_from_iter` we don't have the input string, so cannot return a slice to it. Instead use `match_loc` to get the start and end locations of the current match. Only breaking change is the generated types now have one more generic argument, for the iterator type. So for a lexer like: lexer! { MyLexer -> MyToken; ... } Instead of struct MyLexer<'input>(...); we now generate struct MyLexer<'input, I: Iterator + Clone>(...); So any code that refers to the lexer type will break. Other than this the changes should be backwards compatible. Fixes #41 --- CHANGELOG.md | 17 ++++++++ README.md | 28 ++++++++++++- crates/lexgen/src/dfa/codegen.rs | 53 ++++++++++++++++------- crates/lexgen/tests/bugs.rs | 4 +- crates/lexgen/tests/tests.rs | 72 ++++++++++++++++++++++++++++++++ crates/lexgen_util/src/lib.rs | 42 ++++++++++++++++--- 6 files changed, 192 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a30187..9f53d37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,20 @@ +# Unreleased + +- Generated lexers now have two new constructors: + + - `new_from_iter + Clone>(iter: I) -> Self` + - `new_from_iter_with_state + Clone, S>(iter: I, user_state: S) -> Self` + + These constructors allow running a lexer on a character iterator instead of a + string slice. Generated lexers work exactly the same way, except the `match_` + method panics when called. + + Locations of matches can be obtained with the `match_loc(&self) -> (Loc, + Loc)` method. + + These constructors are useful when the input is not a flat unicode string, + but something like a rope, gap array, zipper, etc. (#41) + # 2022/01/31: 0.9.0 - New regex syntax `#` added for character set difference, e.g. `re1 # re2` diff --git a/README.md b/README.md index d0b13a9..8229257 100644 --- a/README.md +++ b/README.md @@ -262,7 +262,12 @@ A mut reference to this type is passed to semantic action functions. In the implementation of a semantic action, you should use one of the methods below drive the lexer and return tokens: -- `fn match_(&self) -> &str`: returns the current match +- `fn match_(&self) -> &str`: returns the current match. Note that when the + lexer is constructed with `new_from_iter` or `new_from_iter_with_state`, this + method panics. It should only be called when the lexer is initialized with + `new` or `new_with_state`. +- `fn match_loc(&self) -> (lexgen_util::Loc, lexgen_util::Loc)`: returns the + bounds of the current match - `fn peek(&mut self) -> Option`: looks ahead one character - `fn state(&mut self) -> &mut `: returns a mutable reference to the user state @@ -284,6 +289,27 @@ drive the lexer and return tokens: Semantic action functions should return a `SemanticActionResult` value obtained from one of the methods listed above. +## Initializing lexers + +lexgen generates 4 constructors: + +- `fn new(input: &str) -> Self`: Used when the lexer does not have user state, + or user state implements `Default`. + +- `fn new_with_state(input: &str, user_state: S) -> Self`: Used when the lexer + has user state that does not implement `Default`, or you want to initialize + the state with something other than the default. `S` is the user state type + specified in lexer definition. See stateful lexer example below. + +- `fn new_from_iter + Clone>(iter: I) -> Self`: Used + when the input isn't a flat string, but something like a rope or zipper. Note + that the `match_` method panics when this constructor is used. Instead use + `match_loc` to get the location of the current match. + +- `fn new_from_iter_with_state + Clone, S>(iter: I, + user_state: S) -> Self`: Same as above, but doesn't require user state to + implement `Default`. + ## Stateful lexer example Here's an example lexer that counts number of `=`s appear between two `[`s: diff --git a/crates/lexgen/src/dfa/codegen.rs b/crates/lexgen/src/dfa/codegen.rs index c15cac6..149876c 100644 --- a/crates/lexgen/src/dfa/codegen.rs +++ b/crates/lexgen/src/dfa/codegen.rs @@ -139,12 +139,19 @@ pub fn reify( #(#rule_name_idents,)* } - #visibility struct #lexer_name<'input>( - ::lexgen_util::Lexer<'input, #token_type, #user_state_type, #error_type, #lexer_name<'input>> + #visibility struct #lexer_name<'input, I: Iterator + Clone>( + ::lexgen_util::Lexer< + 'input, + I, + #token_type, + #user_state_type, + #error_type, + #lexer_name<'input, I> + > ); // Methods below for using in semantic actions - impl<'input> #lexer_name<'input> { + impl<'input, I: Iterator + Clone> #lexer_name<'input, I> { fn switch_and_return(&mut self, rule: #rule_name_enum_name, token: T) -> ::lexgen_util::SemanticActionResult { self.switch::(rule); ::lexgen_util::SemanticActionResult::Return(token) @@ -172,12 +179,16 @@ pub fn reify( self.0.match_() } + fn match_loc(&self) -> (::lexgen_util::Loc, ::lexgen_util::Loc) { + self.0.match_loc() + } + fn peek(&mut self) -> Option { self.0.peek() } } - impl<'input> #lexer_name<'input> { + impl<'input> #lexer_name<'input, ::std::str::Chars<'input>> { #visibility fn new(input: &'input str) -> Self { #lexer_name(::lexgen_util::Lexer::new(input)) } @@ -187,12 +198,22 @@ pub fn reify( } } + impl + Clone> #lexer_name<'static, I> { + #visibility fn new_from_iter(iter: I) -> Self { + #lexer_name(::lexgen_util::Lexer::new_from_iter(iter)) + } + + #visibility fn new_from_iter_with_state(iter: I, user_state: #user_state_type) -> Self { + #lexer_name(::lexgen_util::Lexer::new_from_iter_with_state(iter, user_state)) + } + } + #(#search_tables)* #binary_search_fn #semantic_action_fns #(#right_ctx_fns)* - impl<'input> Iterator for #lexer_name<'input> { + impl<'input, I: Iterator + Clone> Iterator for #lexer_name<'input, I> { type Item = Result<(::lexgen_util::Loc, #token_type, ::lexgen_util::Loc), ::lexgen_util::LexerError<#error_type>>; fn next(&mut self) -> Option { @@ -558,7 +579,7 @@ fn generate_semantic_action_call(action_fn: &TokenStream) -> TokenStream { let map_res = quote!(match res { Ok(tok) => Ok((match_start, tok, match_end)), Err(err) => Err(::lexgen_util::LexerError { - location: self.0.match_loc().0, + location: self.match_loc().0, kind: ::lexgen_util::LexerErrorKind::Custom(err), }), }); @@ -569,7 +590,7 @@ fn generate_semantic_action_call(action_fn: &TokenStream) -> TokenStream { } ::lexgen_util::SemanticActionResult::Return(res) => { self.0.__state = self.0.__initial_state; - let (match_start, match_end) = self.0.match_loc(); + let (match_start, match_end) = self.match_loc(); self.0.reset_match(); return Some(#map_res); } @@ -590,19 +611,19 @@ fn generate_semantic_action_fns( let rhs = match action { RuleRhs::None => { - quote!(|__lexer: &mut #lexer_name| __lexer.continue_().map_token(Ok)) + quote!(|__lexer: &mut #lexer_name<'input, I>| __lexer.continue_().map_token(Ok)) } RuleRhs::Rhs { expr, kind } => { match kind { RuleKind::Simple => { - quote!(|__lexer: &mut #lexer_name| __lexer.return_(#expr).map_token(Ok)) + quote!(|__lexer: &'lexer mut #lexer_name<'input, I>| __lexer.return_(#expr).map_token(Ok)) } RuleKind::Fallible => quote!(#expr), RuleKind::Infallible => { - quote!(|__lexer: &mut #lexer_name| { + quote!(|__lexer: &'lexer mut #lexer_name<'input, I>| { let semantic_action: - for<'lexer, 'input> fn(&'lexer mut #lexer_name<'input>) -> ::lexgen_util::SemanticActionResult<#token_type> = + fn(&'lexer mut #lexer_name<'input, I>) -> ::lexgen_util::SemanticActionResult<#token_type> = #expr; semantic_action(__lexer).map_token(Ok) @@ -613,9 +634,11 @@ fn generate_semantic_action_fns( }; quote!( - #[allow(non_upper_case_globals)] - static #ident: for<'lexer, 'input> fn(&'lexer mut #lexer_name<'input>) -> #semantic_action_fn_ret_ty = - #rhs; + #[allow(non_snake_case)] + fn #ident<'lexer, 'input, I: Iterator + Clone>(lexer: &'lexer mut #lexer_name<'input, I>) -> #semantic_action_fn_ret_ty { + let action: fn(&'lexer mut #lexer_name<'input, I>) -> #semantic_action_fn_ret_ty = #rhs; + action(lexer) + } ) }) .collect(); @@ -644,7 +667,7 @@ fn generate_right_ctx_fns( let match_arms = generate_right_ctx_state_arms(ctx, dfa); fns.push( - quote!(fn #fn_name(mut input: std::iter::Peekable) -> bool { + quote!(fn #fn_name + Clone>(mut input: I) -> bool { let mut state: usize = 0; loop { diff --git a/crates/lexgen/tests/bugs.rs b/crates/lexgen/tests/bugs.rs index 9eebf63..c10e70f 100644 --- a/crates/lexgen/tests/bugs.rs +++ b/crates/lexgen/tests/bugs.rs @@ -256,8 +256,8 @@ fn issue_16_backtracking_1() { #[test] fn issue_16_backtracking_2() { - fn return_match<'input>( - lexer: &mut Lexer<'input>, + fn return_match<'input, I: Iterator + Clone>( + lexer: &mut Lexer<'input, I>, ) -> lexgen_util::SemanticActionResult<&'input str> { let match_ = lexer.match_(); lexer.return_(match_) diff --git a/crates/lexgen/tests/tests.rs b/crates/lexgen/tests/tests.rs index c17f041..a2b162c 100644 --- a/crates/lexgen/tests/tests.rs +++ b/crates/lexgen/tests/tests.rs @@ -1065,3 +1065,75 @@ fn diff_4() { })) ); } + +#[test] +fn iter_interface_simple() { + // Tests `new_from_iter` with simple rules + lexer! { + Lexer -> usize; + + 'a' = 1, + 'b' = 2, + } + + let mut lexer = Lexer::new_from_iter("ab".chars()); + assert_eq!(next(&mut lexer), Some(Ok(1))); + assert_eq!(next(&mut lexer), Some(Ok(2))); + assert_eq!(next(&mut lexer), None); +} + +#[test] +fn iter_interface_infallible() { + // Tests `new_from_iter` with infallible rules + lexer! { + // TODO: Is it possible to have a lifetime in lexer state? + Lexer(String) -> u32; + + ['0'-'9']+ => |lexer| { + let (start, end) = lexer.match_loc(); + let str = lexer.state(); + let val = str::parse::(&str[start.byte_idx..end.byte_idx]).unwrap(); + lexer.return_(val) + }, + } + + let input = "123"; + let mut lexer = Lexer::new_from_iter_with_state(input.chars(), input.to_owned()); + assert_eq!(next(&mut lexer), Some(Ok(123))); + assert_eq!(next(&mut lexer), None); +} + +#[test] +fn iter_interface_fallible() { + // Tests `new_from_iter` with fallible rules + lexer! { + // TODO: Is it possible to have a lifetime in lexer state? + Lexer(String) -> u32; + + type Error = std::num::ParseIntError; + + $$ascii_alphanumeric+ =? |lexer| { + let (start, end) = lexer.match_loc(); + let str = lexer.state(); + match str::parse::(&str[start.byte_idx..end.byte_idx]) { + Ok(i) => lexer.return_(Ok(i)), + Err(err) => lexer.return_(Err(err)), + } + }, + } + + let input = "123"; + let mut lexer = Lexer::new_from_iter_with_state(input.chars(), input.to_owned()); + assert_eq!(next(&mut lexer), Some(Ok(123))); + assert_eq!(next(&mut lexer), None); + + let input = "a"; + let mut lexer = Lexer::new_from_iter_with_state(input.chars(), input.to_owned()); + assert!(matches!( + next(&mut lexer), + Some(Err(LexerError { + kind: LexerErrorKind::Custom(_), + .. + })) + )); +} diff --git a/crates/lexgen_util/src/lib.rs b/crates/lexgen_util/src/lib.rs index bc3c7db..8376751 100644 --- a/crates/lexgen_util/src/lib.rs +++ b/crates/lexgen_util/src/lib.rs @@ -1,5 +1,8 @@ #![allow(clippy::should_implement_trait, clippy::type_complexity)] +use std::iter::Peekable; +use std::str::Chars; + use unicode_width::UnicodeWidthChar; #[derive(Debug, Clone, PartialEq, Eq)] @@ -57,7 +60,7 @@ impl SemanticActionResult { /// Common parts in lexers generated by lexgen. /// /// **Fields are used by lexgen-generated code and should not be used directly.** -pub struct Lexer<'input, Token, State, Error, Wrapper> { +pub struct Lexer<'input, Iter: Iterator + Clone, Token, State, Error, Wrapper> { // Current lexer state pub __state: usize, @@ -78,7 +81,7 @@ pub struct Lexer<'input, Token, State, Error, Wrapper> { // Character iterator. `Peekable` is used in the handler's `peek` method. Note that we can't // use byte index returned by this directly, as we re-initialize this field when backtracking. // Add `iter_byte_idx` to the byte index before using. When resetting, update `iter_byte_idx`. - pub __iter: std::iter::Peekable>, + pub __iter: Peekable, // Start of the current match current_match_start: Loc, @@ -93,18 +96,42 @@ pub struct Lexer<'input, Token, State, Error, Wrapper> { // - Skipped match end (exclusive, byte index in `input`) last_match: Option<( Loc, + Peekable, for<'lexer> fn(&'lexer mut Wrapper) -> SemanticActionResult>, Loc, )>, } -impl<'input, T, S: Default, E, W> Lexer<'input, T, S, E, W> { +impl + Clone, T, S: Default, E, W> Lexer<'static, I, T, S, E, W> { + pub fn new_from_iter(iter: I) -> Self { + Self::new_from_iter_with_state(iter, Default::default()) + } +} + +impl + Clone, T, S, E, W> Lexer<'static, I, T, S, E, W> { + pub fn new_from_iter_with_state(iter: I, state: S) -> Self { + Self { + __state: 0, + __done: false, + __initial_state: 0, + user_state: state, + input: "", + iter_loc: Loc::ZERO, + __iter: iter.peekable(), + current_match_start: Loc::ZERO, + current_match_end: Loc::ZERO, + last_match: None, + } + } +} + +impl<'input, T, S: Default, E, W> Lexer<'input, Chars<'input>, T, S, E, W> { pub fn new(input: &'input str) -> Self { Self::new_with_state(input, Default::default()) } } -impl<'input, T, S, E, W> Lexer<'input, T, S, E, W> { +impl<'input, T, S, E, W> Lexer<'input, Chars<'input>, T, S, E, W> { pub fn new_with_state(input: &'input str, state: S) -> Self { Self { __state: 0, @@ -119,7 +146,9 @@ impl<'input, T, S, E, W> Lexer<'input, T, S, E, W> { last_match: None, } } +} +impl<'input, I: Iterator + Clone, T, S, E, W> Lexer<'input, I, T, S, E, W> { // Read the next chracter pub fn next(&mut self) -> Option { match self.__iter.next() { @@ -153,11 +182,11 @@ impl<'input, T, S, E, W> Lexer<'input, T, S, E, W> { location: self.current_match_start, kind: LexerErrorKind::InvalidToken, }), - Some((match_start, semantic_action, match_end)) => { + Some((match_start, iter, semantic_action, match_end)) => { self.__done = false; self.current_match_start = match_start; self.current_match_end = match_end; - self.__iter = self.input[match_end.byte_idx..].chars().peekable(); + self.__iter = iter; self.iter_loc = match_end; Ok(semantic_action) } @@ -174,6 +203,7 @@ impl<'input, T, S, E, W> Lexer<'input, T, S, E, W> { ) { self.last_match = Some(( self.current_match_start, + self.__iter.clone(), semantic_action_fn, self.current_match_end, ));