Implement full parsing using PEGs

kneasle · Sep 25, 2021 · 9ab43ce · 9ab43ce
1 parent 56b6322
commit 9ab43ce
Show file tree

Hide file tree

Showing 10 changed files with 508 additions and 74 deletions.
diff --git a/grammar/src/grammar.rs b/grammar/src/grammar.rs
@@ -1,5 +1,5 @@
 use std::{
-    collections::{HashMap, HashSet},
+    collections::HashMap,
     fmt::{Debug, Formatter},
 };
 
@@ -8,14 +8,17 @@ use itertools::Itertools;
 use regex::Regex;
 use serde::Deserialize;
 
+use crate::parser::{self, Ast};
+
 /// A complete specification for how to parse files of any particular language.
 #[derive(Debug, Clone)]
 pub struct Grammar {
     root_type: TypeId,
     whitespace: Whitespace,
     pub(crate) types: TypeVec<Type>,
     tokens: IndexVec<TokenId, Token>,
-    // Look-up tables for the parser
+
+    /* LOOK-UP TABLES FOR THE TOKENIZER/PARSER */
     /// Mapping from token texts to IDs, stored **in decreasing order** of the text length.  This
     /// makes sure that the tokenizer always consumes the largest possible token (e.g. `"&&"`
     /// should be tokenized into just `&&`, rather than two `&`s).
@@ -46,14 +49,40 @@ impl Grammar {
         }
     }
 
-    /////////////
-    // GETTERS //
-    /////////////
+    /// Construct a concrete AST representing a [`str`]ing of the root type according to this [`Grammar`].
+    pub fn parse_root<'s, N: Ast>(&self, s: &'s str) -> Result<(&'s str, N), parser::Error> {
+        parser::parse(self, self.root_type, s)
+    }
 
-    pub fn whitespace(&self) -> &Whitespace {
-        &self.whitespace
+    /// Construct a concrete AST representing a [`str`]ing according to this [`Grammar`].
+    pub fn parse<'s, N: Ast>(
+        &self,
+        type_id: TypeId,
+        s: &'s str,
+    ) -> Result<(&'s str, N), parser::Error> {
+        parser::parse(self, type_id, s)
+    }
+
+    ///////////
+    // TYPES //
+    ///////////
+
+    pub fn root_type(&self) -> TypeId {
+        self.root_type
+    }
+
+    pub fn get_type(&self, id: TypeId) -> &Type {
+        &self.types[id]
     }
 
+    pub fn type_name(&self, id: TypeId) -> &str {
+        &self.types[id].name
+    }
+
+    ////////////
+    // TOKENS //
+    ////////////
+
     pub fn tokens(&self) -> &IndexSlice<TokenId, [Token]> {
         &self.tokens
     }
@@ -66,8 +95,8 @@ impl Grammar {
         &self.tokens[id].text
     }
 
-    pub fn type_name(&self, id: TypeId) -> &str {
-        &self.types[id].name
+    pub fn whitespace(&self) -> &Whitespace {
+        &self.whitespace
     }
 
     /// Returns the static tokens in `self`, in decreasing order of length
@@ -87,18 +116,19 @@ pub struct Type {
     /// example, 'node class' types like expressions (which can never be instantiated directly) or
     /// JSON fields (which are only created implicitly to contain other nodes).
     pub(crate) keys: Vec<String>,
+    /// The complete set of types to which this type can be implicitly converted in order of
+    /// parsing precedence, **including**
+    /// itself.  For [`Stringy`] types, this will only contain `self`.
+    pub(crate) descendants: Vec<TypeId>,
     pub(crate) inner: TypeInner,
 }
 
 #[derive(Debug, Clone)]
 pub enum TypeInner {
-    Pattern {
-        /// The complete set of types to which this type can be implicitly converted, **including**
-        /// itself.  Note that [`Stringy`] types can't have descendant types.
-        descendants: HashSet<TypeId>,
-        /// The pattern describing which token sequences are valid instances of this [`Type`].
-        pattern: Option<Pattern>,
-    },
+    /// A [`Type`] which can't be instantiated, but can contain child nodes
+    Container,
+    /// The pattern describing which token sequences are valid instances of this [`Type`].
+    Pattern(Pattern),
     /// A node which store a string value, editable by the user.  These nodes always correspond to
     /// precisely one token.
     ///

diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs
@@ -1,5 +1,28 @@
+//! Crate for handling language-independent grammars.
+//!
+//! This includes:
+//! - The central [`Grammar`] data structure (in the [`grammar`] module)
+//! - A deserializeable schema for the TOML files (the [`spec`] and [`spec::convert`] modules)
+//! - Code for tokenizing & parsing arbitrary strings into ASTs of any language for which a
+//!   [`Grammar`] is provided (in the [`tokenizer`] and [`parser`] modules, respectively).
+//!
+// TODO: Keep the throughput statistic up-to-date with parser changes
+//! The current parsing engine is quite bodged - it works correctly for the few languages currently
+//! supported by Sapling, but is relatively slow (throughput is roughly 70MB/s for JSON) and has a
+//! few weird edge cases, like being unable to handle left-recursive grammars.  However, it has
+//! been quite extensively fuzzed (see the `fuzz` crate), so should be fairly stable and accurate.
+//!
+//! Speed/quality improvements to this crate are very welcome - optimisations are very much
+//! encouraged and rewrites of any scale is good as long as the external behaviour of the parser
+//! doesn't change (i.e. the trees produced by [`Grammar::parse`] are still correct).
+//! Well-justified use of `unsafe` is also fine, but generally Sapling cares more about safety and
+//! correctness then absolutely top-notch performance.  Also, I suspect that a reasonably well
+//! optimised parser is unlikely to be a major bottleneck of Sapling, so it's worth doing some
+//! profiling before committing large amounts of time into optimising the parsing engine.
+
 mod grammar;
-mod spec; // AST-like specification of the TOML files consumed by Sapling
+pub mod parser;
+mod spec; // AST-like specification of the TOML format consumed by Sapling
 pub mod tokenizer;
 
 pub use grammar::*;

diff --git a/grammar/src/parser.rs b/grammar/src/parser.rs
@@ -0,0 +1,223 @@
+use std::rc::Rc;
+
+use crate::{
+    tokenizer::{self, ParsedToken, Tokenizer},
+    Grammar, PatternElement, TokenId, TypeId, TypeInner,
+};
+
+/// `true` if this module should emit debug printing
+const DO_DEBUG_PRINT: bool = false;
+
+macro_rules! dbg_println {
+    ($s: literal $(, $arg: expr)* ) => {
+        if DO_DEBUG_PRINT {
+            println!($s $(, $arg)*);
+        }
+    };
+}
+
+type TokenIter<'a, 's> = std::iter::Peekable<std::slice::Iter<'a, (ParsedToken<'s>, &'s str)>>;
+
+pub trait Ast: Sized {
+    type Builder: Builder<Node = Self>;
+
+    fn new_stringy(type_id: TypeId, contents: String, display_str: String, ws: &str) -> Self;
+}
+
+// TODO: Not Debug
+pub trait Builder: Sized + std::fmt::Debug {
+    type Node: Ast;
+
+    fn new(type_id: TypeId) -> Self;
+
+    /// Add a child [`Node`](Self::Node) to the [`Node`](Self::Node) being built
+    fn add_node(&mut self, type_bound: TypeId, node: Rc<Self::Node>);
+    /// Add a static token (and its whitespace) to the [`Node`](Self::Node) being built
+    fn add_token(&mut self, token: TokenId, ws: &str);
+
+    /// Start a repeating sequence of sub-patterns
+    fn seq_start(&mut self);
+    /// Move on to the next sub-pattern in a sequence, adding a delimiter in the process
+    fn seq_delim(&mut self, token: TokenId, ws: &str);
+    /// Finish a repeating sequence of sub-patterns
+    fn seq_end(&mut self);
+
+    /// Build this `Builder` into a corresponding [`Node`](Self::Node)
+    fn into_node(self) -> Self::Node;
+}
+
+/// Parse a string into an AST node who's [`Type`] is a descendant of a given [`TypeId`], returning
+/// an error if that isn't possible.
+pub(crate) fn parse<'s, N: Ast>(
+    grammar: &Grammar,
+    type_id: TypeId,
+    s: &'s str,
+) -> Result<(&'s str, N), Error> {
+    // TODO: Check for & handle grammars with left-recursion
+
+    let (leading_ws, tokenizer) = Tokenizer::new(grammar, s);
+    let tokens = tokenizer
+        .collect::<Result<Vec<_>, _>>()
+        .map_err(Error::Tokenize)?;
+    for t in &tokens {
+        dbg_println!("    {:?}", t);
+    }
+
+    let tree = parse_type_bound::<N>(grammar, type_id, &mut tokens.iter().peekable())
+        .ok_or(Error::Parse)?;
+    Ok((leading_ws, tree))
+}
+
+/// The different ways that parsing could fail
+#[derive(Debug, Clone)]
+pub enum Error {
+    Tokenize(tokenizer::Error),
+    Parse,
+}
+
+/// Parse a token stream into an AST node who's [`Type`] is a descendant of a given [`TypeId`],
+/// returning an error if that isn't possible.
+fn parse_type_bound<'a, 's, N: Ast>(
+    grammar: &Grammar,
+    type_bound: TypeId,
+    tokens: &mut TokenIter<'a, 's>,
+) -> Option<N> {
+    let ty = grammar.get_type(type_bound);
+
+    // Try to parse each descendant type
+    for &descendant_id in &ty.descendants {
+        // TODO: Handle left-recursion
+        //
+        // We parse each descendant type on a copy of `tokens` so that, in the case of failure, the
+        // next type can be parsed from the same start point.
+        let mut tokens_clone = tokens.clone();
+        match parse_concrete_type::<N>(grammar, descendant_id, &mut tokens_clone) {
+            Some(node) => {
+                *tokens = tokens_clone; // This has parsed correctly, so consume its tokens
+                return Some(node);
+            }
+            // If parsing failed, then try other parse rules
+            None => {}
+        }
+    }
+
+    // If all the descendant types failed to parse, then this node's parsing fails
+    None
+}
+
+/*
+/// Parse a token stream into **shortest** (as in fewest tokens) AST node with a concrete [`Type`].
+///
+/// This must always terminate, even when given a left-recursive (unambiguous) grammar.  Proof
+/// sketch:
+///  - Rules which apply left-recursion are always longer than their left-child node
+///  - The left-child of a left-recursive rule must have the same type as its parent
+/// => If a left-recursive node exists, then its left-child must be smaller and be of the same
+///    type
+/// => The left-recursive node is not the shortest node of its type given its start point (because
+///    its left-child is shorter)
+/// => This function can't return a left-recursive node
+/// => This function doesn't need to consider left-recursive rules
+/// => Termination of this function is independent of left-recursion
+fn parse_shortest<'a, 's, N: Ast>(
+
+TODO: Handle left-recursion somehow
+*/
+
+/// Attempt to parse a node of a concrete type
+fn parse_concrete_type<'a, 's, N: Ast>(
+    grammar: &Grammar,
+    concrete_type_id: TypeId,
+    tokens: &mut TokenIter<'a, 's>,
+) -> Option<N> {
+    let ty = grammar.get_type(concrete_type_id);
+
+    dbg_println!("Parsing concrete type {:?}", ty.name);
+    match &ty.inner {
+        TypeInner::Container => {} // Containers can't be parsed as concrete types
+        TypeInner::Pattern(pat) => {
+            let mut bdr = N::Builder::new(concrete_type_id);
+            if parse_pattern::<N>(grammar, &mut bdr, pat, tokens).is_some() {
+                return Some(bdr.into_node());
+            }
+        }
+        TypeInner::Stringy(_) => match tokens.next() {
+            Some((ParsedToken::Stringy(id, contents, display_str), ws)) => {
+                if *id == concrete_type_id {
+                    return Some(N::new_stringy(
+                        concrete_type_id,
+                        contents.clone(),
+                        (*display_str).to_owned(),
+                        ws,
+                    ));
+                }
+            }
+            _ => {}
+        },
+    }
+    dbg_println!("Parsing concrete type {:?} failed", ty.name);
+    // If a match wasn't returned, then this type fails to parse
+    None
+}
+
+/// Parse a full [`Pattern`] via a [`Self::Builder`].  This returns `Option<()>` (as opposed to a
+/// [`bool`]) so that the `?` operator can be used.
+#[must_use]
+fn parse_pattern<'a, 's, N: Ast>(
+    grammar: &Grammar,
+    bdr: &mut N::Builder,
+    pat: &[PatternElement],
+    tokens: &mut TokenIter<'a, 's>,
+) -> Option<()> {
+    dbg_println!("Matching {:?}", pat);
+    for elem in pat {
+        dbg_println!("Elem: {:?}", elem);
+        match elem {
+            // If the next element is a static token, then check for the corresponding
+            // token
+            PatternElement::Token(expected_token_id) => match tokens.next() {
+                Some((ParsedToken::Static(token_id), ws)) => {
+                    if expected_token_id != token_id {
+                        return None;
+                    }
+                    // If we got the token we expected, then consume it and keep parsing
+                    bdr.add_token(*token_id, ws);
+                }
+                _ => return None,
+            },
+            // If the next element is a type bound, then attempt to parse a node with that type and
+            // add it to the builder
+            PatternElement::Type(type_bound) => {
+                let node = parse_type_bound::<N>(grammar, *type_bound, tokens)?;
+                bdr.add_node(*type_bound, Rc::new(node));
+            }
+            // If the next element is a sequence, then repeatedly parse the patterns until one of
+            // them doesn't end with the given delimiter.  Parsing fails if any of the patterns
+            // fail
+            PatternElement::Seq { pattern, delimiter } => {
+                bdr.seq_start();
+                loop {
+                    dbg_println!("{{");
+                    parse_pattern::<N>(grammar, bdr, pattern, tokens)?;
+                    dbg_println!("}}");
+                    // Match the delimiter, and continue the loop if it matches
+                    match tokens.peek() {
+                        Some((ParsedToken::Static(token_id), ws)) => {
+                            if token_id == delimiter {
+                                bdr.seq_delim(*token_id, ws);
+                                tokens.next();
+                                continue; // Parse the next element in the sequence
+                            }
+                        }
+                        _ => {}
+                    }
+                    // If matching the delimiter failed, then the sequence is over
+                    bdr.seq_end();
+                    break;
+                }
+            }
+        }
+    }
+    // If we successfully matched all the elements, then parsing succeeded
+    Some(())
+}