Skip to content

Commit

Permalink
Implement full parsing using PEGs
Browse files Browse the repository at this point in the history
  • Loading branch information
kneasle committed Sep 25, 2021
1 parent 56b6322 commit 9ab43ce
Show file tree
Hide file tree
Showing 10 changed files with 508 additions and 74 deletions.
62 changes: 46 additions & 16 deletions grammar/src/grammar.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::{
collections::{HashMap, HashSet},
collections::HashMap,
fmt::{Debug, Formatter},
};

Expand All @@ -8,14 +8,17 @@ use itertools::Itertools;
use regex::Regex;
use serde::Deserialize;

use crate::parser::{self, Ast};

/// A complete specification for how to parse files of any particular language.
#[derive(Debug, Clone)]
pub struct Grammar {
root_type: TypeId,
whitespace: Whitespace,
pub(crate) types: TypeVec<Type>,
tokens: IndexVec<TokenId, Token>,
// Look-up tables for the parser

/* LOOK-UP TABLES FOR THE TOKENIZER/PARSER */
/// Mapping from token texts to IDs, stored **in decreasing order** of the text length. This
/// makes sure that the tokenizer always consumes the largest possible token (e.g. `"&&"`
/// should be tokenized into just `&&`, rather than two `&`s).
Expand Down Expand Up @@ -46,14 +49,40 @@ impl Grammar {
}
}

/////////////
// GETTERS //
/////////////
/// Construct a concrete AST representing a [`str`]ing of the root type according to this [`Grammar`].
pub fn parse_root<'s, N: Ast>(&self, s: &'s str) -> Result<(&'s str, N), parser::Error> {
parser::parse(self, self.root_type, s)
}

pub fn whitespace(&self) -> &Whitespace {
&self.whitespace
/// Construct a concrete AST representing a [`str`]ing according to this [`Grammar`].
pub fn parse<'s, N: Ast>(
&self,
type_id: TypeId,
s: &'s str,
) -> Result<(&'s str, N), parser::Error> {
parser::parse(self, type_id, s)
}

///////////
// TYPES //
///////////

pub fn root_type(&self) -> TypeId {
self.root_type
}

pub fn get_type(&self, id: TypeId) -> &Type {
&self.types[id]
}

pub fn type_name(&self, id: TypeId) -> &str {
&self.types[id].name
}

////////////
// TOKENS //
////////////

pub fn tokens(&self) -> &IndexSlice<TokenId, [Token]> {
&self.tokens
}
Expand All @@ -66,8 +95,8 @@ impl Grammar {
&self.tokens[id].text
}

pub fn type_name(&self, id: TypeId) -> &str {
&self.types[id].name
pub fn whitespace(&self) -> &Whitespace {
&self.whitespace
}

/// Returns the static tokens in `self`, in decreasing order of length
Expand All @@ -87,18 +116,19 @@ pub struct Type {
/// example, 'node class' types like expressions (which can never be instantiated directly) or
/// JSON fields (which are only created implicitly to contain other nodes).
pub(crate) keys: Vec<String>,
/// The complete set of types to which this type can be implicitly converted in order of
/// parsing precedence, **including**
/// itself. For [`Stringy`] types, this will only contain `self`.
pub(crate) descendants: Vec<TypeId>,
pub(crate) inner: TypeInner,
}

#[derive(Debug, Clone)]
pub enum TypeInner {
Pattern {
/// The complete set of types to which this type can be implicitly converted, **including**
/// itself. Note that [`Stringy`] types can't have descendant types.
descendants: HashSet<TypeId>,
/// The pattern describing which token sequences are valid instances of this [`Type`].
pattern: Option<Pattern>,
},
/// A [`Type`] which can't be instantiated, but can contain child nodes
Container,
/// The pattern describing which token sequences are valid instances of this [`Type`].
Pattern(Pattern),
/// A node which store a string value, editable by the user. These nodes always correspond to
/// precisely one token.
///
Expand Down
25 changes: 24 additions & 1 deletion grammar/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
//! Crate for handling language-independent grammars.
//!
//! This includes:
//! - The central [`Grammar`] data structure (in the [`grammar`] module)
//! - A deserializeable schema for the TOML files (the [`spec`] and [`spec::convert`] modules)
//! - Code for tokenizing & parsing arbitrary strings into ASTs of any language for which a
//! [`Grammar`] is provided (in the [`tokenizer`] and [`parser`] modules, respectively).
//!
// TODO: Keep the throughput statistic up-to-date with parser changes
//! The current parsing engine is quite bodged - it works correctly for the few languages currently
//! supported by Sapling, but is relatively slow (throughput is roughly 70MB/s for JSON) and has a
//! few weird edge cases, like being unable to handle left-recursive grammars. However, it has
//! been quite extensively fuzzed (see the `fuzz` crate), so should be fairly stable and accurate.
//!
//! Speed/quality improvements to this crate are very welcome - optimisations are very much
//! encouraged and rewrites of any scale is good as long as the external behaviour of the parser
//! doesn't change (i.e. the trees produced by [`Grammar::parse`] are still correct).
//! Well-justified use of `unsafe` is also fine, but generally Sapling cares more about safety and
//! correctness then absolutely top-notch performance. Also, I suspect that a reasonably well
//! optimised parser is unlikely to be a major bottleneck of Sapling, so it's worth doing some
//! profiling before committing large amounts of time into optimising the parsing engine.
mod grammar;
mod spec; // AST-like specification of the TOML files consumed by Sapling
pub mod parser;
mod spec; // AST-like specification of the TOML format consumed by Sapling
pub mod tokenizer;

pub use grammar::*;
Expand Down
223 changes: 223 additions & 0 deletions grammar/src/parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
use std::rc::Rc;

use crate::{
tokenizer::{self, ParsedToken, Tokenizer},
Grammar, PatternElement, TokenId, TypeId, TypeInner,
};

/// `true` if this module should emit debug printing
const DO_DEBUG_PRINT: bool = false;

macro_rules! dbg_println {
($s: literal $(, $arg: expr)* ) => {
if DO_DEBUG_PRINT {
println!($s $(, $arg)*);
}
};
}

type TokenIter<'a, 's> = std::iter::Peekable<std::slice::Iter<'a, (ParsedToken<'s>, &'s str)>>;

pub trait Ast: Sized {
type Builder: Builder<Node = Self>;

fn new_stringy(type_id: TypeId, contents: String, display_str: String, ws: &str) -> Self;
}

// TODO: Not Debug
pub trait Builder: Sized + std::fmt::Debug {
type Node: Ast;

fn new(type_id: TypeId) -> Self;

/// Add a child [`Node`](Self::Node) to the [`Node`](Self::Node) being built
fn add_node(&mut self, type_bound: TypeId, node: Rc<Self::Node>);
/// Add a static token (and its whitespace) to the [`Node`](Self::Node) being built
fn add_token(&mut self, token: TokenId, ws: &str);

/// Start a repeating sequence of sub-patterns
fn seq_start(&mut self);
/// Move on to the next sub-pattern in a sequence, adding a delimiter in the process
fn seq_delim(&mut self, token: TokenId, ws: &str);
/// Finish a repeating sequence of sub-patterns
fn seq_end(&mut self);

/// Build this `Builder` into a corresponding [`Node`](Self::Node)
fn into_node(self) -> Self::Node;
}

/// Parse a string into an AST node who's [`Type`] is a descendant of a given [`TypeId`], returning
/// an error if that isn't possible.
pub(crate) fn parse<'s, N: Ast>(
grammar: &Grammar,
type_id: TypeId,
s: &'s str,
) -> Result<(&'s str, N), Error> {
// TODO: Check for & handle grammars with left-recursion

let (leading_ws, tokenizer) = Tokenizer::new(grammar, s);
let tokens = tokenizer
.collect::<Result<Vec<_>, _>>()
.map_err(Error::Tokenize)?;
for t in &tokens {
dbg_println!(" {:?}", t);
}

let tree = parse_type_bound::<N>(grammar, type_id, &mut tokens.iter().peekable())
.ok_or(Error::Parse)?;
Ok((leading_ws, tree))
}

/// The different ways that parsing could fail
#[derive(Debug, Clone)]
pub enum Error {
Tokenize(tokenizer::Error),
Parse,
}

/// Parse a token stream into an AST node who's [`Type`] is a descendant of a given [`TypeId`],
/// returning an error if that isn't possible.
fn parse_type_bound<'a, 's, N: Ast>(
grammar: &Grammar,
type_bound: TypeId,
tokens: &mut TokenIter<'a, 's>,
) -> Option<N> {
let ty = grammar.get_type(type_bound);

// Try to parse each descendant type
for &descendant_id in &ty.descendants {
// TODO: Handle left-recursion
//
// We parse each descendant type on a copy of `tokens` so that, in the case of failure, the
// next type can be parsed from the same start point.
let mut tokens_clone = tokens.clone();
match parse_concrete_type::<N>(grammar, descendant_id, &mut tokens_clone) {
Some(node) => {
*tokens = tokens_clone; // This has parsed correctly, so consume its tokens
return Some(node);
}
// If parsing failed, then try other parse rules
None => {}
}
}

// If all the descendant types failed to parse, then this node's parsing fails
None
}

/*
/// Parse a token stream into **shortest** (as in fewest tokens) AST node with a concrete [`Type`].
///
/// This must always terminate, even when given a left-recursive (unambiguous) grammar. Proof
/// sketch:
/// - Rules which apply left-recursion are always longer than their left-child node
/// - The left-child of a left-recursive rule must have the same type as its parent
/// => If a left-recursive node exists, then its left-child must be smaller and be of the same
/// type
/// => The left-recursive node is not the shortest node of its type given its start point (because
/// its left-child is shorter)
/// => This function can't return a left-recursive node
/// => This function doesn't need to consider left-recursive rules
/// => Termination of this function is independent of left-recursion
fn parse_shortest<'a, 's, N: Ast>(
TODO: Handle left-recursion somehow
*/

/// Attempt to parse a node of a concrete type
fn parse_concrete_type<'a, 's, N: Ast>(
grammar: &Grammar,
concrete_type_id: TypeId,
tokens: &mut TokenIter<'a, 's>,
) -> Option<N> {
let ty = grammar.get_type(concrete_type_id);

dbg_println!("Parsing concrete type {:?}", ty.name);
match &ty.inner {
TypeInner::Container => {} // Containers can't be parsed as concrete types
TypeInner::Pattern(pat) => {
let mut bdr = N::Builder::new(concrete_type_id);
if parse_pattern::<N>(grammar, &mut bdr, pat, tokens).is_some() {
return Some(bdr.into_node());
}
}
TypeInner::Stringy(_) => match tokens.next() {
Some((ParsedToken::Stringy(id, contents, display_str), ws)) => {
if *id == concrete_type_id {
return Some(N::new_stringy(
concrete_type_id,
contents.clone(),
(*display_str).to_owned(),
ws,
));
}
}
_ => {}
},
}
dbg_println!("Parsing concrete type {:?} failed", ty.name);
// If a match wasn't returned, then this type fails to parse
None
}

/// Parse a full [`Pattern`] via a [`Self::Builder`]. This returns `Option<()>` (as opposed to a
/// [`bool`]) so that the `?` operator can be used.
#[must_use]
fn parse_pattern<'a, 's, N: Ast>(
grammar: &Grammar,
bdr: &mut N::Builder,
pat: &[PatternElement],
tokens: &mut TokenIter<'a, 's>,
) -> Option<()> {
dbg_println!("Matching {:?}", pat);
for elem in pat {
dbg_println!("Elem: {:?}", elem);
match elem {
// If the next element is a static token, then check for the corresponding
// token
PatternElement::Token(expected_token_id) => match tokens.next() {
Some((ParsedToken::Static(token_id), ws)) => {
if expected_token_id != token_id {
return None;
}
// If we got the token we expected, then consume it and keep parsing
bdr.add_token(*token_id, ws);
}
_ => return None,
},
// If the next element is a type bound, then attempt to parse a node with that type and
// add it to the builder
PatternElement::Type(type_bound) => {
let node = parse_type_bound::<N>(grammar, *type_bound, tokens)?;
bdr.add_node(*type_bound, Rc::new(node));
}
// If the next element is a sequence, then repeatedly parse the patterns until one of
// them doesn't end with the given delimiter. Parsing fails if any of the patterns
// fail
PatternElement::Seq { pattern, delimiter } => {
bdr.seq_start();
loop {
dbg_println!("{{");
parse_pattern::<N>(grammar, bdr, pattern, tokens)?;
dbg_println!("}}");
// Match the delimiter, and continue the loop if it matches
match tokens.peek() {
Some((ParsedToken::Static(token_id), ws)) => {
if token_id == delimiter {
bdr.seq_delim(*token_id, ws);
tokens.next();
continue; // Parse the next element in the sequence
}
}
_ => {}
}
// If matching the delimiter failed, then the sequence is over
bdr.seq_end();
break;
}
}
}
}
// If we successfully matched all the elements, then parsing succeeded
Some(())
}
Loading

0 comments on commit 9ab43ce

Please sign in to comment.