From cec24c0cb4000dee988346dcfb43b18e9701d0f2 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 13 Jul 2024 14:30:07 -0400 Subject: [PATCH] Reorganize parser and lexer to be separate features/modules. --- .github/workflows/cargo-check.yml | 4 +- .github/workflows/cargo-test.yml | 4 +- wright/Cargo.toml | 21 +++++++-- wright/benches/lexer.rs | 2 +- wright/src/ast.rs | 6 +++ wright/src/{parser/ast.rs => ast/astOld.rs} | 0 wright/src/{parser => }/ast/expression.rs | 0 .../{parser => }/ast/expression/primary.rs | 0 .../ast/expression/primary/integer_literal.rs | 0 .../ast/expression/primary/parens.rs | 0 .../src/{parser => }/ast/expression/unary.rs | 0 wright/src/ast/identifier.rs | 14 ++++++ wright/src/ast/path.rs | 20 +++++++++ wright/src/{parser => }/ast/test_utils.rs | 0 wright/src/{parser => }/ast/ty.rs | 0 wright/src/{parser => }/lexer.rs | 1 - wright/src/{parser => }/lexer/comments.rs | 0 wright/src/{parser => }/lexer/identifier.rs | 0 .../src/{parser => }/lexer/integer_literal.rs | 0 wright/src/{parser => }/lexer/quoted.rs | 2 +- wright/src/{parser => }/lexer/token.rs | 0 wright/src/{parser => }/lexer/trivial.rs | 0 wright/src/lib.rs | 10 ++++- wright/src/parser.rs | 44 ++++++++++++++++--- wright/src/parser/ast/identifier.rs | 1 - wright/src/parser/identifier.rs | 9 ++++ 26 files changed, 120 insertions(+), 18 deletions(-) create mode 100644 wright/src/ast.rs rename wright/src/{parser/ast.rs => ast/astOld.rs} (100%) rename wright/src/{parser => }/ast/expression.rs (100%) rename wright/src/{parser => }/ast/expression/primary.rs (100%) rename wright/src/{parser => }/ast/expression/primary/integer_literal.rs (100%) rename wright/src/{parser => }/ast/expression/primary/parens.rs (100%) rename wright/src/{parser => }/ast/expression/unary.rs (100%) create mode 100644 wright/src/ast/identifier.rs create mode 100644 wright/src/ast/path.rs rename wright/src/{parser => }/ast/test_utils.rs (100%) rename wright/src/{parser => }/ast/ty.rs (100%) rename wright/src/{parser => }/lexer.rs (99%) rename wright/src/{parser => }/lexer/comments.rs (100%) rename wright/src/{parser => }/lexer/identifier.rs (100%) rename wright/src/{parser => }/lexer/integer_literal.rs (100%) rename wright/src/{parser => }/lexer/quoted.rs (98%) rename wright/src/{parser => }/lexer/token.rs (100%) rename wright/src/{parser => }/lexer/trivial.rs (100%) delete mode 100644 wright/src/parser/ast/identifier.rs create mode 100644 wright/src/parser/identifier.rs diff --git a/.github/workflows/cargo-check.yml b/.github/workflows/cargo-check.yml index b786cada..98886669 100644 --- a/.github/workflows/cargo-check.yml +++ b/.github/workflows/cargo-check.yml @@ -18,9 +18,11 @@ jobs: features: - none - std - - source_tracking + - source-tracking - reporting - file_memmap + - ast-model + - lexer - parser - wright_library_defaults - wright_binary diff --git a/.github/workflows/cargo-test.yml b/.github/workflows/cargo-test.yml index 54d4ead2..2d117899 100644 --- a/.github/workflows/cargo-test.yml +++ b/.github/workflows/cargo-test.yml @@ -18,9 +18,11 @@ jobs: features: - none - std - - source_tracking + - source-tracking - reporting - file_memmap + - ast-model + - lexer - parser - wright_library_defaults - wright_binary diff --git a/wright/Cargo.toml b/wright/Cargo.toml index 6ef648fd..804d0cef 100644 --- a/wright/Cargo.toml +++ b/wright/Cargo.toml @@ -55,12 +55,25 @@ wright_binary = ["wright_library_defaults", "dep:clap"] # Features and dependencies useful when the wright binary is not being built or used. wright_library_defaults = ["file_memmap", "parser"] -# Wright's parser depends on the ability to report parsing errors. +# Wright's parser depends on the ability to report parsing errors and construct AST models. parser = [ "reporting", + "ast-model", + "lexer", "dep:unicode-ident" ] +# Wright's abstract syntax tree model is built on types from the "source_tracking" module. +ast-model = [ + "source-tracking", + # "derive_more/from" +] + +# Wright's lexical analyzer is build using types from the "source_tracking" module. +lexer = [ + "source-tracking" +] + # Loading memory mapped files from the disk requires memmap2, fs4, and the reporting feature to correctly and efficiently # read from disk. We also use `anyhow` to make error handling easier. file_memmap = [ @@ -73,13 +86,13 @@ file_memmap = [ # Reporting errors requires source tracking, codespan-reporting (for rendering diagnostics), and # termcolor (for pretty output). reporting = [ - "source_tracking", + "source-tracking", "dep:termcolor", "dep:codespan-reporting" ] # Source tracking requires just a few dependencies and standard library. -source_tracking = [ +source-tracking = [ "std", "dep:dashmap", "derive_more/display", @@ -112,7 +125,7 @@ optional = true # derive_more is used for allowing us to derive additional traits like From and Display. # Currently used by features: -# - "source_tracking" +# - "source-tracking" [dependencies.derive_more] version = "0.99.18" default-features = false diff --git a/wright/benches/lexer.rs b/wright/benches/lexer.rs index 45f2f484..c8dd76f6 100644 --- a/wright/benches/lexer.rs +++ b/wright/benches/lexer.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use criterion::{black_box, criterion_group, criterion_main, Bencher, Criterion}; use wright::{ - parser::lexer::Lexer, + lexer::Lexer, source_tracking::{filename::FileName, source::Source}, }; diff --git a/wright/src/ast.rs b/wright/src/ast.rs new file mode 100644 index 00000000..a8926ad5 --- /dev/null +++ b/wright/src/ast.rs @@ -0,0 +1,6 @@ +//! [Abstract syntax tree] modeling. +//! +//! [Abstract syntax tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree + +pub mod identifier; +pub mod path; diff --git a/wright/src/parser/ast.rs b/wright/src/ast/astOld.rs similarity index 100% rename from wright/src/parser/ast.rs rename to wright/src/ast/astOld.rs diff --git a/wright/src/parser/ast/expression.rs b/wright/src/ast/expression.rs similarity index 100% rename from wright/src/parser/ast/expression.rs rename to wright/src/ast/expression.rs diff --git a/wright/src/parser/ast/expression/primary.rs b/wright/src/ast/expression/primary.rs similarity index 100% rename from wright/src/parser/ast/expression/primary.rs rename to wright/src/ast/expression/primary.rs diff --git a/wright/src/parser/ast/expression/primary/integer_literal.rs b/wright/src/ast/expression/primary/integer_literal.rs similarity index 100% rename from wright/src/parser/ast/expression/primary/integer_literal.rs rename to wright/src/ast/expression/primary/integer_literal.rs diff --git a/wright/src/parser/ast/expression/primary/parens.rs b/wright/src/ast/expression/primary/parens.rs similarity index 100% rename from wright/src/parser/ast/expression/primary/parens.rs rename to wright/src/ast/expression/primary/parens.rs diff --git a/wright/src/parser/ast/expression/unary.rs b/wright/src/ast/expression/unary.rs similarity index 100% rename from wright/src/parser/ast/expression/unary.rs rename to wright/src/ast/expression/unary.rs diff --git a/wright/src/ast/identifier.rs b/wright/src/ast/identifier.rs new file mode 100644 index 00000000..45e01f56 --- /dev/null +++ b/wright/src/ast/identifier.rs @@ -0,0 +1,14 @@ +//! [Identifier]s are used throughout wright as variable names, type names, function names, etc. +//! Their modeling is pretty simple, and is defined here. +//! +//! [Identifier]: https://en.wikipedia.org/wiki/Identifier + +use crate::source_tracking::fragment::Fragment; + +/// Identifiers are used as names for variables, functions, modules, etc. +/// These are defined using [Fragment]s of source code, which will contain the identifier itself. +#[derive(Debug, Clone)] +pub struct Identifier { + /// The fragment of source code containing the identifier. + pub fragment: Fragment, +} diff --git a/wright/src/ast/path.rs b/wright/src/ast/path.rs new file mode 100644 index 00000000..709b5a82 --- /dev/null +++ b/wright/src/ast/path.rs @@ -0,0 +1,20 @@ +//! [Path]s are used in import statements, and can take the place of an [Identifier] in many people. + +use crate::source_tracking::fragment::Fragment; +use super::identifier::Identifier; + +/// A double-colon separated path/reference to a module/function. This can be used in an `import` declaration and +/// some other places. [Path]s with length of 1 are just [Identifier]s -- [Identifier]s can be considered paths in some +/// instances. +#[derive(Debug, Clone)] +pub struct Path { + /// The [Fragment] of source code containing the full source of this path (including the double-colon separators). + pub full_path: Fragment, + + /// The first (left-most) identifier in this [Path]. This can also be considered the "root" of the path -- + /// the module that the following item/identifier can be found in. + pub head: Identifier, + + /// The rest of the [Path], following the first separator. + pub tail: Option> +} diff --git a/wright/src/parser/ast/test_utils.rs b/wright/src/ast/test_utils.rs similarity index 100% rename from wright/src/parser/ast/test_utils.rs rename to wright/src/ast/test_utils.rs diff --git a/wright/src/parser/ast/ty.rs b/wright/src/ast/ty.rs similarity index 100% rename from wright/src/parser/ast/ty.rs rename to wright/src/ast/ty.rs diff --git a/wright/src/parser/lexer.rs b/wright/src/lexer.rs similarity index 99% rename from wright/src/parser/lexer.rs rename to wright/src/lexer.rs index e8f12db1..388d9e6d 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/lexer.rs @@ -6,7 +6,6 @@ use self::comments::{try_match_block_comment, try_match_single_line_comment}; use self::integer_literal::try_consume_integer_literal; use self::quoted::try_consume_quoted_literal; - use crate::source_tracking::fragment::Fragment; use crate::source_tracking::SourceRef; use std::iter::FusedIterator; diff --git a/wright/src/parser/lexer/comments.rs b/wright/src/lexer/comments.rs similarity index 100% rename from wright/src/parser/lexer/comments.rs rename to wright/src/lexer/comments.rs diff --git a/wright/src/parser/lexer/identifier.rs b/wright/src/lexer/identifier.rs similarity index 100% rename from wright/src/parser/lexer/identifier.rs rename to wright/src/lexer/identifier.rs diff --git a/wright/src/parser/lexer/integer_literal.rs b/wright/src/lexer/integer_literal.rs similarity index 100% rename from wright/src/parser/lexer/integer_literal.rs rename to wright/src/lexer/integer_literal.rs diff --git a/wright/src/parser/lexer/quoted.rs b/wright/src/lexer/quoted.rs similarity index 98% rename from wright/src/parser/lexer/quoted.rs rename to wright/src/lexer/quoted.rs index cc2a788e..5cf25fbe 100644 --- a/wright/src/parser/lexer/quoted.rs +++ b/wright/src/lexer/quoted.rs @@ -62,7 +62,7 @@ pub fn try_consume_quoted_literal(lexer: &mut Lexer) -> Option { #[cfg(test)] mod tests { - use crate::parser::lexer::{token::TokenTy, Lexer}; + use super::super::{token::TokenTy, Lexer}; #[test] fn string_literal() { diff --git a/wright/src/parser/lexer/token.rs b/wright/src/lexer/token.rs similarity index 100% rename from wright/src/parser/lexer/token.rs rename to wright/src/lexer/token.rs diff --git a/wright/src/parser/lexer/trivial.rs b/wright/src/lexer/trivial.rs similarity index 100% rename from wright/src/parser/lexer/trivial.rs rename to wright/src/lexer/trivial.rs diff --git a/wright/src/lib.rs b/wright/src/lib.rs index c0b13ed9..cd9f3591 100644 --- a/wright/src/lib.rs +++ b/wright/src/lib.rs @@ -36,11 +36,17 @@ pub mod build_info { include!(concat!(env!("OUT_DIR"), "/built.rs")); } +#[cfg(feature = "source-tracking")] +pub mod source_tracking; + #[cfg(feature = "reporting")] pub mod reporting; -#[cfg(feature = "source_tracking")] -pub mod source_tracking; +#[cfg(feature = "lexer")] +pub mod lexer; + +#[cfg(feature = "ast-model")] +pub mod ast; #[cfg(feature = "parser")] pub mod parser; diff --git a/wright/src/parser.rs b/wright/src/parser.rs index 4cfa5954..c101b0a8 100644 --- a/wright/src/parser.rs +++ b/wright/src/parser.rs @@ -1,8 +1,40 @@ -//! The wright lexer, parser, and AST representation. +//! This parser module is responsible for turning the stream of [Token]s from the [Lexer] into a tree of [AST] nodes. +//! +//! [AST]: crate::ast -// pub mod error; -// pub mod state; -// pub mod util; +use super::lexer::{token::{Token, TokenTy}, Lexer}; -// pub mod ast; -pub mod lexer; +mod identifier; + +/// Errors that can arise when parsing a source to an abstract syntax tree node. +#[derive(Debug)] +pub enum ParseError { + /// Expected one type of token, found another + Expected { + /// The expected variant. + expected: TokenTy, + /// The token found from the lexer. + found: Option, + } +} + +/// Trait implemented by all AST nodes that can be parsed. +pub trait Parse: Sized { + /// Attempt to parse a tree node of this type from a given [Lexer]. + fn parse(lexer: &mut Lexer) -> Result; +} + +impl Lexer { + /// Pull the next token from a lexer, and return an error if it's not of the given variant. + pub fn expect(&mut self, token_ty: TokenTy) -> Result { + let next_token = self + .next_token() + .ok_or(ParseError::Expected { expected: token_ty, found: None })?; + + if next_token.variant != token_ty { + return Err(ParseError::Expected { expected: token_ty, found: Some(next_token) }); + } + + Ok(next_token) + } +} diff --git a/wright/src/parser/ast/identifier.rs b/wright/src/parser/ast/identifier.rs deleted file mode 100644 index 0d0386eb..00000000 --- a/wright/src/parser/ast/identifier.rs +++ /dev/null @@ -1 +0,0 @@ -//! AST node implementation for parsing identifiers, which can be used to name types, variables, functions, etc. diff --git a/wright/src/parser/identifier.rs b/wright/src/parser/identifier.rs new file mode 100644 index 00000000..025b9488 --- /dev/null +++ b/wright/src/parser/identifier.rs @@ -0,0 +1,9 @@ +use crate::{ast::identifier::Identifier, lexer::{token::TokenTy, Lexer}}; +use super::{Parse, ParseError}; + +impl Parse for Identifier { + fn parse(lexer: &mut Lexer) -> Result { + let ident_token = lexer.expect(TokenTy::Identifier)?; + Ok(Identifier { fragment: ident_token.fragment }) + } +}