From 9eab376ecdc125d1ff2afed952471c48141f3ee1 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 20 Jan 2024 03:09:41 -0500 Subject: [PATCH 01/60] WIP reworking lexer to use custom fragments and custom file map with mmap2 --- wright/Cargo.toml | 8 + wright/src/bin/wright.rs | 18 - wright/src/filemap.rs | 228 +++++++- wright/src/parser.rs | 10 +- wright/src/parser/fragment.rs | 64 +++ wright/src/parser/lexer.rs | 513 +----------------- wright/src/parser/{ => old}/ast.rs | 0 .../src/parser/{ => old}/ast/declaration.rs | 0 .../parser/{ => old}/ast/declaration/class.rs | 0 .../parser/{ => old}/ast/declaration/enum.rs | 0 .../{ => old}/ast/declaration/function.rs | 0 .../{ => old}/ast/declaration/generics.rs | 0 .../{ => old}/ast/declaration/import.rs | 0 .../{ => old}/ast/declaration/module.rs | 0 .../parser/{ => old}/ast/declaration/type.rs | 0 .../parser/{ => old}/ast/declaration/union.rs | 0 .../{ => old}/ast/declaration/visibility.rs | 0 .../{ => old}/ast/declaration/where_clause.rs | 0 wright/src/parser/{ => old}/ast/expression.rs | 0 .../parser/{ => old}/ast/expression/block.rs | 0 .../{ => old}/ast/expression/literal.rs | 0 .../ast/expression/literal/boolean.rs | 0 .../ast/expression/literal/character.rs | 0 .../ast/expression/literal/escapes.rs | 0 .../ast/expression/literal/integer.rs | 0 .../ast/expression/literal/string.rs | 0 .../{ => old}/ast/expression/parentheses.rs | 0 .../{ => old}/ast/expression/primary.rs | 0 wright/src/parser/{ => old}/ast/identifier.rs | 0 wright/src/parser/{ => old}/ast/metadata.rs | 0 wright/src/parser/{ => old}/ast/path.rs | 0 wright/src/parser/{ => old}/ast/statement.rs | 0 .../parser/{ => old}/ast/statement/bind.rs | 0 wright/src/parser/{ => old}/ast/types.rs | 0 wright/src/parser/{ => old}/error.rs | 0 wright/src/parser/old/lexer.rs | 511 +++++++++++++++++ wright/src/parser/old/lexer/definition.rs | 72 +++ .../parser/{ => old}/lexer/pretty_print.rs | 0 wright/src/parser/{ => old}/lexer/tokens.rs | 0 wright/src/parser/{ => old}/state.rs | 0 wright/src/parser/{ => old}/util.rs | 0 .../parser/{ => old}/util/discard_error.rs | 0 wright/src/parser/{ => old}/util/erase.rs | 0 .../parser/{ => old}/util/first_successful.rs | 0 wright/src/parser/{ => old}/util/ignore.rs | 0 wright/src/parser/{ => old}/util/map.rs | 0 wright/src/repl.rs | 32 +- wright/tests/lexer.rs | 62 +-- 48 files changed, 927 insertions(+), 591 deletions(-) create mode 100644 wright/src/parser/fragment.rs rename wright/src/parser/{ => old}/ast.rs (100%) rename wright/src/parser/{ => old}/ast/declaration.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/class.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/enum.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/function.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/generics.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/import.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/module.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/type.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/union.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/visibility.rs (100%) rename wright/src/parser/{ => old}/ast/declaration/where_clause.rs (100%) rename wright/src/parser/{ => old}/ast/expression.rs (100%) rename wright/src/parser/{ => old}/ast/expression/block.rs (100%) rename wright/src/parser/{ => old}/ast/expression/literal.rs (100%) rename wright/src/parser/{ => old}/ast/expression/literal/boolean.rs (100%) rename wright/src/parser/{ => old}/ast/expression/literal/character.rs (100%) rename wright/src/parser/{ => old}/ast/expression/literal/escapes.rs (100%) rename wright/src/parser/{ => old}/ast/expression/literal/integer.rs (100%) rename wright/src/parser/{ => old}/ast/expression/literal/string.rs (100%) rename wright/src/parser/{ => old}/ast/expression/parentheses.rs (100%) rename wright/src/parser/{ => old}/ast/expression/primary.rs (100%) rename wright/src/parser/{ => old}/ast/identifier.rs (100%) rename wright/src/parser/{ => old}/ast/metadata.rs (100%) rename wright/src/parser/{ => old}/ast/path.rs (100%) rename wright/src/parser/{ => old}/ast/statement.rs (100%) rename wright/src/parser/{ => old}/ast/statement/bind.rs (100%) rename wright/src/parser/{ => old}/ast/types.rs (100%) rename wright/src/parser/{ => old}/error.rs (100%) create mode 100644 wright/src/parser/old/lexer.rs create mode 100644 wright/src/parser/old/lexer/definition.rs rename wright/src/parser/{ => old}/lexer/pretty_print.rs (100%) rename wright/src/parser/{ => old}/lexer/tokens.rs (100%) rename wright/src/parser/{ => old}/state.rs (100%) rename wright/src/parser/{ => old}/util.rs (100%) rename wright/src/parser/{ => old}/util/discard_error.rs (100%) rename wright/src/parser/{ => old}/util/erase.rs (100%) rename wright/src/parser/{ => old}/util/first_successful.rs (100%) rename wright/src/parser/{ => old}/util/ignore.rs (100%) rename wright/src/parser/{ => old}/util/map.rs (100%) diff --git a/wright/Cargo.toml b/wright/Cargo.toml index 0eb54253..813d6ae3 100644 --- a/wright/Cargo.toml +++ b/wright/Cargo.toml @@ -24,8 +24,16 @@ derive_more = "0.99.17" unicode-ident = "1.0" codespan-reporting = "0.11.1" termcolor = "1.2.0" + +# Integers larger than 128 bits num = "0.4" +# Portable (windows, mac, linux) file locking +fs4 = { version = "0.7.0", features = ["sync"] } + +# Memory mapped files. +memmap2 = "0.9.3" + [[bin]] name = "wright" test = false diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index 7fc4f2f8..4f548b1b 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -47,24 +47,6 @@ fn main() -> Result<()> { let cli = Cli::parse(); match cli.command { - // Printing token debug information. - Some(Commands::Debug { - command: DebugCommands::Tokens { file, pretty }, - }) => { - let source_str = fs::read_to_string(&file)?; - let source = SimpleFile::new(file.to_string_lossy(), &source_str); - - if pretty { - Lexer::debug_pretty_print(&source)?; - } else { - for token in Lexer::new(&source_str) { - println!("{}", token); - } - } - - Ok(()) - } - // Start an interactive repl. Some(Commands::Repl) => repl::start(), diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 7f167a4e..20d2a0ab 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -1,8 +1,20 @@ //! Responsible for keeping track of different files added to the Wright build system. -use codespan_reporting::files::{Files, SimpleFiles}; +use codespan_reporting::{files::{Files, SimpleFile}, term::Config, diagnostic::Diagnostic}; use derive_more::Display; -use std::path::PathBuf; +use fs4::FileExt; +use memmap2::Mmap; +use termcolor::{ColorChoice, StandardStream}; +use std::{path::PathBuf, io, fs::File, sync::mpsc, thread, time::Duration}; + +/// Rename import for clarity. +use codespan_reporting::files::Error as CodespanError; + +/// Convenience type alias. +type CodespanResult = Result; + +/// Amount of time before we should warn the user about locking the file taking too long. +const FILE_LOCK_WARNING_TIME: Duration = Duration::from_secs(5); /// Used to represent different file names used throughout this crate. #[derive(Debug, Display, Clone)] @@ -20,8 +32,214 @@ pub enum FileName { None, } +/// An immutable string that either references a source file in memory using an `&` reference or using a [Box]. +#[derive(Debug)] +enum ImmutableString<'src> { + /// An immutable reference to an existing string. + Reference(&'src str), + + /// An owned immutable string. + Owned(Box), + + /// A locked, memory mapped file from the OS. + LockedFile { + /// The locked file that needs to be unlocked when this object is dropped. + locked_file: File, + /// The memory locked file -- this is expected to be locked before + /// one creates it in the file + mem_map: Mmap, + } +} + /// The file map that we use throughout the rest of this crate. -pub type FileMap = SimpleFiles; +pub struct FileMap<'src> { + /// This is just a list of files we're keeping track of. + /// This is identical to the current implementation of [codespan_reporting::files::SimpleFiles], + /// but we don't use theirs because we need to iterate over the [SimpleFile]s manually for various + /// parts of the implementation (including the [Drop] implementation). + inner: Vec>> +} + + +impl<'src> FileMap<'src> { + /// Construct a new empty [FileMap]. + pub const fn new() -> Self { + FileMap { inner: Vec::new() } + } + + /// Get a reference to a file from the internal [Vec] or return a [`CodespanError::FileMissing`] error. + fn get(&self, file_id: >::FileId) -> CodespanResult<&SimpleFile>> { + self.inner.get(file_id).ok_or(CodespanError::FileMissing) + } + + /// Internal function to add a file to the vec. Public facing functions will need to do some conversion + /// and then call this. + fn add(&mut self, name: FileName, source: ImmutableString<'src>) -> >::FileId { + // The file id is just the next index in the vec. + let file_id: usize = self.inner.len(); + self.inner.push(SimpleFile::new(name, source)); + file_id + } + + /// Add a file (in the form of an owned string) to the file map. + pub fn add_string(&mut self, name: FileName, source: String) -> >::FileId { + self.add(name, ImmutableString::Owned(source.into_boxed_str())) + } + + /// Add a file (in the form of a string reference) to the file map. + pub fn add_str_ref(&mut self, name: FileName, source: &'src str) -> >::FileId { + self.add(name, ImmutableString::Reference(source)) + } + + /// Add a file from the file system. This file will be + /// opened with read permissions, locked, memory mapped, + /// and then added to the file map. The file name in the memory map will be the [PathBuf] passed to this function. + pub fn add_file(&mut self, path: PathBuf) -> io::Result<>::FileId> { + // Make a one-off enum here to use for channel messages. + enum ChannelMessage { + /// The file was successfully locked. + FileLocked(File), + /// There was an error locking the file. + LockingError(io::Error), + /// File is taking a long time to lock. + FiveSecondWarning, + } + + // Open the file for reading. + let file: File = File::open(&path)?; + + // Create two threads and a mpsc channel for warning the user if + // locking the file takes longer than 5 seconds. + let (tx, rx) = mpsc::sync_channel::(1); + let timout_tx = tx.clone(); + + // Thread to lock the file + thread::spawn(move || { + match file.lock_exclusive() { + Ok(_) => tx.send(ChannelMessage::FileLocked(file)), + Err(err) => tx.send(ChannelMessage::LockingError(err)) + } + }); + + // Thread to warn user if it takes too long. + thread::spawn(move || { + thread::sleep(FILE_LOCK_WARNING_TIME); + timout_tx.send(ChannelMessage::FiveSecondWarning); + }); -/// The file id type used to refer to files in the file map. -pub type FileId = >::FileId; + // Use an infinite loop to make sure we recieve all the messages from the senders. + loop { + match rx.recv() { + // Emit the diagnostic for the 5-second warning. + Ok(ChannelMessage::FiveSecondWarning) => { + // Get a lock on the standard out so that we don't get interrupted here. + let stdout = StandardStream::stdout(ColorChoice::Auto); + let mut stdout = stdout.lock(); + // Make the diagnostic to show to the user. + let message = format!("Getting a file lock on {} has taken more than {} seconds.", path.display(), FILE_LOCK_WARNING_TIME.as_secs()); + let diagnostic: Diagnostic< as Files<'src>>::FileId> = Diagnostic::note().with_message(message); + // Emit the diagnostic to the user. + codespan_reporting::term::emit(&mut stdout, &Config::default(), self, &diagnostic) + // Convert from the potential codespan error to a normal IO err. + .map_err(|cs_err: CodespanError| match cs_err { + CodespanError::Io(io_err) => io_err, + _ => unreachable!("We should not see any other codespan errors here, since we do not reference files in this diagnostic."), + })? + } + + // Handle any io errors locking the file by returning them. + Ok(ChannelMessage::LockingError(io_err)) => return Err(io_err), + + // Handle success by finishing adding the file to the FileMap. + Ok(ChannelMessage::FileLocked(file)) => { + // The file is now locked, we can memmory map it and add it ro the vec. + // SAFETY: The file should be locked at this point so undefined behaviour from concurrent + // modification is avoided. + let mem_map: Mmap = unsafe { + Mmap::map(&file) + // Make sure we unlock the file if there's an issue memory mapping it. + .map_err(|err| { + file.unlock(); + err + }) + }?; + + // Double check that the file is valid utf-8. If not, return an IO error. + let raw_data: &[u8] = mem_map.as_ref(); + let as_str: Result<&str, std::str::Utf8Error> = std::str::from_utf8(raw_data); + if as_str.is_err() { + // The file is not valid for us so we should unlock it and return an error. + file.unlock(); + return Err(io::Error::new(io::ErrorKind::InvalidData, as_str.unwrap_err())); + } + + // The file's contents are valid utf-8, add them to the file map. + let file_id: usize = self.inner.len(); + self.add(FileName::Real(path), ImmutableString::LockedFile { locked_file: file, mem_map }); + return Ok(file_id); + } + + Err(_) => unreachable!("The reciever should never reach a state where both senders are closed."), + } + } + } +} + +impl<'src> Drop for FileMap<'src> { + fn drop(&mut self) { + // Unlock all files from the file system. + for file in self.inner.iter() { + match file.source() { + // Locked and memory-mapped files need to be unlocked before dropping. + ImmutableString::LockedFile { locked_file, .. } => { + // Unlock the file to give back to the OS. + locked_file.unlock(); + }, + + // All other types of file can drop normally. + _ => {} + } + } + } +} + +/// The implementation here is basically identical to the one for [codespan_reporting::files::SimpleFiles]. +impl<'src> Files<'src> for FileMap<'src> { + /// File IDs here are just indices into [FileMap]'s internal [Vec]. + type FileId = usize; + + type Name = FileName; + + type Source = &'src str; + + fn name(&self, id: Self::FileId) -> Result { + Ok(self.get(id)?.name().clone()) + } + + fn source(&'src self, id: Self::FileId) -> Result { + Ok(self.get(id)?.source().as_ref()) + } + + fn line_index(&self, id: Self::FileId, byte_index: usize) -> Result { + self.get(id)?.line_index((), byte_index) + } + + fn line_range(&self, id: Self::FileId, line_index: usize) -> Result, codespan_reporting::files::Error> { + self.get(id)?.line_range((), line_index) + } +} + +impl<'src> AsRef for ImmutableString<'src> { + fn as_ref(&self) -> &str { + match self { + ImmutableString::Reference(str) => str, + ImmutableString::Owned(str) => &str, + ImmutableString::LockedFile { mem_map, .. } => { + // Get a direct reference to the data that is in the memory map. + let raw_data: &[u8] = mem_map.as_ref(); + // SAFETY: UTF-8 validity is checked when the file is added to the file map. + unsafe { std::str::from_utf8_unchecked(raw_data) } + } + } + } +} diff --git a/wright/src/parser.rs b/wright/src/parser.rs index 3779b817..a1c20b2e 100644 --- a/wright/src/parser.rs +++ b/wright/src/parser.rs @@ -1,7 +1,9 @@ //! Parsers module, for all the parsers implemented by wright and necessary to parse wright source code. -pub mod ast; -pub mod error; +// pub mod ast; +// pub mod error; +// pub mod state; +// pub mod util; + +pub mod fragment; pub mod lexer; -pub mod state; -pub mod util; diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs new file mode 100644 index 00000000..49755154 --- /dev/null +++ b/wright/src/parser/fragment.rs @@ -0,0 +1,64 @@ +//! [Fragment] struct and implementation for dealing with fragments of source code. + +use std::str::Chars; + +/// A fragment of source code. +#[derive(Clone, Copy, Debug)] +pub struct Fragment<'src> { + /// Fragments are represented using direct string references into the source file itself. + pub inner: &'src str +} + +impl<'src> Fragment<'src> { + /// Get the length (in bytes) of this fragment. + pub const fn len(&self) -> usize { + self.inner.len() + } + + /// Return true if this fragment overlaps at all with the other (either one contains the start of the other, + /// by pointer). + pub fn overlaps(&self, other: &Self) -> bool { + // Get the pointer to the start of the string. + let (start, len) = (self.inner.as_ptr(), self.len()); + // Get a pointer just past the end of the string. + // SAFETY: the resulting pointer is guarunteed to point at one byte past the end of the string. + let end = unsafe { start.add(len) }; + + // Do the same thing for the other fragment. + let (other_start, len) = (other.inner.as_ptr(), other.len()); + let other_end = unsafe { other_start.add(len) }; + + // Check bounds. + (start <= other_start && other_start < end) || (other_start <= start && start < other_end) + } + + /// Split this fragment into two sub fragments, with the first one being `bytes` long and the second containing the + /// rest of this fragment. + pub fn split(&self, bytes: usize) -> (Self, Self) { + (Self { inner: &self.inner[..bytes] }, Self { inner: &self.inner[bytes..]}) + } + + /// Get an iterator over the characters in this fragment. + pub fn chars(&self) -> Chars<'src> { + self.inner.chars() + } +} + + +#[cfg(test)] +mod tests { + use crate::parser::fragment::Fragment; + + #[test] + fn test_overlap() { + let a = Fragment { inner: "Test string" }; + let b = Fragment { inner: &a.inner[3..] }; + let c = Fragment { inner: &a.inner[..a.len()-3] }; + let d = Fragment { inner: "other string" }; + + assert!(a.overlaps(&b)); + assert!(b.overlaps(&c)); + assert!(c.overlaps(&a)); + assert!(!a.overlaps(&d)); + } +} diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 2d34d58b..41ff4f60 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -1,507 +1,14 @@ -//! The wright lexer. This module is responsible for lexical analysis and initial processing of source code. +//! First pass lexer that gets run on the source code and returns a series of tokens with their associated [Fragment]s. +//! +//! Note that this will strip out comments and whitespace, returning only fragments that match one of the paterns +//! defined for tokens. -mod pretty_print; -pub mod tokens; +use super::fragment::Fragment; -use std::{ - iter::{FusedIterator, Peekable}, - str::CharIndices, -}; - -use self::tokens::{CommentTy, Token, TokenTy}; - -/// Lexical analyzer for wright code. This struct host functions that produce tokens from wright source. -#[derive(Debug, Clone)] -pub struct Lexer<'a> { - /// Iterator over the indexed input characters tied to the lifetime of the source code. - iterator: Peekable>, - /// The source code passed to the lexer. This is used to check for keywords. - source: &'a str, -} - -impl<'a> Lexer<'a> { - /// Create a new lexer that iterates on a given source string. - pub fn new(source: &'a str) -> Self { - Lexer { - iterator: source.char_indices().peekable(), - source, - } - } -} - -impl<'a> Iterator for Lexer<'a> { - type Item = Token; - - fn next(&mut self) -> Option { - // Get the next character out of the iterator. - let (start_index, next) = self.iterator.next()?; - - // Handle single character tokens first. - let single_char_tokens = [ - ('(', TokenTy::LeftParen), - (')', TokenTy::RightParen), - ('[', TokenTy::LeftSquare), - (']', TokenTy::RightSquare), - ('{', TokenTy::LeftBracket), - ('}', TokenTy::RightBracket), - ('@', TokenTy::At), - (';', TokenTy::Semi), - ('?', TokenTy::Question), - (',', TokenTy::Comma), - ('#', TokenTy::Pound), - ('$', TokenTy::Dollar), - ]; - - for (c, variant) in single_char_tokens { - if next == c { - return Some(Token { variant, length: 1 }); - } - } - - // Next handle tokens that can possibly be followed by an equal sign. - let possible_eq_upgrades = [ - ('!', TokenTy::Bang, TokenTy::BangEq), - ('%', TokenTy::Mod, TokenTy::ModEq), - ('^', TokenTy::Xor, TokenTy::XorEq), - ('*', TokenTy::Star, TokenTy::StarEq), - ('+', TokenTy::Plus, TokenTy::PlusEq), - ]; - - for (c, no_eq, with_eq) in possible_eq_upgrades { - if next == c { - return match self.iterator.next_if(|&(_, x)| x == '=') { - Some(_) => Some(Token { - variant: with_eq, - length: 2, - }), - None => Some(Token { - variant: no_eq, - length: 1, - }), - }; - } - } - - // Next handle tokens that can be doubled or have an equals sign. - let possible_eq_or_double = [ - ('&', TokenTy::And, TokenTy::AndEq, TokenTy::AndAnd), - ('|', TokenTy::Or, TokenTy::OrEq, TokenTy::OrOr), - ('<', TokenTy::Lt, TokenTy::LtEq, TokenTy::ShiftLeft), - ('>', TokenTy::Gt, TokenTy::GtEq, TokenTy::ShiftRight), - (':', TokenTy::Colon, TokenTy::ColonEq, TokenTy::ColonColon), - ('/', TokenTy::Div, TokenTy::DivEq, TokenTy::DivDiv), - ]; - - for (c, alone, with_eq, doubled) in possible_eq_or_double { - if next == c { - return match self.iterator.next_if(|&(_, x)| x == '=' || x == c) { - // Followed by `=` - Some((_, '=')) => Some(Token { - variant: with_eq, - length: 2, - }), - - // Followed by itself. - Some(_) => Some(Token { - variant: doubled, - length: 2, - }), - - // Single char token - None => Some(Token { - variant: alone, - length: 1, - }), - }; - } - } - - // Next deal with arrows - let arrows = [ - ('-', TokenTy::Minus, TokenTy::MinusEq, TokenTy::SingleArrow), - ('=', TokenTy::Eq, TokenTy::EqEq, TokenTy::DoubleArrow), - ('~', TokenTy::Tilde, TokenTy::TildeEq, TokenTy::TildeArrow), - ]; - - for (c, alone, with_eq, as_arrow) in arrows { - if next == c { - return match self.iterator.next_if(|&(_, x)| x == '=' || x == '>') { - Some((_, '=')) => Some(Token { - variant: with_eq, - length: 2, - }), - Some((_, '>')) => Some(Token { - variant: as_arrow, - length: 2, - }), - None => Some(Token { - variant: alone, - length: 1, - }), - _ => unreachable!(), - }; - } - } - - // Dot and range operators. - if next == '.' { - return match self.iterator.next_if(|&(_, x)| x == '.') { - None => Some(Token { - variant: TokenTy::Dot, - length: 1, - }), - Some(_) => match self.iterator.next_if(|&(_, x)| x == '=') { - None => Some(Token { - variant: TokenTy::Range, - length: 2, - }), - Some(_) => Some(Token { - variant: TokenTy::RangeInclusive, - length: 3, - }), - }, - }; - } - - // Whitespace. - if next.is_whitespace() { - // Accumulate the number of bytes of whitespace consumed. - let mut acc = next.len_utf8(); - // Use while-let instead of take-while to avoid consuming the whole iterator. - while let Some((_, consumed)) = self.iterator.next_if(|&(_, x)| x.is_whitespace()) { - acc += consumed.len_utf8(); - } - - return Some(Token { - variant: TokenTy::Whitespace, - length: acc, - }); - } - - // Identifiers - if unicode_ident::is_xid_start(next) || next == '_' { - // Accumulate the number of bytes consumed in the identifier. - let mut acc = next.len_utf8(); - // Consume the rest of the identifier. - while let Some((_, consumed)) = self - .iterator - .next_if(|&(_, x)| unicode_ident::is_xid_continue(x)) - { - acc += consumed.len_utf8(); - } - - // Get the matching source code to check for reserved words. - let range = start_index..start_index + acc; - let matching_source = &self.source[range]; - - // Match on reserved words. - let variant: TokenTy = match matching_source { - // Declaration keywords - "class" => TokenTy::Class, - "struct" => TokenTy::Struct, - "record" => TokenTy::Record, - "trait" => TokenTy::Trait, - "func" => TokenTy::Func, - "enum" => TokenTy::Enum, - "union" => TokenTy::Union, - "module" => TokenTy::Module, - "import" => TokenTy::Import, - "implement" => TokenTy::Implement, - "represent" => TokenTy::Represent, - - // Visibility keywords - "public" => TokenTy::Public, - "package" => TokenTy::Package, - "private" => TokenTy::Private, - - // Boolean literals - "true" => TokenTy::True, - "false" => TokenTy::False, - - // Other keywords. - "constraint" => TokenTy::Constraint, - "constrain" => TokenTy::Constrain, - "relation" => TokenTy::Relation, - "unsafe" => TokenTy::Unsafe, - "unchecked" => TokenTy::Unchecked, - "lifetime" => TokenTy::Lifetime, - "outlives" => TokenTy::Outlives, - "Self" => TokenTy::SelfUpper, - "self" => TokenTy::SelfLower, - "type" => TokenTy::Type, - "const" => TokenTy::Const, - "var" => TokenTy::Var, - "if" => TokenTy::If, - "else" => TokenTy::Else, - "match" => TokenTy::Match, - "is" => TokenTy::Is, - "as" => TokenTy::As, - "on" => TokenTy::On, - "in" => TokenTy::In, - "not" => TokenTy::Not, - "dyn" => TokenTy::Dyn, - "try" => TokenTy::Try, - - _ => TokenTy::Identifier, - }; - - return Some(Token { - variant, - length: acc, - }); - } - - // Numerical literals. - if next.is_ascii_digit() { - // Accumulate the number of bytes consumed in the numeric literal. - // All ascii is 1 byte wide so avoid the extra call to `.len_utf8()`. - let mut acc = 1; - // Track the radix - let mut radix = 10; - - // Change the radix if necessary - if next == '0' { - if let Some((_, prefix)) = self - .iterator - .next_if(|(_, x)| ['x', 'o', 'b', 'X', 'B'].contains(x)) - { - acc += 1; - - radix = match prefix { - 'x' | 'X' => 16, - 'b' | 'B' => 2, - 'o' => 8, - _ => unreachable!(), - }; - } - } - - // Consume the rest of the integer literal. - while self - .iterator - .next_if(|&(_, x)| x.is_digit(radix) || x == '_') - .is_some() - { - // All accepted characters should be ascii, so we can just simplify `.len_utf8()` to 1. - acc += 1; - } - - return Some(Token { - variant: TokenTy::IntegerLit, - length: acc, - }); - } - - // String and Character literals. - if ['\'', '"', '`'].contains(&next) { - // Accumulator to track number of bytes consumed. - let mut acc: usize = 1; - let mut is_terminated = false; - - // Consume characters until the end of the literal - while let Some((_, consumed)) = self.iterator.next() { - acc += consumed.len_utf8(); - - match consumed { - // Ending character is the same as starting character. - // Escapes should all be handled, so don't worry about this being escaped. - _ if consumed == next => { - is_terminated = true; - break; - } - - // Escaped pattern. - // Only worry about escaped terminators here, since all other escaped - // patterns can be dealt with later. - '\\' => { - // Consume the escaped character regardless of what it is. - // It will always be part of the quoted literal. - if let Some((_, escaped)) = self.iterator.next() { - acc += escaped.len_utf8(); - } - } - - // Do nothing for non-escaped chars since the quoted literal continues - // and we have already recorded the consumed bytes. - _ => {} - } - } - - // We have finished consuming the literal -- make sure we produce the - // right variant - return match next { - '\'' => Some(Token { - variant: TokenTy::CharLit { is_terminated }, - length: acc, - }), - _ => Some(Token { - variant: TokenTy::StringLit { - is_format: next == '`', - is_terminated, - }, - length: acc, - }), - }; - } - - // Comments. - if next == '#' { - // Use accumulator to track number of bytes consumed. - let mut acc = 1; - - // There are a few variants as follows. - // `#...` - single line comment - // `#*...*#` - multiline comment - // `##...` - single line inner doc comment - // `##!...` - single line outer doc comment - // `#**...*#` - multiline inner doc comment - // `#*!...*#` - multiline outer doc comment - // If a multiline comment is not terminated by the end of the file then just mark it as such in the - // produced token. A seperate token error handling layer will raise that outside of this function. - - // Handle multiline comments - if self.iterator.next_if(|&(_, x)| x == '*').is_some() { - acc += 1; - - // Check if it's a doc comment. - let comment_type = match self.iterator.next_if(|&(_, x)| x == '*' || x == '!') { - Some((_, '*')) => { - acc += 1; - CommentTy::InnerDoc - } - - Some((_, '!')) => { - acc += 1; - CommentTy::OuterDoc - } - - None => CommentTy::Normal, - - _ => unreachable!(), - }; - - // Read the rest of the multi-line comment - while let Some((_, consumed)) = self.iterator.next() { - acc += consumed.len_utf8(); - if consumed == '*' && matches!(self.iterator.peek(), Some((_, '#'))) { - acc += 1; - return Some(Token { - variant: TokenTy::MultilineComment { - comment_type, - is_terminated: true, - }, - length: acc, - }); - } - } - - // If we hit the end, the comment is not terminated. - return Some(Token { - variant: TokenTy::MultilineComment { - comment_type, - is_terminated: false, - }, - length: acc, - }); - } - - // Handle single line comment. - let mut comment_type = CommentTy::Normal; - - // Check for inner doc comment - if self.iterator.next_if(|&(_, x)| x == '#').is_some() { - acc += 1; - comment_type = CommentTy::InnerDoc; - - // Check for outer doc comment - if self.iterator.next_if(|&(_, x)| x == '!').is_some() { - acc += 1; - comment_type = CommentTy::OuterDoc; - } - } - - // Read to end of line/file for rest of comment. Include line ending in consumed bytes. - for (_, consumed) in self.iterator.by_ref() { - acc += consumed.len_utf8(); - if consumed == '\n' { - break; - } - } - - return Some(Token { - variant: TokenTy::SingleLineComment { comment_type }, - length: acc, - }); - } - - // If we haven't matched by this point, return an unknown token. - Some(Token { - variant: TokenTy::Unknown, - length: next.len_utf8(), - }) - } - - fn size_hint(&self) -> (usize, Option) { - // Get the size hint of the internal iterator. - let (inner_lower, upper) = self.iterator.size_hint(); - // If there are any characters left, then there is at least one token remaining. - ((inner_lower > 0) as usize, upper) - } -} - -impl<'a> FusedIterator for Lexer<'a> {} - -/// A token with an index in a piece of source code. -#[derive(Copy, Clone, Debug)] -pub struct IndexedToken { - /// The byte index into the source code that this token starts on. - pub index: usize, - /// The token itself. - pub token: Token, -} - -/// An iterator over the tokens in the source code with byte indices attached. -#[derive(Debug, Clone)] -pub struct IndexedLexer<'src> { - /// The current index in source code -- the number of bytes currently consumed by the iterator. - pub index: usize, - /// The underlying lexer iterator. - lexer: Lexer<'src>, -} - -impl<'src> IndexedLexer<'src> { - /// Construct a new indexed lexer. - pub fn new(source: &'src str) -> Self { - Self { - index: 0, - lexer: Lexer::new(source), - } - } -} - -impl<'a> Iterator for IndexedLexer<'a> { - type Item = IndexedToken; - - fn next(&mut self) -> Option { - // Pull a token from the iterator. - let token = self.lexer.next()?; - - // If available, add the current index to it to return. - let indexed_token = IndexedToken { - index: self.index, - token, - }; - - // Update the current index with the length of the token. - self.index += token.length; - - // Return indexed token - Some(indexed_token) - } - - fn size_hint(&self) -> (usize, Option) { - self.lexer.size_hint() - } +/// The +#[derive(Debug)] +pub struct Lexer<'src> { + /// The remaining source code that has not been processed and returned as a token from the iterator yet. + pub remaining: Fragment<'src>, } -impl<'a> FusedIterator for IndexedLexer<'a> {} diff --git a/wright/src/parser/ast.rs b/wright/src/parser/old/ast.rs similarity index 100% rename from wright/src/parser/ast.rs rename to wright/src/parser/old/ast.rs diff --git a/wright/src/parser/ast/declaration.rs b/wright/src/parser/old/ast/declaration.rs similarity index 100% rename from wright/src/parser/ast/declaration.rs rename to wright/src/parser/old/ast/declaration.rs diff --git a/wright/src/parser/ast/declaration/class.rs b/wright/src/parser/old/ast/declaration/class.rs similarity index 100% rename from wright/src/parser/ast/declaration/class.rs rename to wright/src/parser/old/ast/declaration/class.rs diff --git a/wright/src/parser/ast/declaration/enum.rs b/wright/src/parser/old/ast/declaration/enum.rs similarity index 100% rename from wright/src/parser/ast/declaration/enum.rs rename to wright/src/parser/old/ast/declaration/enum.rs diff --git a/wright/src/parser/ast/declaration/function.rs b/wright/src/parser/old/ast/declaration/function.rs similarity index 100% rename from wright/src/parser/ast/declaration/function.rs rename to wright/src/parser/old/ast/declaration/function.rs diff --git a/wright/src/parser/ast/declaration/generics.rs b/wright/src/parser/old/ast/declaration/generics.rs similarity index 100% rename from wright/src/parser/ast/declaration/generics.rs rename to wright/src/parser/old/ast/declaration/generics.rs diff --git a/wright/src/parser/ast/declaration/import.rs b/wright/src/parser/old/ast/declaration/import.rs similarity index 100% rename from wright/src/parser/ast/declaration/import.rs rename to wright/src/parser/old/ast/declaration/import.rs diff --git a/wright/src/parser/ast/declaration/module.rs b/wright/src/parser/old/ast/declaration/module.rs similarity index 100% rename from wright/src/parser/ast/declaration/module.rs rename to wright/src/parser/old/ast/declaration/module.rs diff --git a/wright/src/parser/ast/declaration/type.rs b/wright/src/parser/old/ast/declaration/type.rs similarity index 100% rename from wright/src/parser/ast/declaration/type.rs rename to wright/src/parser/old/ast/declaration/type.rs diff --git a/wright/src/parser/ast/declaration/union.rs b/wright/src/parser/old/ast/declaration/union.rs similarity index 100% rename from wright/src/parser/ast/declaration/union.rs rename to wright/src/parser/old/ast/declaration/union.rs diff --git a/wright/src/parser/ast/declaration/visibility.rs b/wright/src/parser/old/ast/declaration/visibility.rs similarity index 100% rename from wright/src/parser/ast/declaration/visibility.rs rename to wright/src/parser/old/ast/declaration/visibility.rs diff --git a/wright/src/parser/ast/declaration/where_clause.rs b/wright/src/parser/old/ast/declaration/where_clause.rs similarity index 100% rename from wright/src/parser/ast/declaration/where_clause.rs rename to wright/src/parser/old/ast/declaration/where_clause.rs diff --git a/wright/src/parser/ast/expression.rs b/wright/src/parser/old/ast/expression.rs similarity index 100% rename from wright/src/parser/ast/expression.rs rename to wright/src/parser/old/ast/expression.rs diff --git a/wright/src/parser/ast/expression/block.rs b/wright/src/parser/old/ast/expression/block.rs similarity index 100% rename from wright/src/parser/ast/expression/block.rs rename to wright/src/parser/old/ast/expression/block.rs diff --git a/wright/src/parser/ast/expression/literal.rs b/wright/src/parser/old/ast/expression/literal.rs similarity index 100% rename from wright/src/parser/ast/expression/literal.rs rename to wright/src/parser/old/ast/expression/literal.rs diff --git a/wright/src/parser/ast/expression/literal/boolean.rs b/wright/src/parser/old/ast/expression/literal/boolean.rs similarity index 100% rename from wright/src/parser/ast/expression/literal/boolean.rs rename to wright/src/parser/old/ast/expression/literal/boolean.rs diff --git a/wright/src/parser/ast/expression/literal/character.rs b/wright/src/parser/old/ast/expression/literal/character.rs similarity index 100% rename from wright/src/parser/ast/expression/literal/character.rs rename to wright/src/parser/old/ast/expression/literal/character.rs diff --git a/wright/src/parser/ast/expression/literal/escapes.rs b/wright/src/parser/old/ast/expression/literal/escapes.rs similarity index 100% rename from wright/src/parser/ast/expression/literal/escapes.rs rename to wright/src/parser/old/ast/expression/literal/escapes.rs diff --git a/wright/src/parser/ast/expression/literal/integer.rs b/wright/src/parser/old/ast/expression/literal/integer.rs similarity index 100% rename from wright/src/parser/ast/expression/literal/integer.rs rename to wright/src/parser/old/ast/expression/literal/integer.rs diff --git a/wright/src/parser/ast/expression/literal/string.rs b/wright/src/parser/old/ast/expression/literal/string.rs similarity index 100% rename from wright/src/parser/ast/expression/literal/string.rs rename to wright/src/parser/old/ast/expression/literal/string.rs diff --git a/wright/src/parser/ast/expression/parentheses.rs b/wright/src/parser/old/ast/expression/parentheses.rs similarity index 100% rename from wright/src/parser/ast/expression/parentheses.rs rename to wright/src/parser/old/ast/expression/parentheses.rs diff --git a/wright/src/parser/ast/expression/primary.rs b/wright/src/parser/old/ast/expression/primary.rs similarity index 100% rename from wright/src/parser/ast/expression/primary.rs rename to wright/src/parser/old/ast/expression/primary.rs diff --git a/wright/src/parser/ast/identifier.rs b/wright/src/parser/old/ast/identifier.rs similarity index 100% rename from wright/src/parser/ast/identifier.rs rename to wright/src/parser/old/ast/identifier.rs diff --git a/wright/src/parser/ast/metadata.rs b/wright/src/parser/old/ast/metadata.rs similarity index 100% rename from wright/src/parser/ast/metadata.rs rename to wright/src/parser/old/ast/metadata.rs diff --git a/wright/src/parser/ast/path.rs b/wright/src/parser/old/ast/path.rs similarity index 100% rename from wright/src/parser/ast/path.rs rename to wright/src/parser/old/ast/path.rs diff --git a/wright/src/parser/ast/statement.rs b/wright/src/parser/old/ast/statement.rs similarity index 100% rename from wright/src/parser/ast/statement.rs rename to wright/src/parser/old/ast/statement.rs diff --git a/wright/src/parser/ast/statement/bind.rs b/wright/src/parser/old/ast/statement/bind.rs similarity index 100% rename from wright/src/parser/ast/statement/bind.rs rename to wright/src/parser/old/ast/statement/bind.rs diff --git a/wright/src/parser/ast/types.rs b/wright/src/parser/old/ast/types.rs similarity index 100% rename from wright/src/parser/ast/types.rs rename to wright/src/parser/old/ast/types.rs diff --git a/wright/src/parser/error.rs b/wright/src/parser/old/error.rs similarity index 100% rename from wright/src/parser/error.rs rename to wright/src/parser/old/error.rs diff --git a/wright/src/parser/old/lexer.rs b/wright/src/parser/old/lexer.rs new file mode 100644 index 00000000..baf58553 --- /dev/null +++ b/wright/src/parser/old/lexer.rs @@ -0,0 +1,511 @@ +//! The wright lexer. This module is responsible for lexical analysis and initial processing of source code. +//! +//! This is implemented here using an iterator that looks up the next character from the input using a `const`-defined +//! lexer structure definition. This can be found in [definition]. + +pub mod tokens; +mod definition; +// mod pretty_print; + +use std::{ + iter::{FusedIterator, Peekable}, + str::CharIndices, +}; + +use self::tokens::{CommentTy, Token, TokenTy}; + +/// Lexical analyzer for wright code. This struct host functions that produce tokens from wright source. +#[derive(Debug, Clone)] +pub struct Lexer<'a> { + /// Iterator over the indexed input characters tied to the lifetime of the source code. + iterator: Peekable>, + /// The source code passed to the lexer. This is used to check for keywords. + source: &'a str, +} + +impl<'a> Lexer<'a> { + /// Create a new lexer that iterates on a given source string. + pub fn new(source: &'a str) -> Self { + Lexer { + iterator: source.char_indices().peekable(), + source, + } + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token; + + fn next(&mut self) -> Option { + // Get the next character out of the iterator. + let (start_index, next) = self.iterator.next()?; + + // Handle single character tokens first. + let single_char_tokens = [ + ('(', TokenTy::LeftParen), + (')', TokenTy::RightParen), + ('[', TokenTy::LeftSquare), + (']', TokenTy::RightSquare), + ('{', TokenTy::LeftBracket), + ('}', TokenTy::RightBracket), + ('@', TokenTy::At), + (';', TokenTy::Semi), + ('?', TokenTy::Question), + (',', TokenTy::Comma), + ('#', TokenTy::Pound), + ('$', TokenTy::Dollar), + ]; + + for (c, variant) in single_char_tokens { + if next == c { + return Some(Token { variant, length: 1 }); + } + } + + // Next handle tokens that can possibly be followed by an equal sign. + let possible_eq_upgrades = [ + ('!', TokenTy::Bang, TokenTy::BangEq), + ('%', TokenTy::Mod, TokenTy::ModEq), + ('^', TokenTy::Xor, TokenTy::XorEq), + ('*', TokenTy::Star, TokenTy::StarEq), + ('+', TokenTy::Plus, TokenTy::PlusEq), + ]; + + for (c, no_eq, with_eq) in possible_eq_upgrades { + if next == c { + return match self.iterator.next_if(|&(_, x)| x == '=') { + Some(_) => Some(Token { + variant: with_eq, + length: 2, + }), + None => Some(Token { + variant: no_eq, + length: 1, + }), + }; + } + } + + // Next handle tokens that can be doubled or have an equals sign. + let possible_eq_or_double = [ + ('&', TokenTy::And, TokenTy::AndEq, TokenTy::AndAnd), + ('|', TokenTy::Or, TokenTy::OrEq, TokenTy::OrOr), + ('<', TokenTy::Lt, TokenTy::LtEq, TokenTy::ShiftLeft), + ('>', TokenTy::Gt, TokenTy::GtEq, TokenTy::ShiftRight), + (':', TokenTy::Colon, TokenTy::ColonEq, TokenTy::ColonColon), + ('/', TokenTy::Div, TokenTy::DivEq, TokenTy::DivDiv), + ]; + + for (c, alone, with_eq, doubled) in possible_eq_or_double { + if next == c { + return match self.iterator.next_if(|&(_, x)| x == '=' || x == c) { + // Followed by `=` + Some((_, '=')) => Some(Token { + variant: with_eq, + length: 2, + }), + + // Followed by itself. + Some(_) => Some(Token { + variant: doubled, + length: 2, + }), + + // Single char token + None => Some(Token { + variant: alone, + length: 1, + }), + }; + } + } + + // Next deal with arrows + let arrows = [ + ('-', TokenTy::Minus, TokenTy::MinusEq, TokenTy::SingleArrow), + ('=', TokenTy::Eq, TokenTy::EqEq, TokenTy::DoubleArrow), + ('~', TokenTy::Tilde, TokenTy::TildeEq, TokenTy::TildeArrow), + ]; + + for (c, alone, with_eq, as_arrow) in arrows { + if next == c { + return match self.iterator.next_if(|&(_, x)| x == '=' || x == '>') { + Some((_, '=')) => Some(Token { + variant: with_eq, + length: 2, + }), + Some((_, '>')) => Some(Token { + variant: as_arrow, + length: 2, + }), + None => Some(Token { + variant: alone, + length: 1, + }), + _ => unreachable!(), + }; + } + } + + // Dot and range operators. + if next == '.' { + return match self.iterator.next_if(|&(_, x)| x == '.') { + None => Some(Token { + variant: TokenTy::Dot, + length: 1, + }), + Some(_) => match self.iterator.next_if(|&(_, x)| x == '=') { + None => Some(Token { + variant: TokenTy::Range, + length: 2, + }), + Some(_) => Some(Token { + variant: TokenTy::RangeInclusive, + length: 3, + }), + }, + }; + } + + // Whitespace. + if next.is_whitespace() { + // Accumulate the number of bytes of whitespace consumed. + let mut acc = next.len_utf8(); + // Use while-let instead of take-while to avoid consuming the whole iterator. + while let Some((_, consumed)) = self.iterator.next_if(|&(_, x)| x.is_whitespace()) { + acc += consumed.len_utf8(); + } + + return Some(Token { + variant: TokenTy::Whitespace, + length: acc, + }); + } + + // Identifiers + if unicode_ident::is_xid_start(next) || next == '_' { + // Accumulate the number of bytes consumed in the identifier. + let mut acc = next.len_utf8(); + // Consume the rest of the identifier. + while let Some((_, consumed)) = self + .iterator + .next_if(|&(_, x)| unicode_ident::is_xid_continue(x)) + { + acc += consumed.len_utf8(); + } + + // Get the matching source code to check for reserved words. + let range = start_index..start_index + acc; + let matching_source = &self.source[range]; + + // Match on reserved words. + let variant: TokenTy = match matching_source { + // Declaration keywords + "class" => TokenTy::Class, + "struct" => TokenTy::Struct, + "record" => TokenTy::Record, + "trait" => TokenTy::Trait, + "func" => TokenTy::Func, + "enum" => TokenTy::Enum, + "union" => TokenTy::Union, + "module" => TokenTy::Module, + "import" => TokenTy::Import, + "implement" => TokenTy::Implement, + "represent" => TokenTy::Represent, + + // Visibility keywords + "public" => TokenTy::Public, + "package" => TokenTy::Package, + "private" => TokenTy::Private, + + // Boolean literals + "true" => TokenTy::True, + "false" => TokenTy::False, + + // Other keywords. + "constraint" => TokenTy::Constraint, + "constrain" => TokenTy::Constrain, + "relation" => TokenTy::Relation, + "unsafe" => TokenTy::Unsafe, + "unchecked" => TokenTy::Unchecked, + "lifetime" => TokenTy::Lifetime, + "outlives" => TokenTy::Outlives, + "Self" => TokenTy::SelfUpper, + "self" => TokenTy::SelfLower, + "type" => TokenTy::Type, + "const" => TokenTy::Const, + "var" => TokenTy::Var, + "if" => TokenTy::If, + "else" => TokenTy::Else, + "match" => TokenTy::Match, + "is" => TokenTy::Is, + "as" => TokenTy::As, + "on" => TokenTy::On, + "in" => TokenTy::In, + "not" => TokenTy::Not, + "dyn" => TokenTy::Dyn, + "try" => TokenTy::Try, + + _ => TokenTy::Identifier, + }; + + return Some(Token { + variant, + length: acc, + }); + } + + // Numerical literals. + if next.is_ascii_digit() { + // Accumulate the number of bytes consumed in the numeric literal. + // All ascii is 1 byte wide so avoid the extra call to `.len_utf8()`. + let mut acc = 1; + // Track the radix + let mut radix = 10; + + // Change the radix if necessary + if next == '0' { + if let Some((_, prefix)) = self + .iterator + .next_if(|(_, x)| ['x', 'o', 'b', 'X', 'B'].contains(x)) + { + acc += 1; + + radix = match prefix { + 'x' | 'X' => 16, + 'b' | 'B' => 2, + 'o' => 8, + _ => unreachable!(), + }; + } + } + + // Consume the rest of the integer literal. + while self + .iterator + .next_if(|&(_, x)| x.is_digit(radix) || x == '_') + .is_some() + { + // All accepted characters should be ascii, so we can just simplify `.len_utf8()` to 1. + acc += 1; + } + + return Some(Token { + variant: TokenTy::IntegerLit, + length: acc, + }); + } + + // String and Character literals. + if ['\'', '"', '`'].contains(&next) { + // Accumulator to track number of bytes consumed. + let mut acc: usize = 1; + let mut is_terminated = false; + + // Consume characters until the end of the literal + while let Some((_, consumed)) = self.iterator.next() { + acc += consumed.len_utf8(); + + match consumed { + // Ending character is the same as starting character. + // Escapes should all be handled, so don't worry about this being escaped. + _ if consumed == next => { + is_terminated = true; + break; + } + + // Escaped pattern. + // Only worry about escaped terminators here, since all other escaped + // patterns can be dealt with later. + '\\' => { + // Consume the escaped character regardless of what it is. + // It will always be part of the quoted literal. + if let Some((_, escaped)) = self.iterator.next() { + acc += escaped.len_utf8(); + } + } + + // Do nothing for non-escaped chars since the quoted literal continues + // and we have already recorded the consumed bytes. + _ => {} + } + } + + // We have finished consuming the literal -- make sure we produce the + // right variant + return match next { + '\'' => Some(Token { + variant: TokenTy::CharLit { is_terminated }, + length: acc, + }), + _ => Some(Token { + variant: TokenTy::StringLit { + is_format: next == '`', + is_terminated, + }, + length: acc, + }), + }; + } + + // Comments. + if next == '#' { + // Use accumulator to track number of bytes consumed. + let mut acc = 1; + + // There are a few variants as follows. + // `#...` - single line comment + // `#*...*#` - multiline comment + // `##...` - single line inner doc comment + // `##!...` - single line outer doc comment + // `#**...*#` - multiline inner doc comment + // `#*!...*#` - multiline outer doc comment + // If a multiline comment is not terminated by the end of the file then just mark it as such in the + // produced token. A seperate token error handling layer will raise that outside of this function. + + // Handle multiline comments + if self.iterator.next_if(|&(_, x)| x == '*').is_some() { + acc += 1; + + // Check if it's a doc comment. + let comment_type = match self.iterator.next_if(|&(_, x)| x == '*' || x == '!') { + Some((_, '*')) => { + acc += 1; + CommentTy::InnerDoc + } + + Some((_, '!')) => { + acc += 1; + CommentTy::OuterDoc + } + + None => CommentTy::Normal, + + _ => unreachable!(), + }; + + // Read the rest of the multi-line comment + while let Some((_, consumed)) = self.iterator.next() { + acc += consumed.len_utf8(); + if consumed == '*' && matches!(self.iterator.peek(), Some((_, '#'))) { + acc += 1; + return Some(Token { + variant: TokenTy::MultilineComment { + comment_type, + is_terminated: true, + }, + length: acc, + }); + } + } + + // If we hit the end, the comment is not terminated. + return Some(Token { + variant: TokenTy::MultilineComment { + comment_type, + is_terminated: false, + }, + length: acc, + }); + } + + // Handle single line comment. + let mut comment_type = CommentTy::Normal; + + // Check for inner doc comment + if self.iterator.next_if(|&(_, x)| x == '#').is_some() { + acc += 1; + comment_type = CommentTy::InnerDoc; + + // Check for outer doc comment + if self.iterator.next_if(|&(_, x)| x == '!').is_some() { + acc += 1; + comment_type = CommentTy::OuterDoc; + } + } + + // Read to end of line/file for rest of comment. Include line ending in consumed bytes. + for (_, consumed) in self.iterator.by_ref() { + acc += consumed.len_utf8(); + if consumed == '\n' { + break; + } + } + + return Some(Token { + variant: TokenTy::SingleLineComment { comment_type }, + length: acc, + }); + } + + // If we haven't matched by this point, return an unknown token. + Some(Token { + variant: TokenTy::Unknown, + length: next.len_utf8(), + }) + } + + fn size_hint(&self) -> (usize, Option) { + // Get the size hint of the internal iterator. + let (inner_lower, upper) = self.iterator.size_hint(); + // If there are any characters left, then there is at least one token remaining. + ((inner_lower > 0) as usize, upper) + } +} + +impl<'a> FusedIterator for Lexer<'a> {} + +/// A token with an index in a piece of source code. +#[derive(Copy, Clone, Debug)] +pub struct IndexedToken { + /// The byte index into the source code that this token starts on. + pub index: usize, + /// The token itself. + pub token: Token, +} + +/// An iterator over the tokens in the source code with byte indices attached. +#[derive(Debug, Clone)] +pub struct IndexedLexer<'src> { + /// The current index in source code -- the number of bytes currently consumed by the iterator. + pub index: usize, + /// The underlying lexer iterator. + lexer: Lexer<'src>, +} + +impl<'src> IndexedLexer<'src> { + /// Construct a new indexed lexer. + pub fn new(source: &'src str) -> Self { + Self { + index: 0, + lexer: Lexer::new(source), + } + } +} + +impl<'a> Iterator for IndexedLexer<'a> { + type Item = IndexedToken; + + fn next(&mut self) -> Option { + // Pull a token from the iterator. + let token = self.lexer.next()?; + + // If available, add the current index to it to return. + let indexed_token = IndexedToken { + index: self.index, + token, + }; + + // Update the current index with the length of the token. + self.index += token.length; + + // Return indexed token + Some(indexed_token) + } + + fn size_hint(&self) -> (usize, Option) { + self.lexer.size_hint() + } +} + +impl<'a> FusedIterator for IndexedLexer<'a> {} diff --git a/wright/src/parser/old/lexer/definition.rs b/wright/src/parser/old/lexer/definition.rs new file mode 100644 index 00000000..8dbeb5fa --- /dev/null +++ b/wright/src/parser/old/lexer/definition.rs @@ -0,0 +1,72 @@ +//! The lexer definition in a rust constant that tells us how to handle characters encountered and lists all the +//! possible tokens produced. + +use super::tokens::{TokenTy}; + +/// A single character token matches a single character from the input, and produces a token of the length of the +/// character exactly. +#[derive(Clone, Copy, Debug)] +pub struct SingleCharToken { + /// The character to match. + pub matching_char: char, + /// The token type produced. + pub produces: TokenTy, +} + +impl SingleCharToken { + /// Turn a single character token into a lexer branch. + const fn into_lexer_branch(self) -> LexerBranch { + LexerBranch::SingleCharToken(self) + } +} + +/// A set of posible continuations from a single char token that will form multi char tokens +/// (i.e. going from `&` to `&&` and `&=`). +#[derive(Clone, Copy, Debug)] +pub struct PossibleContinuations { + /// The base single char and the token it produces when not followed by one of the other possible characters. + pub base: SingleCharToken, + /// The characters that can follow this and the tokens they would produce. + pub continuations: &'static [(char, TokenTy)] +} + +impl PossibleContinuations { + /// Convert to a [LexerBranch]. + const fn into_lexer_branch(self) -> LexerBranch { + LexerBranch::PossibleContinuations(self) + } +} + +/// A branch in the lexer, representing options to be tried. +#[derive(Debug)] +pub enum LexerBranch { + /// A single character token (such as '[') with no option for continuation. + SingleCharToken(SingleCharToken), + PossibleContinuations(PossibleContinuations) + +} + +// Below is a variety of `const-fn`s to make generating this structure easier. + +/// Makes a [SingleCharToken]. +const fn single(matching_char: char, produces: TokenTy) -> SingleCharToken { + SingleCharToken { matching_char, produces } +} + +/// Makes a [PossibleContinuations]. +const fn pc(matching_char: char, produces: TokenTy, continuations: &'static [(char, TokenTy)]) -> PossibleContinuations { + PossibleContinuations { base: single(matching_char, produces), continuations } +} + + +/// The lexer's definition, in abstract branching. +pub const DEFINITION: &[LexerBranch] = &[ + single('(', TokenTy::LeftParen).into_lexer_branch(), + single(')', TokenTy::RightParen).into_lexer_branch(), + + pc('+', TokenTy::Plus, &[ + ('=', TokenTy::PlusEq), + ]).into_lexer_branch(), + + +]; diff --git a/wright/src/parser/lexer/pretty_print.rs b/wright/src/parser/old/lexer/pretty_print.rs similarity index 100% rename from wright/src/parser/lexer/pretty_print.rs rename to wright/src/parser/old/lexer/pretty_print.rs diff --git a/wright/src/parser/lexer/tokens.rs b/wright/src/parser/old/lexer/tokens.rs similarity index 100% rename from wright/src/parser/lexer/tokens.rs rename to wright/src/parser/old/lexer/tokens.rs diff --git a/wright/src/parser/state.rs b/wright/src/parser/old/state.rs similarity index 100% rename from wright/src/parser/state.rs rename to wright/src/parser/old/state.rs diff --git a/wright/src/parser/util.rs b/wright/src/parser/old/util.rs similarity index 100% rename from wright/src/parser/util.rs rename to wright/src/parser/old/util.rs diff --git a/wright/src/parser/util/discard_error.rs b/wright/src/parser/old/util/discard_error.rs similarity index 100% rename from wright/src/parser/util/discard_error.rs rename to wright/src/parser/old/util/discard_error.rs diff --git a/wright/src/parser/util/erase.rs b/wright/src/parser/old/util/erase.rs similarity index 100% rename from wright/src/parser/util/erase.rs rename to wright/src/parser/old/util/erase.rs diff --git a/wright/src/parser/util/first_successful.rs b/wright/src/parser/old/util/first_successful.rs similarity index 100% rename from wright/src/parser/util/first_successful.rs rename to wright/src/parser/old/util/first_successful.rs diff --git a/wright/src/parser/util/ignore.rs b/wright/src/parser/old/util/ignore.rs similarity index 100% rename from wright/src/parser/util/ignore.rs rename to wright/src/parser/old/util/ignore.rs diff --git a/wright/src/parser/util/map.rs b/wright/src/parser/old/util/map.rs similarity index 100% rename from wright/src/parser/util/map.rs rename to wright/src/parser/old/util/map.rs diff --git a/wright/src/repl.rs b/wright/src/repl.rs index 66582fb5..74560937 100644 --- a/wright/src/repl.rs +++ b/wright/src/repl.rs @@ -2,7 +2,7 @@ use crate::{ filemap::{FileMap, FileName}, - parser::lexer::Lexer, + // parser::lexer::Lexer, WRIGHT_VERSION, }; use derive_more::Display; @@ -120,34 +120,6 @@ pub fn start() -> anyhow::Result<()> { write!(&mut output, "[{}]: << ", input_number)?; output.flush()?; - // Add line to the code map. - let file_handle = code_map.add( - FileName::Repl { - line_number: input_number, - }, - line, - ); - // Get a ref to the line we just added to the code map. - let line_ref: &str = code_map.get(file_handle).unwrap().source().as_str(); - - match repl_mode { - ReplMode::Ast => { - unimplemented!("AST mode is unimplemented."); - } - - ReplMode::Tokens => { - // Make a new lexer and iterate through the tokens generated. - let lexer = Lexer::new(line_ref); - - for token in lexer { - write!(&mut output, "[{}]", token)?; - } - - // Write newline. - writeln!(&mut output)?; - } - - ReplMode::Eval => unimplemented!("Eval mode is unimplemented."), - } + unimplemented!("REPL needs to be re-worked a bit."); } } diff --git a/wright/tests/lexer.rs b/wright/tests/lexer.rs index 4dd0acde..42b972a9 100644 --- a/wright/tests/lexer.rs +++ b/wright/tests/lexer.rs @@ -1,33 +1,33 @@ -use wright::parser::lexer::{ - tokens::{Token, TokenTy}, - Lexer, -}; +// use wright::parser::lexer::{ +// tokens::{Token, TokenTy}, +// Lexer, +// }; -/// Test unterminated string literal. -#[test] -fn unterminated_string_literal() { - let tokens: Vec = Lexer::new(r#""this string is not closed"#).collect(); - assert_eq!(tokens.len(), 1); - assert_eq!( - tokens[0].variant, - TokenTy::StringLit { - is_format: false, - is_terminated: false - } - ); -} +// /// Test unterminated string literal. +// #[test] +// fn unterminated_string_literal() { +// let tokens: Vec = Lexer::new(r#""this string is not closed"#).collect(); +// assert_eq!(tokens.len(), 1); +// assert_eq!( +// tokens[0].variant, +// TokenTy::StringLit { +// is_format: false, +// is_terminated: false +// } +// ); +// } -/// Test string literal with escaped terminal. -#[test] -fn string_with_escape() { - let tokens: Vec = - Lexer::new(r#" "this string has an escaped terminator \" " "#).collect(); - assert_eq!(tokens.len(), 3); - assert_eq!( - tokens[1].variant, - TokenTy::StringLit { - is_format: false, - is_terminated: true - } - ); -} +// /// Test string literal with escaped terminal. +// #[test] +// fn string_with_escape() { +// let tokens: Vec = +// Lexer::new(r#" "this string has an escaped terminator \" " "#).collect(); +// assert_eq!(tokens.len(), 3); +// assert_eq!( +// tokens[1].variant, +// TokenTy::StringLit { +// is_format: false, +// is_terminated: true +// } +// ); +// } From e9348626e7e3f71028aecc3094b2ebdcc53fe61d Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 20 Jan 2024 03:19:41 -0500 Subject: [PATCH 02/60] Better error handling on unlocks. --- wright/src/filemap.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 20d2a0ab..45d7d21b 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -124,7 +124,7 @@ impl<'src> FileMap<'src> { // Thread to warn user if it takes too long. thread::spawn(move || { thread::sleep(FILE_LOCK_WARNING_TIME); - timout_tx.send(ChannelMessage::FiveSecondWarning); + timout_tx.send(ChannelMessage::FiveSecondWarning) }); // Use an infinite loop to make sure we recieve all the messages from the senders. @@ -159,7 +159,7 @@ impl<'src> FileMap<'src> { Mmap::map(&file) // Make sure we unlock the file if there's an issue memory mapping it. .map_err(|err| { - file.unlock(); + file.unlock().map_err(|err| eprintln!("Error unlocking file: {:?}", err)).ok(); err }) }?; @@ -169,7 +169,7 @@ impl<'src> FileMap<'src> { let as_str: Result<&str, std::str::Utf8Error> = std::str::from_utf8(raw_data); if as_str.is_err() { // The file is not valid for us so we should unlock it and return an error. - file.unlock(); + file.unlock().map_err(|err| eprintln!("Error unlocking file: {:?}", err)).ok(); return Err(io::Error::new(io::ErrorKind::InvalidData, as_str.unwrap_err())); } @@ -193,7 +193,7 @@ impl<'src> Drop for FileMap<'src> { // Locked and memory-mapped files need to be unlocked before dropping. ImmutableString::LockedFile { locked_file, .. } => { // Unlock the file to give back to the OS. - locked_file.unlock(); + locked_file.unlock().map_err(|err| eprintln!("Error unlocking file: {:?}", err)).ok(); }, // All other types of file can drop normally. From 1e82b8582f6e694978de19efc8d696677da8d3ce Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 20 Jan 2024 03:20:01 -0500 Subject: [PATCH 03/60] Typo --- wright/src/filemap.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 45d7d21b..5dbce910 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -196,7 +196,7 @@ impl<'src> Drop for FileMap<'src> { locked_file.unlock().map_err(|err| eprintln!("Error unlocking file: {:?}", err)).ok(); }, - // All other types of file can drop normally. + // All other types of files can drop normally. _ => {} } } From a974b69dbcd38225b2af0770cf6f0426f1078ec1 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 20 Jan 2024 20:59:11 -0500 Subject: [PATCH 04/60] Remove excess file ID calculation --- wright/src/filemap.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 5dbce910..b362089b 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -174,9 +174,7 @@ impl<'src> FileMap<'src> { } // The file's contents are valid utf-8, add them to the file map. - let file_id: usize = self.inner.len(); - self.add(FileName::Real(path), ImmutableString::LockedFile { locked_file: file, mem_map }); - return Ok(file_id); + return Ok(self.add(FileName::Real(path), ImmutableString::LockedFile { locked_file: file, mem_map })); } Err(_) => unreachable!("The reciever should never reach a state where both senders are closed."), From 02e284213b362be9a0c33ee14b8456371acbb2d8 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 21 Jan 2024 04:42:52 -0500 Subject: [PATCH 05/60] Additions to fragments and filemaps --- wright/src/filemap.rs | 16 ++++++++++++++++ wright/src/parser/fragment.rs | 34 +++++++++++++++++++++++----------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index b362089b..9bad2d2e 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -6,6 +6,7 @@ use fs4::FileExt; use memmap2::Mmap; use termcolor::{ColorChoice, StandardStream}; use std::{path::PathBuf, io, fs::File, sync::mpsc, thread, time::Duration}; +use crate::parser::fragment::Fragment; /// Rename import for clarity. use codespan_reporting::files::Error as CodespanError; @@ -181,6 +182,21 @@ impl<'src> FileMap<'src> { } } } + + /// Find the file ID of a given [Fragment] using the fragment's internal pointer. + pub fn find_fragment(&self, fragment: &Fragment<'src>) -> Option<>::FileId> { + // Iterate on file IDs. + for file_id in 0..self.inner.len() { + // Use expect because all of these file IDs should be fine. + let source: &str = self.source(file_id).expect("All file IDs here are valid"); + if (Fragment { inner: source }).contains(fragment) { + return Some(file_id); + } + } + + // If there was no file containing the given fragment, return none. + None + } } impl<'src> Drop for FileMap<'src> { diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index 49755154..ae5d73da 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -15,23 +15,35 @@ impl<'src> Fragment<'src> { self.inner.len() } - /// Return true if this fragment overlaps at all with the other (either one contains the start of the other, - /// by pointer). - pub fn overlaps(&self, other: &Self) -> bool { - // Get the pointer to the start of the string. - let (start, len) = (self.inner.as_ptr(), self.len()); + /// Get a pair of pointers, the first one being at the beginning of the fragment, the second one pointing + /// to the byte after the end of the fragment. + const fn start_and_end(&self) -> (*const u8, *const u8) { + // Get the pointer to the start of the fragment. + let start: *const u8 = self.inner.as_ptr(); // Get a pointer just past the end of the string. // SAFETY: the resulting pointer is guarunteed to point at one byte past the end of the string. - let end = unsafe { start.add(len) }; - - // Do the same thing for the other fragment. - let (other_start, len) = (other.inner.as_ptr(), other.len()); - let other_end = unsafe { other_start.add(len) }; + (start, unsafe { start.add(self.len()) }) + } - // Check bounds. + /// Return true if this fragment overlaps at all with the other (either one contains the start of the other, + /// by pointer). + pub fn overlaps(&self, other: &Self) -> bool { + // Get start and end pointers for both fragments. + let (start, end) = self.start_and_end(); + let (other_start, other_end) = other.start_and_end(); + // Check if this fragment contains either end of the other fragment. (start <= other_start && other_start < end) || (other_start <= start && start < other_end) } + /// Return true if this fragment entirely contains another fragment using pointers. + pub fn contains(&self, other: &Self) -> bool { + // Get start and end pointers for both fragments. + let (start, end) = self.start_and_end(); + let (other_start, other_end) = other.start_and_end(); + // Check bounds. + start <= other_start && end >= other_end + } + /// Split this fragment into two sub fragments, with the first one being `bytes` long and the second containing the /// rest of this fragment. pub fn split(&self, bytes: usize) -> (Self, Self) { From 277d5bd01429245e40189340ca380fca0605f65b Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 21 Jan 2024 13:01:16 -0500 Subject: [PATCH 06/60] Fix inaccurate Drop implementation --- wright/src/filemap.rs | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 9bad2d2e..1ca33687 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -199,20 +199,21 @@ impl<'src> FileMap<'src> { } } -impl<'src> Drop for FileMap<'src> { +/// Implement drop here to make sure that the files get unlocked as they go out of scope/use. +impl<'src> Drop for ImmutableString<'src> { fn drop(&mut self) { - // Unlock all files from the file system. - for file in self.inner.iter() { - match file.source() { - // Locked and memory-mapped files need to be unlocked before dropping. - ImmutableString::LockedFile { locked_file, .. } => { - // Unlock the file to give back to the OS. - locked_file.unlock().map_err(|err| eprintln!("Error unlocking file: {:?}", err)).ok(); - }, - - // All other types of files can drop normally. - _ => {} + match self { + // Unlock locked files. + ImmutableString::LockedFile { locked_file, .. } => { + locked_file.unlock() + // Log the error if there is one, + .map_err(|io_err: io::Error| eprintln!("{}", io_err)) + // Discard value of result + .ok(); } + + // All other types drop trivially. + ImmutableString::Owned(_) | ImmutableString::Reference(_) => {} } } } From ab217f0ad8c9abc8ec2cbf366451be9ff52bb49b Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 21 Jan 2024 13:19:25 -0500 Subject: [PATCH 07/60] Doc tweaks --- wright/src/filemap.rs | 2 +- wright/src/solver.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 1ca33687..70d05348 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -57,7 +57,7 @@ pub struct FileMap<'src> { /// This is just a list of files we're keeping track of. /// This is identical to the current implementation of [codespan_reporting::files::SimpleFiles], /// but we don't use theirs because we need to iterate over the [SimpleFile]s manually for various - /// parts of the implementation (including the [Drop] implementation). + /// parts of the implementation. inner: Vec>> } diff --git a/wright/src/solver.rs b/wright/src/solver.rs index 06bf6a66..6679fbf5 100644 --- a/wright/src/solver.rs +++ b/wright/src/solver.rs @@ -1 +1 @@ -//! The logical induction engine for wright -- this manages provingf out types and constraints at compile time. +//! The logical induction engine for wright -- this manages proving out types and constraints at compile time. From 8490ce24bc3d08b08e8b5b8fcd8b8c6b5768755b Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Mon, 22 Jan 2024 01:04:07 -0500 Subject: [PATCH 08/60] Docs clarification --- wright/src/filemap.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 70d05348..0ce9afa0 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -56,7 +56,7 @@ enum ImmutableString<'src> { pub struct FileMap<'src> { /// This is just a list of files we're keeping track of. /// This is identical to the current implementation of [codespan_reporting::files::SimpleFiles], - /// but we don't use theirs because we need to iterate over the [SimpleFile]s manually for various + /// but we don't use theirs because we need to iterate over each [SimpleFile] manually for various /// parts of the implementation. inner: Vec>> } From 7fb7217e30930fef71e43eaa0afce598a727bec3 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 27 Jan 2024 03:16:51 -0500 Subject: [PATCH 09/60] Start tokens and lexer --- wright/src/parser/lexer.rs | 149 ++++++++++++++++++++++++++++++++++++- 1 file changed, 148 insertions(+), 1 deletion(-) diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 41ff4f60..aeaccf78 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -5,10 +5,157 @@ use super::fragment::Fragment; -/// The +/// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug)] pub struct Lexer<'src> { /// The remaining source code that has not been processed and returned as a token from the iterator yet. pub remaining: Fragment<'src>, } +/// A token in wright source code. +#[derive(Debug)] +pub struct Token<'src> { + /// What type of token this is. + pub variant: TokenTy, + /// The matching fragment of source code -- this contains the location and length data for the token. + pub fragment: Fragment<'src> +} + +/// The different types of tokens in wright source. +#[rustfmt::skip] // Turn off auto reformat. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum TokenTy { + LeftCurly, RightCurly, + LeftBracket, RightBracket, + LeftParen, RightParen, + + Plus, PlusEq, + Minus, MinusEq, + Star, StarEq, + Div, DivEq, + Xor, XorEq, + Mod, ModEq, + Bang, BangEq, + Eq, EqEq, + + Lt, LtEq, LtLt, + Gt, GtEq, GtGt, + And, AndEq, AndAnd, + Or, OrEq, OrOr, + + Colon, ColonColon, + + At, + Tilde, + Underscore, + Semi, + Dot, + Comma, + Hash, + + Identifier, + + OuterDocComment, OuterBlockDocComment, + InnerDocComment, InnerBlockDocComment, + + KwRecord, + KwType, + KwEnum, + KwUnion, + KwFunc, + KwRepr, + KwImpl, + KwConstraint, + KwTrait, + KwUse, + KwAs, + KwConst, + KwMod, + KwIf, + KwElse, + KwFor, + KwIn, + KwWhile, + KwTrue, + KwFalse, + KwLoop, + KwWhere, + + /// Unknown character in lexer fragment. + Unknown +} + +impl<'src> Lexer<'src> { + /// Get the number of bytes remaining that we need to transform into tokens. + pub const fn bytes_remaining(&self) -> usize { + self.remaining.len() + } + + /// Construct a new lexer over a given reference to a source string. + pub const fn new(source: &'src str) -> Self { + Lexer { remaining: Fragment { inner: source } } + } + + /// Try to match a single character to a single character token if possible. + #[rustfmt::skip] + const fn single_char_tokens(c: char) -> Option { + use TokenTy::*; + + match c { + '{' => Some(LeftCurly), + '}' => Some(RightCurly), + '[' => Some(LeftBracket), + ']' => Some(RightBracket), + '(' => Some(LeftParen), + ')' => Some(RightParen), + + '@' => Some(At), + '~' => Some(Tilde), + '_' => Some(Underscore), + '.' => Some(Dot), + ',' => Some(Comma), + ';' => Some(Semi), + '#' => Some(Hash), + + _ => None, + } + } + + /// Try to match a fragment recognized to be an identifier or keyword to + /// a keyword or return [TokenTy::Identifier]. + fn identifier_or_keyword(fragment: Fragment<'src>) -> TokenTy { + use TokenTy::*; + + match fragment.inner { + "record" => KwRecord, + "type" => KwType, + "enum" => KwEnum, + "union" => KwUnion, + "func" => KwFunc, + "repr" => KwRepr, + "impl" => KwImpl, + "constraint" => KwConstraint, + "trait" => KwTrait, + "const" => KwConst, + "where" => KwWhere, + + "use" => KwUse, + "as" => KwAs, + "mod" => KwMod, + + "if" => KwIf, + "else" => KwElse, + + "for" => KwFor, + "in" => KwIn, + "while" => KwWhile, + "loop" => KwLoop, + + "true" => KwTrue, + "false" => KwFalse, + + _ => Identifier + } + } + +} From 66a95d6a3c77af95aceadd224ea3730c6c391c54 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 31 Jan 2024 01:48:33 -0500 Subject: [PATCH 10/60] Get llvm install working and make some progress on re-creating the lexer. --- wright/Cargo.toml | 85 ++++++++++++++++------ wright/src/parser/fragment.rs | 12 +++- wright/src/parser/lexer.rs | 124 +++++++++++++++++++++++++-------- wright/src/parser/old/lexer.rs | 13 +--- 4 files changed, 170 insertions(+), 64 deletions(-) diff --git a/wright/Cargo.toml b/wright/Cargo.toml index 813d6ae3..a69f80fb 100644 --- a/wright/Cargo.toml +++ b/wright/Cargo.toml @@ -1,8 +1,9 @@ +# PACKAGE METADATA [package] name = "wright" -description = "The wright programming language interpreter and tooling." +description = "The wright programming language compiler and tooling." license = "MIT" -version = "0.10.0" +version = "0.9.0" authors = ["Venus Xeon-Blonde "] repository = "https://github.com/Alfriadox/wright-lang" documentation = "https://docs.rs/wright" @@ -11,37 +12,77 @@ keywords = ["wright", "language", "bytecode", "compiler", "interpreter"] edition.workspace = true rust-version.workspace = true +# LIBRARY METADATA [lib] name = "wright" test = true doctest = true doc = true -[dependencies] -clap = { version = "4", features = ["derive"] } -anyhow = "1" -derive_more = "0.99.17" -unicode-ident = "1.0" -codespan-reporting = "0.11.1" -termcolor = "1.2.0" +# BINARIES +[[bin]] +name = "wright" +test = false +doc = false +doctest = false + +# CRATES.IO BADGES +[badges.maintenance] +status = "actively-developed" + +# DEPENDENCIES: + +# Comand-line interface generator +[dependencies.clap] +version = "4" +features = ["derive"] + +# Error handling glue code +[dependencies.anyhow] +version = "1" + +# Derives for various traits +[dependencies.derive_more] +version = "0.99.17" -# Integers larger than 128 bits -num = "0.4" +# Unicode identifier functions +[dependencies.unicode-ident] +version = "1.0" + +# Source code location tracking and cli error rendering +[dependencies.codespan-reporting] +version = "0.11.1" + +# Terminal output colors +[dependencies.termcolor] +version = "1.2.0" + +# Big Integers +[dependencies.num] +version = "0.4" # Portable (windows, mac, linux) file locking -fs4 = { version = "0.7.0", features = ["sync"] } +[dependencies.fs4] +version = "0.7.0" +features = ["sync"] # Memory mapped files. -memmap2 = "0.9.3" +[dependencies.memmap2] +version = "0.9.3" -[[bin]] -name = "wright" -test = false -doc = false -doctest = false +# Unsafe bindings to LLVM +# See https://llvm.org/. +[dependencies.llvm-sys] +version = "170" +features = ["strict-versioning", "force-static"] + +# Safe bindings to llvm +[dependencies.inkwell] +version = "0.3" +features = ["llvm17-0"] -[badges] -maintenance = { status = "actively-developed" } +# TEST DEPENDENCIES -[dev-dependencies] -rayon = "1.8.0" +# Rayon to speed up brute-force testing in some cases. +[dev-dependencies.rayon] +version = "1.8.0" diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index ae5d73da..8ef6c4ab 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -15,6 +15,11 @@ impl<'src> Fragment<'src> { self.inner.len() } + /// Check if the length of this fragment is zero. + pub const fn is_empty(&self) -> bool { + self.inner.is_empty() + } + /// Get a pair of pointers, the first one being at the beginning of the fragment, the second one pointing /// to the byte after the end of the fragment. const fn start_and_end(&self) -> (*const u8, *const u8) { @@ -46,8 +51,13 @@ impl<'src> Fragment<'src> { /// Split this fragment into two sub fragments, with the first one being `bytes` long and the second containing the /// rest of this fragment. + /// + /// Panics if the byte index is not in the fragment, or if it's on a char boundary. pub fn split(&self, bytes: usize) -> (Self, Self) { - (Self { inner: &self.inner[..bytes] }, Self { inner: &self.inner[bytes..]}) + // Use str's split_at. + let (left, right) = self.inner.split_at(bytes); + + (Self { inner: left }, Self { inner: right }) } /// Get an iterator over the characters in this fragment. diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index aeaccf78..a3f41870 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -5,6 +5,49 @@ use super::fragment::Fragment; +/// Constant table of single character tokens and the characters that match them. +pub const SINGLE_CHAR_TOKENS: &[(char, TokenTy)] = &[ + ('(', TokenTy::LeftParen), + (')', TokenTy::RightParen), + ('[', TokenTy::LeftBracket), + (']', TokenTy::RightBracket), + ('{', TokenTy::LeftCurly), + ('}', TokenTy::RightCurly), + ('@', TokenTy::At), + (';', TokenTy::Semi), + ('?', TokenTy::Question), + (',', TokenTy::Comma), + ('#', TokenTy::Hash), + ('$', TokenTy::Dollar), +]; + +/// Tokens that can be either a single character or upgraded with an +/// equals sign. +pub const POSSIBLE_EQ_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy)] = &[ + ('!', TokenTy::Bang, TokenTy::BangEq), + ('%', TokenTy::Mod, TokenTy::ModEq), + ('^', TokenTy::Xor, TokenTy::XorEq), + ('*', TokenTy::Star, TokenTy::StarEq), + ('+', TokenTy::Plus, TokenTy::PlusEq), + ('/', TokenTy::Div, TokenTy::DivEq), +]; + +/// Characters that can produce different tokens when followed by an equals sign or themselves. +pub const POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy, TokenTy)] = &[ + ('&', TokenTy::And, TokenTy::AndEq, TokenTy::AndAnd), + ('|', TokenTy::Or, TokenTy::OrEq, TokenTy::OrOr), + ('<', TokenTy::Lt, TokenTy::LtEq, TokenTy::LtLt), + ('>', TokenTy::Gt, TokenTy::GtEq, TokenTy::GtGt), + (':', TokenTy::Colon, TokenTy::ColonEq, TokenTy::ColonColon), +]; + +/// Characters that can produce different tokens when followed by an equals sign or +/// a `>` for arrows. +pub const POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy, TokenTy)] = &[ + ('-', TokenTy::Minus, TokenTy::MinusEq, TokenTy::SingleArrow), + ('=', TokenTy::Eq, TokenTy::EqEq, TokenTy::DoubleArrow), +]; + /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug)] pub struct Lexer<'src> { @@ -30,20 +73,20 @@ pub enum TokenTy { LeftParen, RightParen, Plus, PlusEq, - Minus, MinusEq, Star, StarEq, Div, DivEq, Xor, XorEq, Mod, ModEq, Bang, BangEq, - Eq, EqEq, + + Minus, MinusEq, SingleArrow, + Eq, EqEq, DoubleArrow, Lt, LtEq, LtLt, Gt, GtEq, GtGt, And, AndEq, AndAnd, Or, OrEq, OrOr, - - Colon, ColonColon, + Colon, ColonEq, ColonColon, At, Tilde, @@ -52,6 +95,8 @@ pub enum TokenTy { Dot, Comma, Hash, + Question, + Dollar, Identifier, @@ -96,31 +141,6 @@ impl<'src> Lexer<'src> { Lexer { remaining: Fragment { inner: source } } } - /// Try to match a single character to a single character token if possible. - #[rustfmt::skip] - const fn single_char_tokens(c: char) -> Option { - use TokenTy::*; - - match c { - '{' => Some(LeftCurly), - '}' => Some(RightCurly), - '[' => Some(LeftBracket), - ']' => Some(RightBracket), - '(' => Some(LeftParen), - ')' => Some(RightParen), - - '@' => Some(At), - '~' => Some(Tilde), - '_' => Some(Underscore), - '.' => Some(Dot), - ',' => Some(Comma), - ';' => Some(Semi), - '#' => Some(Hash), - - _ => None, - } - } - /// Try to match a fragment recognized to be an identifier or keyword to /// a keyword or return [TokenTy::Identifier]. fn identifier_or_keyword(fragment: Fragment<'src>) -> TokenTy { @@ -158,4 +178,50 @@ impl<'src> Lexer<'src> { } } + /// Make a token by splitting a given number of bytes off of the `self.remaining` fragment + /// and labeling them with the given kind. + fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { + let (token_fragment, new_remaining_fragment) = self.remaining.split(bytes); + self.remaining = new_remaining_fragment; + Token { variant: kind, fragment: token_fragment } + } + + /// Get the next token from the lexer. + pub fn next_token(&mut self) -> Option> { + // If the remaining input is empty, there is no token. + if self.remaining.is_empty() { + return None; + } + + // Otherwise create a char iterator on the fragment. + // This one will be mainly used to check for shorter tokens -- a new one may be created later + // to check for identifiers and strings. + let mut char_indices = self.remaining.inner.chars(); + + // Get the next character from the iterator. + let next_char = char_indices.next().unwrap(); + + // Match a single character if possible. + for (c, kind) in SINGLE_CHAR_TOKENS { + if next_char == *c { + return Some(self.split_token(next_char.len_utf8(), *kind)); + } + } + + // Get the character after the next char if there is one. + let following_char_option = char_indices.next(); + + // Try to match a token that can be augmented with a possible additional equal sign. + for (c, without_eq, with_eq) in POSSIBLE_EQ_UPGRADE_TOKENS { + if next_char == *c { + match following_char_option { + Some('=') => return Some(self.split_token(next_char.len_utf8() + 1, *with_eq)), + _ => return Some(self.split_token(next_char.len_utf8(), *without_eq)), + } + } + } + + unimplemented!() + } + } diff --git a/wright/src/parser/old/lexer.rs b/wright/src/parser/old/lexer.rs index baf58553..051313dc 100644 --- a/wright/src/parser/old/lexer.rs +++ b/wright/src/parser/old/lexer.rs @@ -42,18 +42,7 @@ impl<'a> Iterator for Lexer<'a> { // Handle single character tokens first. let single_char_tokens = [ - ('(', TokenTy::LeftParen), - (')', TokenTy::RightParen), - ('[', TokenTy::LeftSquare), - (']', TokenTy::RightSquare), - ('{', TokenTy::LeftBracket), - ('}', TokenTy::RightBracket), - ('@', TokenTy::At), - (';', TokenTy::Semi), - ('?', TokenTy::Question), - (',', TokenTy::Comma), - ('#', TokenTy::Pound), - ('$', TokenTy::Dollar), + ]; for (c, variant) in single_char_tokens { From b673f8e6a7a68c85230f871f034c439fb9cb1b25 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 31 Jan 2024 01:57:35 -0500 Subject: [PATCH 11/60] Add llvm packages to github actions --- .github/workflows/cargo-check.yml | 8 +++----- .github/workflows/cargo-clippy.yml | 8 +++----- .github/workflows/cargo-test.yml | 8 +++----- .github/workflows/grcov.yml | 2 ++ .github/workflows/pages.yml | 2 ++ 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/.github/workflows/cargo-check.yml b/.github/workflows/cargo-check.yml index 6436c6aa..19168728 100644 --- a/.github/workflows/cargo-check.yml +++ b/.github/workflows/cargo-check.yml @@ -1,10 +1,6 @@ name: Cargo Check -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] +on: ["push", "pull_request"] jobs: check: @@ -13,6 +9,8 @@ jobs: working-directory: ./wright steps: - uses: actions/checkout@v4 + - name: Install LLVM package + run: sudo apt-get install llvm-17-dev - name: Check rust code run: cargo check working-directory: ${{env.working-directory}} diff --git a/.github/workflows/cargo-clippy.yml b/.github/workflows/cargo-clippy.yml index 6eb5ce97..a2b2b7a5 100644 --- a/.github/workflows/cargo-clippy.yml +++ b/.github/workflows/cargo-clippy.yml @@ -1,10 +1,6 @@ name: Clippy -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] +on: ["push", "pull_request"] jobs: clippy: @@ -13,6 +9,8 @@ jobs: working-directory: ./wright steps: - uses: actions/checkout@v4 + - name: Install LLVM package + run: sudo apt-get install llvm-17-dev - name: Run Clippy run: cargo clippy -- --deny clippy::all --deny warnings working-directory: ${{env.working-directory}} diff --git a/.github/workflows/cargo-test.yml b/.github/workflows/cargo-test.yml index ba6d178a..2e15ce07 100644 --- a/.github/workflows/cargo-test.yml +++ b/.github/workflows/cargo-test.yml @@ -1,10 +1,6 @@ name: Cargo Test -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] +on: ["push", "pull_request"] jobs: test: @@ -13,6 +9,8 @@ jobs: working-directory: ./wright steps: - uses: actions/checkout@v4 + - name: Install LLVM package + run: sudo apt-get install llvm-17-dev - name: Run tests run: cargo test working-directory: ${{env.working-directory}} diff --git a/.github/workflows/grcov.yml b/.github/workflows/grcov.yml index a120d13b..28f0a123 100644 --- a/.github/workflows/grcov.yml +++ b/.github/workflows/grcov.yml @@ -7,6 +7,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Install LLVM package + run: sudo apt-get install llvm-17-dev - uses: actions-rs/toolchain@v1 with: toolchain: nightly diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 497ffbd9..bb7db730 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -26,6 +26,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - name: Install LLVM package + run: sudo apt-get install llvm-17-dev - name: Install mdBook run: | curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf -y | sh From 64cab9afcaa1f092642289a545bbc19bc59fa557 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 31 Jan 2024 02:00:42 -0500 Subject: [PATCH 12/60] Fix llvm install --- .github/workflows/cargo-check.yml | 7 +++++-- .github/workflows/cargo-clippy.yml | 7 +++++-- .github/workflows/cargo-test.yml | 7 +++++-- .github/workflows/grcov.yml | 7 +++++-- .github/workflows/pages.yml | 9 ++++++--- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/.github/workflows/cargo-check.yml b/.github/workflows/cargo-check.yml index 19168728..8051e204 100644 --- a/.github/workflows/cargo-check.yml +++ b/.github/workflows/cargo-check.yml @@ -4,13 +4,16 @@ on: ["push", "pull_request"] jobs: check: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: working-directory: ./wright steps: - uses: actions/checkout@v4 - name: Install LLVM package - run: sudo apt-get install llvm-17-dev + run: | + deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + sudo apt-get install llvm-17-dev - name: Check rust code run: cargo check working-directory: ${{env.working-directory}} diff --git a/.github/workflows/cargo-clippy.yml b/.github/workflows/cargo-clippy.yml index a2b2b7a5..75741485 100644 --- a/.github/workflows/cargo-clippy.yml +++ b/.github/workflows/cargo-clippy.yml @@ -4,13 +4,16 @@ on: ["push", "pull_request"] jobs: clippy: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: working-directory: ./wright steps: - uses: actions/checkout@v4 - name: Install LLVM package - run: sudo apt-get install llvm-17-dev + run: | + deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + sudo apt-get install llvm-17-dev - name: Run Clippy run: cargo clippy -- --deny clippy::all --deny warnings working-directory: ${{env.working-directory}} diff --git a/.github/workflows/cargo-test.yml b/.github/workflows/cargo-test.yml index 2e15ce07..a73e3070 100644 --- a/.github/workflows/cargo-test.yml +++ b/.github/workflows/cargo-test.yml @@ -4,13 +4,16 @@ on: ["push", "pull_request"] jobs: test: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: working-directory: ./wright steps: - uses: actions/checkout@v4 - name: Install LLVM package - run: sudo apt-get install llvm-17-dev + run: | + deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + sudo apt-get install llvm-17-dev - name: Run tests run: cargo test working-directory: ${{env.working-directory}} diff --git a/.github/workflows/grcov.yml b/.github/workflows/grcov.yml index 28f0a123..82b032e4 100644 --- a/.github/workflows/grcov.yml +++ b/.github/workflows/grcov.yml @@ -4,11 +4,14 @@ name: Code Coverage jobs: coverage: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 - name: Install LLVM package - run: sudo apt-get install llvm-17-dev + run: | + deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + sudo apt-get install llvm-17-dev - uses: actions-rs/toolchain@v1 with: toolchain: nightly diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index bb7db730..545c0963 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -23,11 +23,14 @@ concurrency: jobs: # Build job build: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install LLVM package - run: sudo apt-get install llvm-17-dev + run: | + deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main + sudo apt-get install llvm-17-dev - name: Install mdBook run: | curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf -y | sh From a0c14e532f20f055dd04af461867ef7fe10322f8 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 31 Jan 2024 02:04:18 -0500 Subject: [PATCH 13/60] Change llvm installation method --- .github/workflows/cargo-check.yml | 8 ++++---- .github/workflows/cargo-clippy.yml | 8 ++++---- .github/workflows/cargo-test.yml | 8 ++++---- .github/workflows/grcov.yml | 8 ++++---- .github/workflows/pages.yml | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/cargo-check.yml b/.github/workflows/cargo-check.yml index 8051e204..b137995c 100644 --- a/.github/workflows/cargo-check.yml +++ b/.github/workflows/cargo-check.yml @@ -9,11 +9,11 @@ jobs: working-directory: ./wright steps: - uses: actions/checkout@v4 - - name: Install LLVM package + - name: Install LLVM run: | - deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - sudo apt-get install llvm-17-dev + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 17 - name: Check rust code run: cargo check working-directory: ${{env.working-directory}} diff --git a/.github/workflows/cargo-clippy.yml b/.github/workflows/cargo-clippy.yml index 75741485..64f1401b 100644 --- a/.github/workflows/cargo-clippy.yml +++ b/.github/workflows/cargo-clippy.yml @@ -9,11 +9,11 @@ jobs: working-directory: ./wright steps: - uses: actions/checkout@v4 - - name: Install LLVM package + - name: Install LLVM run: | - deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - sudo apt-get install llvm-17-dev + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 17 - name: Run Clippy run: cargo clippy -- --deny clippy::all --deny warnings working-directory: ${{env.working-directory}} diff --git a/.github/workflows/cargo-test.yml b/.github/workflows/cargo-test.yml index a73e3070..9a12f561 100644 --- a/.github/workflows/cargo-test.yml +++ b/.github/workflows/cargo-test.yml @@ -9,11 +9,11 @@ jobs: working-directory: ./wright steps: - uses: actions/checkout@v4 - - name: Install LLVM package + - name: Install LLVM run: | - deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - sudo apt-get install llvm-17-dev + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 17 - name: Run tests run: cargo test working-directory: ${{env.working-directory}} diff --git a/.github/workflows/grcov.yml b/.github/workflows/grcov.yml index 82b032e4..547b7814 100644 --- a/.github/workflows/grcov.yml +++ b/.github/workflows/grcov.yml @@ -7,11 +7,11 @@ jobs: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 - - name: Install LLVM package + - name: Install LLVM run: | - deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - sudo apt-get install llvm-17-dev + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 17 - uses: actions-rs/toolchain@v1 with: toolchain: nightly diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 545c0963..ae6bde09 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -26,11 +26,11 @@ jobs: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 - - name: Install LLVM package + - name: Install LLVM run: | - deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main - sudo apt-get install llvm-17-dev + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 17 - name: Install mdBook run: | curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf -y | sh From ea8e3ce6f56333893e6d5d1892cd2ee99f2d3871 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 31 Jan 2024 02:10:10 -0500 Subject: [PATCH 14/60] Add missing libpolly to CI config --- .github/workflows/cargo-check.yml | 1 + .github/workflows/cargo-clippy.yml | 1 + .github/workflows/cargo-test.yml | 1 + .github/workflows/grcov.yml | 1 + .github/workflows/pages.yml | 1 + 5 files changed, 5 insertions(+) diff --git a/.github/workflows/cargo-check.yml b/.github/workflows/cargo-check.yml index b137995c..97b90d89 100644 --- a/.github/workflows/cargo-check.yml +++ b/.github/workflows/cargo-check.yml @@ -14,6 +14,7 @@ jobs: wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh sudo ./llvm.sh 17 + sudo apt install libpolly-17-dev libz-dev - name: Check rust code run: cargo check working-directory: ${{env.working-directory}} diff --git a/.github/workflows/cargo-clippy.yml b/.github/workflows/cargo-clippy.yml index 64f1401b..48c01189 100644 --- a/.github/workflows/cargo-clippy.yml +++ b/.github/workflows/cargo-clippy.yml @@ -14,6 +14,7 @@ jobs: wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh sudo ./llvm.sh 17 + sudo apt install libpolly-17-dev libz-dev - name: Run Clippy run: cargo clippy -- --deny clippy::all --deny warnings working-directory: ${{env.working-directory}} diff --git a/.github/workflows/cargo-test.yml b/.github/workflows/cargo-test.yml index 9a12f561..3c4786ca 100644 --- a/.github/workflows/cargo-test.yml +++ b/.github/workflows/cargo-test.yml @@ -14,6 +14,7 @@ jobs: wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh sudo ./llvm.sh 17 + sudo apt install libpolly-17-dev libz-dev - name: Run tests run: cargo test working-directory: ${{env.working-directory}} diff --git a/.github/workflows/grcov.yml b/.github/workflows/grcov.yml index 547b7814..f7c3026d 100644 --- a/.github/workflows/grcov.yml +++ b/.github/workflows/grcov.yml @@ -12,6 +12,7 @@ jobs: wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh sudo ./llvm.sh 17 + sudo apt install libpolly-17-dev libz-dev - uses: actions-rs/toolchain@v1 with: toolchain: nightly diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index ae6bde09..c978cc6d 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -31,6 +31,7 @@ jobs: wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh sudo ./llvm.sh 17 + sudo apt install libpolly-17-dev libz-dev - name: Install mdBook run: | curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf -y | sh From 8cef341f90963ae98254ea774ec1d7b66c6ba1b4 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 31 Jan 2024 02:14:03 -0500 Subject: [PATCH 15/60] Add comment about llvm installation method --- .github/workflows/cargo-check.yml | 2 ++ .github/workflows/cargo-clippy.yml | 2 ++ .github/workflows/cargo-test.yml | 2 ++ .github/workflows/grcov.yml | 2 ++ .github/workflows/pages.yml | 2 ++ 5 files changed, 10 insertions(+) diff --git a/.github/workflows/cargo-check.yml b/.github/workflows/cargo-check.yml index 97b90d89..2ffd82b9 100644 --- a/.github/workflows/cargo-check.yml +++ b/.github/workflows/cargo-check.yml @@ -10,6 +10,8 @@ jobs: steps: - uses: actions/checkout@v4 - name: Install LLVM + # See: https://apt.llvm.org/ + # Last line: https://gitlab.com/taricorp/llvm-sys.rs/-/issues/13 run: | wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh diff --git a/.github/workflows/cargo-clippy.yml b/.github/workflows/cargo-clippy.yml index 48c01189..3ebff030 100644 --- a/.github/workflows/cargo-clippy.yml +++ b/.github/workflows/cargo-clippy.yml @@ -10,6 +10,8 @@ jobs: steps: - uses: actions/checkout@v4 - name: Install LLVM + # See: https://apt.llvm.org/ + # Last line: https://gitlab.com/taricorp/llvm-sys.rs/-/issues/13 run: | wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh diff --git a/.github/workflows/cargo-test.yml b/.github/workflows/cargo-test.yml index 3c4786ca..1e23b635 100644 --- a/.github/workflows/cargo-test.yml +++ b/.github/workflows/cargo-test.yml @@ -10,6 +10,8 @@ jobs: steps: - uses: actions/checkout@v4 - name: Install LLVM + # See: https://apt.llvm.org/ + # Last line: https://gitlab.com/taricorp/llvm-sys.rs/-/issues/13 run: | wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh diff --git a/.github/workflows/grcov.yml b/.github/workflows/grcov.yml index f7c3026d..a098cbbe 100644 --- a/.github/workflows/grcov.yml +++ b/.github/workflows/grcov.yml @@ -8,6 +8,8 @@ jobs: steps: - uses: actions/checkout@v4 - name: Install LLVM + # See: https://apt.llvm.org/ + # Last line: https://gitlab.com/taricorp/llvm-sys.rs/-/issues/13 run: | wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index c978cc6d..d3aa3832 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -27,6 +27,8 @@ jobs: steps: - uses: actions/checkout@v4 - name: Install LLVM + # See: https://apt.llvm.org/ + # Last line: https://gitlab.com/taricorp/llvm-sys.rs/-/issues/13 run: | wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh From 42f96df5567e660474df7befa85e0b3ca59030e1 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Thu, 8 Feb 2024 00:22:40 -0500 Subject: [PATCH 16/60] Use huge compile time table for short symbol token lookup. --- wright/Cargo.toml | 10 ++ wright/benches/lexer.rs | 14 +++ wright/src/parser/fragment.rs | 8 ++ wright/src/parser/lexer.rs | 218 ++++++++++++++++++++++++++++++---- 4 files changed, 225 insertions(+), 25 deletions(-) create mode 100644 wright/benches/lexer.rs diff --git a/wright/Cargo.toml b/wright/Cargo.toml index a69f80fb..5a9b9fdc 100644 --- a/wright/Cargo.toml +++ b/wright/Cargo.toml @@ -26,6 +26,12 @@ test = false doc = false doctest = false +# BENCHMARKS +[[bench]] +name = "lexer" +harness = false + + # CRATES.IO BADGES [badges.maintenance] status = "actively-developed" @@ -86,3 +92,7 @@ features = ["llvm17-0"] # Rayon to speed up brute-force testing in some cases. [dev-dependencies.rayon] version = "1.8.0" + +# Criterion is used for benchmarking. +[dev-dependencies] +criterion = "0.5.1" diff --git a/wright/benches/lexer.rs b/wright/benches/lexer.rs new file mode 100644 index 00000000..81430d39 --- /dev/null +++ b/wright/benches/lexer.rs @@ -0,0 +1,14 @@ +//! Lexer benchmarks. + + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use wright::parser::lexer::Lexer; + +fn bench_lex_plus_eq(c: &mut Criterion) { + c.bench_function("lex +=", |b| b.iter(|| { + Lexer::new(black_box("+=")).next_token(); + })); +} + +criterion_group!(benches, bench_lex_plus_eq); +criterion_main!(benches); diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index 8ef6c4ab..f67b0265 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -83,4 +83,12 @@ mod tests { assert!(c.overlaps(&a)); assert!(!a.overlaps(&d)); } + + #[test] + fn test_split_single() { + let a = Fragment { inner: "+" }; + let (left, right) = a.split(1); + assert_eq!(left.inner, "+"); + assert_eq!(right.inner, ""); + } } diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index a3f41870..32aed7b7 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -48,6 +48,145 @@ pub const POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy, TokenT ('=', TokenTy::Eq, TokenTy::EqEq, TokenTy::DoubleArrow), ]; +/// The number of rows of the generated prefix table. +pub const PREFIX_TABLE_ROWS: usize = { + SINGLE_CHAR_TOKENS.len() + + 2 * POSSIBLE_EQ_UPGRADE_TOKENS.len() + + 3 * POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS.len() + + 3 * POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS.len() +}; + +/// A relationship between a prefix and the token that should be generated when that prefix matches. +#[derive(Copy, Clone, Debug)] +pub struct PrefixToToken { + /// An array of two chars. In single char tokens, the second one should be a null character (`'\0'`). + /// the char_length field will be used to slice this buffer to get the actual prefix. + pub char_buffer: [char; 2], + /// The byte length of this prefix and all generated tokens by this prefix. + pub byte_len: usize, + /// The kind of [Token] generated when this prefix matches. + pub kind: TokenTy, +} + +/// A full table generated at compile time using all the token tables +/// ([SINGLE_CHAR_TOKENS], [POSSIBLE_EQ_UPGRADE_TOKENS], [POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS], +/// [POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS]). +/// +/// This table can be iterated on in order when trying to match a token at the start of a fragment of source code. +pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { + // Make a mutable table with dummy values to replace with actual values. + let mut table: [PrefixToToken; PREFIX_TABLE_ROWS] = + [PrefixToToken { char_buffer: ['\0'; 2], byte_len: 0, kind: TokenTy::Unknown }; PREFIX_TABLE_ROWS]; + + // Current index to insert into table at. + let mut write_index: usize = 0; + + // Index used for reading from various tables. + let mut read_index: usize = 0; + + // Iterate first over all the single char tokens. + while read_index < SINGLE_CHAR_TOKENS.len() { + // Get row from source table. + let (c, token_kind) = SINGLE_CHAR_TOKENS[read_index]; + + // Put row in destination table. + table[write_index] = PrefixToToken { + char_buffer: [c, '\0'], + byte_len: c.len_utf8(), + kind: token_kind, + }; + + // Increment both indices. + read_index += 1; + write_index += 1; + } + + // Then do all the tokens that can be upgraded with an equals sign. + // Add the row for the token with the equals sign first so that when we iterate over this table in order, + // the version without the equals sign does not match prematurely. + read_index = 0; + while read_index < POSSIBLE_EQ_UPGRADE_TOKENS.len() { + let (c, without_eq, with_eq) = POSSIBLE_EQ_UPGRADE_TOKENS[read_index]; + + table[write_index] = PrefixToToken { + char_buffer: [c, '='], + byte_len: c.len_utf8() + '='.len_utf8(), + kind: with_eq, + }; + + write_index += 1; + table[write_index] = PrefixToToken { + char_buffer: [c, '\0'], + byte_len: c.len_utf8(), + kind: without_eq, + }; + + read_index += 1; + write_index += 1; + } + + // Do the same for the tokens that can be upgraded with an equals sign or doubled. + read_index = 0; + while read_index < POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS.len() { + let (c, without_eq, with_eq, doubled) = POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS[read_index]; + + table[write_index] = PrefixToToken { + char_buffer: [c, c], + byte_len: 2 * c.len_utf8(), + kind: doubled, + }; + + write_index += 1; + table[write_index] = PrefixToToken { + char_buffer: [c, '='], + byte_len: c.len_utf8() + '='.len_utf8(), + kind: with_eq, + }; + + write_index += 1; + table[write_index] = PrefixToToken { + char_buffer: [c, '\0'], + byte_len: c.len_utf8(), + kind: without_eq, + }; + + read_index += 1; + write_index += 1; + } + + // Do the same for possible eq or arrow upgrades. + read_index = 0; + while read_index < POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS.len() { + let (c, without_eq, with_eq, with_arrow) = POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS[read_index]; + + table[write_index] = PrefixToToken { + char_buffer: [c, '>'], + byte_len: c.len_utf8() + '>'.len_utf8(), + kind: with_arrow, + }; + + write_index += 1; + table[write_index] = PrefixToToken { + char_buffer: [c, '='], + byte_len: c.len_utf8() + '='.len_utf8(), + kind: with_eq, + }; + + write_index += 1; + table[write_index] = PrefixToToken { + char_buffer: [c, '\0'], + byte_len: c.len_utf8(), + kind: without_eq, + }; + + read_index += 1; + write_index += 1; + } + + table +}; + + /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug)] pub struct Lexer<'src> { @@ -186,42 +325,71 @@ impl<'src> Lexer<'src> { Token { variant: kind, fragment: token_fragment } } + /// See if the remaining fragment in this [Lexer] starts with a given [str] prefix and if so, + /// split off a token of the length of this prefix with the given variant. + fn match_str_prefix(&mut self, prefix: &str, token_kind: TokenTy) -> Option> { + if self.remaining.inner.starts_with(prefix) { + Some(self.split_token(prefix.len(), token_kind)) + } else { + None + } + } + + /// Get the next token from the lexer. pub fn next_token(&mut self) -> Option> { // If the remaining input is empty, there is no token. if self.remaining.is_empty() { return None; } - - // Otherwise create a char iterator on the fragment. - // This one will be mainly used to check for shorter tokens -- a new one may be created later - // to check for identifiers and strings. - let mut char_indices = self.remaining.inner.chars(); - - // Get the next character from the iterator. - let next_char = char_indices.next().unwrap(); - - // Match a single character if possible. - for (c, kind) in SINGLE_CHAR_TOKENS { - if next_char == *c { - return Some(self.split_token(next_char.len_utf8(), *kind)); + + // To attempt to match a token from the prefix table, make a char iterator + // and get two chars from it to test equality. None of the tokens start with a + // null character so use that as a single of an unavailable char. + let mut char_iter = self.remaining.chars(); + let char_array: [char; 2] = [ + // Just unwrap here since we know there's at least one char. + char_iter.next().unwrap(), + char_iter.next().unwrap_or('\0') + ]; + + // Next iterate through the prefix table to try to get any tokens that are covered there. + for prefix_meta in PREFIX_TABLE.iter() { + if &prefix_meta.char_buffer == &char_array { + return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); } } - // Get the character after the next char if there is one. - let following_char_option = char_indices.next(); + unimplemented!() + } - // Try to match a token that can be augmented with a possible additional equal sign. - for (c, without_eq, with_eq) in POSSIBLE_EQ_UPGRADE_TOKENS { - if next_char == *c { - match following_char_option { - Some('=') => return Some(self.split_token(next_char.len_utf8() + 1, *with_eq)), - _ => return Some(self.split_token(next_char.len_utf8(), *without_eq)), - } - } - } +} - unimplemented!() +#[cfg(test)] +mod tests { + use crate::parser::lexer::TokenTy; + + use super::Lexer; + use super::PREFIX_TABLE; + + #[test] + #[ignore = "this test is just used for debugging the prefix table"] + /// Run this with `cargo test manual_debug_prefix_table -- --nocapture --ignored`. + fn manual_debug_prefix_table() { + dbg!(PREFIX_TABLE); } + #[test] + fn plus_and_plus_eq_tokens() { + let mut plus = Lexer::new("+"); + let mut plus_eq = Lexer::new("+="); + + let plus_token = plus.next_token().unwrap(); + let plus_eq_token = plus_eq.next_token().unwrap(); + + assert_eq!(plus.bytes_remaining(), 0); + assert_eq!(plus_eq.bytes_remaining(), 0); + assert_eq!(plus_token.variant, TokenTy::Plus); + assert_eq!(plus_eq_token.variant, TokenTy::PlusEq); + } } From f304ef8662e8d25d92bd0039235b05d92665731b Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 11 Feb 2024 00:51:09 -0500 Subject: [PATCH 17/60] Add identifier parsing --- wright/Cargo.toml | 4 + wright/benches/lexer.rs | 22 +++-- wright/src/bin/wright.rs | 5 +- wright/src/parser/lexer.rs | 192 +++++++++++++++++++++---------------- wright/src/repl.rs | 8 +- 5 files changed, 134 insertions(+), 97 deletions(-) diff --git a/wright/Cargo.toml b/wright/Cargo.toml index 5a9b9fdc..8957aebb 100644 --- a/wright/Cargo.toml +++ b/wright/Cargo.toml @@ -87,6 +87,10 @@ features = ["strict-versioning", "force-static"] version = "0.3" features = ["llvm17-0"] +# Fast parsing for integers and floats from source code. +[dependencies.lexical-core] +version = "0.8" + # TEST DEPENDENCIES # Rayon to speed up brute-force testing in some cases. diff --git a/wright/benches/lexer.rs b/wright/benches/lexer.rs index 81430d39..49a95b6a 100644 --- a/wright/benches/lexer.rs +++ b/wright/benches/lexer.rs @@ -1,14 +1,24 @@ //! Lexer benchmarks. -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, Criterion, Bencher}; use wright::parser::lexer::Lexer; -fn bench_lex_plus_eq(c: &mut Criterion) { - c.bench_function("lex +=", |b| b.iter(|| { - Lexer::new(black_box("+=")).next_token(); - })); +fn bench_symbol_tokens(c: &mut Criterion) { + // Make a benchmark group. + let mut group = c.benchmark_group("lexer symbol benchmarks"); + + // Function to make a lexer and get a token from it. + let make_lexer_and_get_token = |b: &mut Bencher, input: &str| { + b.iter(|| Lexer::new(black_box(input)).next_token()) + }; + + let inputs = ["+", "+=", "*", "@", "?"]; + + for i in inputs { + group.bench_with_input(format!("lexer {i}"), i, make_lexer_and_get_token); + } } -criterion_group!(benches, bench_lex_plus_eq); +criterion_group!(benches, bench_symbol_tokens); criterion_main!(benches); diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index 4f548b1b..2e589622 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -2,9 +2,8 @@ use anyhow::Result; use clap::{Parser, Subcommand}; -use codespan_reporting::files::SimpleFile; -use std::{fs, path::PathBuf}; -use wright::{parser::lexer::Lexer, repl}; +use std::path::PathBuf; +use wright::repl; /// The wright cli. #[derive(Parser, Debug)] diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 32aed7b7..08676d39 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -3,6 +3,9 @@ //! Note that this will strip out comments and whitespace, returning only fragments that match one of the paterns //! defined for tokens. +use std::ptr; +use std::str::Chars; +use unicode_ident::{is_xid_continue, is_xid_start}; use super::fragment::Fragment; /// Constant table of single character tokens and the characters that match them. @@ -68,11 +71,24 @@ pub struct PrefixToToken { pub kind: TokenTy, } +impl PrefixToToken { + /// Convenience function to construct a [`PrefixToToken`] by calculating the length of both chars + /// (and ignoring the second one if it's null). + pub const fn new(chars: [char; 2], kind: TokenTy) -> Self { + PrefixToToken { + char_buffer: chars, + byte_len: if chars[1] == '\0' { chars[0].len_utf8() } else { chars[0].len_utf8() + chars[1].len_utf8() }, + kind, + } + } +} + /// A full table generated at compile time using all the token tables /// ([SINGLE_CHAR_TOKENS], [POSSIBLE_EQ_UPGRADE_TOKENS], [POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS], /// [POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS]). /// /// This table can be iterated on in order when trying to match a token at the start of a fragment of source code. +#[rustfmt::skip] pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { // Make a mutable table with dummy values to replace with actual values. let mut table: [PrefixToToken; PREFIX_TABLE_ROWS] = @@ -90,11 +106,7 @@ pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { let (c, token_kind) = SINGLE_CHAR_TOKENS[read_index]; // Put row in destination table. - table[write_index] = PrefixToToken { - char_buffer: [c, '\0'], - byte_len: c.len_utf8(), - kind: token_kind, - }; + table[write_index] = PrefixToToken::new([c, '\0'], token_kind); // Increment both indices. read_index += 1; @@ -108,21 +120,11 @@ pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { while read_index < POSSIBLE_EQ_UPGRADE_TOKENS.len() { let (c, without_eq, with_eq) = POSSIBLE_EQ_UPGRADE_TOKENS[read_index]; - table[write_index] = PrefixToToken { - char_buffer: [c, '='], - byte_len: c.len_utf8() + '='.len_utf8(), - kind: with_eq, - }; - - write_index += 1; - table[write_index] = PrefixToToken { - char_buffer: [c, '\0'], - byte_len: c.len_utf8(), - kind: without_eq, - }; + table[write_index] = PrefixToToken::new([c, '='], with_eq); + table[write_index + 1] = PrefixToToken::new([c, '\0'], without_eq); read_index += 1; - write_index += 1; + write_index += 2; } // Do the same for the tokens that can be upgraded with an equals sign or doubled. @@ -130,28 +132,12 @@ pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { while read_index < POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS.len() { let (c, without_eq, with_eq, doubled) = POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS[read_index]; - table[write_index] = PrefixToToken { - char_buffer: [c, c], - byte_len: 2 * c.len_utf8(), - kind: doubled, - }; - - write_index += 1; - table[write_index] = PrefixToToken { - char_buffer: [c, '='], - byte_len: c.len_utf8() + '='.len_utf8(), - kind: with_eq, - }; - - write_index += 1; - table[write_index] = PrefixToToken { - char_buffer: [c, '\0'], - byte_len: c.len_utf8(), - kind: without_eq, - }; + table[write_index] = PrefixToToken::new([c, c], doubled); + table[write_index + 1] = PrefixToToken::new([c, '='], with_eq); + table[write_index + 2] = PrefixToToken::new([c, '\0'], without_eq); read_index += 1; - write_index += 1; + write_index += 3; } // Do the same for possible eq or arrow upgrades. @@ -159,28 +145,12 @@ pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { while read_index < POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS.len() { let (c, without_eq, with_eq, with_arrow) = POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS[read_index]; - table[write_index] = PrefixToToken { - char_buffer: [c, '>'], - byte_len: c.len_utf8() + '>'.len_utf8(), - kind: with_arrow, - }; - - write_index += 1; - table[write_index] = PrefixToToken { - char_buffer: [c, '='], - byte_len: c.len_utf8() + '='.len_utf8(), - kind: with_eq, - }; - - write_index += 1; - table[write_index] = PrefixToToken { - char_buffer: [c, '\0'], - byte_len: c.len_utf8(), - kind: without_eq, - }; + table[write_index] = PrefixToToken::new([c, '>'], with_arrow); + table[write_index + 1] = PrefixToToken::new([c, '='], with_eq); + table[write_index + 2] = PrefixToToken::new([c, '\0'], without_eq); read_index += 1; - write_index += 1; + write_index += 3; } table @@ -229,13 +199,15 @@ pub enum TokenTy { At, Tilde, - Underscore, Semi, Dot, Comma, Hash, Question, Dollar, + + // Not in the same group as the other ones there since it can be used at the start of identifiers. + Underscore, Identifier, @@ -313,6 +285,8 @@ impl<'src> Lexer<'src> { "true" => KwTrue, "false" => KwFalse, + "_" => Underscore, + _ => Identifier } } @@ -325,38 +299,76 @@ impl<'src> Lexer<'src> { Token { variant: kind, fragment: token_fragment } } - /// See if the remaining fragment in this [Lexer] starts with a given [str] prefix and if so, - /// split off a token of the length of this prefix with the given variant. - fn match_str_prefix(&mut self, prefix: &str, token_kind: TokenTy) -> Option> { - if self.remaining.inner.starts_with(prefix) { - Some(self.split_token(prefix.len(), token_kind)) - } else { - None - } - } - - /// Get the next token from the lexer. pub fn next_token(&mut self) -> Option> { // If the remaining input is empty, there is no token. if self.remaining.is_empty() { return None; } + + // Use blocks heavily in this function as we don't want to re-use iterators or variables + // after we check them in most cases. + + // If there is whitespace at the start of the remaining fragment, strip it and re-run this + // function to get the next token. + { + let without_whitespace: &str = self.remaining.inner.trim_start(); + + if !ptr::eq(without_whitespace, self.remaining.inner) { + self.remaining.inner = without_whitespace; + return self.next_token(); + } + } // To attempt to match a token from the prefix table, make a char iterator // and get two chars from it to test equality. None of the tokens start with a // null character so use that as a single of an unavailable char. - let mut char_iter = self.remaining.chars(); - let char_array: [char; 2] = [ - // Just unwrap here since we know there's at least one char. - char_iter.next().unwrap(), - char_iter.next().unwrap_or('\0') - ]; - - // Next iterate through the prefix table to try to get any tokens that are covered there. - for prefix_meta in PREFIX_TABLE.iter() { - if &prefix_meta.char_buffer == &char_array { - return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); + { + let mut char_iter: Chars = self.remaining.chars(); + let char_array: [char; 2] = [ + // Unchecked unwrap here since we know there's at least one char. + unsafe { char_iter.next().unwrap_unchecked() }, + char_iter.next().unwrap_or('\0') + ]; + + // Next iterate through the prefix table to try to get any tokens that are covered there. + for prefix_meta in PREFIX_TABLE.iter() { + // If it's a single char comparison, only compare the first chars. + if prefix_meta.char_buffer[1] == '\0' && prefix_meta.char_buffer[0] == char_array[0] { + return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); + } + + // Otherwise compare the whole slices. + if &prefix_meta.char_buffer == &char_array { + return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); + } + } + } + + // Next attempt to match a keyword or identifier. + { + let mut chars: Chars = self.remaining.chars(); + + // The unsafe is fine here -- we've established that this lexer has bytes remaining. + let next: char = unsafe { chars.next().unwrap_unchecked() }; + + if is_xid_start(next) || next == '_' { + let mut bytes_consumed: usize = next.len_utf8(); + + // Take remaining chars and add to sum. + bytes_consumed += chars + .take_while(|c| is_xid_continue(*c)) + .map(char::len_utf8) + .sum::(); + + // Split the number of bytes we consumed. + let (ident_frag, new_remaining) = self.remaining.split(bytes_consumed); + // Get the token kind to produce for this fragment. + let variant = Lexer::identifier_or_keyword(ident_frag); + // Update the lexers remaining fragment. + self.remaining = new_remaining; + // Return the identifier, keyword, or underscore. + return Some(Token { variant, fragment: ident_frag }); } } @@ -368,7 +380,6 @@ impl<'src> Lexer<'src> { #[cfg(test)] mod tests { use crate::parser::lexer::TokenTy; - use super::Lexer; use super::PREFIX_TABLE; @@ -392,4 +403,21 @@ mod tests { assert_eq!(plus_token.variant, TokenTy::Plus); assert_eq!(plus_eq_token.variant, TokenTy::PlusEq); } + + #[test] + fn plus_one_token() { + let mut plus_one = Lexer::new("+1"); + let plus_token = plus_one.next_token().unwrap(); + assert_eq!(plus_one.bytes_remaining(), 1); + assert_eq!(plus_token.variant, TokenTy::Plus); + assert_eq!(plus_token.fragment.len(), 1); + } + + #[test] + fn identifiers_and_keywords() { + let mut lexer = Lexer::new("const TEST"); + + assert_eq!(lexer.next_token().unwrap().variant, TokenTy::KwConst); + assert_eq!(lexer.next_token().unwrap().variant, TokenTy::Identifier); + } } diff --git a/wright/src/repl.rs b/wright/src/repl.rs index 74560937..f0e024b5 100644 --- a/wright/src/repl.rs +++ b/wright/src/repl.rs @@ -1,10 +1,6 @@ //! The Wright interactive REPL. -use crate::{ - filemap::{FileMap, FileName}, - // parser::lexer::Lexer, - WRIGHT_VERSION, -}; +use crate::WRIGHT_VERSION; use derive_more::Display; use std::io::{self, BufRead, Write}; @@ -58,7 +54,7 @@ pub fn start() -> anyhow::Result<()> { let mut repl_mode = ReplMode::Tokens; // Make a file map to track input. - let mut code_map = FileMap::new(); + // let mut code_map = FileMap::new(); // Loop until this returns/exits. loop { From da25ba3c18978c533b87a4c053bf4d7669085c97 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 11 Feb 2024 00:59:02 -0500 Subject: [PATCH 18/60] Reformatting --- wright/benches/lexer.rs | 11 +- wright/src/filemap.rs | 202 +++++++++++++++++++++------------- wright/src/parser/fragment.rs | 54 +++++---- wright/src/parser/lexer.rs | 130 ++++++++++++---------- wright/src/repl.rs | 2 +- 5 files changed, 236 insertions(+), 163 deletions(-) diff --git a/wright/benches/lexer.rs b/wright/benches/lexer.rs index 49a95b6a..35f6a8ee 100644 --- a/wright/benches/lexer.rs +++ b/wright/benches/lexer.rs @@ -1,17 +1,16 @@ //! Lexer benchmarks. - -use criterion::{black_box, criterion_group, criterion_main, Criterion, Bencher}; +use criterion::{black_box, criterion_group, criterion_main, Bencher, Criterion}; use wright::parser::lexer::Lexer; fn bench_symbol_tokens(c: &mut Criterion) { // Make a benchmark group. let mut group = c.benchmark_group("lexer symbol benchmarks"); - // Function to make a lexer and get a token from it. - let make_lexer_and_get_token = |b: &mut Bencher, input: &str| { - b.iter(|| Lexer::new(black_box(input)).next_token()) - }; + // Function to make a lexer and get a token from it. + fn make_lexer_and_get_token(b: &mut Bencher, input: &str) { + b.iter(|| Lexer::new(black_box(input)).next_token()); + } let inputs = ["+", "+=", "*", "@", "?"]; diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 0ce9afa0..eb9d8984 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -1,20 +1,24 @@ //! Responsible for keeping track of different files added to the Wright build system. -use codespan_reporting::{files::{Files, SimpleFile}, term::Config, diagnostic::Diagnostic}; +use crate::parser::fragment::Fragment; +use codespan_reporting::{ + diagnostic::Diagnostic, + files::{Files, SimpleFile}, + term::Config, +}; use derive_more::Display; use fs4::FileExt; use memmap2::Mmap; +use std::{fs::File, io, path::PathBuf, sync::mpsc, thread, time::Duration}; use termcolor::{ColorChoice, StandardStream}; -use std::{path::PathBuf, io, fs::File, sync::mpsc, thread, time::Duration}; -use crate::parser::fragment::Fragment; -/// Rename import for clarity. +/// Rename import for clarity. use codespan_reporting::files::Error as CodespanError; -/// Convenience type alias. +/// Convenience type alias. type CodespanResult = Result; -/// Amount of time before we should warn the user about locking the file taking too long. +/// Amount of time before we should warn the user about locking the file taking too long. const FILE_LOCK_WARNING_TIME: Duration = Duration::from_secs(5); /// Used to represent different file names used throughout this crate. @@ -33,112 +37,127 @@ pub enum FileName { None, } -/// An immutable string that either references a source file in memory using an `&` reference or using a [Box]. +/// An immutable string that either references a source file in memory using an `&` reference or using a [Box]. #[derive(Debug)] enum ImmutableString<'src> { - /// An immutable reference to an existing string. + /// An immutable reference to an existing string. Reference(&'src str), - /// An owned immutable string. + /// An owned immutable string. Owned(Box), - /// A locked, memory mapped file from the OS. + /// A locked, memory mapped file from the OS. LockedFile { - /// The locked file that needs to be unlocked when this object is dropped. + /// The locked file that needs to be unlocked when this object is dropped. locked_file: File, /// The memory locked file -- this is expected to be locked before - /// one creates it in the file + /// one creates it in the file mem_map: Mmap, - } + }, } /// The file map that we use throughout the rest of this crate. pub struct FileMap<'src> { - /// This is just a list of files we're keeping track of. + /// This is just a list of files we're keeping track of. /// This is identical to the current implementation of [codespan_reporting::files::SimpleFiles], - /// but we don't use theirs because we need to iterate over each [SimpleFile] manually for various + /// but we don't use theirs because we need to iterate over each [SimpleFile] manually for various /// parts of the implementation. - inner: Vec>> + inner: Vec>>, } - impl<'src> FileMap<'src> { - /// Construct a new empty [FileMap]. + /// Construct a new empty [FileMap]. pub const fn new() -> Self { FileMap { inner: Vec::new() } } - /// Get a reference to a file from the internal [Vec] or return a [`CodespanError::FileMissing`] error. - fn get(&self, file_id: >::FileId) -> CodespanResult<&SimpleFile>> { + /// Get a reference to a file from the internal [Vec] or return a [`CodespanError::FileMissing`] error. + fn get( + &self, + file_id: >::FileId, + ) -> CodespanResult<&SimpleFile>> { self.inner.get(file_id).ok_or(CodespanError::FileMissing) } /// Internal function to add a file to the vec. Public facing functions will need to do some conversion - /// and then call this. - fn add(&mut self, name: FileName, source: ImmutableString<'src>) -> >::FileId { + /// and then call this. + fn add( + &mut self, + name: FileName, + source: ImmutableString<'src>, + ) -> >::FileId { // The file id is just the next index in the vec. let file_id: usize = self.inner.len(); self.inner.push(SimpleFile::new(name, source)); file_id } - /// Add a file (in the form of an owned string) to the file map. + /// Add a file (in the form of an owned string) to the file map. pub fn add_string(&mut self, name: FileName, source: String) -> >::FileId { self.add(name, ImmutableString::Owned(source.into_boxed_str())) } - /// Add a file (in the form of a string reference) to the file map. - pub fn add_str_ref(&mut self, name: FileName, source: &'src str) -> >::FileId { + /// Add a file (in the form of a string reference) to the file map. + pub fn add_str_ref( + &mut self, + name: FileName, + source: &'src str, + ) -> >::FileId { self.add(name, ImmutableString::Reference(source)) } - /// Add a file from the file system. This file will be - /// opened with read permissions, locked, memory mapped, - /// and then added to the file map. The file name in the memory map will be the [PathBuf] passed to this function. + /// Add a file from the file system. This file will be + /// opened with read permissions, locked, memory mapped, + /// and then added to the file map. The file name in the memory map will be the [PathBuf] passed to this function. pub fn add_file(&mut self, path: PathBuf) -> io::Result<>::FileId> { // Make a one-off enum here to use for channel messages. enum ChannelMessage { /// The file was successfully locked. FileLocked(File), - /// There was an error locking the file. + /// There was an error locking the file. LockingError(io::Error), - /// File is taking a long time to lock. + /// File is taking a long time to lock. FiveSecondWarning, } - // Open the file for reading. + // Open the file for reading. let file: File = File::open(&path)?; - // Create two threads and a mpsc channel for warning the user if - // locking the file takes longer than 5 seconds. + // Create two threads and a mpsc channel for warning the user if + // locking the file takes longer than 5 seconds. let (tx, rx) = mpsc::sync_channel::(1); let timout_tx = tx.clone(); - // Thread to lock the file - thread::spawn(move || { - match file.lock_exclusive() { - Ok(_) => tx.send(ChannelMessage::FileLocked(file)), - Err(err) => tx.send(ChannelMessage::LockingError(err)) - } + // Thread to lock the file + thread::spawn(move || match file.lock_exclusive() { + Ok(_) => tx.send(ChannelMessage::FileLocked(file)), + Err(err) => tx.send(ChannelMessage::LockingError(err)), }); - // Thread to warn user if it takes too long. + // Thread to warn user if it takes too long. thread::spawn(move || { thread::sleep(FILE_LOCK_WARNING_TIME); timout_tx.send(ChannelMessage::FiveSecondWarning) }); - // Use an infinite loop to make sure we recieve all the messages from the senders. + // Use an infinite loop to make sure we recieve all the messages from the senders. loop { match rx.recv() { - // Emit the diagnostic for the 5-second warning. + // Emit the diagnostic for the 5-second warning. Ok(ChannelMessage::FiveSecondWarning) => { - // Get a lock on the standard out so that we don't get interrupted here. + // Get a lock on the standard out so that we don't get interrupted here. let stdout = StandardStream::stdout(ColorChoice::Auto); let mut stdout = stdout.lock(); // Make the diagnostic to show to the user. - let message = format!("Getting a file lock on {} has taken more than {} seconds.", path.display(), FILE_LOCK_WARNING_TIME.as_secs()); - let diagnostic: Diagnostic< as Files<'src>>::FileId> = Diagnostic::note().with_message(message); + let message = format!( + "Getting a file lock on {} has taken more than {} seconds.", + path.display(), + FILE_LOCK_WARNING_TIME.as_secs() + ); + + let diagnostic: Diagnostic< as Files<'src>>::FileId> = + Diagnostic::note().with_message(message); + // Emit the diagnostic to the user. codespan_reporting::term::emit(&mut stdout, &Config::default(), self, &diagnostic) // Convert from the potential codespan error to a normal IO err. @@ -148,53 +167,72 @@ impl<'src> FileMap<'src> { })? } - // Handle any io errors locking the file by returning them. + // Handle any io errors locking the file by returning them. Ok(ChannelMessage::LockingError(io_err)) => return Err(io_err), - // Handle success by finishing adding the file to the FileMap. + // Handle success by finishing adding the file to the FileMap. Ok(ChannelMessage::FileLocked(file)) => { - // The file is now locked, we can memmory map it and add it ro the vec. - // SAFETY: The file should be locked at this point so undefined behaviour from concurrent - // modification is avoided. - let mem_map: Mmap = unsafe { + // The file is now locked, we can memmory map it and add it ro the vec. + // SAFETY: The file should be locked at this point so undefined behaviour from concurrent + // modification is avoided. + let mem_map: Mmap = unsafe { Mmap::map(&file) - // Make sure we unlock the file if there's an issue memory mapping it. + // Make sure we unlock the file if there's an issue memory mapping it. .map_err(|err| { - file.unlock().map_err(|err| eprintln!("Error unlocking file: {:?}", err)).ok(); + file.unlock() + .map_err(|err| eprintln!("Error unlocking file: {:?}", err)) + .ok(); err }) }?; - // Double check that the file is valid utf-8. If not, return an IO error. + // Double check that the file is valid utf-8. If not, return an IO error. let raw_data: &[u8] = mem_map.as_ref(); let as_str: Result<&str, std::str::Utf8Error> = std::str::from_utf8(raw_data); if as_str.is_err() { - // The file is not valid for us so we should unlock it and return an error. - file.unlock().map_err(|err| eprintln!("Error unlocking file: {:?}", err)).ok(); - return Err(io::Error::new(io::ErrorKind::InvalidData, as_str.unwrap_err())); + // The file is not valid for us so we should unlock it and return an error. + file.unlock() + .map_err(|err| eprintln!("Error unlocking file: {:?}", err)) + .ok(); + + return Err(io::Error::new( + io::ErrorKind::InvalidData, + as_str.unwrap_err(), + )); } - // The file's contents are valid utf-8, add them to the file map. - return Ok(self.add(FileName::Real(path), ImmutableString::LockedFile { locked_file: file, mem_map })); + // The file's contents are valid utf-8, add them to the file map. + return Ok(self.add( + FileName::Real(path), + ImmutableString::LockedFile { + locked_file: file, + mem_map, + }, + )); } - Err(_) => unreachable!("The reciever should never reach a state where both senders are closed."), - } + Err(_) => unreachable!( + "The reciever should never reach a state where both senders are closed." + ), + } } } - /// Find the file ID of a given [Fragment] using the fragment's internal pointer. - pub fn find_fragment(&self, fragment: &Fragment<'src>) -> Option<>::FileId> { - // Iterate on file IDs. + /// Find the file ID of a given [Fragment] using the fragment's internal pointer. + pub fn find_fragment( + &self, + fragment: &Fragment<'src>, + ) -> Option<>::FileId> { + // Iterate on file IDs. for file_id in 0..self.inner.len() { - // Use expect because all of these file IDs should be fine. + // Use expect because all of these file IDs should be fine. let source: &str = self.source(file_id).expect("All file IDs here are valid"); if (Fragment { inner: source }).contains(fragment) { return Some(file_id); } } - // If there was no file containing the given fragment, return none. + // If there was no file containing the given fragment, return none. None } } @@ -205,22 +243,23 @@ impl<'src> Drop for ImmutableString<'src> { match self { // Unlock locked files. ImmutableString::LockedFile { locked_file, .. } => { - locked_file.unlock() - // Log the error if there is one, + locked_file + .unlock() + // Log the error if there is one, .map_err(|io_err: io::Error| eprintln!("{}", io_err)) // Discard value of result .ok(); } - // All other types drop trivially. + // All other types drop trivially. ImmutableString::Owned(_) | ImmutableString::Reference(_) => {} } } } -/// The implementation here is basically identical to the one for [codespan_reporting::files::SimpleFiles]. +/// The implementation here is basically identical to the one for [codespan_reporting::files::SimpleFiles]. impl<'src> Files<'src> for FileMap<'src> { - /// File IDs here are just indices into [FileMap]'s internal [Vec]. + /// File IDs here are just indices into [FileMap]'s internal [Vec]. type FileId = usize; type Name = FileName; @@ -231,15 +270,26 @@ impl<'src> Files<'src> for FileMap<'src> { Ok(self.get(id)?.name().clone()) } - fn source(&'src self, id: Self::FileId) -> Result { + fn source( + &'src self, + id: Self::FileId, + ) -> Result { Ok(self.get(id)?.source().as_ref()) } - fn line_index(&self, id: Self::FileId, byte_index: usize) -> Result { + fn line_index( + &self, + id: Self::FileId, + byte_index: usize, + ) -> Result { self.get(id)?.line_index((), byte_index) } - fn line_range(&self, id: Self::FileId, line_index: usize) -> Result, codespan_reporting::files::Error> { + fn line_range( + &self, + id: Self::FileId, + line_index: usize, + ) -> Result, codespan_reporting::files::Error> { self.get(id)?.line_range((), line_index) } } @@ -250,9 +300,9 @@ impl<'src> AsRef for ImmutableString<'src> { ImmutableString::Reference(str) => str, ImmutableString::Owned(str) => &str, ImmutableString::LockedFile { mem_map, .. } => { - // Get a direct reference to the data that is in the memory map. + // Get a direct reference to the data that is in the memory map. let raw_data: &[u8] = mem_map.as_ref(); - // SAFETY: UTF-8 validity is checked when the file is added to the file map. + // SAFETY: UTF-8 validity is checked when the file is added to the file map. unsafe { std::str::from_utf8_unchecked(raw_data) } } } diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index f67b0265..62fe95a7 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -2,45 +2,45 @@ use std::str::Chars; -/// A fragment of source code. +/// A fragment of source code. #[derive(Clone, Copy, Debug)] pub struct Fragment<'src> { - /// Fragments are represented using direct string references into the source file itself. - pub inner: &'src str + /// Fragments are represented using direct string references into the source file itself. + pub inner: &'src str, } impl<'src> Fragment<'src> { - /// Get the length (in bytes) of this fragment. + /// Get the length (in bytes) of this fragment. pub const fn len(&self) -> usize { self.inner.len() } - /// Check if the length of this fragment is zero. + /// Check if the length of this fragment is zero. pub const fn is_empty(&self) -> bool { self.inner.is_empty() } - /// Get a pair of pointers, the first one being at the beginning of the fragment, the second one pointing + /// Get a pair of pointers, the first one being at the beginning of the fragment, the second one pointing /// to the byte after the end of the fragment. const fn start_and_end(&self) -> (*const u8, *const u8) { - // Get the pointer to the start of the fragment. + // Get the pointer to the start of the fragment. let start: *const u8 = self.inner.as_ptr(); - // Get a pointer just past the end of the string. - // SAFETY: the resulting pointer is guarunteed to point at one byte past the end of the string. + // Get a pointer just past the end of the string. + // SAFETY: the resulting pointer is guarunteed to point at one byte past the end of the string. (start, unsafe { start.add(self.len()) }) } - /// Return true if this fragment overlaps at all with the other (either one contains the start of the other, + /// Return true if this fragment overlaps at all with the other (either one contains the start of the other, /// by pointer). pub fn overlaps(&self, other: &Self) -> bool { // Get start and end pointers for both fragments. let (start, end) = self.start_and_end(); let (other_start, other_end) = other.start_and_end(); - // Check if this fragment contains either end of the other fragment. + // Check if this fragment contains either end of the other fragment. (start <= other_start && other_start < end) || (other_start <= start && start < other_end) } - /// Return true if this fragment entirely contains another fragment using pointers. + /// Return true if this fragment entirely contains another fragment using pointers. pub fn contains(&self, other: &Self) -> bool { // Get start and end pointers for both fragments. let (start, end) = self.start_and_end(); @@ -50,33 +50,43 @@ impl<'src> Fragment<'src> { } /// Split this fragment into two sub fragments, with the first one being `bytes` long and the second containing the - /// rest of this fragment. - /// - /// Panics if the byte index is not in the fragment, or if it's on a char boundary. + /// rest of this fragment. + /// + /// Panics if the byte index is not in the fragment, or if it's on a char boundary. pub fn split(&self, bytes: usize) -> (Self, Self) { - // Use str's split_at. + // Use str's split_at. let (left, right) = self.inner.split_at(bytes); (Self { inner: left }, Self { inner: right }) } - /// Get an iterator over the characters in this fragment. + /// Get an iterator over the characters in this fragment. pub fn chars(&self) -> Chars<'src> { self.inner.chars() } } - #[cfg(test)] mod tests { use crate::parser::fragment::Fragment; #[test] fn test_overlap() { - let a = Fragment { inner: "Test string" }; - let b = Fragment { inner: &a.inner[3..] }; - let c = Fragment { inner: &a.inner[..a.len()-3] }; - let d = Fragment { inner: "other string" }; + let a = Fragment { + inner: "Test string", + }; + + let b = Fragment { + inner: &a.inner[3..], + }; + + let c = Fragment { + inner: &a.inner[..a.len() - 3], + }; + + let d = Fragment { + inner: "other string", + }; assert!(a.overlaps(&b)); assert!(b.overlaps(&c)); diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 08676d39..38806906 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -1,14 +1,14 @@ //! First pass lexer that gets run on the source code and returns a series of tokens with their associated [Fragment]s. -//! -//! Note that this will strip out comments and whitespace, returning only fragments that match one of the paterns -//! defined for tokens. +//! +//! Note that this will strip out comments and whitespace, returning only fragments that match one of the paterns +//! defined for tokens. +use super::fragment::Fragment; use std::ptr; use std::str::Chars; use unicode_ident::{is_xid_continue, is_xid_start}; -use super::fragment::Fragment; -/// Constant table of single character tokens and the characters that match them. +/// Constant table of single character tokens and the characters that match them. pub const SINGLE_CHAR_TOKENS: &[(char, TokenTy)] = &[ ('(', TokenTy::LeftParen), (')', TokenTy::RightParen), @@ -25,7 +25,7 @@ pub const SINGLE_CHAR_TOKENS: &[(char, TokenTy)] = &[ ]; /// Tokens that can be either a single character or upgraded with an -/// equals sign. +/// equals sign. pub const POSSIBLE_EQ_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy)] = &[ ('!', TokenTy::Bang, TokenTy::BangEq), ('%', TokenTy::Mod, TokenTy::ModEq), @@ -35,7 +35,7 @@ pub const POSSIBLE_EQ_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy)] = &[ ('/', TokenTy::Div, TokenTy::DivEq), ]; -/// Characters that can produce different tokens when followed by an equals sign or themselves. +/// Characters that can produce different tokens when followed by an equals sign or themselves. pub const POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy, TokenTy)] = &[ ('&', TokenTy::And, TokenTy::AndEq, TokenTy::AndAnd), ('|', TokenTy::Or, TokenTy::OrEq, TokenTy::OrOr), @@ -44,40 +44,46 @@ pub const POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy, Toke (':', TokenTy::Colon, TokenTy::ColonEq, TokenTy::ColonColon), ]; -/// Characters that can produce different tokens when followed by an equals sign or +/// Characters that can produce different tokens when followed by an equals sign or /// a `>` for arrows. pub const POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy, TokenTy)] = &[ ('-', TokenTy::Minus, TokenTy::MinusEq, TokenTy::SingleArrow), ('=', TokenTy::Eq, TokenTy::EqEq, TokenTy::DoubleArrow), ]; -/// The number of rows of the generated prefix table. +/// The number of rows of the generated prefix table. pub const PREFIX_TABLE_ROWS: usize = { - SINGLE_CHAR_TOKENS.len() - + 2 * POSSIBLE_EQ_UPGRADE_TOKENS.len() - + 3 * POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS.len() - + 3 * POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS.len() + SINGLE_CHAR_TOKENS.len() + + 2 * POSSIBLE_EQ_UPGRADE_TOKENS.len() + + 3 * POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS.len() + + 3 * POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS.len() }; -/// A relationship between a prefix and the token that should be generated when that prefix matches. +/// A relationship between a prefix and the token that should be generated when that prefix matches. #[derive(Copy, Clone, Debug)] pub struct PrefixToToken { - /// An array of two chars. In single char tokens, the second one should be a null character (`'\0'`). - /// the char_length field will be used to slice this buffer to get the actual prefix. + /// An array of two chars. In single char tokens, the second one should be a null character (`'\0'`). + /// the char_length field will be used to slice this buffer to get the actual prefix. pub char_buffer: [char; 2], - /// The byte length of this prefix and all generated tokens by this prefix. + /// The byte length of this prefix and all generated tokens by this prefix. pub byte_len: usize, - /// The kind of [Token] generated when this prefix matches. + /// The kind of [Token] generated when this prefix matches. pub kind: TokenTy, } impl PrefixToToken { - /// Convenience function to construct a [`PrefixToToken`] by calculating the length of both chars - /// (and ignoring the second one if it's null). + /// Convenience function to construct a [`PrefixToToken`] by calculating the length of both chars + /// (and ignoring the second one if it's null). pub const fn new(chars: [char; 2], kind: TokenTy) -> Self { PrefixToToken { char_buffer: chars, - byte_len: if chars[1] == '\0' { chars[0].len_utf8() } else { chars[0].len_utf8() + chars[1].len_utf8() }, + + byte_len: if chars[1] == '\0' { + chars[0].len_utf8() + } else { + chars[0].len_utf8() + chars[1].len_utf8() + }, + kind, } } @@ -156,21 +162,20 @@ pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { table }; - -/// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. +/// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug)] pub struct Lexer<'src> { - /// The remaining source code that has not been processed and returned as a token from the iterator yet. + /// The remaining source code that has not been processed and returned as a token from the iterator yet. pub remaining: Fragment<'src>, } -/// A token in wright source code. +/// A token in wright source code. #[derive(Debug)] pub struct Token<'src> { - /// What type of token this is. + /// What type of token this is. pub variant: TokenTy, - /// The matching fragment of source code -- this contains the location and length data for the token. - pub fragment: Fragment<'src> + /// The matching fragment of source code -- this contains the location and length data for the token. + pub fragment: Fragment<'src>, } /// The different types of tokens in wright source. @@ -242,18 +247,20 @@ pub enum TokenTy { } impl<'src> Lexer<'src> { - /// Get the number of bytes remaining that we need to transform into tokens. + /// Get the number of bytes remaining that we need to transform into tokens. pub const fn bytes_remaining(&self) -> usize { self.remaining.len() } - /// Construct a new lexer over a given reference to a source string. + /// Construct a new lexer over a given reference to a source string. pub const fn new(source: &'src str) -> Self { - Lexer { remaining: Fragment { inner: source } } + Lexer { + remaining: Fragment { inner: source }, + } } /// Try to match a fragment recognized to be an identifier or keyword to - /// a keyword or return [TokenTy::Identifier]. + /// a keyword or return [TokenTy::Identifier]. fn identifier_or_keyword(fragment: Fragment<'src>) -> TokenTy { use TokenTy::*; @@ -269,11 +276,11 @@ impl<'src> Lexer<'src> { "trait" => KwTrait, "const" => KwConst, "where" => KwWhere, - + "use" => KwUse, "as" => KwAs, "mod" => KwMod, - + "if" => KwIf, "else" => KwElse, @@ -287,30 +294,34 @@ impl<'src> Lexer<'src> { "_" => Underscore, - _ => Identifier + _ => Identifier, } } /// Make a token by splitting a given number of bytes off of the `self.remaining` fragment - /// and labeling them with the given kind. + /// and labeling them with the given kind. fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { let (token_fragment, new_remaining_fragment) = self.remaining.split(bytes); self.remaining = new_remaining_fragment; - Token { variant: kind, fragment: token_fragment } + + Token { + variant: kind, + fragment: token_fragment, + } } /// Get the next token from the lexer. pub fn next_token(&mut self) -> Option> { - // If the remaining input is empty, there is no token. + // If the remaining input is empty, there is no token. if self.remaining.is_empty() { return None; } - // Use blocks heavily in this function as we don't want to re-use iterators or variables - // after we check them in most cases. + // Use blocks heavily in this function as we don't want to re-use iterators or variables + // after we check them in most cases. - // If there is whitespace at the start of the remaining fragment, strip it and re-run this - // function to get the next token. + // If there is whitespace at the start of the remaining fragment, strip it and re-run this + // function to get the next token. { let without_whitespace: &str = self.remaining.inner.trim_start(); @@ -319,69 +330,72 @@ impl<'src> Lexer<'src> { return self.next_token(); } } - + // To attempt to match a token from the prefix table, make a char iterator // and get two chars from it to test equality. None of the tokens start with a // null character so use that as a single of an unavailable char. { let mut char_iter: Chars = self.remaining.chars(); let char_array: [char; 2] = [ - // Unchecked unwrap here since we know there's at least one char. - unsafe { char_iter.next().unwrap_unchecked() }, - char_iter.next().unwrap_or('\0') + // Unchecked unwrap here since we know there's at least one char. + unsafe { char_iter.next().unwrap_unchecked() }, + char_iter.next().unwrap_or('\0'), ]; // Next iterate through the prefix table to try to get any tokens that are covered there. for prefix_meta in PREFIX_TABLE.iter() { // If it's a single char comparison, only compare the first chars. - if prefix_meta.char_buffer[1] == '\0' && prefix_meta.char_buffer[0] == char_array[0] { + if prefix_meta.char_buffer[1] == '\0' && prefix_meta.char_buffer[0] == char_array[0] + { return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); } - // Otherwise compare the whole slices. + // Otherwise compare the whole slices. if &prefix_meta.char_buffer == &char_array { return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); } } } - // Next attempt to match a keyword or identifier. + // Next attempt to match a keyword or identifier. { let mut chars: Chars = self.remaining.chars(); - // The unsafe is fine here -- we've established that this lexer has bytes remaining. + // The unsafe is fine here -- we've established that this lexer has bytes remaining. let next: char = unsafe { chars.next().unwrap_unchecked() }; if is_xid_start(next) || next == '_' { let mut bytes_consumed: usize = next.len_utf8(); - // Take remaining chars and add to sum. + // Take remaining chars and add to sum. bytes_consumed += chars .take_while(|c| is_xid_continue(*c)) .map(char::len_utf8) .sum::(); - // Split the number of bytes we consumed. + // Split the number of bytes we consumed. let (ident_frag, new_remaining) = self.remaining.split(bytes_consumed); - // Get the token kind to produce for this fragment. + // Get the token kind to produce for this fragment. let variant = Lexer::identifier_or_keyword(ident_frag); - // Update the lexers remaining fragment. + // Update the lexers remaining fragment. self.remaining = new_remaining; - // Return the identifier, keyword, or underscore. - return Some(Token { variant, fragment: ident_frag }); + // Return the identifier, keyword, or underscore. + return Some(Token { + variant, + fragment: ident_frag, + }); } } unimplemented!() } - } #[cfg(test)] mod tests { - use crate::parser::lexer::TokenTy; use super::Lexer; use super::PREFIX_TABLE; + use crate::parser::lexer::TokenTy; #[test] #[ignore = "this test is just used for debugging the prefix table"] diff --git a/wright/src/repl.rs b/wright/src/repl.rs index f0e024b5..40d16370 100644 --- a/wright/src/repl.rs +++ b/wright/src/repl.rs @@ -116,6 +116,6 @@ pub fn start() -> anyhow::Result<()> { write!(&mut output, "[{}]: << ", input_number)?; output.flush()?; - unimplemented!("REPL needs to be re-worked a bit."); + unimplemented!("REPL needs to be re-worked a bit."); } } From 2f18bb7398f2e5faf93e03c386eb6aef4b8495f2 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 11 Feb 2024 01:01:08 -0500 Subject: [PATCH 19/60] Clippy changes --- wright/src/filemap.rs | 7 ++++--- wright/src/parser/lexer.rs | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index eb9d8984..d9718b8e 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -189,7 +189,8 @@ impl<'src> FileMap<'src> { // Double check that the file is valid utf-8. If not, return an IO error. let raw_data: &[u8] = mem_map.as_ref(); let as_str: Result<&str, std::str::Utf8Error> = std::str::from_utf8(raw_data); - if as_str.is_err() { + + if let Err(utf8_err) = as_str { // The file is not valid for us so we should unlock it and return an error. file.unlock() .map_err(|err| eprintln!("Error unlocking file: {:?}", err)) @@ -197,7 +198,7 @@ impl<'src> FileMap<'src> { return Err(io::Error::new( io::ErrorKind::InvalidData, - as_str.unwrap_err(), + utf8_err, )); } @@ -298,7 +299,7 @@ impl<'src> AsRef for ImmutableString<'src> { fn as_ref(&self) -> &str { match self { ImmutableString::Reference(str) => str, - ImmutableString::Owned(str) => &str, + ImmutableString::Owned(str) => str, ImmutableString::LockedFile { mem_map, .. } => { // Get a direct reference to the data that is in the memory map. let raw_data: &[u8] = mem_map.as_ref(); diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 38806906..b1294d17 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -303,7 +303,7 @@ impl<'src> Lexer<'src> { fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { let (token_fragment, new_remaining_fragment) = self.remaining.split(bytes); self.remaining = new_remaining_fragment; - + Token { variant: kind, fragment: token_fragment, @@ -351,7 +351,7 @@ impl<'src> Lexer<'src> { } // Otherwise compare the whole slices. - if &prefix_meta.char_buffer == &char_array { + if prefix_meta.char_buffer == char_array { return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); } } From 2acce9def458723a465619da6d6c8df0b4aad6bc Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 11 Feb 2024 01:01:32 -0500 Subject: [PATCH 20/60] cargo fmt --- wright/src/filemap.rs | 7 ++----- wright/src/parser/fragment.rs | 4 ++-- wright/src/parser/lexer.rs | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index d9718b8e..91d10985 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -195,11 +195,8 @@ impl<'src> FileMap<'src> { file.unlock() .map_err(|err| eprintln!("Error unlocking file: {:?}", err)) .ok(); - - return Err(io::Error::new( - io::ErrorKind::InvalidData, - utf8_err, - )); + + return Err(io::Error::new(io::ErrorKind::InvalidData, utf8_err)); } // The file's contents are valid utf-8, add them to the file map. diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index 62fe95a7..6f159ec4 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -79,11 +79,11 @@ mod tests { let b = Fragment { inner: &a.inner[3..], }; - + let c = Fragment { inner: &a.inner[..a.len() - 3], }; - + let d = Fragment { inner: "other string", }; diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index b1294d17..b6b917bd 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -83,7 +83,7 @@ impl PrefixToToken { } else { chars[0].len_utf8() + chars[1].len_utf8() }, - + kind, } } From 7af8d2155ad1ce9cb1bb986b280c8f6e7fe52e69 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Tue, 13 Feb 2024 01:55:51 -0500 Subject: [PATCH 21/60] Integer Literal tokens --- wright/Cargo.toml | 6 ++--- wright/src/parser/lexer.rs | 52 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/wright/Cargo.toml b/wright/Cargo.toml index 8957aebb..df699765 100644 --- a/wright/Cargo.toml +++ b/wright/Cargo.toml @@ -87,9 +87,9 @@ features = ["strict-versioning", "force-static"] version = "0.3" features = ["llvm17-0"] -# Fast parsing for integers and floats from source code. -[dependencies.lexical-core] -version = "0.8" +# # Fast parsing for integers and floats from source code. +# [dependencies.lexical-core] +# version = "0.8" # TEST DEPENDENCIES diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index b6b917bd..f9728314 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -4,8 +4,8 @@ //! defined for tokens. use super::fragment::Fragment; -use std::ptr; use std::str::Chars; +use std::{iter::Peekable, ptr}; use unicode_ident::{is_xid_continue, is_xid_start}; /// Constant table of single character tokens and the characters that match them. @@ -242,6 +242,8 @@ pub enum TokenTy { KwLoop, KwWhere, + IntegerLiteral, + /// Unknown character in lexer fragment. Unknown } @@ -360,7 +362,6 @@ impl<'src> Lexer<'src> { // Next attempt to match a keyword or identifier. { let mut chars: Chars = self.remaining.chars(); - // The unsafe is fine here -- we've established that this lexer has bytes remaining. let next: char = unsafe { chars.next().unwrap_unchecked() }; @@ -387,6 +388,43 @@ impl<'src> Lexer<'src> { } } + // Next attempt to parse a numerical literal. + { + let mut chars: Peekable = self.remaining.chars().peekable(); + // The unsafe is fine here -- we've established that this lexer has bytes remaining. + let next: char = unsafe { chars.next().unwrap_unchecked() }; + + if next.is_ascii_digit() { + // Accumulate the number of bytes consumed in the numeric literal. + let mut acc: usize = 1; + // Track the radix + let mut radix: u32 = 10; + + // Change the radix if necessary + if next == '0' { + if let Some(prefix) = chars.next_if(|x| ['x', 'o', 'b', 'X', 'B'].contains(x)) { + // All the possible prefix chars are 1 byte ascii characters. + acc += 1; + + radix = match prefix { + 'x' | 'X' => 16, + 'b' | 'B' => 2, + 'o' => 8, + _ => unreachable!("the prefix byte is checked above"), + }; + } + } + + // Add the rest of the integer literal. + acc += chars + .take_while(|c| c.is_digit(radix) || *c == '_') + .map(char::len_utf8) + .sum::(); + + return Some(self.split_token(acc, TokenTy::IntegerLiteral)); + } + } + unimplemented!() } } @@ -434,4 +472,14 @@ mod tests { assert_eq!(lexer.next_token().unwrap().variant, TokenTy::KwConst); assert_eq!(lexer.next_token().unwrap().variant, TokenTy::Identifier); } + + #[test] + fn intger_literal() { + let mut lexer = Lexer::new("123_456_789."); + + let token = lexer.next_token().unwrap(); + + assert_eq!(token.fragment.inner, "123_456_789"); + assert_eq!(token.variant, TokenTy::IntegerLiteral); + } } From a345cb7bc9c898bc6529a7af90e402598b1dc503 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 18 Feb 2024 02:30:28 -0500 Subject: [PATCH 22/60] Stub AST, add token debugging, refactor FileIds --- wright/src/bin/wright.rs | 22 +++++++++++++++++++--- wright/src/filemap.rs | 29 +++++++++-------------------- wright/src/parser.rs | 1 + wright/src/parser/ast.rs | 3 +++ wright/src/parser/lexer.rs | 29 +++++++++++++++++++++++++++-- 5 files changed, 59 insertions(+), 25 deletions(-) create mode 100644 wright/src/parser/ast.rs diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index 2e589622..8ed5c89b 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -2,12 +2,13 @@ use anyhow::Result; use clap::{Parser, Subcommand}; +use codespan_reporting::files::Files; use std::path::PathBuf; -use wright::repl; +use wright::{filemap::{FileId, FileMap}, parser::lexer::{Lexer, Token}, repl}; /// The wright cli. #[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] +#[command(author, version, about, long_about = None, arg_required_else_help = true)] struct Cli { /// The subcommand passed to the wright cli. #[command(subcommand)] @@ -49,6 +50,21 @@ fn main() -> Result<()> { // Start an interactive repl. Some(Commands::Repl) => repl::start(), - _ => unimplemented!(), + // Print all the tokens for a given file. + Some(Commands::Debug { command: DebugCommands::Tokens { file, pretty: false } }) => { + let mut file_map: FileMap = FileMap::new(); + // Add the given file to the file map. + let file_id: FileId = file_map.add_file(file)?; + // Make a lexer over the entirety of the given file. + // Use unwrap here, since we know we just added the file. + let lexer: Lexer = Lexer::new(file_map.source(file_id).unwrap()); + // Get all the tokens from the lexer and print them each. + lexer.for_each(|token: Token| println!("{token:?}")); + // Return ok. + Ok(()) + }, + + + _ => unimplemented!() } } diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index 91d10985..d26d100e 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -65,6 +65,9 @@ pub struct FileMap<'src> { inner: Vec>>, } +/// File Identifier used to refer to files. +pub type FileId = as Files<'static>>::FileId; + impl<'src> FileMap<'src> { /// Construct a new empty [FileMap]. pub const fn new() -> Self { @@ -72,20 +75,13 @@ impl<'src> FileMap<'src> { } /// Get a reference to a file from the internal [Vec] or return a [`CodespanError::FileMissing`] error. - fn get( - &self, - file_id: >::FileId, - ) -> CodespanResult<&SimpleFile>> { + fn get(&self, file_id: FileId) -> CodespanResult<&SimpleFile>> { self.inner.get(file_id).ok_or(CodespanError::FileMissing) } /// Internal function to add a file to the vec. Public facing functions will need to do some conversion /// and then call this. - fn add( - &mut self, - name: FileName, - source: ImmutableString<'src>, - ) -> >::FileId { + fn add(&mut self, name: FileName, source: ImmutableString<'src>) -> FileId { // The file id is just the next index in the vec. let file_id: usize = self.inner.len(); self.inner.push(SimpleFile::new(name, source)); @@ -93,23 +89,19 @@ impl<'src> FileMap<'src> { } /// Add a file (in the form of an owned string) to the file map. - pub fn add_string(&mut self, name: FileName, source: String) -> >::FileId { + pub fn add_string(&mut self, name: FileName, source: String) -> FileId { self.add(name, ImmutableString::Owned(source.into_boxed_str())) } /// Add a file (in the form of a string reference) to the file map. - pub fn add_str_ref( - &mut self, - name: FileName, - source: &'src str, - ) -> >::FileId { + pub fn add_str_ref(&mut self, name: FileName, source: &'src str) -> FileId { self.add(name, ImmutableString::Reference(source)) } /// Add a file from the file system. This file will be /// opened with read permissions, locked, memory mapped, /// and then added to the file map. The file name in the memory map will be the [PathBuf] passed to this function. - pub fn add_file(&mut self, path: PathBuf) -> io::Result<>::FileId> { + pub fn add_file(&mut self, path: PathBuf) -> io::Result { // Make a one-off enum here to use for channel messages. enum ChannelMessage { /// The file was successfully locked. @@ -217,10 +209,7 @@ impl<'src> FileMap<'src> { } /// Find the file ID of a given [Fragment] using the fragment's internal pointer. - pub fn find_fragment( - &self, - fragment: &Fragment<'src>, - ) -> Option<>::FileId> { + pub fn find_fragment(&self, fragment: &Fragment<'src>) -> Option { // Iterate on file IDs. for file_id in 0..self.inner.len() { // Use expect because all of these file IDs should be fine. diff --git a/wright/src/parser.rs b/wright/src/parser.rs index a1c20b2e..142bca6b 100644 --- a/wright/src/parser.rs +++ b/wright/src/parser.rs @@ -7,3 +7,4 @@ pub mod fragment; pub mod lexer; +pub mod ast; diff --git a/wright/src/parser/ast.rs b/wright/src/parser/ast.rs new file mode 100644 index 00000000..ca1652f8 --- /dev/null +++ b/wright/src/parser/ast.rs @@ -0,0 +1,3 @@ +//! Abstract syntax tree representation for Wright source code. + + diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index f9728314..7c14b9e8 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -4,9 +4,11 @@ //! defined for tokens. use super::fragment::Fragment; +use std::iter::FusedIterator; use std::str::Chars; use std::{iter::Peekable, ptr}; use unicode_ident::{is_xid_continue, is_xid_start}; +use derive_more::Display; /// Constant table of single character tokens and the characters that match them. pub const SINGLE_CHAR_TOKENS: &[(char, TokenTy)] = &[ @@ -170,7 +172,8 @@ pub struct Lexer<'src> { } /// A token in wright source code. -#[derive(Debug)] +#[derive(Debug, Display)] +#[display(fmt = "\"{}\" ({:?})", "fragment.inner", variant)] pub struct Token<'src> { /// What type of token this is. pub variant: TokenTy, @@ -227,6 +230,7 @@ pub enum TokenTy { KwRepr, KwImpl, KwConstraint, + KwReferences, KwTrait, KwUse, KwAs, @@ -275,6 +279,7 @@ impl<'src> Lexer<'src> { "repr" => KwRepr, "impl" => KwImpl, "constraint" => KwConstraint, + "references" => KwReferences, "trait" => KwTrait, "const" => KwConst, "where" => KwWhere, @@ -425,10 +430,30 @@ impl<'src> Lexer<'src> { } } - unimplemented!() + // If we haven't matched at this point, produce a token marked as "Unknown". + // The unsafe is fine -- we know from above that there are remaining characters. + let unknown_char = unsafe { self.remaining.chars().next().unwrap_unchecked() }; + return Some(self.split_token(unknown_char.len_utf8(), TokenTy::Unknown)); + } +} + +/// Lexers can be considered token iterators. +impl<'src> Iterator for Lexer<'src> { + type Item = Token<'src>; + + fn next(&mut self) -> Option { + self.next_token() + } + + fn size_hint(&self) -> (usize, Option) { + // Lexers cannot return multiple tokens for a single byte. + (0, Some(self.bytes_remaining())) } } +// Lexers are fused -- they cannot generate tokens infinitely. +impl<'src> FusedIterator for Lexer<'src> {} + #[cfg(test)] mod tests { use super::Lexer; From 82187d3659cc446ae9c2aef16f1f3a98e7904b02 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sun, 18 Feb 2024 02:31:10 -0500 Subject: [PATCH 23/60] cargo fmt --- wright/src/bin/wright.rs | 29 +++++++++++++++++++---------- wright/src/filemap.rs | 2 +- wright/src/parser.rs | 2 +- wright/src/parser/ast.rs | 4 +--- wright/src/parser/lexer.rs | 10 +++++----- 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index 8ed5c89b..61ce6bc3 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -4,7 +4,11 @@ use anyhow::Result; use clap::{Parser, Subcommand}; use codespan_reporting::files::Files; use std::path::PathBuf; -use wright::{filemap::{FileId, FileMap}, parser::lexer::{Lexer, Token}, repl}; +use wright::{ + filemap::{FileId, FileMap}, + parser::lexer::{Lexer, Token}, + repl, +}; /// The wright cli. #[derive(Parser, Debug)] @@ -50,21 +54,26 @@ fn main() -> Result<()> { // Start an interactive repl. Some(Commands::Repl) => repl::start(), - // Print all the tokens for a given file. - Some(Commands::Debug { command: DebugCommands::Tokens { file, pretty: false } }) => { + // Print all the tokens for a given file. + Some(Commands::Debug { + command: + DebugCommands::Tokens { + file, + pretty: false, + }, + }) => { let mut file_map: FileMap = FileMap::new(); - // Add the given file to the file map. + // Add the given file to the file map. let file_id: FileId = file_map.add_file(file)?; - // Make a lexer over the entirety of the given file. - // Use unwrap here, since we know we just added the file. + // Make a lexer over the entirety of the given file. + // Use unwrap here, since we know we just added the file. let lexer: Lexer = Lexer::new(file_map.source(file_id).unwrap()); - // Get all the tokens from the lexer and print them each. + // Get all the tokens from the lexer and print them each. lexer.for_each(|token: Token| println!("{token:?}")); // Return ok. Ok(()) - }, + } - - _ => unimplemented!() + _ => unimplemented!(), } } diff --git a/wright/src/filemap.rs b/wright/src/filemap.rs index d26d100e..5c5612f8 100644 --- a/wright/src/filemap.rs +++ b/wright/src/filemap.rs @@ -65,7 +65,7 @@ pub struct FileMap<'src> { inner: Vec>>, } -/// File Identifier used to refer to files. +/// File Identifier used to refer to files. pub type FileId = as Files<'static>>::FileId; impl<'src> FileMap<'src> { diff --git a/wright/src/parser.rs b/wright/src/parser.rs index 142bca6b..6f03f9b7 100644 --- a/wright/src/parser.rs +++ b/wright/src/parser.rs @@ -5,6 +5,6 @@ // pub mod state; // pub mod util; +pub mod ast; pub mod fragment; pub mod lexer; -pub mod ast; diff --git a/wright/src/parser/ast.rs b/wright/src/parser/ast.rs index ca1652f8..9b7bc8c1 100644 --- a/wright/src/parser/ast.rs +++ b/wright/src/parser/ast.rs @@ -1,3 +1 @@ -//! Abstract syntax tree representation for Wright source code. - - +//! Abstract syntax tree representation for Wright source code. diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 7c14b9e8..07518a29 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -4,11 +4,11 @@ //! defined for tokens. use super::fragment::Fragment; +use derive_more::Display; use std::iter::FusedIterator; use std::str::Chars; use std::{iter::Peekable, ptr}; use unicode_ident::{is_xid_continue, is_xid_start}; -use derive_more::Display; /// Constant table of single character tokens and the characters that match them. pub const SINGLE_CHAR_TOKENS: &[(char, TokenTy)] = &[ @@ -431,13 +431,13 @@ impl<'src> Lexer<'src> { } // If we haven't matched at this point, produce a token marked as "Unknown". - // The unsafe is fine -- we know from above that there are remaining characters. + // The unsafe is fine -- we know from above that there are remaining characters. let unknown_char = unsafe { self.remaining.chars().next().unwrap_unchecked() }; return Some(self.split_token(unknown_char.len_utf8(), TokenTy::Unknown)); } } -/// Lexers can be considered token iterators. +/// Lexers can be considered token iterators. impl<'src> Iterator for Lexer<'src> { type Item = Token<'src>; @@ -446,12 +446,12 @@ impl<'src> Iterator for Lexer<'src> { } fn size_hint(&self) -> (usize, Option) { - // Lexers cannot return multiple tokens for a single byte. + // Lexers cannot return multiple tokens for a single byte. (0, Some(self.bytes_remaining())) } } -// Lexers are fused -- they cannot generate tokens infinitely. +// Lexers are fused -- they cannot generate tokens infinitely. impl<'src> FusedIterator for Lexer<'src> {} #[cfg(test)] From c9a3740b8e6ae71117fc13575f9f530eb8605f56 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Mon, 19 Feb 2024 01:55:06 -0500 Subject: [PATCH 24/60] Ignore single line comments in lexer, print add command-line arg to print time elapsed. --- wright/src/bin/wright.rs | 22 +++++++++++++++++----- wright/src/parser/lexer.rs | 29 +++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index 61ce6bc3..5a43cf81 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -3,7 +3,7 @@ use anyhow::Result; use clap::{Parser, Subcommand}; use codespan_reporting::files::Files; -use std::path::PathBuf; +use std::{path::PathBuf, time::Instant}; use wright::{ filemap::{FileId, FileMap}, parser::lexer::{Lexer, Token}, @@ -17,6 +17,10 @@ struct Cli { /// The subcommand passed to the wright cli. #[command(subcommand)] command: Option, + + /// Output elapsed timing information at the end of the command. + #[arg(short, long)] + timed: bool, } /// Different sub-commands that the wright cli supports. @@ -48,11 +52,14 @@ enum DebugCommands { } fn main() -> Result<()> { - let cli = Cli::parse(); + // Parse the command line arguments. + let cli: Cli = Cli::parse(); + // Get the start time to track duration if asked. + let start: Instant = Instant::now(); match cli.command { // Start an interactive repl. - Some(Commands::Repl) => repl::start(), + Some(Commands::Repl) => { repl::start()?; }, // Print all the tokens for a given file. Some(Commands::Debug { @@ -70,10 +77,15 @@ fn main() -> Result<()> { let lexer: Lexer = Lexer::new(file_map.source(file_id).unwrap()); // Get all the tokens from the lexer and print them each. lexer.for_each(|token: Token| println!("{token:?}")); - // Return ok. - Ok(()) } _ => unimplemented!(), } + + // Handle timing info. + if cli.timed { + println!("\nTime elapsed: {:?}", Instant::now() - start); + } + + Ok(()) } diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 07518a29..b62b9c92 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -338,6 +338,35 @@ impl<'src> Lexer<'src> { } } + // Discard any single-line comment at the start of this lexer and then re-run this function if there was one. + // Note that this will not detect doc comments or multi-line comments. + { + if let Some(without_comment_prefix) = self.remaining.inner.strip_prefix("//") { + // If the next character is not a slash or exclamation, indicating a doc comment. + if !without_comment_prefix.starts_with(&['/', '!']) { + // Get the number of bytes between the start of the comment and the newline, or end of file. + // Do not include bytes of whitespace at or past the newline -- those are handled above. + let line_bytes: usize = without_comment_prefix + // Make an iterator over the lines after this `//`. + .lines() + // Get only the first line. + .next() + // Map to the length of the line string. + .map(str::len) + // If there is no line after the start of this comment we have zero bytes to read. + .unwrap_or(0); + + // Split this number of bytes from the string and ignore them. + let (_, new_remaining) = without_comment_prefix.split_at(line_bytes); + // Put the split off string in a Fragment, and consider this fragment to be the + // remaining Fragment for this lexer. + self.remaining = Fragment { inner: new_remaining }; + // Restart this function. + return self.next_token(); + } + } + } + // To attempt to match a token from the prefix table, make a char iterator // and get two chars from it to test equality. None of the tokens start with a // null character so use that as a single of an unavailable char. From 9476c53424aae57deb90f2ff41c5f8d42c307020 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Mon, 19 Feb 2024 01:55:36 -0500 Subject: [PATCH 25/60] clippy --- wright/src/parser/lexer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index b62b9c92..f4f00070 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -343,7 +343,7 @@ impl<'src> Lexer<'src> { { if let Some(without_comment_prefix) = self.remaining.inner.strip_prefix("//") { // If the next character is not a slash or exclamation, indicating a doc comment. - if !without_comment_prefix.starts_with(&['/', '!']) { + if !without_comment_prefix.starts_with(['/', '!']) { // Get the number of bytes between the start of the comment and the newline, or end of file. // Do not include bytes of whitespace at or past the newline -- those are handled above. let line_bytes: usize = without_comment_prefix From 3fcf26e5a8eea16b28f3d03a43e4ade85d0225e2 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Mon, 19 Feb 2024 01:56:03 -0500 Subject: [PATCH 26/60] cargo fmt --- wright/src/bin/wright.rs | 6 ++++-- wright/src/parser/lexer.rs | 21 ++++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index 5a43cf81..1469488d 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -54,12 +54,14 @@ enum DebugCommands { fn main() -> Result<()> { // Parse the command line arguments. let cli: Cli = Cli::parse(); - // Get the start time to track duration if asked. + // Get the start time to track duration if asked. let start: Instant = Instant::now(); match cli.command { // Start an interactive repl. - Some(Commands::Repl) => { repl::start()?; }, + Some(Commands::Repl) => { + repl::start()?; + } // Print all the tokens for a given file. Some(Commands::Debug { diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index f4f00070..29d26080 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -338,8 +338,8 @@ impl<'src> Lexer<'src> { } } - // Discard any single-line comment at the start of this lexer and then re-run this function if there was one. - // Note that this will not detect doc comments or multi-line comments. + // Discard any single-line comment at the start of this lexer and then re-run this function if there was one. + // Note that this will not detect doc comments or multi-line comments. { if let Some(without_comment_prefix) = self.remaining.inner.strip_prefix("//") { // If the next character is not a slash or exclamation, indicating a doc comment. @@ -347,21 +347,24 @@ impl<'src> Lexer<'src> { // Get the number of bytes between the start of the comment and the newline, or end of file. // Do not include bytes of whitespace at or past the newline -- those are handled above. let line_bytes: usize = without_comment_prefix - // Make an iterator over the lines after this `//`. + // Make an iterator over the lines after this `//`. .lines() - // Get only the first line. + // Get only the first line. .next() // Map to the length of the line string. .map(str::len) // If there is no line after the start of this comment we have zero bytes to read. .unwrap_or(0); - - // Split this number of bytes from the string and ignore them. + + // Split this number of bytes from the string and ignore them. let (_, new_remaining) = without_comment_prefix.split_at(line_bytes); // Put the split off string in a Fragment, and consider this fragment to be the - // remaining Fragment for this lexer. - self.remaining = Fragment { inner: new_remaining }; - // Restart this function. + // remaining Fragment for this lexer. + self.remaining = Fragment { + inner: new_remaining, + }; + + // Restart this function. return self.next_token(); } } From 71bab84627df829dd76ce257400ab2584a06e910 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Mon, 19 Feb 2024 01:56:17 -0500 Subject: [PATCH 27/60] cargo fmt --- wright/src/parser/lexer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 29d26080..01864fde 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -363,7 +363,7 @@ impl<'src> Lexer<'src> { self.remaining = Fragment { inner: new_remaining, }; - + // Restart this function. return self.next_token(); } From 05570fe97431e34bb31f1378b2f36b1ddf109e12 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Mon, 19 Feb 2024 02:06:09 -0500 Subject: [PATCH 28/60] Remove num dependency for now --- wright/Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wright/Cargo.toml b/wright/Cargo.toml index df699765..42e13fc8 100644 --- a/wright/Cargo.toml +++ b/wright/Cargo.toml @@ -63,9 +63,9 @@ version = "0.11.1" [dependencies.termcolor] version = "1.2.0" -# Big Integers -[dependencies.num] -version = "0.4" +# # Big Integers +# [dependencies.num] +# version = "0.4" # Portable (windows, mac, linux) file locking [dependencies.fs4] From 70fbeb53dd5fd5e52392455cdc581e6aad23d04d Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Mon, 19 Feb 2024 02:09:33 -0500 Subject: [PATCH 29/60] Tweak timing message --- wright/src/bin/wright.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index 1469488d..a445a556 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -86,7 +86,7 @@ fn main() -> Result<()> { // Handle timing info. if cli.timed { - println!("\nTime elapsed: {:?}", Instant::now() - start); + println!("\nTotal time elapsed since parsing arguments: {:?}", Instant::now() - start); } Ok(()) From 48a96e2d01e95890d48a03b44356685cb3632763 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Thu, 22 Feb 2024 21:09:18 -0500 Subject: [PATCH 30/60] Start multi-line comments --- wright/src/parser/lexer.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 01864fde..93015cf3 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -339,7 +339,7 @@ impl<'src> Lexer<'src> { } // Discard any single-line comment at the start of this lexer and then re-run this function if there was one. - // Note that this will not detect doc comments or multi-line comments. + // Note that this will not detect/discard doc comments or multi-line comments. { if let Some(without_comment_prefix) = self.remaining.inner.strip_prefix("//") { // If the next character is not a slash or exclamation, indicating a doc comment. @@ -370,6 +370,11 @@ impl<'src> Lexer<'src> { } } + // Discard any multi-line comments we encounter, sparing doc comments. + { + + } + // To attempt to match a token from the prefix table, make a char iterator // and get two chars from it to test equality. None of the tokens start with a // null character so use that as a single of an unavailable char. From 6d9001ac2d375a4176a3014f1859fd1fd02b3a88 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Thu, 22 Feb 2024 23:35:22 -0500 Subject: [PATCH 31/60] Refactor single line comment and whitespace lexing --- wright/src/parser/fragment.rs | 29 ++++++ wright/src/parser/lexer.rs | 191 ++++++++++++++++++++++++++-------- 2 files changed, 177 insertions(+), 43 deletions(-) diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index 6f159ec4..ddc37006 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -30,6 +30,12 @@ impl<'src> Fragment<'src> { (start, unsafe { start.add(self.len()) }) } + /// Return true if both of these [`Fragment`]s point to the exact same slice of source code. + pub fn ptr_eq(&self, other: &Self) -> bool { + // Since std::ptr::eq works for fat pointers, we can use it here. + std::ptr::eq(self.inner, other.inner) + } + /// Return true if this fragment overlaps at all with the other (either one contains the start of the other, /// by pointer). pub fn overlaps(&self, other: &Self) -> bool { @@ -64,6 +70,29 @@ impl<'src> Fragment<'src> { pub fn chars(&self) -> Chars<'src> { self.inner.chars() } + + /// Get the number of bytes between the beginning of [`origin`] and the beginning of [`self`]. + /// + /// # Panics: + /// - Panics if [`self`] is not a fragment within [`origin`] according to [`Fragment::contains`]. + pub fn offset_from(&self, origin: &Self) -> usize { + if !origin.contains(self) { + panic!("This fragment must be contained in the original fragment"); + } + + // Get a pointer to the start of the original fragment. + let start: *const u8 = origin.inner.as_ptr(); + // Do the same for the subslice. + let subslice_start: *const u8 = self.inner.as_ptr(); + + // SAFETY: Since the subslice is contained (by pointer) by the origin slice, both of them + // necessarily satisfy the safety requirements of offset_from to be pointers to the same + // allocation. + // + // We can always cast to a usize since this should always be a positive offset, as long + // as the subslice is contained in the origin fragment. + unsafe { subslice_start.offset_from(start) as usize } + } } #[cfg(test)] diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 93015cf3..15b4b4a5 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -164,8 +164,11 @@ pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { table }; +/// The pattern that begins any single line comments (including doc comments). +pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//"; + /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct Lexer<'src> { /// The remaining source code that has not been processed and returned as a token from the iterator yet. pub remaining: Fragment<'src>, @@ -317,57 +320,152 @@ impl<'src> Lexer<'src> { } } - /// Get the next token from the lexer. - pub fn next_token(&mut self) -> Option> { - // If the remaining input is empty, there is no token. - if self.remaining.is_empty() { - return None; + /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for + /// failable parsing. This can be compared to the original lexer it was forked from using [Fragment::offset_from] + /// on the underlying `remaining` fragments. + fn fork(&self) -> Self { + *self + } + + /// Remove and ignore any whitespace at the start of the remaining fragment. + fn ignore_whitespace(&mut self) { + // Get a reference to the slice of the string past any whitespace at the start. + let without_whitespace: &str = self.remaining.inner.trim_start(); + + // If the references aren't equal, update the remaining fragment. + if !ptr::eq(without_whitespace, self.remaining.inner) { + self.remaining.inner = without_whitespace; + } + } + + /// Check if a pattern matches at the start of the remaining fragment, and if so return the number of bytes. + fn matches(&self, pattern: &str) -> bool { + self.remaining.inner.starts_with(pattern) + } + + /// If the remaining fragment starts with the given `pattern`, strip it from the remaining fragment and return + /// true. Otherwise return false. + fn consume(&mut self, pattern: &str) -> bool { + if let Some(stripped) = self.remaining.inner.strip_prefix(pattern) { + self.remaining.inner = stripped; + true + } else { + false } + } - // Use blocks heavily in this function as we don't want to re-use iterators or variables - // after we check them in most cases. + /// Remove a character from the start of the `remaining` [`Fragment`], return the character + /// consumed if there was a character available to consume. + fn consume_any(&mut self) -> Option { + // Make a character iterator. + let mut chars: Chars = self.remaining.chars(); + + if let Some(c) = chars.next() { + // Consumed a char, update the remaining fragment of this lexer. + let char_bytes: usize = c.len_utf8(); + // SAFETY: we know that this is not on a char boundary and does not exceed the length of the slice, + // since we just pulled it from a `Chars` iterator. + self.remaining.inner = unsafe { self.remaining.inner.get_unchecked(char_bytes..) }; + // Return the character. + Some(c) + } else { + // No characters available, return nothing. + None + } + } - // If there is whitespace at the start of the remaining fragment, strip it and re-run this - // function to get the next token. - { - let without_whitespace: &str = self.remaining.inner.trim_start(); + // /// Consume characters from the lexer until given pattern matches. Do not consume the pattern or + // /// any characters in it. This will consumed to the end of the lexer if the pattern is not found. + // fn consume_until(&mut self, pattern: &str) { + // while !self.remaining.is_empty() && !self.matches(pattern) { + // self.consume_any(); + // } + // } + + /// Attempt to read/handle a single line comment from the start of the + /// remaining fragment. If there's a doc-style single line comment, return a [`Token`], + /// otherwise return [`None`]. + /// + /// Generally I'm trying to follow the [rust comment spec] here. + /// + /// [rust comment spec]: https://doc.rust-lang.org/reference/comments.html + fn handle_single_line_comment(&mut self) -> Option> { + // Fork the lexer to attempt to consume a single line comment. + let mut fork: Self = self.fork(); + + // Try to consume the single line comment prefix from the fork. + if fork.consume(SINGLE_LINE_COMMENT_PREFIX) { + // We consumed it successfully, read through a newline or the end of the forked lexer if we get there. + + // First determine if this is a doc comment of some kind. + let is_inner_doc_comment: bool = fork.matches("/") && !fork.matches("//"); + let is_outer_doc_comment: bool = fork.matches("!"); - if !ptr::eq(without_whitespace, self.remaining.inner) { - self.remaining.inner = without_whitespace; - return self.next_token(); + // The consume until a newline, carraige return, or the end of the source fragment. + while !fork.remaining.is_empty() && !fork.matches("\r") && !fork.matches("\n") { + fork.consume_any(); } + + // Determine the kind of token to produce (if any). + let variant: Option; + + if is_inner_doc_comment { + variant = Some(TokenTy::InnerDocComment); + } + else if is_outer_doc_comment { + variant = Some(TokenTy::OuterDocComment); + } + else { + variant = None; + } + + // Map the variant to a token to return. + let token: Option = variant.map(|kind| { + // Get the number of bytes we have consumed using `offset_from`. + let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); + // Split this token from `self` rather than `fork` since self is still in an unmodified position. + self.split_token(bytes_consumed, kind) + }); + + // Update this lexer to match the state of the forked lexer. + *self = fork; + // Consume any outstanding whitespace. + self.ignore_whitespace(); + // Return any token produced. + return token; } - // Discard any single-line comment at the start of this lexer and then re-run this function if there was one. - // Note that this will not detect/discard doc comments or multi-line comments. - { - if let Some(without_comment_prefix) = self.remaining.inner.strip_prefix("//") { - // If the next character is not a slash or exclamation, indicating a doc comment. - if !without_comment_prefix.starts_with(['/', '!']) { - // Get the number of bytes between the start of the comment and the newline, or end of file. - // Do not include bytes of whitespace at or past the newline -- those are handled above. - let line_bytes: usize = without_comment_prefix - // Make an iterator over the lines after this `//`. - .lines() - // Get only the first line. - .next() - // Map to the length of the line string. - .map(str::len) - // If there is no line after the start of this comment we have zero bytes to read. - .unwrap_or(0); - - // Split this number of bytes from the string and ignore them. - let (_, new_remaining) = without_comment_prefix.split_at(line_bytes); - // Put the split off string in a Fragment, and consider this fragment to be the - // remaining Fragment for this lexer. - self.remaining = Fragment { - inner: new_remaining, - }; - - // Restart this function. + // If there was no comment prefix, there is no comment immediately available. + None + } + + /// Get the next token from the lexer. + pub fn next_token(&mut self) -> Option> { + // Ignore any whitespace at the start of the lexer. + self.ignore_whitespace(); + + // Attempt to parse a single line comment. Return it if it's documentation. + // Rerun this function if there was a comment and it was ignored successfully. + let initial_lexer: Self = self.fork(); + match self.handle_single_line_comment() { + // There was a single line comment ignored or no single line comment. + None => { + // Check if the remaining fragment changed. + if !self.remaining.ptr_eq(&initial_lexer.remaining) { + // If so, re-run this function. return self.next_token(); } - } + + // If the lexer was unchanged, then there was no comment -- keep trying to match tokens. + }, + + // If there was some token, return it. + token => return token, + } + + // If the remaining input is empty, there is no token. + if self.remaining.is_empty() { + return None; } // Discard any multi-line comments we encounter, sparing doc comments. @@ -544,4 +642,11 @@ mod tests { assert_eq!(token.fragment.inner, "123_456_789"); assert_eq!(token.variant, TokenTy::IntegerLiteral); } + + #[test] + fn ignored_single_line_comment() { + let mut lexer = Lexer::new("// test comment "); + assert!(lexer.next_token().is_none()); + assert_eq!(lexer.remaining.len(), 0); + } } From 25b72007ff4c0fd0612c9f86b526acb5b6e59bf3 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Thu, 22 Feb 2024 23:36:00 -0500 Subject: [PATCH 32/60] cargo fmt --- wright/src/parser/fragment.rs | 22 ++++----- wright/src/parser/lexer.rs | 86 +++++++++++++++++------------------ 2 files changed, 52 insertions(+), 56 deletions(-) diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index ddc37006..2bf718a0 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -30,11 +30,11 @@ impl<'src> Fragment<'src> { (start, unsafe { start.add(self.len()) }) } - /// Return true if both of these [`Fragment`]s point to the exact same slice of source code. + /// Return true if both of these [`Fragment`]s point to the exact same slice of source code. pub fn ptr_eq(&self, other: &Self) -> bool { // Since std::ptr::eq works for fat pointers, we can use it here. std::ptr::eq(self.inner, other.inner) - } + } /// Return true if this fragment overlaps at all with the other (either one contains the start of the other, /// by pointer). @@ -71,8 +71,8 @@ impl<'src> Fragment<'src> { self.inner.chars() } - /// Get the number of bytes between the beginning of [`origin`] and the beginning of [`self`]. - /// + /// Get the number of bytes between the beginning of [`origin`] and the beginning of [`self`]. + /// /// # Panics: /// - Panics if [`self`] is not a fragment within [`origin`] according to [`Fragment::contains`]. pub fn offset_from(&self, origin: &Self) -> usize { @@ -85,13 +85,13 @@ impl<'src> Fragment<'src> { // Do the same for the subslice. let subslice_start: *const u8 = self.inner.as_ptr(); - // SAFETY: Since the subslice is contained (by pointer) by the origin slice, both of them - // necessarily satisfy the safety requirements of offset_from to be pointers to the same - // allocation. - // - // We can always cast to a usize since this should always be a positive offset, as long - // as the subslice is contained in the origin fragment. - unsafe { subslice_start.offset_from(start) as usize } + // SAFETY: Since the subslice is contained (by pointer) by the origin slice, both of them + // necessarily satisfy the safety requirements of offset_from to be pointers to the same + // allocation. + // + // We can always cast to a usize since this should always be a positive offset, as long + // as the subslice is contained in the origin fragment. + unsafe { subslice_start.offset_from(start) as usize } } } diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 15b4b4a5..04296b81 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -164,7 +164,7 @@ pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { table }; -/// The pattern that begins any single line comments (including doc comments). +/// The pattern that begins any single line comments (including doc comments). pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//"; /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. @@ -320,19 +320,19 @@ impl<'src> Lexer<'src> { } } - /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for + /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for /// failable parsing. This can be compared to the original lexer it was forked from using [Fragment::offset_from] - /// on the underlying `remaining` fragments. + /// on the underlying `remaining` fragments. fn fork(&self) -> Self { *self } - /// Remove and ignore any whitespace at the start of the remaining fragment. + /// Remove and ignore any whitespace at the start of the remaining fragment. fn ignore_whitespace(&mut self) { - // Get a reference to the slice of the string past any whitespace at the start. + // Get a reference to the slice of the string past any whitespace at the start. let without_whitespace: &str = self.remaining.inner.trim_start(); - // If the references aren't equal, update the remaining fragment. + // If the references aren't equal, update the remaining fragment. if !ptr::eq(without_whitespace, self.remaining.inner) { self.remaining.inner = without_whitespace; } @@ -343,8 +343,8 @@ impl<'src> Lexer<'src> { self.remaining.inner.starts_with(pattern) } - /// If the remaining fragment starts with the given `pattern`, strip it from the remaining fragment and return - /// true. Otherwise return false. + /// If the remaining fragment starts with the given `pattern`, strip it from the remaining fragment and return + /// true. Otherwise return false. fn consume(&mut self, pattern: &str) -> bool { if let Some(stripped) = self.remaining.inner.strip_prefix(pattern) { self.remaining.inner = stripped; @@ -354,88 +354,86 @@ impl<'src> Lexer<'src> { } } - /// Remove a character from the start of the `remaining` [`Fragment`], return the character - /// consumed if there was a character available to consume. + /// Remove a character from the start of the `remaining` [`Fragment`], return the character + /// consumed if there was a character available to consume. fn consume_any(&mut self) -> Option { - // Make a character iterator. + // Make a character iterator. let mut chars: Chars = self.remaining.chars(); if let Some(c) = chars.next() { // Consumed a char, update the remaining fragment of this lexer. let char_bytes: usize = c.len_utf8(); // SAFETY: we know that this is not on a char boundary and does not exceed the length of the slice, - // since we just pulled it from a `Chars` iterator. + // since we just pulled it from a `Chars` iterator. self.remaining.inner = unsafe { self.remaining.inner.get_unchecked(char_bytes..) }; - // Return the character. + // Return the character. Some(c) } else { - // No characters available, return nothing. + // No characters available, return nothing. None } } // /// Consume characters from the lexer until given pattern matches. Do not consume the pattern or - // /// any characters in it. This will consumed to the end of the lexer if the pattern is not found. + // /// any characters in it. This will consumed to the end of the lexer if the pattern is not found. // fn consume_until(&mut self, pattern: &str) { // while !self.remaining.is_empty() && !self.matches(pattern) { // self.consume_any(); // } // } - /// Attempt to read/handle a single line comment from the start of the + /// Attempt to read/handle a single line comment from the start of the /// remaining fragment. If there's a doc-style single line comment, return a [`Token`], - /// otherwise return [`None`]. - /// + /// otherwise return [`None`]. + /// /// Generally I'm trying to follow the [rust comment spec] here. /// - /// [rust comment spec]: https://doc.rust-lang.org/reference/comments.html + /// [rust comment spec]: https://doc.rust-lang.org/reference/comments.html fn handle_single_line_comment(&mut self) -> Option> { - // Fork the lexer to attempt to consume a single line comment. + // Fork the lexer to attempt to consume a single line comment. let mut fork: Self = self.fork(); - // Try to consume the single line comment prefix from the fork. + // Try to consume the single line comment prefix from the fork. if fork.consume(SINGLE_LINE_COMMENT_PREFIX) { - // We consumed it successfully, read through a newline or the end of the forked lexer if we get there. - - // First determine if this is a doc comment of some kind. + // We consumed it successfully, read through a newline or the end of the forked lexer if we get there. + + // First determine if this is a doc comment of some kind. let is_inner_doc_comment: bool = fork.matches("/") && !fork.matches("//"); let is_outer_doc_comment: bool = fork.matches("!"); - // The consume until a newline, carraige return, or the end of the source fragment. + // The consume until a newline, carraige return, or the end of the source fragment. while !fork.remaining.is_empty() && !fork.matches("\r") && !fork.matches("\n") { fork.consume_any(); } // Determine the kind of token to produce (if any). - let variant: Option; - - if is_inner_doc_comment { + let variant: Option; + + if is_inner_doc_comment { variant = Some(TokenTy::InnerDocComment); - } - else if is_outer_doc_comment { + } else if is_outer_doc_comment { variant = Some(TokenTy::OuterDocComment); - } - else { + } else { variant = None; } - // Map the variant to a token to return. + // Map the variant to a token to return. let token: Option = variant.map(|kind| { - // Get the number of bytes we have consumed using `offset_from`. + // Get the number of bytes we have consumed using `offset_from`. let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); // Split this token from `self` rather than `fork` since self is still in an unmodified position. self.split_token(bytes_consumed, kind) }); - // Update this lexer to match the state of the forked lexer. + // Update this lexer to match the state of the forked lexer. *self = fork; - // Consume any outstanding whitespace. + // Consume any outstanding whitespace. self.ignore_whitespace(); // Return any token produced. return token; } - // If there was no comment prefix, there is no comment immediately available. + // If there was no comment prefix, there is no comment immediately available. None } @@ -448,18 +446,18 @@ impl<'src> Lexer<'src> { // Rerun this function if there was a comment and it was ignored successfully. let initial_lexer: Self = self.fork(); match self.handle_single_line_comment() { - // There was a single line comment ignored or no single line comment. + // There was a single line comment ignored or no single line comment. None => { // Check if the remaining fragment changed. if !self.remaining.ptr_eq(&initial_lexer.remaining) { - // If so, re-run this function. + // If so, re-run this function. return self.next_token(); } - + // If the lexer was unchanged, then there was no comment -- keep trying to match tokens. - }, + } - // If there was some token, return it. + // If there was some token, return it. token => return token, } @@ -469,9 +467,7 @@ impl<'src> Lexer<'src> { } // Discard any multi-line comments we encounter, sparing doc comments. - { - - } + {} // To attempt to match a token from the prefix table, make a char iterator // and get two chars from it to test equality. None of the tokens start with a From 1267d35664c72d7832caa643d62e1975059aca41 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Fri, 23 Feb 2024 23:56:50 -0500 Subject: [PATCH 33/60] Multi-line comment handling --- wright/src/bin/wright.rs | 11 ++- wright/src/parser/fragment.rs | 22 +++++- wright/src/parser/lexer.rs | 133 ++++++++++++++++++++++++++++------ 3 files changed, 136 insertions(+), 30 deletions(-) diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index a445a556..ab022fda 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -44,10 +44,10 @@ enum DebugCommands { /// A file of wright source code. file: PathBuf, - /// Pretty print the source code with the tokens lined under them. - /// If not used, a list of tokens will be printed with their metadata. - #[arg(short, long)] - pretty: bool, + // /// Pretty print the source code with the tokens lined under them. + // /// If not used, a list of tokens will be printed with their metadata. + // #[arg(short, long)] + // pretty: bool, }, } @@ -68,7 +68,6 @@ fn main() -> Result<()> { command: DebugCommands::Tokens { file, - pretty: false, }, }) => { let mut file_map: FileMap = FileMap::new(); @@ -78,7 +77,7 @@ fn main() -> Result<()> { // Use unwrap here, since we know we just added the file. let lexer: Lexer = Lexer::new(file_map.source(file_id).unwrap()); // Get all the tokens from the lexer and print them each. - lexer.for_each(|token: Token| println!("{token:?}")); + lexer.for_each(|token: Token| println!("{token}")); } _ => unimplemented!(), diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index 2bf718a0..a49561d8 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -22,7 +22,7 @@ impl<'src> Fragment<'src> { /// Get a pair of pointers, the first one being at the beginning of the fragment, the second one pointing /// to the byte after the end of the fragment. - const fn start_and_end(&self) -> (*const u8, *const u8) { + pub const fn start_and_end(&self) -> (*const u8, *const u8) { // Get the pointer to the start of the fragment. let start: *const u8 = self.inner.as_ptr(); // Get a pointer just past the end of the string. @@ -74,7 +74,7 @@ impl<'src> Fragment<'src> { /// Get the number of bytes between the beginning of [`origin`] and the beginning of [`self`]. /// /// # Panics: - /// - Panics if [`self`] is not a fragment within [`origin`] according to [`Fragment::contains`]. + /// - Panics if [`self`] is not a fragment within `origin` according to [`Fragment::contains`]. pub fn offset_from(&self, origin: &Self) -> usize { if !origin.contains(self) { panic!("This fragment must be contained in the original fragment"); @@ -130,4 +130,22 @@ mod tests { assert_eq!(left.inner, "+"); assert_eq!(right.inner, ""); } + + #[test] + fn test_offset_from() { + let a = Fragment { inner: "abcde" }; + let (b, c) = a.split(2); + assert_eq!(b.offset_from(&a), 0); + assert_eq!(c.offset_from(&a), 2); + } + + #[test] + #[should_panic] + fn test_offset_panics() { + let a = Fragment { inner: "abc" }; + let b = Fragment { inner: "def" }; + a.offset_from(&b); + } + + } diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 04296b81..337885d2 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -167,6 +167,12 @@ pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { /// The pattern that begins any single line comments (including doc comments). pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//"; +/// The pattern that starts any multi-line comments (including doc comments). +pub const MULTI_LINE_COMMENT_START: &str = "/*"; + +/// The pattern that ends any multi-line comments (including doc comments). +pub const MULTI_LINE_COMMENT_END: &str = "*/"; + /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug, Clone, Copy)] pub struct Lexer<'src> { @@ -224,6 +230,10 @@ pub enum TokenTy { OuterDocComment, OuterBlockDocComment, InnerDocComment, InnerBlockDocComment, + + /// Indicates a block style comment without termination. + UnterminatedBlockComment, + KwRecord, KwType, @@ -374,14 +384,6 @@ impl<'src> Lexer<'src> { } } - // /// Consume characters from the lexer until given pattern matches. Do not consume the pattern or - // /// any characters in it. This will consumed to the end of the lexer if the pattern is not found. - // fn consume_until(&mut self, pattern: &str) { - // while !self.remaining.is_empty() && !self.matches(pattern) { - // self.consume_any(); - // } - // } - /// Attempt to read/handle a single line comment from the start of the /// remaining fragment. If there's a doc-style single line comment, return a [`Token`], /// otherwise return [`None`]. @@ -398,8 +400,8 @@ impl<'src> Lexer<'src> { // We consumed it successfully, read through a newline or the end of the forked lexer if we get there. // First determine if this is a doc comment of some kind. - let is_inner_doc_comment: bool = fork.matches("/") && !fork.matches("//"); - let is_outer_doc_comment: bool = fork.matches("!"); + let is_inner_doc: bool = fork.matches("/") && !fork.matches("//"); + let is_outer_doc: bool = fork.matches("!"); // The consume until a newline, carraige return, or the end of the source fragment. while !fork.remaining.is_empty() && !fork.matches("\r") && !fork.matches("\n") { @@ -407,15 +409,12 @@ impl<'src> Lexer<'src> { } // Determine the kind of token to produce (if any). - let variant: Option; - - if is_inner_doc_comment { - variant = Some(TokenTy::InnerDocComment); - } else if is_outer_doc_comment { - variant = Some(TokenTy::OuterDocComment); - } else { - variant = None; - } + let variant: Option = match (is_inner_doc, is_outer_doc) { + (true, false) => Some(TokenTy::InnerDocComment), + (false, true) => Some(TokenTy::OuterDocComment), + (false, false) => None, + (true, true) => unreachable!("Lexer should not match multiple comment types at once."), + }; // Map the variant to a token to return. let token: Option = variant.map(|kind| { @@ -437,14 +436,92 @@ impl<'src> Lexer<'src> { None } + /// Attempt to read/consume a multi-line comment from the start of the `remaining` fragment. + fn handle_multi_line_comment(&mut self) -> Option> { + // Handle corner cases here so we don't have to below. + // These are both considered empty non-documenting comments. + if self.consume("/***/") { + return None; + } + + if self.consume("/**/") { + return None; + } + + // Make a fork of the lexer to avoid modifying this lexer if we fail to parse. + let mut fork: Self = self.fork(); + + // Try to parse the start of a multi-line comment. + if fork.consume(MULTI_LINE_COMMENT_START) { + // Check if this is a doc comment. + let is_outer_doc: bool = fork.matches("!"); + // Use this to indicate that more than one following asterix is not a doc comment. + let is_inner_doc: bool = fork.matches("*") && !fork.matches("**"); + + // Consume until we see the end of the doc comment. If we run out of characters, consider the + // comment unterminated. + while !fork.matches(MULTI_LINE_COMMENT_END) { + // Handle nested comments here: + if fork.matches(MULTI_LINE_COMMENT_START) { + // Discard the output -- don't care about doc comments in other comments. + fork.handle_multi_line_comment(); + continue; + } + + // Handle unterminated comments here. + if fork.remaining.is_empty() { + // If we have not hit a "*/" before the end of the input, return an unterminated block comment. + let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); + // Split the token and return it. + return Some(self.split_token(bytes_consumed, TokenTy::UnterminatedBlockComment)); + } + + // If there's still input, and not a nested comment, consume it. + fork.consume_any(); + } + + // If we get here, the comment was terminated. Consume the terminating characters, and return. + // Use debug assert here to make sure that the comment is actually terminated. + debug_assert!(fork.consume(MULTI_LINE_COMMENT_END), "comment is actually terminated"); + + // Determine the kind of token to produce (if any). + let variant: Option = match (is_inner_doc, is_outer_doc) { + (true, false) => Some(TokenTy::InnerBlockDocComment), + (false, true) => Some(TokenTy::OuterBlockDocComment), + (false, false) => None, + (true, true) => unreachable!("Lexer should not match multiple comment types at once."), + }; + + // Make the token to return. + let token: Option = variant.map(|kind| { + // Get the number of bytes we have consumed using `offset_from`. + let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); + // Split this token from `self` rather than `fork` since self is still in an unmodified position. + self.split_token(bytes_consumed, kind) + }); + + // Update this lexer to match the state of the fork. + *self = fork; + // Return token if there was one. + return token; + } + + // If the fork did not consume a multi-line comment start, return None and do + // not update this lexer. + None + } + /// Get the next token from the lexer. pub fn next_token(&mut self) -> Option> { // Ignore any whitespace at the start of the lexer. self.ignore_whitespace(); + // Grab a copy of the initial lexer to compare and check when progress has been made. + let initial_lexer: Self = self.fork(); + // Attempt to parse a single line comment. Return it if it's documentation. // Rerun this function if there was a comment and it was ignored successfully. - let initial_lexer: Self = self.fork(); + match self.handle_single_line_comment() { // There was a single line comment ignored or no single line comment. None => { @@ -466,8 +543,20 @@ impl<'src> Lexer<'src> { return None; } - // Discard any multi-line comments we encounter, sparing doc comments. - {} + // Try to handle a multi-line comment if there is one. + match self.handle_multi_line_comment() { + // There was an ignored comment or no comment. + None => { + // If the lexer was changed, restart this function. + if !self.remaining.ptr_eq(&initial_lexer.remaining) { + return self.next_token(); + } + } + + // If there was a block style doc-comment, or an unterminated multi-line comment + // return. + token => return token, + } // To attempt to match a token from the prefix table, make a char iterator // and get two chars from it to test equality. None of the tokens start with a From c92b3cb2f1c6929fad3c1c30c54e6c627d845bbc Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 24 Feb 2024 00:04:46 -0500 Subject: [PATCH 34/60] Add block-style doc-comment benchmark --- wright/benches/lexer.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/wright/benches/lexer.rs b/wright/benches/lexer.rs index 35f6a8ee..3ec4dfc8 100644 --- a/wright/benches/lexer.rs +++ b/wright/benches/lexer.rs @@ -9,7 +9,7 @@ fn bench_symbol_tokens(c: &mut Criterion) { // Function to make a lexer and get a token from it. fn make_lexer_and_get_token(b: &mut Bencher, input: &str) { - b.iter(|| Lexer::new(black_box(input)).next_token()); + b.iter(|| black_box(Lexer::new(input).next_token())); } let inputs = ["+", "+=", "*", "@", "?"]; @@ -19,5 +19,13 @@ fn bench_symbol_tokens(c: &mut Criterion) { } } -criterion_group!(benches, bench_symbol_tokens); +fn bench_block_doc_comment(c: &mut Criterion) { + c.bench_function("lexer block style doc comment", move |b: &mut Bencher| { + b.iter(move || { + black_box(Lexer::new("/*! \n this is a block-style comment \n\n */").next_token()) + }); + }); +} + +criterion_group!(benches, bench_symbol_tokens, bench_block_doc_comment); criterion_main!(benches); From ec2b1f9764efcab5ae7f4daf0674d54e99215e63 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 24 Feb 2024 00:59:28 -0500 Subject: [PATCH 35/60] Simplify lexing of trivial tokens --- wright/Cargo.toml | 12 +- wright/src/parser/lexer.rs | 269 ++++++++++++------------------------- 2 files changed, 88 insertions(+), 193 deletions(-) diff --git a/wright/Cargo.toml b/wright/Cargo.toml index 225dcd5c..8f19ac51 100644 --- a/wright/Cargo.toml +++ b/wright/Cargo.toml @@ -87,16 +87,12 @@ features = ["strict-versioning", "force-static"] version = "0.3" features = ["llvm17-0"] -# # Fast parsing for integers and floats from source code. -# [dependencies.lexical-core] -# version = "0.8" - # TEST DEPENDENCIES -# Rayon to speed up brute-force testing in some cases. -[dev-dependencies.rayon] -version = "1.8.0" - # Criterion is used for benchmarking. [dev-dependencies.criterion] version = "0.5.1" + +# Rayon is used to do various brute-force tests in parallel +[dev-dependencies.rayon] +version = "1.8.0" diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 337885d2..5d8e4ac7 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -4,166 +4,71 @@ //! defined for tokens. use super::fragment::Fragment; -use derive_more::Display; use std::iter::FusedIterator; use std::str::Chars; use std::{iter::Peekable, ptr}; +use derive_more::Display; use unicode_ident::{is_xid_continue, is_xid_start}; -/// Constant table of single character tokens and the characters that match them. -pub const SINGLE_CHAR_TOKENS: &[(char, TokenTy)] = &[ - ('(', TokenTy::LeftParen), - (')', TokenTy::RightParen), - ('[', TokenTy::LeftBracket), - (']', TokenTy::RightBracket), - ('{', TokenTy::LeftCurly), - ('}', TokenTy::RightCurly), - ('@', TokenTy::At), - (';', TokenTy::Semi), - ('?', TokenTy::Question), - (',', TokenTy::Comma), - ('#', TokenTy::Hash), - ('$', TokenTy::Dollar), -]; - -/// Tokens that can be either a single character or upgraded with an -/// equals sign. -pub const POSSIBLE_EQ_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy)] = &[ - ('!', TokenTy::Bang, TokenTy::BangEq), - ('%', TokenTy::Mod, TokenTy::ModEq), - ('^', TokenTy::Xor, TokenTy::XorEq), - ('*', TokenTy::Star, TokenTy::StarEq), - ('+', TokenTy::Plus, TokenTy::PlusEq), - ('/', TokenTy::Div, TokenTy::DivEq), -]; - -/// Characters that can produce different tokens when followed by an equals sign or themselves. -pub const POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy, TokenTy)] = &[ - ('&', TokenTy::And, TokenTy::AndEq, TokenTy::AndAnd), - ('|', TokenTy::Or, TokenTy::OrEq, TokenTy::OrOr), - ('<', TokenTy::Lt, TokenTy::LtEq, TokenTy::LtLt), - ('>', TokenTy::Gt, TokenTy::GtEq, TokenTy::GtGt), - (':', TokenTy::Colon, TokenTy::ColonEq, TokenTy::ColonColon), +/// Trivial tokens that are two ASCII characters and can be matched directly +/// against the input source code. +pub const TWO_ASCII_TRIVIAL_TOKENS: &[(&[u8; 2], TokenTy)] = &[ + (b"->", TokenTy::SingleArrow), + (b"-=", TokenTy::MinusEq), + + (b"=>", TokenTy::DoubleArrow), + (b"==", TokenTy::EqEq), + + (b"&&", TokenTy::AndAnd), + (b"||", TokenTy::OrOr), + (b"<<", TokenTy::LtLt), + (b">>", TokenTy::GtGt), + (b"::", TokenTy::ColonColon), + + (b"|=", TokenTy::OrEq), + (b"&=", TokenTy::AndEq), + (b":=", TokenTy::ColonEq), + (b">=", TokenTy::GtEq), + (b"<=", TokenTy::LtEq), + (b"!=", TokenTy::BangEq), + (b"%=", TokenTy::ModEq), + (b"^=", TokenTy::XorEq), + (b"*=", TokenTy::StarEq), + (b"+=", TokenTy::PlusEq), + (b"/=", TokenTy::DivEq), ]; -/// Characters that can produce different tokens when followed by an equals sign or -/// a `>` for arrows. -pub const POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS: &[(char, TokenTy, TokenTy, TokenTy)] = &[ - ('-', TokenTy::Minus, TokenTy::MinusEq, TokenTy::SingleArrow), - ('=', TokenTy::Eq, TokenTy::EqEq, TokenTy::DoubleArrow), +/// Single ASCII character trivial tokens that can be matched directly against +/// the source code. +pub const SINGLE_ASCII_CHAR_TRIVIAL_TOKENS: &[(u8, TokenTy)] = &[ + (b'(', TokenTy::LeftParen), + (b')', TokenTy::RightParen), + (b'[', TokenTy::LeftBracket), + (b']', TokenTy::RightBracket), + (b'{', TokenTy::LeftCurly), + (b'}', TokenTy::RightCurly), + (b'@', TokenTy::At), + (b';', TokenTy::Semi), + (b'?', TokenTy::Question), + (b',', TokenTy::Comma), + (b'#', TokenTy::Hash), + (b'$', TokenTy::Dollar), + + (b'>', TokenTy::Gt), + (b'<', TokenTy::Lt), + (b'-', TokenTy::Minus), + (b':', TokenTy::Colon), + (b'!', TokenTy::Bang), + (b'=', TokenTy::Eq), + (b'&', TokenTy::And), + (b'|', TokenTy::Or), + (b'/', TokenTy::Div), + (b'+', TokenTy::Plus), + (b'^', TokenTy::Xor), + (b'*', TokenTy::Star), + (b'%', TokenTy::Mod), ]; -/// The number of rows of the generated prefix table. -pub const PREFIX_TABLE_ROWS: usize = { - SINGLE_CHAR_TOKENS.len() - + 2 * POSSIBLE_EQ_UPGRADE_TOKENS.len() - + 3 * POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS.len() - + 3 * POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS.len() -}; - -/// A relationship between a prefix and the token that should be generated when that prefix matches. -#[derive(Copy, Clone, Debug)] -pub struct PrefixToToken { - /// An array of two chars. In single char tokens, the second one should be a null character (`'\0'`). - /// the char_length field will be used to slice this buffer to get the actual prefix. - pub char_buffer: [char; 2], - /// The byte length of this prefix and all generated tokens by this prefix. - pub byte_len: usize, - /// The kind of [Token] generated when this prefix matches. - pub kind: TokenTy, -} - -impl PrefixToToken { - /// Convenience function to construct a [`PrefixToToken`] by calculating the length of both chars - /// (and ignoring the second one if it's null). - pub const fn new(chars: [char; 2], kind: TokenTy) -> Self { - PrefixToToken { - char_buffer: chars, - - byte_len: if chars[1] == '\0' { - chars[0].len_utf8() - } else { - chars[0].len_utf8() + chars[1].len_utf8() - }, - - kind, - } - } -} - -/// A full table generated at compile time using all the token tables -/// ([SINGLE_CHAR_TOKENS], [POSSIBLE_EQ_UPGRADE_TOKENS], [POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS], -/// [POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS]). -/// -/// This table can be iterated on in order when trying to match a token at the start of a fragment of source code. -#[rustfmt::skip] -pub const PREFIX_TABLE: [PrefixToToken; PREFIX_TABLE_ROWS] = { - // Make a mutable table with dummy values to replace with actual values. - let mut table: [PrefixToToken; PREFIX_TABLE_ROWS] = - [PrefixToToken { char_buffer: ['\0'; 2], byte_len: 0, kind: TokenTy::Unknown }; PREFIX_TABLE_ROWS]; - - // Current index to insert into table at. - let mut write_index: usize = 0; - - // Index used for reading from various tables. - let mut read_index: usize = 0; - - // Iterate first over all the single char tokens. - while read_index < SINGLE_CHAR_TOKENS.len() { - // Get row from source table. - let (c, token_kind) = SINGLE_CHAR_TOKENS[read_index]; - - // Put row in destination table. - table[write_index] = PrefixToToken::new([c, '\0'], token_kind); - - // Increment both indices. - read_index += 1; - write_index += 1; - } - - // Then do all the tokens that can be upgraded with an equals sign. - // Add the row for the token with the equals sign first so that when we iterate over this table in order, - // the version without the equals sign does not match prematurely. - read_index = 0; - while read_index < POSSIBLE_EQ_UPGRADE_TOKENS.len() { - let (c, without_eq, with_eq) = POSSIBLE_EQ_UPGRADE_TOKENS[read_index]; - - table[write_index] = PrefixToToken::new([c, '='], with_eq); - table[write_index + 1] = PrefixToToken::new([c, '\0'], without_eq); - - read_index += 1; - write_index += 2; - } - - // Do the same for the tokens that can be upgraded with an equals sign or doubled. - read_index = 0; - while read_index < POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS.len() { - let (c, without_eq, with_eq, doubled) = POSSIBLE_EQ_OR_DOUBLED_UPGRADE_TOKENS[read_index]; - - table[write_index] = PrefixToToken::new([c, c], doubled); - table[write_index + 1] = PrefixToToken::new([c, '='], with_eq); - table[write_index + 2] = PrefixToToken::new([c, '\0'], without_eq); - - read_index += 1; - write_index += 3; - } - - // Do the same for possible eq or arrow upgrades. - read_index = 0; - while read_index < POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS.len() { - let (c, without_eq, with_eq, with_arrow) = POSSIBLE_EQ_OR_ARROW_UPGRADE_TOKENS[read_index]; - - table[write_index] = PrefixToToken::new([c, '>'], with_arrow); - table[write_index + 1] = PrefixToToken::new([c, '='], with_eq); - table[write_index + 2] = PrefixToToken::new([c, '\0'], without_eq); - - read_index += 1; - write_index += 3; - } - - table -}; - /// The pattern that begins any single line comments (including doc comments). pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//"; @@ -516,12 +421,16 @@ impl<'src> Lexer<'src> { // Ignore any whitespace at the start of the lexer. self.ignore_whitespace(); + // If the remaining input is empty, there is no token. + if self.remaining.is_empty() { + return None; + } + // Grab a copy of the initial lexer to compare and check when progress has been made. let initial_lexer: Self = self.fork(); // Attempt to parse a single line comment. Return it if it's documentation. // Rerun this function if there was a comment and it was ignored successfully. - match self.handle_single_line_comment() { // There was a single line comment ignored or no single line comment. None => { @@ -538,11 +447,6 @@ impl<'src> Lexer<'src> { token => return token, } - // If the remaining input is empty, there is no token. - if self.remaining.is_empty() { - return None; - } - // Try to handle a multi-line comment if there is one. match self.handle_multi_line_comment() { // There was an ignored comment or no comment. @@ -558,28 +462,31 @@ impl<'src> Lexer<'src> { token => return token, } - // To attempt to match a token from the prefix table, make a char iterator - // and get two chars from it to test equality. None of the tokens start with a - // null character so use that as a single of an unavailable char. - { - let mut char_iter: Chars = self.remaining.chars(); - let char_array: [char; 2] = [ - // Unchecked unwrap here since we know there's at least one char. - unsafe { char_iter.next().unwrap_unchecked() }, - char_iter.next().unwrap_or('\0'), - ]; - - // Next iterate through the prefix table to try to get any tokens that are covered there. - for prefix_meta in PREFIX_TABLE.iter() { - // If it's a single char comparison, only compare the first chars. - if prefix_meta.char_buffer[1] == '\0' && prefix_meta.char_buffer[0] == char_array[0] - { - return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); + // Do all trivial matching after matching comments to avoid matching "/" for "//". + + // Attempt to match any two-byte ASCII trivial tokens. + // This must be done before single-ascii byte tokens since matching is greedy. + if self.remaining.len() >= 2 { + // Get the first two bytes of the remaining fragment. + // SAFETY: We just checked length. + let bytes: &[u8] = unsafe { self.remaining.inner.as_bytes().get_unchecked(0..2) }; + // Match against each possible token pattern. + for (pattern, kind) in TWO_ASCII_TRIVIAL_TOKENS { + if bytes == *pattern { + return Some(self.split_token(2, *kind)); } + } + } + + // Do the same for single byte patterns. + { + // We can assume there is at least one more byte since we check above if the fragment + // is empty and return early if not. + let byte: &u8 = unsafe { self.remaining.inner.as_bytes().get_unchecked(0) }; - // Otherwise compare the whole slices. - if prefix_meta.char_buffer == char_array { - return Some(self.split_token(prefix_meta.byte_len, prefix_meta.kind)); + for (pattern, kind) in SINGLE_ASCII_CHAR_TRIVIAL_TOKENS { + if byte == pattern { + return Some(self.split_token(1, *kind)); } } } @@ -677,16 +584,8 @@ impl<'src> FusedIterator for Lexer<'src> {} #[cfg(test)] mod tests { use super::Lexer; - use super::PREFIX_TABLE; use crate::parser::lexer::TokenTy; - #[test] - #[ignore = "this test is just used for debugging the prefix table"] - /// Run this with `cargo test manual_debug_prefix_table -- --nocapture --ignored`. - fn manual_debug_prefix_table() { - dbg!(PREFIX_TABLE); - } - #[test] fn plus_and_plus_eq_tokens() { let mut plus = Lexer::new("+"); From f9b9a9892e9b8dce9580caa1da27a72f9f60f49e Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 24 Feb 2024 01:03:20 -0500 Subject: [PATCH 36/60] Docs --- wright/src/parser/fragment.rs | 3 ++- wright/src/parser/lexer.rs | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index a49561d8..3afd68d7 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -58,7 +58,8 @@ impl<'src> Fragment<'src> { /// Split this fragment into two sub fragments, with the first one being `bytes` long and the second containing the /// rest of this fragment. /// - /// Panics if the byte index is not in the fragment, or if it's on a char boundary. + /// # Panics: + /// - Panics if the byte index is not in the fragment, or if it's on a char boundary. pub fn split(&self, bytes: usize) -> (Self, Self) { // Use str's split_at. let (left, right) = self.inner.split_at(bytes); diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 5d8e4ac7..f5c0b035 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -53,7 +53,6 @@ pub const SINGLE_ASCII_CHAR_TRIVIAL_TOKENS: &[(u8, TokenTy)] = &[ (b',', TokenTy::Comma), (b'#', TokenTy::Hash), (b'$', TokenTy::Dollar), - (b'>', TokenTy::Gt), (b'<', TokenTy::Lt), (b'-', TokenTy::Minus), @@ -225,6 +224,9 @@ impl<'src> Lexer<'src> { /// Make a token by splitting a given number of bytes off of the `self.remaining` fragment /// and labeling them with the given kind. + /// + /// # Panics: + /// - Panics if the number of bytes lands out of bounds or in the middle of a character. fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { let (token_fragment, new_remaining_fragment) = self.remaining.split(bytes); self.remaining = new_remaining_fragment; From 13807c78dac2235d073c0f4927705ede82287922 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 24 Feb 2024 01:04:15 -0500 Subject: [PATCH 37/60] rustfmt --- wright/src/bin/wright.rs | 6 +-- wright/src/parser/fragment.rs | 2 - wright/src/parser/lexer.rs | 91 ++++++++++++++++++----------------- 3 files changed, 48 insertions(+), 51 deletions(-) diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index ab022fda..cde9308b 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -43,7 +43,6 @@ enum DebugCommands { Tokens { /// A file of wright source code. file: PathBuf, - // /// Pretty print the source code with the tokens lined under them. // /// If not used, a list of tokens will be printed with their metadata. // #[arg(short, long)] @@ -65,10 +64,7 @@ fn main() -> Result<()> { // Print all the tokens for a given file. Some(Commands::Debug { - command: - DebugCommands::Tokens { - file, - }, + command: DebugCommands::Tokens { file }, }) => { let mut file_map: FileMap = FileMap::new(); // Add the given file to the file map. diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index 3afd68d7..fbec7851 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -147,6 +147,4 @@ mod tests { let b = Fragment { inner: "def" }; a.offset_from(&b); } - - } diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index f5c0b035..edd65cb6 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -4,27 +4,24 @@ //! defined for tokens. use super::fragment::Fragment; +use derive_more::Display; use std::iter::FusedIterator; use std::str::Chars; use std::{iter::Peekable, ptr}; -use derive_more::Display; use unicode_ident::{is_xid_continue, is_xid_start}; -/// Trivial tokens that are two ASCII characters and can be matched directly -/// against the input source code. +/// Trivial tokens that are two ASCII characters and can be matched directly +/// against the input source code. pub const TWO_ASCII_TRIVIAL_TOKENS: &[(&[u8; 2], TokenTy)] = &[ (b"->", TokenTy::SingleArrow), (b"-=", TokenTy::MinusEq), - (b"=>", TokenTy::DoubleArrow), (b"==", TokenTy::EqEq), - (b"&&", TokenTy::AndAnd), (b"||", TokenTy::OrOr), (b"<<", TokenTy::LtLt), (b">>", TokenTy::GtGt), (b"::", TokenTy::ColonColon), - (b"|=", TokenTy::OrEq), (b"&=", TokenTy::AndEq), (b":=", TokenTy::ColonEq), @@ -38,8 +35,8 @@ pub const TWO_ASCII_TRIVIAL_TOKENS: &[(&[u8; 2], TokenTy)] = &[ (b"/=", TokenTy::DivEq), ]; -/// Single ASCII character trivial tokens that can be matched directly against -/// the source code. +/// Single ASCII character trivial tokens that can be matched directly against +/// the source code. pub const SINGLE_ASCII_CHAR_TRIVIAL_TOKENS: &[(u8, TokenTy)] = &[ (b'(', TokenTy::LeftParen), (b')', TokenTy::RightParen), @@ -224,9 +221,9 @@ impl<'src> Lexer<'src> { /// Make a token by splitting a given number of bytes off of the `self.remaining` fragment /// and labeling them with the given kind. - /// + /// /// # Panics: - /// - Panics if the number of bytes lands out of bounds or in the middle of a character. + /// - Panics if the number of bytes lands out of bounds or in the middle of a character. fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { let (token_fragment, new_remaining_fragment) = self.remaining.split(bytes); self.remaining = new_remaining_fragment; @@ -320,7 +317,9 @@ impl<'src> Lexer<'src> { (true, false) => Some(TokenTy::InnerDocComment), (false, true) => Some(TokenTy::OuterDocComment), (false, false) => None, - (true, true) => unreachable!("Lexer should not match multiple comment types at once."), + (true, true) => { + unreachable!("Lexer should not match multiple comment types at once.") + } }; // Map the variant to a token to return. @@ -343,9 +342,9 @@ impl<'src> Lexer<'src> { None } - /// Attempt to read/consume a multi-line comment from the start of the `remaining` fragment. + /// Attempt to read/consume a multi-line comment from the start of the `remaining` fragment. fn handle_multi_line_comment(&mut self) -> Option> { - // Handle corner cases here so we don't have to below. + // Handle corner cases here so we don't have to below. // These are both considered empty non-documenting comments. if self.consume("/***/") { return None; @@ -355,40 +354,42 @@ impl<'src> Lexer<'src> { return None; } - // Make a fork of the lexer to avoid modifying this lexer if we fail to parse. + // Make a fork of the lexer to avoid modifying this lexer if we fail to parse. let mut fork: Self = self.fork(); - // Try to parse the start of a multi-line comment. + // Try to parse the start of a multi-line comment. if fork.consume(MULTI_LINE_COMMENT_START) { - // Check if this is a doc comment. + // Check if this is a doc comment. let is_outer_doc: bool = fork.matches("!"); - // Use this to indicate that more than one following asterix is not a doc comment. + // Use this to indicate that more than one following asterix is not a doc comment. let is_inner_doc: bool = fork.matches("*") && !fork.matches("**"); - // Consume until we see the end of the doc comment. If we run out of characters, consider the - // comment unterminated. + // Consume until we see the end of the doc comment. If we run out of characters, consider the + // comment unterminated. while !fork.matches(MULTI_LINE_COMMENT_END) { - // Handle nested comments here: - if fork.matches(MULTI_LINE_COMMENT_START) { - // Discard the output -- don't care about doc comments in other comments. + // Handle nested comments here: + if fork.matches(MULTI_LINE_COMMENT_START) { + // Discard the output -- don't care about doc comments in other comments. fork.handle_multi_line_comment(); continue; } // Handle unterminated comments here. if fork.remaining.is_empty() { - // If we have not hit a "*/" before the end of the input, return an unterminated block comment. + // If we have not hit a "*/" before the end of the input, return an unterminated block comment. let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); - // Split the token and return it. - return Some(self.split_token(bytes_consumed, TokenTy::UnterminatedBlockComment)); + // Split the token and return it. + return Some( + self.split_token(bytes_consumed, TokenTy::UnterminatedBlockComment), + ); } - - // If there's still input, and not a nested comment, consume it. + + // If there's still input, and not a nested comment, consume it. fork.consume_any(); } - // If we get here, the comment was terminated. Consume the terminating characters, and return. - // Use debug assert here to make sure that the comment is actually terminated. + // If we get here, the comment was terminated. Consume the terminating characters, and return. + // Use debug assert here to make sure that the comment is actually terminated. debug_assert!(fork.consume(MULTI_LINE_COMMENT_END), "comment is actually terminated"); // Determine the kind of token to produce (if any). @@ -396,10 +397,12 @@ impl<'src> Lexer<'src> { (true, false) => Some(TokenTy::InnerBlockDocComment), (false, true) => Some(TokenTy::OuterBlockDocComment), (false, false) => None, - (true, true) => unreachable!("Lexer should not match multiple comment types at once."), + (true, true) => { + unreachable!("Lexer should not match multiple comment types at once.") + } }; - // Make the token to return. + // Make the token to return. let token: Option = variant.map(|kind| { // Get the number of bytes we have consumed using `offset_from`. let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); @@ -413,8 +416,8 @@ impl<'src> Lexer<'src> { return token; } - // If the fork did not consume a multi-line comment start, return None and do - // not update this lexer. + // If the fork did not consume a multi-line comment start, return None and do + // not update this lexer. None } @@ -449,28 +452,28 @@ impl<'src> Lexer<'src> { token => return token, } - // Try to handle a multi-line comment if there is one. + // Try to handle a multi-line comment if there is one. match self.handle_multi_line_comment() { - // There was an ignored comment or no comment. + // There was an ignored comment or no comment. None => { - // If the lexer was changed, restart this function. + // If the lexer was changed, restart this function. if !self.remaining.ptr_eq(&initial_lexer.remaining) { return self.next_token(); } } // If there was a block style doc-comment, or an unterminated multi-line comment - // return. + // return. token => return token, } // Do all trivial matching after matching comments to avoid matching "/" for "//". - - // Attempt to match any two-byte ASCII trivial tokens. - // This must be done before single-ascii byte tokens since matching is greedy. + + // Attempt to match any two-byte ASCII trivial tokens. + // This must be done before single-ascii byte tokens since matching is greedy. if self.remaining.len() >= 2 { - // Get the first two bytes of the remaining fragment. - // SAFETY: We just checked length. + // Get the first two bytes of the remaining fragment. + // SAFETY: We just checked length. let bytes: &[u8] = unsafe { self.remaining.inner.as_bytes().get_unchecked(0..2) }; // Match against each possible token pattern. for (pattern, kind) in TWO_ASCII_TRIVIAL_TOKENS { @@ -481,9 +484,9 @@ impl<'src> Lexer<'src> { } // Do the same for single byte patterns. - { + { // We can assume there is at least one more byte since we check above if the fragment - // is empty and return early if not. + // is empty and return early if not. let byte: &u8 = unsafe { self.remaining.inner.as_bytes().get_unchecked(0) }; for (pattern, kind) in SINGLE_ASCII_CHAR_TRIVIAL_TOKENS { From 450daa34ab241da43cfe225df70714c8b19e0be6 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 24 Feb 2024 04:03:04 -0500 Subject: [PATCH 38/60] Refactor parts of the lexer --- wright/src/bin/wright.rs | 2 +- wright/src/parser/fragment.rs | 23 +++- wright/src/parser/lexer.rs | 188 +++-------------------------- wright/src/parser/lexer/token.rs | 90 ++++++++++++++ wright/src/parser/lexer/trivial.rs | 103 ++++++++++++++++ 5 files changed, 232 insertions(+), 174 deletions(-) create mode 100644 wright/src/parser/lexer/token.rs create mode 100644 wright/src/parser/lexer/trivial.rs diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index cde9308b..669a3afb 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -6,7 +6,7 @@ use codespan_reporting::files::Files; use std::{path::PathBuf, time::Instant}; use wright::{ filemap::{FileId, FileMap}, - parser::lexer::{Lexer, Token}, + parser::lexer::{Lexer, token::Token}, repl, }; diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index fbec7851..3e2bce43 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -60,19 +60,34 @@ impl<'src> Fragment<'src> { /// /// # Panics: /// - Panics if the byte index is not in the fragment, or if it's on a char boundary. - pub fn split(&self, bytes: usize) -> (Self, Self) { + pub fn split_at(&self, bytes: usize) -> (Self, Self) { // Use str's split_at. let (left, right) = self.inner.split_at(bytes); (Self { inner: left }, Self { inner: right }) } + /// Unsafe version of [`Fragment::split_at`]. Splits this [Fragment] into two subfragments, + /// where the left one contains the first `bytes` bytes of the fragment, and the right one + /// contains the rest. + /// + /// # Safety: + /// - Undefined Behavior occurs if `bytes` is greater than the length of the [Fragment]. + /// - Undefined Behavior occurs if `bytes` is not on a UTF-8 character boundary. + /// - See [str::get_unchecked] for more details. + pub unsafe fn split_at_unchecked(&self, bytes: usize) -> (Self, Self) { + let left: &str = self.inner.get_unchecked(..bytes); + let right: &str = self.inner.get_unchecked(bytes..); + + (Fragment { inner: left }, Fragment { inner: right }) + } + /// Get an iterator over the characters in this fragment. pub fn chars(&self) -> Chars<'src> { self.inner.chars() } - /// Get the number of bytes between the beginning of [`origin`] and the beginning of [`self`]. + /// Get the number of bytes between the beginning of `origin` and the beginning of [`self`]. /// /// # Panics: /// - Panics if [`self`] is not a fragment within `origin` according to [`Fragment::contains`]. @@ -127,7 +142,7 @@ mod tests { #[test] fn test_split_single() { let a = Fragment { inner: "+" }; - let (left, right) = a.split(1); + let (left, right) = a.split_at(1); assert_eq!(left.inner, "+"); assert_eq!(right.inner, ""); } @@ -135,7 +150,7 @@ mod tests { #[test] fn test_offset_from() { let a = Fragment { inner: "abcde" }; - let (b, c) = a.split(2); + let (b, c) = a.split_at(2); assert_eq!(b.offset_from(&a), 0); assert_eq!(c.offset_from(&a), 2); } diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index edd65cb6..35f4e9a9 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -4,66 +4,14 @@ //! defined for tokens. use super::fragment::Fragment; -use derive_more::Display; use std::iter::FusedIterator; use std::str::Chars; use std::{iter::Peekable, ptr}; use unicode_ident::{is_xid_continue, is_xid_start}; +use token::{Token, TokenTy}; -/// Trivial tokens that are two ASCII characters and can be matched directly -/// against the input source code. -pub const TWO_ASCII_TRIVIAL_TOKENS: &[(&[u8; 2], TokenTy)] = &[ - (b"->", TokenTy::SingleArrow), - (b"-=", TokenTy::MinusEq), - (b"=>", TokenTy::DoubleArrow), - (b"==", TokenTy::EqEq), - (b"&&", TokenTy::AndAnd), - (b"||", TokenTy::OrOr), - (b"<<", TokenTy::LtLt), - (b">>", TokenTy::GtGt), - (b"::", TokenTy::ColonColon), - (b"|=", TokenTy::OrEq), - (b"&=", TokenTy::AndEq), - (b":=", TokenTy::ColonEq), - (b">=", TokenTy::GtEq), - (b"<=", TokenTy::LtEq), - (b"!=", TokenTy::BangEq), - (b"%=", TokenTy::ModEq), - (b"^=", TokenTy::XorEq), - (b"*=", TokenTy::StarEq), - (b"+=", TokenTy::PlusEq), - (b"/=", TokenTy::DivEq), -]; - -/// Single ASCII character trivial tokens that can be matched directly against -/// the source code. -pub const SINGLE_ASCII_CHAR_TRIVIAL_TOKENS: &[(u8, TokenTy)] = &[ - (b'(', TokenTy::LeftParen), - (b')', TokenTy::RightParen), - (b'[', TokenTy::LeftBracket), - (b']', TokenTy::RightBracket), - (b'{', TokenTy::LeftCurly), - (b'}', TokenTy::RightCurly), - (b'@', TokenTy::At), - (b';', TokenTy::Semi), - (b'?', TokenTy::Question), - (b',', TokenTy::Comma), - (b'#', TokenTy::Hash), - (b'$', TokenTy::Dollar), - (b'>', TokenTy::Gt), - (b'<', TokenTy::Lt), - (b'-', TokenTy::Minus), - (b':', TokenTy::Colon), - (b'!', TokenTy::Bang), - (b'=', TokenTy::Eq), - (b'&', TokenTy::And), - (b'|', TokenTy::Or), - (b'/', TokenTy::Div), - (b'+', TokenTy::Plus), - (b'^', TokenTy::Xor), - (b'*', TokenTy::Star), - (b'%', TokenTy::Mod), -]; +pub mod token; +pub mod trivial; /// The pattern that begins any single line comments (including doc comments). pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//"; @@ -81,91 +29,6 @@ pub struct Lexer<'src> { pub remaining: Fragment<'src>, } -/// A token in wright source code. -#[derive(Debug, Display)] -#[display(fmt = "\"{}\" ({:?})", "fragment.inner", variant)] -pub struct Token<'src> { - /// What type of token this is. - pub variant: TokenTy, - /// The matching fragment of source code -- this contains the location and length data for the token. - pub fragment: Fragment<'src>, -} - -/// The different types of tokens in wright source. -#[rustfmt::skip] // Turn off auto reformat. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum TokenTy { - LeftCurly, RightCurly, - LeftBracket, RightBracket, - LeftParen, RightParen, - - Plus, PlusEq, - Star, StarEq, - Div, DivEq, - Xor, XorEq, - Mod, ModEq, - Bang, BangEq, - - Minus, MinusEq, SingleArrow, - Eq, EqEq, DoubleArrow, - - Lt, LtEq, LtLt, - Gt, GtEq, GtGt, - And, AndEq, AndAnd, - Or, OrEq, OrOr, - Colon, ColonEq, ColonColon, - - At, - Tilde, - Semi, - Dot, - Comma, - Hash, - Question, - Dollar, - - // Not in the same group as the other ones there since it can be used at the start of identifiers. - Underscore, - - Identifier, - - OuterDocComment, OuterBlockDocComment, - InnerDocComment, InnerBlockDocComment, - - /// Indicates a block style comment without termination. - UnterminatedBlockComment, - - - KwRecord, - KwType, - KwEnum, - KwUnion, - KwFunc, - KwRepr, - KwImpl, - KwConstraint, - KwReferences, - KwTrait, - KwUse, - KwAs, - KwConst, - KwMod, - KwIf, - KwElse, - KwFor, - KwIn, - KwWhile, - KwTrue, - KwFalse, - KwLoop, - KwWhere, - - IntegerLiteral, - - /// Unknown character in lexer fragment. - Unknown -} - impl<'src> Lexer<'src> { /// Get the number of bytes remaining that we need to transform into tokens. pub const fn bytes_remaining(&self) -> usize { @@ -225,7 +88,7 @@ impl<'src> Lexer<'src> { /// # Panics: /// - Panics if the number of bytes lands out of bounds or in the middle of a character. fn split_token(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { - let (token_fragment, new_remaining_fragment) = self.remaining.split(bytes); + let (token_fragment, new_remaining_fragment) = self.remaining.split_at(bytes); self.remaining = new_remaining_fragment; Token { @@ -234,6 +97,17 @@ impl<'src> Lexer<'src> { } } + /// Unsafe version of [Lexer::split_token]. + /// + /// # Safety: + /// - This function matches the safety guarantees of [Fragment::split_at_unchecked]. + unsafe fn split_token_unchecked(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { + let (token_fragment, new_remaining_fragment) = self.remaining.split_at_unchecked(bytes); + self.remaining = new_remaining_fragment; + + Token { variant: kind, fragment: token_fragment } + } + /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for /// failable parsing. This can be compared to the original lexer it was forked from using [Fragment::offset_from] /// on the underlying `remaining` fragments. @@ -467,33 +341,9 @@ impl<'src> Lexer<'src> { token => return token, } - // Do all trivial matching after matching comments to avoid matching "/" for "//". - - // Attempt to match any two-byte ASCII trivial tokens. - // This must be done before single-ascii byte tokens since matching is greedy. - if self.remaining.len() >= 2 { - // Get the first two bytes of the remaining fragment. - // SAFETY: We just checked length. - let bytes: &[u8] = unsafe { self.remaining.inner.as_bytes().get_unchecked(0..2) }; - // Match against each possible token pattern. - for (pattern, kind) in TWO_ASCII_TRIVIAL_TOKENS { - if bytes == *pattern { - return Some(self.split_token(2, *kind)); - } - } - } - - // Do the same for single byte patterns. - { - // We can assume there is at least one more byte since we check above if the fragment - // is empty and return early if not. - let byte: &u8 = unsafe { self.remaining.inner.as_bytes().get_unchecked(0) }; - - for (pattern, kind) in SINGLE_ASCII_CHAR_TRIVIAL_TOKENS { - if byte == pattern { - return Some(self.split_token(1, *kind)); - } - } + // Handle a trivial token if there is one. + if let Some(token) = trivial::try_consume_trivial_token(self) { + return Some(token); } // Next attempt to match a keyword or identifier. @@ -512,7 +362,7 @@ impl<'src> Lexer<'src> { .sum::(); // Split the number of bytes we consumed. - let (ident_frag, new_remaining) = self.remaining.split(bytes_consumed); + let (ident_frag, new_remaining) = self.remaining.split_at(bytes_consumed); // Get the token kind to produce for this fragment. let variant = Lexer::identifier_or_keyword(ident_frag); // Update the lexers remaining fragment. diff --git a/wright/src/parser/lexer/token.rs b/wright/src/parser/lexer/token.rs new file mode 100644 index 00000000..e30b6851 --- /dev/null +++ b/wright/src/parser/lexer/token.rs @@ -0,0 +1,90 @@ +//! Token models. + +use derive_more::Display; +use crate::parser::fragment::Fragment; + +/// A token in wright source code. +#[derive(Debug, Display)] +#[display(fmt = "\"{}\" ({:?})", "fragment.inner", variant)] +pub struct Token<'src> { + /// What type of token this is. + pub variant: TokenTy, + /// The matching fragment of source code -- this contains the location and length data for the token. + pub fragment: Fragment<'src>, +} + +/// The different types of tokens in wright source. +#[rustfmt::skip] // Turn off auto reformat. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum TokenTy { + LeftCurly, RightCurly, + LeftBracket, RightBracket, + LeftParen, RightParen, + + Plus, PlusEq, + Star, StarEq, + Div, DivEq, + Xor, XorEq, + Mod, ModEq, + Bang, BangEq, + + Minus, MinusEq, SingleArrow, + Eq, EqEq, DoubleArrow, + + Lt, LtEq, LtLt, + Gt, GtEq, GtGt, + And, AndEq, AndAnd, + Or, OrEq, OrOr, + Colon, ColonEq, ColonColon, + + At, + Tilde, + Semi, + Dot, + Comma, + Hash, + Question, + Dollar, + + // Not in the same group as the other ones there since it can be used at the start of identifiers. + Underscore, + + Identifier, + + OuterDocComment, OuterBlockDocComment, + InnerDocComment, InnerBlockDocComment, + + /// Indicates a block style comment without termination. + UnterminatedBlockComment, + + KwRecord, + KwType, + KwEnum, + KwUnion, + KwFunc, + KwRepr, + KwImpl, + KwConstraint, + KwReferences, + KwTrait, + KwUse, + KwAs, + KwConst, + KwMod, + KwIf, + KwElse, + KwFor, + KwIn, + KwWhile, + KwTrue, + KwFalse, + KwLoop, + KwWhere, + + IntegerLiteral, + StringLiteral, + CharLiteral, + + /// Unknown character in lexer fragment. + Unknown +} diff --git a/wright/src/parser/lexer/trivial.rs b/wright/src/parser/lexer/trivial.rs new file mode 100644 index 00000000..7fb9284d --- /dev/null +++ b/wright/src/parser/lexer/trivial.rs @@ -0,0 +1,103 @@ +//! Trivial tokens and their implementation. + +use super::{token::{Token, TokenTy}, Lexer}; + +/// Trivial tokens that are two ASCII characters and can be matched directly +/// against the input source code. +pub const TWO_ASCII_TRIVIAL_TOKENS: &[(&[u8; 2], TokenTy)] = &[ + (b"->", TokenTy::SingleArrow), + (b"-=", TokenTy::MinusEq), + (b"=>", TokenTy::DoubleArrow), + (b"==", TokenTy::EqEq), + (b"&&", TokenTy::AndAnd), + (b"||", TokenTy::OrOr), + (b"<<", TokenTy::LtLt), + (b">>", TokenTy::GtGt), + (b"::", TokenTy::ColonColon), + (b"|=", TokenTy::OrEq), + (b"&=", TokenTy::AndEq), + (b":=", TokenTy::ColonEq), + (b">=", TokenTy::GtEq), + (b"<=", TokenTy::LtEq), + (b"!=", TokenTy::BangEq), + (b"%=", TokenTy::ModEq), + (b"^=", TokenTy::XorEq), + (b"*=", TokenTy::StarEq), + (b"+=", TokenTy::PlusEq), + (b"/=", TokenTy::DivEq), +]; + +/// Single ASCII character trivial tokens that can be matched directly against +/// the source code. +pub const SINGLE_ASCII_CHAR_TRIVIAL_TOKENS: &[(u8, TokenTy)] = &[ + (b'(', TokenTy::LeftParen), + (b')', TokenTy::RightParen), + (b'[', TokenTy::LeftBracket), + (b']', TokenTy::RightBracket), + (b'{', TokenTy::LeftCurly), + (b'}', TokenTy::RightCurly), + (b'@', TokenTy::At), + (b';', TokenTy::Semi), + (b'?', TokenTy::Question), + (b',', TokenTy::Comma), + (b'#', TokenTy::Hash), + (b'$', TokenTy::Dollar), + (b'>', TokenTy::Gt), + (b'<', TokenTy::Lt), + (b'-', TokenTy::Minus), + (b':', TokenTy::Colon), + (b'!', TokenTy::Bang), + (b'=', TokenTy::Eq), + (b'&', TokenTy::And), + (b'|', TokenTy::Or), + (b'/', TokenTy::Div), + (b'+', TokenTy::Plus), + (b'^', TokenTy::Xor), + (b'*', TokenTy::Star), + (b'%', TokenTy::Mod), +]; + + +/// Attempt to consume a "trivial" token from the start of the [Lexer]'s [Lexer::remaining] fragment. +/// +/// Leave the lexer unmodified if one is not available. +#[inline] +pub fn try_consume_trivial_token<'src>(lexer: &mut Lexer<'src>) -> Option> { + // Get the number of bytes remaining, since we need at least 1 to parse anything. + let bytes_remaining: usize = lexer.bytes_remaining(); + + // No token if there are no bytes of source left. + if bytes_remaining == 0 { return None; } + + // Attempt to match any two-byte ASCII trivial tokens. + // This must be done before single-ascii byte tokens since matching is greedy. + if bytes_remaining >= 2 { + // Get the first two bytes of the remaining fragment. + // SAFETY: We just checked length. + let bytes: &[u8] = unsafe { lexer.remaining.inner.as_bytes().get_unchecked(0..2) }; + + // Match against each possible token pattern. + for (pattern, kind) in TWO_ASCII_TRIVIAL_TOKENS { + if bytes == *pattern { + // SAFETY: We have already done bounds checking, and this cannot be a character + // boundary since we just matched against ASCII characters. + return Some(unsafe { lexer.split_token_unchecked(2, *kind) }); + } + } + } + + // Do the same for single byte patterns. + // SAFETY: We checked that the number of bytes remaining is not 0 above. + let byte: &u8 = unsafe { lexer.remaining.inner.as_bytes().get_unchecked(0) }; + + for (pattern, kind) in SINGLE_ASCII_CHAR_TRIVIAL_TOKENS { + if byte == pattern { + // SAFETTY: If we matched, then the first byte is ASCII, and therefor we don't have to worry + // about bounds or unicode boundaries. + return Some(unsafe { lexer.split_token_unchecked(1, *kind) }); + } + } + + // If nothing else has matched, there is no trivial token available. + None +} From 7f8919dd5cd61fd251389b3d1246e9ac798d8222 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 24 Feb 2024 04:03:30 -0500 Subject: [PATCH 39/60] clippy --- wright/src/parser/fragment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index 3e2bce43..21ffa775 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -71,7 +71,7 @@ impl<'src> Fragment<'src> { /// where the left one contains the first `bytes` bytes of the fragment, and the right one /// contains the rest. /// - /// # Safety: + /// # Safety /// - Undefined Behavior occurs if `bytes` is greater than the length of the [Fragment]. /// - Undefined Behavior occurs if `bytes` is not on a UTF-8 character boundary. /// - See [str::get_unchecked] for more details. From bea8f047c7dd5b95ece27018d2a407e0e6c38b8c Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 24 Feb 2024 04:04:19 -0500 Subject: [PATCH 40/60] fmt --- wright/src/bin/wright.rs | 2 +- wright/src/parser/fragment.rs | 10 +++++----- wright/src/parser/lexer.rs | 13 ++++++++----- wright/src/parser/lexer/token.rs | 4 ++-- wright/src/parser/lexer/trivial.rs | 28 ++++++++++++++++------------ 5 files changed, 32 insertions(+), 25 deletions(-) diff --git a/wright/src/bin/wright.rs b/wright/src/bin/wright.rs index 669a3afb..16e6ef90 100644 --- a/wright/src/bin/wright.rs +++ b/wright/src/bin/wright.rs @@ -6,7 +6,7 @@ use codespan_reporting::files::Files; use std::{path::PathBuf, time::Instant}; use wright::{ filemap::{FileId, FileMap}, - parser::lexer::{Lexer, token::Token}, + parser::lexer::{token::Token, Lexer}, repl, }; diff --git a/wright/src/parser/fragment.rs b/wright/src/parser/fragment.rs index 21ffa775..01b3eedf 100644 --- a/wright/src/parser/fragment.rs +++ b/wright/src/parser/fragment.rs @@ -68,13 +68,13 @@ impl<'src> Fragment<'src> { } /// Unsafe version of [`Fragment::split_at`]. Splits this [Fragment] into two subfragments, - /// where the left one contains the first `bytes` bytes of the fragment, and the right one - /// contains the rest. - /// + /// where the left one contains the first `bytes` bytes of the fragment, and the right one + /// contains the rest. + /// /// # Safety /// - Undefined Behavior occurs if `bytes` is greater than the length of the [Fragment]. - /// - Undefined Behavior occurs if `bytes` is not on a UTF-8 character boundary. - /// - See [str::get_unchecked] for more details. + /// - Undefined Behavior occurs if `bytes` is not on a UTF-8 character boundary. + /// - See [str::get_unchecked] for more details. pub unsafe fn split_at_unchecked(&self, bytes: usize) -> (Self, Self) { let left: &str = self.inner.get_unchecked(..bytes); let right: &str = self.inner.get_unchecked(bytes..); diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 35f4e9a9..f7b8dc55 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -7,8 +7,8 @@ use super::fragment::Fragment; use std::iter::FusedIterator; use std::str::Chars; use std::{iter::Peekable, ptr}; -use unicode_ident::{is_xid_continue, is_xid_start}; use token::{Token, TokenTy}; +use unicode_ident::{is_xid_continue, is_xid_start}; pub mod token; pub mod trivial; @@ -97,15 +97,18 @@ impl<'src> Lexer<'src> { } } - /// Unsafe version of [Lexer::split_token]. - /// + /// Unsafe version of [Lexer::split_token]. + /// /// # Safety: /// - This function matches the safety guarantees of [Fragment::split_at_unchecked]. unsafe fn split_token_unchecked(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { let (token_fragment, new_remaining_fragment) = self.remaining.split_at_unchecked(bytes); self.remaining = new_remaining_fragment; - Token { variant: kind, fragment: token_fragment } + Token { + variant: kind, + fragment: token_fragment, + } } /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for @@ -341,7 +344,7 @@ impl<'src> Lexer<'src> { token => return token, } - // Handle a trivial token if there is one. + // Handle a trivial token if there is one. if let Some(token) = trivial::try_consume_trivial_token(self) { return Some(token); } diff --git a/wright/src/parser/lexer/token.rs b/wright/src/parser/lexer/token.rs index e30b6851..f6a9f9e8 100644 --- a/wright/src/parser/lexer/token.rs +++ b/wright/src/parser/lexer/token.rs @@ -1,7 +1,7 @@ -//! Token models. +//! Token models. -use derive_more::Display; use crate::parser::fragment::Fragment; +use derive_more::Display; /// A token in wright source code. #[derive(Debug, Display)] diff --git a/wright/src/parser/lexer/trivial.rs b/wright/src/parser/lexer/trivial.rs index 7fb9284d..1e2f52a1 100644 --- a/wright/src/parser/lexer/trivial.rs +++ b/wright/src/parser/lexer/trivial.rs @@ -1,6 +1,9 @@ //! Trivial tokens and their implementation. -use super::{token::{Token, TokenTy}, Lexer}; +use super::{ + token::{Token, TokenTy}, + Lexer, +}; /// Trivial tokens that are two ASCII characters and can be matched directly /// against the input source code. @@ -57,17 +60,18 @@ pub const SINGLE_ASCII_CHAR_TRIVIAL_TOKENS: &[(u8, TokenTy)] = &[ (b'%', TokenTy::Mod), ]; - -/// Attempt to consume a "trivial" token from the start of the [Lexer]'s [Lexer::remaining] fragment. -/// -/// Leave the lexer unmodified if one is not available. +/// Attempt to consume a "trivial" token from the start of the [Lexer]'s [Lexer::remaining] fragment. +/// +/// Leave the lexer unmodified if one is not available. #[inline] pub fn try_consume_trivial_token<'src>(lexer: &mut Lexer<'src>) -> Option> { // Get the number of bytes remaining, since we need at least 1 to parse anything. let bytes_remaining: usize = lexer.bytes_remaining(); - // No token if there are no bytes of source left. - if bytes_remaining == 0 { return None; } + // No token if there are no bytes of source left. + if bytes_remaining == 0 { + return None; + } // Attempt to match any two-byte ASCII trivial tokens. // This must be done before single-ascii byte tokens since matching is greedy. @@ -79,25 +83,25 @@ pub fn try_consume_trivial_token<'src>(lexer: &mut Lexer<'src>) -> Option Date: Mon, 26 Feb 2024 01:19:44 -0500 Subject: [PATCH 41/60] refactor comments out to their own file --- wright/src/parser/lexer.rs | 220 +++++++--------------------- wright/src/parser/lexer/comments.rs | 133 +++++++++++++++++ 2 files changed, 184 insertions(+), 169 deletions(-) create mode 100644 wright/src/parser/lexer/comments.rs diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index f7b8dc55..601d8233 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -3,6 +3,8 @@ //! Note that this will strip out comments and whitespace, returning only fragments that match one of the paterns //! defined for tokens. +use self::comments::{try_match_block_comment, try_match_single_line_comment}; + use super::fragment::Fragment; use std::iter::FusedIterator; use std::str::Chars; @@ -12,15 +14,7 @@ use unicode_ident::{is_xid_continue, is_xid_start}; pub mod token; pub mod trivial; - -/// The pattern that begins any single line comments (including doc comments). -pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//"; - -/// The pattern that starts any multi-line comments (including doc comments). -pub const MULTI_LINE_COMMENT_START: &str = "/*"; - -/// The pattern that ends any multi-line comments (including doc comments). -pub const MULTI_LINE_COMMENT_END: &str = "*/"; +pub mod comments; /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug, Clone, Copy)] @@ -112,12 +106,24 @@ impl<'src> Lexer<'src> { } /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for - /// failable parsing. This can be compared to the original lexer it was forked from using [Fragment::offset_from] + /// failable parsing. This can be compared to the original lexer it was forked from using [Lexer::offset_from] /// on the underlying `remaining` fragments. fn fork(&self) -> Self { *self } + /// Get the number of bytes between the origin's [remaining](Lexer::remaining) and + /// this [Lexer]'s [remaining](Lexer::remaining) using [`Fragment::offset_from`]. + /// + /// # Panics + /// - This function panics under the same conditions as [`Fragment::offset_from`]. + /// - Generally the best way to avoid panics is to only call this function on + /// [Lexer]s created using [Lexer::fork] on the `origin` lexer. + #[inline] + fn offset_from(&self, origin: &Self) -> usize { + self.remaining.offset_from(&origin.remaining) + } + /// Remove and ignore any whitespace at the start of the remaining fragment. fn ignore_whitespace(&mut self) { // Get a reference to the slice of the string past any whitespace at the start. @@ -165,137 +171,25 @@ impl<'src> Lexer<'src> { } } - /// Attempt to read/handle a single line comment from the start of the - /// remaining fragment. If there's a doc-style single line comment, return a [`Token`], - /// otherwise return [`None`]. - /// - /// Generally I'm trying to follow the [rust comment spec] here. - /// - /// [rust comment spec]: https://doc.rust-lang.org/reference/comments.html - fn handle_single_line_comment(&mut self) -> Option> { - // Fork the lexer to attempt to consume a single line comment. - let mut fork: Self = self.fork(); - - // Try to consume the single line comment prefix from the fork. - if fork.consume(SINGLE_LINE_COMMENT_PREFIX) { - // We consumed it successfully, read through a newline or the end of the forked lexer if we get there. - - // First determine if this is a doc comment of some kind. - let is_inner_doc: bool = fork.matches("/") && !fork.matches("//"); - let is_outer_doc: bool = fork.matches("!"); - - // The consume until a newline, carraige return, or the end of the source fragment. - while !fork.remaining.is_empty() && !fork.matches("\r") && !fork.matches("\n") { - fork.consume_any(); - } - - // Determine the kind of token to produce (if any). - let variant: Option = match (is_inner_doc, is_outer_doc) { - (true, false) => Some(TokenTy::InnerDocComment), - (false, true) => Some(TokenTy::OuterDocComment), - (false, false) => None, - (true, true) => { - unreachable!("Lexer should not match multiple comment types at once.") - } - }; - - // Map the variant to a token to return. - let token: Option = variant.map(|kind| { - // Get the number of bytes we have consumed using `offset_from`. - let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); - // Split this token from `self` rather than `fork` since self is still in an unmodified position. - self.split_token(bytes_consumed, kind) - }); - - // Update this lexer to match the state of the forked lexer. - *self = fork; - // Consume any outstanding whitespace. - self.ignore_whitespace(); - // Return any token produced. - return token; - } - - // If there was no comment prefix, there is no comment immediately available. - None + /// Advance this lexer by the specified number of bytes. + /// + /// # Panics + /// - If the lexer is not on a unicode character boundary after advancing. + /// - If the number of bytes is greater than the length of the [remaining](Lexer::remaining) fragment. + fn advance(&mut self, bytes: usize) { + self.remaining.inner = &self.remaining.inner[bytes..]; } - /// Attempt to read/consume a multi-line comment from the start of the `remaining` fragment. - fn handle_multi_line_comment(&mut self) -> Option> { - // Handle corner cases here so we don't have to below. - // These are both considered empty non-documenting comments. - if self.consume("/***/") { - return None; - } - - if self.consume("/**/") { - return None; - } - - // Make a fork of the lexer to avoid modifying this lexer if we fail to parse. - let mut fork: Self = self.fork(); - - // Try to parse the start of a multi-line comment. - if fork.consume(MULTI_LINE_COMMENT_START) { - // Check if this is a doc comment. - let is_outer_doc: bool = fork.matches("!"); - // Use this to indicate that more than one following asterix is not a doc comment. - let is_inner_doc: bool = fork.matches("*") && !fork.matches("**"); - - // Consume until we see the end of the doc comment. If we run out of characters, consider the - // comment unterminated. - while !fork.matches(MULTI_LINE_COMMENT_END) { - // Handle nested comments here: - if fork.matches(MULTI_LINE_COMMENT_START) { - // Discard the output -- don't care about doc comments in other comments. - fork.handle_multi_line_comment(); - continue; - } - - // Handle unterminated comments here. - if fork.remaining.is_empty() { - // If we have not hit a "*/" before the end of the input, return an unterminated block comment. - let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); - // Split the token and return it. - return Some( - self.split_token(bytes_consumed, TokenTy::UnterminatedBlockComment), - ); - } - - // If there's still input, and not a nested comment, consume it. - fork.consume_any(); - } - - // If we get here, the comment was terminated. Consume the terminating characters, and return. - // Use debug assert here to make sure that the comment is actually terminated. - debug_assert!(fork.consume(MULTI_LINE_COMMENT_END), "comment is actually terminated"); - - // Determine the kind of token to produce (if any). - let variant: Option = match (is_inner_doc, is_outer_doc) { - (true, false) => Some(TokenTy::InnerBlockDocComment), - (false, true) => Some(TokenTy::OuterBlockDocComment), - (false, false) => None, - (true, true) => { - unreachable!("Lexer should not match multiple comment types at once.") - } - }; - - // Make the token to return. - let token: Option = variant.map(|kind| { - // Get the number of bytes we have consumed using `offset_from`. - let bytes_consumed: usize = fork.remaining.offset_from(&self.remaining); - // Split this token from `self` rather than `fork` since self is still in an unmodified position. - self.split_token(bytes_consumed, kind) - }); - - // Update this lexer to match the state of the fork. - *self = fork; - // Return token if there was one. - return token; - } - - // If the fork did not consume a multi-line comment start, return None and do - // not update this lexer. - None + /// Unsafe version of [Lexer::advance]. + /// Advances this lexer by the specified number of bytes. + /// + /// # Safety + /// - This lexer will be left in an invalid/undefined state if the number of bytes is greater than the length + /// of the [Lexer::remaining] fragment. + /// - This lexer will be left in an invalid/undefined state if after advancing, the next byte in the + /// [Lexer::remaining] fragment is not the start of a unicode code point. + unsafe fn advance_unchecked(&mut self, bytes: usize) { + self.remaining.inner = self.remaining.inner.get_unchecked(bytes..); } /// Get the next token from the lexer. @@ -308,40 +202,28 @@ impl<'src> Lexer<'src> { return None; } - // Grab a copy of the initial lexer to compare and check when progress has been made. - let initial_lexer: Self = self.fork(); - - // Attempt to parse a single line comment. Return it if it's documentation. - // Rerun this function if there was a comment and it was ignored successfully. - match self.handle_single_line_comment() { - // There was a single line comment ignored or no single line comment. - None => { - // Check if the remaining fragment changed. - if !self.remaining.ptr_eq(&initial_lexer.remaining) { - // If so, re-run this function. + // Attempt to parse a single line comment and then attempt a multi-line comment. + for comment_match_fn in [try_match_single_line_comment, try_match_block_comment] { + // Attempt to parse a comment using the given match function. Return it if it's documentation or unterminated. + // Get a new token and return that if there was a comment and it was ignored successfully. + match (comment_match_fn)(self) { + // A comment was parsed, consume and return it. + (bytes, Some(comment_variant)) => { + // Split the token. + let token: Token = self.split_token(bytes, comment_variant); + // Return it. + return Some(token); + }, + + // There was a comment, advance the lexer and ignore it. Re-start this function. + (bytes @ 1.., None) => { + self.advance(bytes); return self.next_token(); } - // If the lexer was unchanged, then there was no comment -- keep trying to match tokens. - } - - // If there was some token, return it. - token => return token, - } - - // Try to handle a multi-line comment if there is one. - match self.handle_multi_line_comment() { - // There was an ignored comment or no comment. - None => { - // If the lexer was changed, restart this function. - if !self.remaining.ptr_eq(&initial_lexer.remaining) { - return self.next_token(); - } + // There was no comment, keep trying to match other tokens. + (0, None) => {}, } - - // If there was a block style doc-comment, or an unterminated multi-line comment - // return. - token => return token, } // Handle a trivial token if there is one. diff --git a/wright/src/parser/lexer/comments.rs b/wright/src/parser/lexer/comments.rs new file mode 100644 index 00000000..3045ba28 --- /dev/null +++ b/wright/src/parser/lexer/comments.rs @@ -0,0 +1,133 @@ +//! Implementation of comment token lexing. + +use super::{token::TokenTy, Lexer}; + +/// The pattern that begins any single line comments (including doc comments). +pub const SINGLE_LINE_COMMENT_PREFIX: &str = "//"; + +/// The pattern that starts any multi-line comments (including doc comments). +pub const MULTI_LINE_COMMENT_START: &str = "/*"; + +/// The pattern that ends any multi-line comments (including doc comments). +pub const MULTI_LINE_COMMENT_END: &str = "*/"; + +/// Attempt to match a sinlgle line comment from the start of the [Lexer::remaining] fragment. +/// Return a [usize] and optionally a [TokenTy]. The [usize] indicates how many bytes were in the comment. +/// The [TokenTy] (if it's not [None]) should be either [TokenTy::InnerDocComment] or [TokenTy::OuterDocComment]. +/// +/// If the [TokenTy] is not [None], the lexer should consume the specified number of bytes (by the [usize]) and +/// Produce a token with the [variant](super::token::Token::variant) from this function. +/// +/// Generally I'm trying to follow the [rust comment spec] here. +/// +/// [rust comment spec]: https://doc.rust-lang.org/reference/comments.html +pub fn try_match_single_line_comment(lexer: &Lexer) -> (usize, Option) { + // Fork the lexer so we can do all the parsing on the fork without worrying about modifying the original + // unnecessarily. + let mut fork: Lexer = lexer.fork(); + + // Try to consume the single line comment prefix from the fork. + if fork.consume(SINGLE_LINE_COMMENT_PREFIX) { + // We consumed it successfully, read through a newline or the end of the forked lexer if we get there. + + // First determine if this is a doc comment of some kind. + let is_inner_doc: bool = fork.matches("/") && !fork.matches("//"); + let is_outer_doc: bool = fork.matches("!"); + + // The consume until a newline, carraige return, or the end of the source fragment. + while !fork.remaining.is_empty() && !fork.matches("\r") && !fork.matches("\n") { + fork.consume_any(); + } + + // Determine the kind of token to produce (if any). + let variant: Option = match (is_inner_doc, is_outer_doc) { + (true, false) => Some(TokenTy::InnerDocComment), + (false, true) => Some(TokenTy::OuterDocComment), + (false, false) => None, + (true, true) => unreachable!("It is impossible for the `remaining` fragment to start with an `!` and a `/` simultaneously.") + }; + + // Return the number of bytes consumed and the type of token to + // produce if any. + return (fork.offset_from(lexer), variant); + } + + // If the single line comment prefix was not immediately available, there is no comment. + (0, None) +} + +/// Attempt to match a block comment from the start of the [Lexer::remaining] fragment. +/// Return a [usize] and optionally a [TokenTy]. The [usize] indicates how many bytes were in the comment. +/// The [TokenTy] (if it's not [None]) should be [TokenTy::InnerBlockDocComment], [TokenTy::OuterBlockDocComment], or +/// [TokenTy::UnterminatedBlockComment]. +/// +/// If the [TokenTy] is not [None], the lexer should consume the specified number of bytes (by the [usize]) and +/// Produce a token with the [variant](super::token::Token::variant) from this function. +pub fn try_match_block_comment(lexer: &Lexer) -> (usize, Option) { + // Handle corner cases here so we don't have to below. + // These are both considered empty non-documenting comments. + if lexer.matches("/***/") { + return (5, None); + } + + if lexer.matches("/**/") { + return (4, None); + } + + // Make a fork of the lexer to avoid modifying this lexer if we fail to parse. + let mut fork: Lexer = lexer.fork(); + + // Try to parse the start of a multi-line comment. + if fork.consume(MULTI_LINE_COMMENT_START) { + // Check if this is a doc comment. + let is_outer_doc: bool = fork.matches("!"); + // Use this to indicate that more than one following asterix is not a doc comment. + let is_inner_doc: bool = fork.matches("*") && !fork.matches("**"); + + // Consume until we see the end of the doc comment. If we run out of characters, consider the + // comment unterminated. + while !fork.matches(MULTI_LINE_COMMENT_END) { + // Handle nested comments here: + if fork.matches(MULTI_LINE_COMMENT_START) { + // Discard the output -- don't care about doc comments in other comments. + let (nested_comment_bytes, _) = try_match_block_comment(&fork); + + // SAFETY: the return from this function should never be on a char boundary or out of bounds. + // This is because the return value is always either 0 or calculated using `offset_from`. + unsafe { fork.advance_unchecked(nested_comment_bytes) }; + + // Restart the loop to keep consuming this comment. + continue; + } + + // Handle unterminated comments here. + if fork.remaining.is_empty() { + // If we have not hit a "*/" before the end of the input, return an unterminated block comment. + let bytes_consumed: usize = fork.offset_from(lexer); + return (bytes_consumed, Some(TokenTy::UnterminatedBlockComment)); + } + + // If there's still input, and not a nested comment, consume it. + fork.consume_any(); + } + + // If we get here, the comment was terminated. Consume the terminating characters, and return. + // Use debug assert here to make sure that the comment is actually terminated. + let consumed_comment_terminator: bool = fork.consume(MULTI_LINE_COMMENT_END); + debug_assert!(consumed_comment_terminator, "comment is actually terminated"); + + // Determine the kind of token to produce (if any). + let variant: Option = match (is_inner_doc, is_outer_doc) { + (true, false) => Some(TokenTy::InnerBlockDocComment), + (false, true) => Some(TokenTy::OuterBlockDocComment), + (false, false) => None, + (true, true) => { + unreachable!("Lexer should not match multiple comment types at once.") + } + }; + + return (fork.offset_from(lexer), variant); + } + + (0, None) +} From 99f19c81cea9d222fa65f817eff8f9f594ae722e Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Mon, 26 Feb 2024 01:22:13 -0500 Subject: [PATCH 42/60] cargo fmt --- wright/src/parser/lexer.rs | 40 ++++++++++++++--------------- wright/src/parser/lexer/comments.rs | 4 +-- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 601d8233..baa354a2 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -12,9 +12,9 @@ use std::{iter::Peekable, ptr}; use token::{Token, TokenTy}; use unicode_ident::{is_xid_continue, is_xid_start}; +pub mod comments; pub mod token; pub mod trivial; -pub mod comments; /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug, Clone, Copy)] @@ -112,13 +112,13 @@ impl<'src> Lexer<'src> { *self } - /// Get the number of bytes between the origin's [remaining](Lexer::remaining) and - /// this [Lexer]'s [remaining](Lexer::remaining) using [`Fragment::offset_from`]. - /// + /// Get the number of bytes between the origin's [remaining](Lexer::remaining) and + /// this [Lexer]'s [remaining](Lexer::remaining) using [`Fragment::offset_from`]. + /// /// # Panics /// - This function panics under the same conditions as [`Fragment::offset_from`]. - /// - Generally the best way to avoid panics is to only call this function on - /// [Lexer]s created using [Lexer::fork] on the `origin` lexer. + /// - Generally the best way to avoid panics is to only call this function on + /// [Lexer]s created using [Lexer::fork] on the `origin` lexer. #[inline] fn offset_from(&self, origin: &Self) -> usize { self.remaining.offset_from(&origin.remaining) @@ -171,23 +171,23 @@ impl<'src> Lexer<'src> { } } - /// Advance this lexer by the specified number of bytes. - /// + /// Advance this lexer by the specified number of bytes. + /// /// # Panics - /// - If the lexer is not on a unicode character boundary after advancing. - /// - If the number of bytes is greater than the length of the [remaining](Lexer::remaining) fragment. + /// - If the lexer is not on a unicode character boundary after advancing. + /// - If the number of bytes is greater than the length of the [remaining](Lexer::remaining) fragment. fn advance(&mut self, bytes: usize) { self.remaining.inner = &self.remaining.inner[bytes..]; } - /// Unsafe version of [Lexer::advance]. + /// Unsafe version of [Lexer::advance]. /// Advances this lexer by the specified number of bytes. - /// + /// /// # Safety /// - This lexer will be left in an invalid/undefined state if the number of bytes is greater than the length /// of the [Lexer::remaining] fragment. - /// - This lexer will be left in an invalid/undefined state if after advancing, the next byte in the - /// [Lexer::remaining] fragment is not the start of a unicode code point. + /// - This lexer will be left in an invalid/undefined state if after advancing, the next byte in the + /// [Lexer::remaining] fragment is not the start of a unicode code point. unsafe fn advance_unchecked(&mut self, bytes: usize) { self.remaining.inner = self.remaining.inner.get_unchecked(bytes..); } @@ -202,27 +202,27 @@ impl<'src> Lexer<'src> { return None; } - // Attempt to parse a single line comment and then attempt a multi-line comment. + // Attempt to parse a single line comment and then attempt a multi-line comment. for comment_match_fn in [try_match_single_line_comment, try_match_block_comment] { // Attempt to parse a comment using the given match function. Return it if it's documentation or unterminated. // Get a new token and return that if there was a comment and it was ignored successfully. match (comment_match_fn)(self) { - // A comment was parsed, consume and return it. + // A comment was parsed, consume and return it. (bytes, Some(comment_variant)) => { // Split the token. let token: Token = self.split_token(bytes, comment_variant); // Return it. return Some(token); - }, + } - // There was a comment, advance the lexer and ignore it. Re-start this function. + // There was a comment, advance the lexer and ignore it. Re-start this function. (bytes @ 1.., None) => { self.advance(bytes); return self.next_token(); } - // There was no comment, keep trying to match other tokens. - (0, None) => {}, + // There was no comment, keep trying to match other tokens. + (0, None) => {} } } diff --git a/wright/src/parser/lexer/comments.rs b/wright/src/parser/lexer/comments.rs index 3045ba28..dd89856a 100644 --- a/wright/src/parser/lexer/comments.rs +++ b/wright/src/parser/lexer/comments.rs @@ -17,7 +17,7 @@ pub const MULTI_LINE_COMMENT_END: &str = "*/"; /// /// If the [TokenTy] is not [None], the lexer should consume the specified number of bytes (by the [usize]) and /// Produce a token with the [variant](super::token::Token::variant) from this function. -/// +/// /// Generally I'm trying to follow the [rust comment spec] here. /// /// [rust comment spec]: https://doc.rust-lang.org/reference/comments.html @@ -93,7 +93,7 @@ pub fn try_match_block_comment(lexer: &Lexer) -> (usize, Option) { let (nested_comment_bytes, _) = try_match_block_comment(&fork); // SAFETY: the return from this function should never be on a char boundary or out of bounds. - // This is because the return value is always either 0 or calculated using `offset_from`. + // This is because the return value is always either 0 or calculated using `offset_from`. unsafe { fork.advance_unchecked(nested_comment_bytes) }; // Restart the loop to keep consuming this comment. From 92b5d867c966d3ab06d2caa736eb878b3317f0d2 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 13 Mar 2024 00:36:17 -0400 Subject: [PATCH 43/60] Refactor identifier/keywords to their own file --- wright/src/parser/lexer.rs | 74 ++----------------------- wright/src/parser/lexer/identifier.rs | 78 +++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 69 deletions(-) create mode 100644 wright/src/parser/lexer/identifier.rs diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index baa354a2..e9c69019 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -10,11 +10,11 @@ use std::iter::FusedIterator; use std::str::Chars; use std::{iter::Peekable, ptr}; use token::{Token, TokenTy}; -use unicode_ident::{is_xid_continue, is_xid_start}; pub mod comments; pub mod token; pub mod trivial; +pub mod identifier; /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug, Clone, Copy)] @@ -36,46 +36,6 @@ impl<'src> Lexer<'src> { } } - /// Try to match a fragment recognized to be an identifier or keyword to - /// a keyword or return [TokenTy::Identifier]. - fn identifier_or_keyword(fragment: Fragment<'src>) -> TokenTy { - use TokenTy::*; - - match fragment.inner { - "record" => KwRecord, - "type" => KwType, - "enum" => KwEnum, - "union" => KwUnion, - "func" => KwFunc, - "repr" => KwRepr, - "impl" => KwImpl, - "constraint" => KwConstraint, - "references" => KwReferences, - "trait" => KwTrait, - "const" => KwConst, - "where" => KwWhere, - - "use" => KwUse, - "as" => KwAs, - "mod" => KwMod, - - "if" => KwIf, - "else" => KwElse, - - "for" => KwFor, - "in" => KwIn, - "while" => KwWhile, - "loop" => KwLoop, - - "true" => KwTrue, - "false" => KwFalse, - - "_" => Underscore, - - _ => Identifier, - } - } - /// Make a token by splitting a given number of bytes off of the `self.remaining` fragment /// and labeling them with the given kind. /// @@ -93,7 +53,7 @@ impl<'src> Lexer<'src> { /// Unsafe version of [Lexer::split_token]. /// - /// # Safety: + /// # Safety /// - This function matches the safety guarantees of [Fragment::split_at_unchecked]. unsafe fn split_token_unchecked(&mut self, bytes: usize, kind: TokenTy) -> Token<'src> { let (token_fragment, new_remaining_fragment) = self.remaining.split_at_unchecked(bytes); @@ -232,32 +192,8 @@ impl<'src> Lexer<'src> { } // Next attempt to match a keyword or identifier. - { - let mut chars: Chars = self.remaining.chars(); - // The unsafe is fine here -- we've established that this lexer has bytes remaining. - let next: char = unsafe { chars.next().unwrap_unchecked() }; - - if is_xid_start(next) || next == '_' { - let mut bytes_consumed: usize = next.len_utf8(); - - // Take remaining chars and add to sum. - bytes_consumed += chars - .take_while(|c| is_xid_continue(*c)) - .map(char::len_utf8) - .sum::(); - - // Split the number of bytes we consumed. - let (ident_frag, new_remaining) = self.remaining.split_at(bytes_consumed); - // Get the token kind to produce for this fragment. - let variant = Lexer::identifier_or_keyword(ident_frag); - // Update the lexers remaining fragment. - self.remaining = new_remaining; - // Return the identifier, keyword, or underscore. - return Some(Token { - variant, - fragment: ident_frag, - }); - } + if let Some(token) = identifier::try_consume_keyword_or_identifier(self) { + return Some(token); } // Next attempt to parse a numerical literal. @@ -313,7 +249,7 @@ impl<'src> Iterator for Lexer<'src> { } fn size_hint(&self) -> (usize, Option) { - // Lexers cannot return multiple tokens for a single byte. + // Lexers should not return multiple tokens for a single byte. (0, Some(self.bytes_remaining())) } } diff --git a/wright/src/parser/lexer/identifier.rs b/wright/src/parser/lexer/identifier.rs new file mode 100644 index 00000000..d950e062 --- /dev/null +++ b/wright/src/parser/lexer/identifier.rs @@ -0,0 +1,78 @@ +//! Implementation related to parsing keywords and identifiers. + +use std::str::Chars; +use unicode_ident::{is_xid_continue, is_xid_start}; +use crate::parser::fragment::Fragment; +use super::{token::Token, Lexer, token::TokenTy}; + +/// Try to match a fragment recognized to be an identifier or keyword to +/// a keyword or return [TokenTy::Identifier]. +fn identifier_or_keyword<'src>(fragment: Fragment<'src>) -> TokenTy { + use TokenTy::*; + + match fragment.inner { + "record" => KwRecord, + "type" => KwType, + "enum" => KwEnum, + "union" => KwUnion, + "func" => KwFunc, + "repr" => KwRepr, + "impl" => KwImpl, + "constraint" => KwConstraint, + "references" => KwReferences, + "trait" => KwTrait, + "const" => KwConst, + "where" => KwWhere, + + "use" => KwUse, + "as" => KwAs, + "mod" => KwMod, + + "if" => KwIf, + "else" => KwElse, + + "for" => KwFor, + "in" => KwIn, + "while" => KwWhile, + "loop" => KwLoop, + + "true" => KwTrue, + "false" => KwFalse, + + "_" => Underscore, + + _ => Identifier, + } +} + +/// Attempt to consume a keyword/[identifier](TokenTy::Identifier)/[underscore](TokenTy::Underscore) from the lexer. +pub fn try_consume_keyword_or_identifier<'src>(lexer: &mut Lexer<'src>) -> Option> { + // Get a character iterator that we can pull from. + let mut chars: Chars = lexer.remaining.chars(); + // Get the next character from the iterator, consider it the first char of any potential match. + // Make sure it's a valid identifier start (includes start to all keywords) or is an underscore. + // If it does not exist or match predicates, return None. + let next: char = chars.next().filter(|c| is_xid_start(*c) || *c == '_')?; + // Store/track the number of bytes consumed so far. + let mut bytes_consumed: usize = next.len_utf8(); + + // Take remaining chars and add to sum. + bytes_consumed += chars + .take_while(|c| is_xid_continue(*c)) + .map(char::len_utf8) + .sum::(); + + // Split the token and the new remaining fragment. + // SAFETY: The character iterator should guaruntee that we land on a valid character boundary within the bounds + // of the fragment. + let (token_fragment, new_remaining): (Fragment, Fragment) = unsafe { + lexer.remaining.split_at_unchecked(bytes_consumed) + }; + + // Get the variant of token to produce. + let variant: TokenTy = identifier_or_keyword(token_fragment); + // Update the lexer's remaining fragment. + lexer.remaining = new_remaining; + // Return the token. + return Some(Token { variant, fragment: token_fragment }); +} From 161e660c2b13e80ae1dfe3db25dc87b9cd234681 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 13 Mar 2024 00:54:51 -0400 Subject: [PATCH 44/60] Refactor integer literals and tests --- wright/src/parser/lexer.rs | 96 ++-------------------- wright/src/parser/lexer/comments.rs | 12 +++ wright/src/parser/lexer/identifier.rs | 13 +++ wright/src/parser/lexer/integer_literal.rs | 56 +++++++++++++ wright/src/parser/lexer/trivial.rs | 28 +++++++ 5 files changed, 115 insertions(+), 90 deletions(-) create mode 100644 wright/src/parser/lexer/integer_literal.rs diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index e9c69019..e9967c11 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -4,17 +4,19 @@ //! defined for tokens. use self::comments::{try_match_block_comment, try_match_single_line_comment}; +use self::integer_literal::try_consume_integer_literal; use super::fragment::Fragment; use std::iter::FusedIterator; use std::str::Chars; -use std::{iter::Peekable, ptr}; +use std::ptr; use token::{Token, TokenTy}; pub mod comments; pub mod token; pub mod trivial; pub mod identifier; +pub mod integer_literal; /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug, Clone, Copy)] @@ -196,41 +198,9 @@ impl<'src> Lexer<'src> { return Some(token); } - // Next attempt to parse a numerical literal. - { - let mut chars: Peekable = self.remaining.chars().peekable(); - // The unsafe is fine here -- we've established that this lexer has bytes remaining. - let next: char = unsafe { chars.next().unwrap_unchecked() }; - - if next.is_ascii_digit() { - // Accumulate the number of bytes consumed in the numeric literal. - let mut acc: usize = 1; - // Track the radix - let mut radix: u32 = 10; - - // Change the radix if necessary - if next == '0' { - if let Some(prefix) = chars.next_if(|x| ['x', 'o', 'b', 'X', 'B'].contains(x)) { - // All the possible prefix chars are 1 byte ascii characters. - acc += 1; - - radix = match prefix { - 'x' | 'X' => 16, - 'b' | 'B' => 2, - 'o' => 8, - _ => unreachable!("the prefix byte is checked above"), - }; - } - } - - // Add the rest of the integer literal. - acc += chars - .take_while(|c| c.is_digit(radix) || *c == '_') - .map(char::len_utf8) - .sum::(); - - return Some(self.split_token(acc, TokenTy::IntegerLiteral)); - } + // Next attempt to parse an integer literal. + if let Some(integer_lit) = try_consume_integer_literal(self) { + return Some(integer_lit); } // If we haven't matched at this point, produce a token marked as "Unknown". @@ -256,57 +226,3 @@ impl<'src> Iterator for Lexer<'src> { // Lexers are fused -- they cannot generate tokens infinitely. impl<'src> FusedIterator for Lexer<'src> {} - -#[cfg(test)] -mod tests { - use super::Lexer; - use crate::parser::lexer::TokenTy; - - #[test] - fn plus_and_plus_eq_tokens() { - let mut plus = Lexer::new("+"); - let mut plus_eq = Lexer::new("+="); - - let plus_token = plus.next_token().unwrap(); - let plus_eq_token = plus_eq.next_token().unwrap(); - - assert_eq!(plus.bytes_remaining(), 0); - assert_eq!(plus_eq.bytes_remaining(), 0); - assert_eq!(plus_token.variant, TokenTy::Plus); - assert_eq!(plus_eq_token.variant, TokenTy::PlusEq); - } - - #[test] - fn plus_one_token() { - let mut plus_one = Lexer::new("+1"); - let plus_token = plus_one.next_token().unwrap(); - assert_eq!(plus_one.bytes_remaining(), 1); - assert_eq!(plus_token.variant, TokenTy::Plus); - assert_eq!(plus_token.fragment.len(), 1); - } - - #[test] - fn identifiers_and_keywords() { - let mut lexer = Lexer::new("const TEST"); - - assert_eq!(lexer.next_token().unwrap().variant, TokenTy::KwConst); - assert_eq!(lexer.next_token().unwrap().variant, TokenTy::Identifier); - } - - #[test] - fn intger_literal() { - let mut lexer = Lexer::new("123_456_789."); - - let token = lexer.next_token().unwrap(); - - assert_eq!(token.fragment.inner, "123_456_789"); - assert_eq!(token.variant, TokenTy::IntegerLiteral); - } - - #[test] - fn ignored_single_line_comment() { - let mut lexer = Lexer::new("// test comment "); - assert!(lexer.next_token().is_none()); - assert_eq!(lexer.remaining.len(), 0); - } -} diff --git a/wright/src/parser/lexer/comments.rs b/wright/src/parser/lexer/comments.rs index dd89856a..bca7e23d 100644 --- a/wright/src/parser/lexer/comments.rs +++ b/wright/src/parser/lexer/comments.rs @@ -131,3 +131,15 @@ pub fn try_match_block_comment(lexer: &Lexer) -> (usize, Option) { (0, None) } + +#[cfg(test)] +mod tests { + use super::Lexer; + + #[test] + fn ignored_single_line_comment() { + let mut lexer = Lexer::new("// test comment "); + assert!(lexer.next_token().is_none()); + assert_eq!(lexer.remaining.len(), 0); + } +} diff --git a/wright/src/parser/lexer/identifier.rs b/wright/src/parser/lexer/identifier.rs index d950e062..d3fa29d1 100644 --- a/wright/src/parser/lexer/identifier.rs +++ b/wright/src/parser/lexer/identifier.rs @@ -76,3 +76,16 @@ pub fn try_consume_keyword_or_identifier<'src>(lexer: &mut Lexer<'src>) -> Optio // Return the token. return Some(Token { variant, fragment: token_fragment }); } + +#[cfg(test)] +mod tests { + use super::{Lexer, TokenTy}; + + #[test] + fn identifiers_and_keywords() { + let mut lexer = Lexer::new("const TEST"); + + assert_eq!(lexer.next_token().unwrap().variant, TokenTy::KwConst); + assert_eq!(lexer.next_token().unwrap().variant, TokenTy::Identifier); + } +} diff --git a/wright/src/parser/lexer/integer_literal.rs b/wright/src/parser/lexer/integer_literal.rs new file mode 100644 index 00000000..2479565a --- /dev/null +++ b/wright/src/parser/lexer/integer_literal.rs @@ -0,0 +1,56 @@ +//! Implementation for lexing integer literals. + +use std::{iter::Peekable, str::Chars}; +use super::{token::{Token, TokenTy}, Lexer}; + +/// Attempt to lex and consume an [TokenTy::IntegerLiteral] from the lexer. +pub fn try_consume_integer_literal<'src>(lexer: &mut Lexer<'src>) -> Option> { + // Make a peekable character iterator. + let mut chars: Peekable = lexer.remaining.chars().peekable(); + // Get the first character from the iterator. We can only continue lexing if one exists and is an ascii + // decimal digit. + let next: char = chars.next().filter(char::is_ascii_digit)?; + // Track the number of bytes consumed. We use the length of the parsed first char here but we could probably + // assume it to be 1. + let mut bytes_consumed: usize = next.len_utf8(); + // Track the radix + let mut radix: u32 = 10; + + // Change the radix if necessary + if next == '0' { + if let Some(prefix) = chars.next_if(|x| ['x', 'o', 'b', 'X', 'B'].contains(x)) { + // All the possible prefix chars are 1 byte ascii characters. + bytes_consumed += 1; + + radix = match prefix { + 'x' | 'X' => 16, + 'b' | 'B' => 2, + 'o' => 8, + _ => unreachable!("the prefix byte is checked above"), + }; + } + } + + // Add the rest of the integer literal. + bytes_consumed += chars + .take_while(|c| c.is_digit(radix) || *c == '_') + .map(char::len_utf8) + .sum::(); + + return Some(lexer.split_token(bytes_consumed, TokenTy::IntegerLiteral)); +} + +#[cfg(test)] +mod tests { + use super::{TokenTy, Lexer}; + + #[test] + fn integer_literal() { + let mut lexer = Lexer::new("123_456_789."); + + let token = lexer.next_token().unwrap(); + + assert_eq!(token.fragment.inner, "123_456_789"); + assert_eq!(token.variant, TokenTy::IntegerLiteral); + } +} diff --git a/wright/src/parser/lexer/trivial.rs b/wright/src/parser/lexer/trivial.rs index 1e2f52a1..a0c2445f 100644 --- a/wright/src/parser/lexer/trivial.rs +++ b/wright/src/parser/lexer/trivial.rs @@ -105,3 +105,31 @@ pub fn try_consume_trivial_token<'src>(lexer: &mut Lexer<'src>) -> Option Date: Wed, 13 Mar 2024 00:55:28 -0400 Subject: [PATCH 45/60] satisfy clippy --- wright/src/parser/lexer/identifier.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wright/src/parser/lexer/identifier.rs b/wright/src/parser/lexer/identifier.rs index d3fa29d1..fc0cffdc 100644 --- a/wright/src/parser/lexer/identifier.rs +++ b/wright/src/parser/lexer/identifier.rs @@ -7,7 +7,7 @@ use super::{token::Token, Lexer, token::TokenTy}; /// Try to match a fragment recognized to be an identifier or keyword to /// a keyword or return [TokenTy::Identifier]. -fn identifier_or_keyword<'src>(fragment: Fragment<'src>) -> TokenTy { +fn identifier_or_keyword(fragment: Fragment) -> TokenTy { use TokenTy::*; match fragment.inner { @@ -74,7 +74,7 @@ pub fn try_consume_keyword_or_identifier<'src>(lexer: &mut Lexer<'src>) -> Optio // Update the lexer's remaining fragment. lexer.remaining = new_remaining; // Return the token. - return Some(Token { variant, fragment: token_fragment }); + Some(Token { variant, fragment: token_fragment }) } #[cfg(test)] From 3ce43b9bc8380bf0989fc45c4482f1285744fed9 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 13 Mar 2024 00:56:10 -0400 Subject: [PATCH 46/60] cargo fmt --- wright/src/parser/lexer.rs | 6 ++-- wright/src/parser/lexer/comments.rs | 2 +- wright/src/parser/lexer/identifier.rs | 34 ++++++++++++---------- wright/src/parser/lexer/integer_literal.rs | 19 +++++++----- 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index e9967c11..dd5ddf1c 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -8,15 +8,15 @@ use self::integer_literal::try_consume_integer_literal; use super::fragment::Fragment; use std::iter::FusedIterator; -use std::str::Chars; use std::ptr; +use std::str::Chars; use token::{Token, TokenTy}; pub mod comments; -pub mod token; -pub mod trivial; pub mod identifier; pub mod integer_literal; +pub mod token; +pub mod trivial; /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug, Clone, Copy)] diff --git a/wright/src/parser/lexer/comments.rs b/wright/src/parser/lexer/comments.rs index bca7e23d..cd1559f4 100644 --- a/wright/src/parser/lexer/comments.rs +++ b/wright/src/parser/lexer/comments.rs @@ -135,7 +135,7 @@ pub fn try_match_block_comment(lexer: &Lexer) -> (usize, Option) { #[cfg(test)] mod tests { use super::Lexer; - + #[test] fn ignored_single_line_comment() { let mut lexer = Lexer::new("// test comment "); diff --git a/wright/src/parser/lexer/identifier.rs b/wright/src/parser/lexer/identifier.rs index fc0cffdc..b1df9533 100644 --- a/wright/src/parser/lexer/identifier.rs +++ b/wright/src/parser/lexer/identifier.rs @@ -1,9 +1,9 @@ -//! Implementation related to parsing keywords and identifiers. +//! Implementation related to parsing keywords and identifiers. +use super::{token::Token, token::TokenTy, Lexer}; +use crate::parser::fragment::Fragment; use std::str::Chars; use unicode_ident::{is_xid_continue, is_xid_start}; -use crate::parser::fragment::Fragment; -use super::{token::Token, Lexer, token::TokenTy}; /// Try to match a fragment recognized to be an identifier or keyword to /// a keyword or return [TokenTy::Identifier]. @@ -47,11 +47,11 @@ fn identifier_or_keyword(fragment: Fragment) -> TokenTy { /// Attempt to consume a keyword/[identifier](TokenTy::Identifier)/[underscore](TokenTy::Underscore) from the lexer. pub fn try_consume_keyword_or_identifier<'src>(lexer: &mut Lexer<'src>) -> Option> { - // Get a character iterator that we can pull from. + // Get a character iterator that we can pull from. let mut chars: Chars = lexer.remaining.chars(); // Get the next character from the iterator, consider it the first char of any potential match. - // Make sure it's a valid identifier start (includes start to all keywords) or is an underscore. - // If it does not exist or match predicates, return None. + // Make sure it's a valid identifier start (includes start to all keywords) or is an underscore. + // If it does not exist or match predicates, return None. let next: char = chars.next().filter(|c| is_xid_start(*c) || *c == '_')?; // Store/track the number of bytes consumed so far. let mut bytes_consumed: usize = next.len_utf8(); @@ -62,19 +62,21 @@ pub fn try_consume_keyword_or_identifier<'src>(lexer: &mut Lexer<'src>) -> Optio .map(char::len_utf8) .sum::(); - // Split the token and the new remaining fragment. - // SAFETY: The character iterator should guaruntee that we land on a valid character boundary within the bounds - // of the fragment. - let (token_fragment, new_remaining): (Fragment, Fragment) = unsafe { - lexer.remaining.split_at_unchecked(bytes_consumed) - }; + // Split the token and the new remaining fragment. + // SAFETY: The character iterator should guaruntee that we land on a valid character boundary within the bounds + // of the fragment. + let (token_fragment, new_remaining): (Fragment, Fragment) = + unsafe { lexer.remaining.split_at_unchecked(bytes_consumed) }; - // Get the variant of token to produce. + // Get the variant of token to produce. let variant: TokenTy = identifier_or_keyword(token_fragment); - // Update the lexer's remaining fragment. + // Update the lexer's remaining fragment. lexer.remaining = new_remaining; - // Return the token. - Some(Token { variant, fragment: token_fragment }) + // Return the token. + Some(Token { + variant, + fragment: token_fragment, + }) } #[cfg(test)] diff --git a/wright/src/parser/lexer/integer_literal.rs b/wright/src/parser/lexer/integer_literal.rs index 2479565a..435bdd40 100644 --- a/wright/src/parser/lexer/integer_literal.rs +++ b/wright/src/parser/lexer/integer_literal.rs @@ -1,17 +1,20 @@ -//! Implementation for lexing integer literals. +//! Implementation for lexing integer literals. +use super::{ + token::{Token, TokenTy}, + Lexer, +}; use std::{iter::Peekable, str::Chars}; -use super::{token::{Token, TokenTy}, Lexer}; /// Attempt to lex and consume an [TokenTy::IntegerLiteral] from the lexer. pub fn try_consume_integer_literal<'src>(lexer: &mut Lexer<'src>) -> Option> { - // Make a peekable character iterator. + // Make a peekable character iterator. let mut chars: Peekable = lexer.remaining.chars().peekable(); - // Get the first character from the iterator. We can only continue lexing if one exists and is an ascii - // decimal digit. + // Get the first character from the iterator. We can only continue lexing if one exists and is an ascii + // decimal digit. let next: char = chars.next().filter(char::is_ascii_digit)?; - // Track the number of bytes consumed. We use the length of the parsed first char here but we could probably - // assume it to be 1. + // Track the number of bytes consumed. We use the length of the parsed first char here but we could probably + // assume it to be 1. let mut bytes_consumed: usize = next.len_utf8(); // Track the radix let mut radix: u32 = 10; @@ -42,7 +45,7 @@ pub fn try_consume_integer_literal<'src>(lexer: &mut Lexer<'src>) -> Option Date: Wed, 13 Mar 2024 01:05:28 -0400 Subject: [PATCH 47/60] Add new token types and make some lexer fns pub --- wright/src/parser/lexer.rs | 15 +++++++-------- wright/src/parser/lexer/token.rs | 7 +++++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index dd5ddf1c..2e13436d 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -68,9 +68,8 @@ impl<'src> Lexer<'src> { } /// "Fork" this lexer, creating a new [`Lexer`] at the same position as this one that can be used for - /// failable parsing. This can be compared to the original lexer it was forked from using [Lexer::offset_from] - /// on the underlying `remaining` fragments. - fn fork(&self) -> Self { + /// failable parsing. This can be compared to the original lexer it was forked from using [Lexer::offset_from]. + pub fn fork(&self) -> Self { *self } @@ -82,12 +81,12 @@ impl<'src> Lexer<'src> { /// - Generally the best way to avoid panics is to only call this function on /// [Lexer]s created using [Lexer::fork] on the `origin` lexer. #[inline] - fn offset_from(&self, origin: &Self) -> usize { + pub fn offset_from(&self, origin: &Self) -> usize { self.remaining.offset_from(&origin.remaining) } - /// Remove and ignore any whitespace at the start of the remaining fragment. - fn ignore_whitespace(&mut self) { + /// Remove and ignore any whitespace at the start of the [Lexer::remaining] [Fragment]. + pub fn ignore_whitespace(&mut self) { // Get a reference to the slice of the string past any whitespace at the start. let without_whitespace: &str = self.remaining.inner.trim_start(); @@ -97,8 +96,8 @@ impl<'src> Lexer<'src> { } } - /// Check if a pattern matches at the start of the remaining fragment, and if so return the number of bytes. - fn matches(&self, pattern: &str) -> bool { + /// Check if a pattern matches at the start of the [Lexer::remaining] [Fragment]. + pub fn matches(&self, pattern: &str) -> bool { self.remaining.inner.starts_with(pattern) } diff --git a/wright/src/parser/lexer/token.rs b/wright/src/parser/lexer/token.rs index f6a9f9e8..6749aa84 100644 --- a/wright/src/parser/lexer/token.rs +++ b/wright/src/parser/lexer/token.rs @@ -55,6 +55,8 @@ pub enum TokenTy { InnerDocComment, InnerBlockDocComment, /// Indicates a block style comment without termination. + /// Separate from [TokenTy::InnerDocComment] and [TokenTy::OuterDocComment] to indicate that + /// unterminated comments will be handled differently (produce errors eventually). UnterminatedBlockComment, KwRecord, @@ -82,8 +84,9 @@ pub enum TokenTy { KwWhere, IntegerLiteral, - StringLiteral, - CharLiteral, + StringLiteral { terminated: bool }, + FormatStringLiteral { terminated: bool }, + CharLiteral { terminated: bool }, /// Unknown character in lexer fragment. Unknown From 07813bc2d017b8960f5da4f2abed7894aca756a6 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 13 Mar 2024 01:27:52 -0400 Subject: [PATCH 48/60] String/char literals --- wright/src/parser/lexer.rs | 7 ++++ wright/src/parser/lexer/quoted.rs | 65 +++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 wright/src/parser/lexer/quoted.rs diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 2e13436d..5d502f52 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -5,6 +5,7 @@ use self::comments::{try_match_block_comment, try_match_single_line_comment}; use self::integer_literal::try_consume_integer_literal; +use self::quoted::try_consume_quoted_literal; use super::fragment::Fragment; use std::iter::FusedIterator; @@ -17,6 +18,7 @@ pub mod identifier; pub mod integer_literal; pub mod token; pub mod trivial; +pub mod quoted; /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug, Clone, Copy)] @@ -202,6 +204,11 @@ impl<'src> Lexer<'src> { return Some(integer_lit); } + // Next attempt to parse a quoted literal. + if let Some(quoted_lit) = try_consume_quoted_literal(self) { + return Some(quoted_lit); + } + // If we haven't matched at this point, produce a token marked as "Unknown". // The unsafe is fine -- we know from above that there are remaining characters. let unknown_char = unsafe { self.remaining.chars().next().unwrap_unchecked() }; diff --git a/wright/src/parser/lexer/quoted.rs b/wright/src/parser/lexer/quoted.rs new file mode 100644 index 00000000..2318e0db --- /dev/null +++ b/wright/src/parser/lexer/quoted.rs @@ -0,0 +1,65 @@ +//! Quoted literals. + +use std::str::Chars; +use super::{token::Token, Lexer, token::TokenTy}; + +/// Attempt to parse a quoted literal. This includes [TokenTy::StringLiteral], [TokenTy::CharLiteral], and +/// [TokenTy::FormatStringLiteral]. +pub fn try_consume_quoted_literal<'src>(lexer: &mut Lexer<'src>) -> Option> { + // Make a chars iterator to lex from. + let mut chars: Chars = lexer.remaining.chars(); + // Get the first char from the character iterator. + // Return none if the first character doesn't exist or is not one of the quote terminating characters. + let first: char = chars.next().filter(|c| ['\'', '"', '`'].contains(c))?; + // Track number of bytes consumed. + let mut bytes_consumed: usize = first.len_utf8(); + // Track whether the quoted literal is terminated. + let mut is_terminated: bool = false; + + // Consume from the iterator while possible. + while let Some(consumed) = chars.next() { + // Update the number of bytes consumed. + bytes_consumed += consumed.len_utf8(); + + // Check if the character matches the starting char. + // If so, record the literal as terminated and break this loop. + if consumed == first { + is_terminated = true; + break; + } + + // If the character we just consumed is a backslash. + // We only handle escaped terminators here, rather than parsing actual meaning. + // Consume the next character if there is one, regardless of what it is. + // This prevents an escaped terminator from ending the literal. + if consumed == '\\' { + // If there is no next char, do not add anything to the number of bytes consumed. + bytes_consumed += chars.next().map(char::len_utf8).unwrap_or(0); + } + } + + // Return when we have either reached a terminator or run out of characters. + // First determine the variant to return. + let variant: TokenTy = match first { + '\'' => TokenTy::CharLiteral { terminated: is_terminated }, + '\"' => TokenTy::StringLiteral { terminated: is_terminated }, + '`' => TokenTy::FormatStringLiteral { terminated: is_terminated }, + _ => unreachable!("There are no other quoted literals"), + }; + + // SAFETY: Summing char lengths from the iterator should never give us an invalid or out of bounds index. + Some(unsafe { lexer.split_token_unchecked(bytes_consumed, variant) }) +} + +#[cfg(test)] +mod tests { + use crate::parser::lexer::{token::TokenTy, Lexer}; + + #[test] + fn string_literal() { + let mut lexer = Lexer::new(r#" "Test string literal" "#); + let token = lexer.next_token().unwrap(); + assert_eq!(token.variant, TokenTy::StringLiteral { terminated: true }); + assert_eq!(token.fragment.inner, "\"Test string literal\""); + } +} From 955411a54f588594c7f85991bd9776b7452122ff Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Wed, 13 Mar 2024 01:28:48 -0400 Subject: [PATCH 49/60] cargo fmt --- wright/src/parser/lexer.rs | 4 +-- wright/src/parser/lexer/quoted.rs | 55 ++++++++++++++++++------------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/wright/src/parser/lexer.rs b/wright/src/parser/lexer.rs index 5d502f52..387b6d40 100644 --- a/wright/src/parser/lexer.rs +++ b/wright/src/parser/lexer.rs @@ -16,9 +16,9 @@ use token::{Token, TokenTy}; pub mod comments; pub mod identifier; pub mod integer_literal; +pub mod quoted; pub mod token; pub mod trivial; -pub mod quoted; /// The lexical analyser for wright. This produces a series of tokens that make up the larger elements of the language. #[derive(Debug, Clone, Copy)] @@ -204,7 +204,7 @@ impl<'src> Lexer<'src> { return Some(integer_lit); } - // Next attempt to parse a quoted literal. + // Next attempt to parse a quoted literal. if let Some(quoted_lit) = try_consume_quoted_literal(self) { return Some(quoted_lit); } diff --git a/wright/src/parser/lexer/quoted.rs b/wright/src/parser/lexer/quoted.rs index 2318e0db..b26f0481 100644 --- a/wright/src/parser/lexer/quoted.rs +++ b/wright/src/parser/lexer/quoted.rs @@ -1,53 +1,62 @@ -//! Quoted literals. +//! Lexing implementation for quoted literals. +use super::{token::Token, token::TokenTy, Lexer}; use std::str::Chars; -use super::{token::Token, Lexer, token::TokenTy}; -/// Attempt to parse a quoted literal. This includes [TokenTy::StringLiteral], [TokenTy::CharLiteral], and -/// [TokenTy::FormatStringLiteral]. +/// Attempt to parse a quoted literal. This includes [TokenTy::StringLiteral], [TokenTy::CharLiteral], and +/// [TokenTy::FormatStringLiteral]. pub fn try_consume_quoted_literal<'src>(lexer: &mut Lexer<'src>) -> Option> { // Make a chars iterator to lex from. let mut chars: Chars = lexer.remaining.chars(); - // Get the first char from the character iterator. - // Return none if the first character doesn't exist or is not one of the quote terminating characters. + // Get the first char from the character iterator. + // Return none if the first character doesn't exist or is not one of the quote terminating characters. let first: char = chars.next().filter(|c| ['\'', '"', '`'].contains(c))?; // Track number of bytes consumed. let mut bytes_consumed: usize = first.len_utf8(); - // Track whether the quoted literal is terminated. + // Track whether the quoted literal is terminated. let mut is_terminated: bool = false; - // Consume from the iterator while possible. + // Consume from the iterator while possible. while let Some(consumed) = chars.next() { - // Update the number of bytes consumed. + // Update the number of bytes consumed. bytes_consumed += consumed.len_utf8(); - // Check if the character matches the starting char. - // If so, record the literal as terminated and break this loop. + // Check if the character matches the starting char. + // If so, record the literal as terminated and break this loop. if consumed == first { is_terminated = true; break; } - // If the character we just consumed is a backslash. - // We only handle escaped terminators here, rather than parsing actual meaning. - // Consume the next character if there is one, regardless of what it is. - // This prevents an escaped terminator from ending the literal. + // If the character we just consumed is a backslash. + // We only handle escaped terminators here, rather than parsing actual meaning. + // Consume the next character if there is one, regardless of what it is. + // This prevents an escaped terminator from ending the literal. if consumed == '\\' { - // If there is no next char, do not add anything to the number of bytes consumed. + // If there is no next char, do not add anything to the number of bytes consumed. bytes_consumed += chars.next().map(char::len_utf8).unwrap_or(0); } } - // Return when we have either reached a terminator or run out of characters. - // First determine the variant to return. + // Return when we have either reached a terminator or run out of characters. + // First determine the variant to return. let variant: TokenTy = match first { - '\'' => TokenTy::CharLiteral { terminated: is_terminated }, - '\"' => TokenTy::StringLiteral { terminated: is_terminated }, - '`' => TokenTy::FormatStringLiteral { terminated: is_terminated }, + '\'' => TokenTy::CharLiteral { + terminated: is_terminated, + }, + + '\"' => TokenTy::StringLiteral { + terminated: is_terminated, + }, + + '`' => TokenTy::FormatStringLiteral { + terminated: is_terminated, + }, + _ => unreachable!("There are no other quoted literals"), }; - // SAFETY: Summing char lengths from the iterator should never give us an invalid or out of bounds index. + // SAFETY: Summing char lengths from the iterator should never give us an invalid or out of bounds index. Some(unsafe { lexer.split_token_unchecked(bytes_consumed, variant) }) } @@ -55,7 +64,7 @@ pub fn try_consume_quoted_literal<'src>(lexer: &mut Lexer<'src>) -> Option Date: Sat, 16 Mar 2024 01:05:36 -0400 Subject: [PATCH 50/60] Add codecov.io --- .github/workflows/codecov-io.yml | 29 +++++++++++++++++++++++++++++ wright/src/parser/ast.rs | 1 + 2 files changed, 30 insertions(+) create mode 100644 .github/workflows/codecov-io.yml diff --git a/.github/workflows/codecov-io.yml b/.github/workflows/codecov-io.yml new file mode 100644 index 00000000..890cd8b8 --- /dev/null +++ b/.github/workflows/codecov-io.yml @@ -0,0 +1,29 @@ +on: ["push", "pull_request"] + +name: codecov.io Code Coverage +jobs: + coverage: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Install LLVM + # See: https://apt.llvm.org/ + # Last line: https://gitlab.com/taricorp/llvm-sys.rs/-/issues/13 + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 17 + sudo apt install libpolly-17-dev libz-dev + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all-features --no-fail-fast + env: + RUSTFLAGS: '-C instrument-coverage' + LLVM_PROFILE_FILE: 'cargo-test-%p-%m.profraw' + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + slug: vcfxb/wright-lang + diff --git a/wright/src/parser/ast.rs b/wright/src/parser/ast.rs index 9b7bc8c1..fdd36655 100644 --- a/wright/src/parser/ast.rs +++ b/wright/src/parser/ast.rs @@ -1 +1,2 @@ //! Abstract syntax tree representation for Wright source code. + From 839d1a74878231a96c34402be0022734433792bc Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:17:10 -0400 Subject: [PATCH 51/60] Fix codecov.io --- .github/workflows/codecov-io.yml | 34 +++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/.github/workflows/codecov-io.yml b/.github/workflows/codecov-io.yml index 890cd8b8..c880abcb 100644 --- a/.github/workflows/codecov-io.yml +++ b/.github/workflows/codecov-io.yml @@ -1,4 +1,13 @@ -on: ["push", "pull_request"] +on: + push: + branches: + # https://stackoverflow.com/questions/64635032/github-actions-run-on-push-to-all-branches + - "**" + pull_request: + branches: + - "main" +env: + CARGO_TERM_COLOR: always name: codecov.io Code Coverage jobs: @@ -6,6 +15,7 @@ jobs: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 + - name: Install LLVM # See: https://apt.llvm.org/ # Last line: https://gitlab.com/taricorp/llvm-sys.rs/-/issues/13 @@ -14,16 +24,26 @@ jobs: chmod +x llvm.sh sudo ./llvm.sh 17 sudo apt install libpolly-17-dev libz-dev - - uses: actions-rs/cargo@v1 + + - uses: actions-rs/toolchain@v1 with: - command: test - args: --all-features --no-fail-fast + toolchain: nightly + override: true + + - name: Run tests + run: cargo test --verbose env: - RUSTFLAGS: '-C instrument-coverage' - LLVM_PROFILE_FILE: 'cargo-test-%p-%m.profraw' + CARGO_INCREMENTAL: '0' + RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' + + - name: rust-grcov + # You may pin to the exact commit or the version. + # uses: actions-rs/grcov@bb47b1ed7883a1502fa6875d562727ace2511248 + uses: actions-rs/grcov@v0.1 + - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4.0.1 with: token: ${{ secrets.CODECOV_TOKEN }} slug: vcfxb/wright-lang - From 754d6de59fbddd77b4f1dc0a8e6513149339974b Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:18:24 -0400 Subject: [PATCH 52/60] Activate for non-main branches --- .github/workflows/codecov-io.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/codecov-io.yml b/.github/workflows/codecov-io.yml index c880abcb..c3e2c2e9 100644 --- a/.github/workflows/codecov-io.yml +++ b/.github/workflows/codecov-io.yml @@ -1,8 +1,6 @@ on: push: branches: - # https://stackoverflow.com/questions/64635032/github-actions-run-on-push-to-all-branches - - "**" pull_request: branches: - "main" From d5fabc2eae9b1f0da95196645d3bcc75a7ced255 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:19:05 -0400 Subject: [PATCH 53/60] activate for all branches --- .github/workflows/codecov-io.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/codecov-io.yml b/.github/workflows/codecov-io.yml index c3e2c2e9..b69c0243 100644 --- a/.github/workflows/codecov-io.yml +++ b/.github/workflows/codecov-io.yml @@ -1,9 +1,5 @@ -on: - push: - branches: - pull_request: - branches: - - "main" +on: ["push", "pull_request"] + env: CARGO_TERM_COLOR: always From 0554dec01c830453753e30c99ba8a98c4abee199 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:20:10 -0400 Subject: [PATCH 54/60] third times a charm --- .github/workflows/codecov-io.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codecov-io.yml b/.github/workflows/codecov-io.yml index b69c0243..0efcda05 100644 --- a/.github/workflows/codecov-io.yml +++ b/.github/workflows/codecov-io.yml @@ -1,5 +1,9 @@ -on: ["push", "pull_request"] - +on: + push: + branches: + pull_request: + branches: + - "main" env: CARGO_TERM_COLOR: always @@ -18,8 +22,8 @@ jobs: chmod +x llvm.sh sudo ./llvm.sh 17 sudo apt install libpolly-17-dev libz-dev - - - uses: actions-rs/toolchain@v1 + + - uses: actions-rs/toolchain@v1 with: toolchain: nightly override: true From facde2ca7a32bb96e8650d4d0513a9fdee9aa728 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:27:32 -0400 Subject: [PATCH 55/60] Add codecov badge to readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 23d24c05..00f10754 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ | Cargo Check Status | ![Cargo Check status](https://github.com/vcfxb/wright-lang/actions/workflows/cargo-check.yml/badge.svg?branch=master) | | Cargo Test Status | ![Cargo Test status](https://github.com/vcfxb/wright-lang/actions/workflows/cargo-test.yml/badge.svg?branch=master) | | Cargo Clippy Status | ![Cargo Clippy status](https://github.com/vcfxb/wright-lang/actions/workflows/cargo-clippy.yml/badge.svg?branch=master) | -| Code Coverage | [![Coverage Status](https://coveralls.io/repos/github/vcfxb/wright-lang/badge.svg?branch=master&kill_cache=1)](https://coveralls.io/github/vcfxb/wright-lang?branch=master) | +| Code Coverage (Coveralls) | [![Coverage Status](https://coveralls.io/repos/github/vcfxb/wright-lang/badge.svg?branch=master&kill_cache=1)](https://coveralls.io/github/vcfxb/wright-lang?branch=master) | +| Code Coverage (Codecov.io) | [![codecov](https://codecov.io/github/vcfxb/wright-lang/graph/badge.svg?token=HO07JEYMIH)](https://codecov.io/github/vcfxb/wright-lang) | Docs.rs | [![Documentation](https://docs.rs/wright/badge.svg)](https://docs.rs/wright) | | Crates.io | [![Crates.io](https://img.shields.io/crates/v/wright.svg)](https://crates.io/crates/wright) | | GitHub release | [![GitHub release](https://img.shields.io/github/release/vcfxb/wright-lang.svg)](https://github.com/vcfxb/wright-lang/releases) | From 9d69484e503458478686456660a835cb9e374f34 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:33:32 -0400 Subject: [PATCH 56/60] Update coveralls coverage CI --- .github/actions-rs/grcov.yml | 6 ------ .github/workflows/grcov.yml | 12 +++++------- 2 files changed, 5 insertions(+), 13 deletions(-) delete mode 100644 .github/actions-rs/grcov.yml diff --git a/.github/actions-rs/grcov.yml b/.github/actions-rs/grcov.yml deleted file mode 100644 index 97314290..00000000 --- a/.github/actions-rs/grcov.yml +++ /dev/null @@ -1,6 +0,0 @@ -branch: true -output-type: lcov -output-file: ./lcov.info -ignore-not-existing: true -ignore: - - "/*" diff --git a/.github/workflows/grcov.yml b/.github/workflows/grcov.yml index a098cbbe..e258cf08 100644 --- a/.github/workflows/grcov.yml +++ b/.github/workflows/grcov.yml @@ -1,6 +1,6 @@ on: ["push", "pull_request"] -name: Code Coverage +name: coveralls Code Coverage jobs: coverage: @@ -19,18 +19,16 @@ jobs: with: toolchain: nightly override: true - - uses: actions-rs/cargo@v1 - with: - command: test - args: --all-features --no-fail-fast + - name: Run tests + run: cargo test --verbose env: CARGO_INCREMENTAL: '0' RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' - - id: coverage + - name: rust-grcov uses: actions-rs/grcov@v0.1 - name: Coveralls upload uses: coverallsapp/github-action@master with: - github-token: ${{secrets.GITHUB_TOKEN}} + github-token: ${{ secrets.GITHUB_TOKEN }} path-to-lcov: ${{ steps.coverage.outputs.report }} From 000f8e06f2c9a04e2ef3fcb52b32c960defc6c63 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:38:52 -0400 Subject: [PATCH 57/60] Needed id for coverage step --- .github/workflows/grcov.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/grcov.yml b/.github/workflows/grcov.yml index e258cf08..979fc7cb 100644 --- a/.github/workflows/grcov.yml +++ b/.github/workflows/grcov.yml @@ -26,6 +26,7 @@ jobs: RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests' - name: rust-grcov + id: coverage uses: actions-rs/grcov@v0.1 - name: Coveralls upload uses: coverallsapp/github-action@master From 735cfd42fd8849821e1179d1a2e234fdb4048222 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:43:04 -0400 Subject: [PATCH 58/60] Start stripping out old unused code --- README.md | 2 +- wright/src/parser/old/lexer.rs | 500 -------------------- wright/src/parser/old/lexer/definition.rs | 72 --- wright/src/parser/old/lexer/pretty_print.rs | 176 ------- wright/src/parser/old/lexer/tokens.rs | 189 -------- 5 files changed, 1 insertion(+), 938 deletions(-) delete mode 100644 wright/src/parser/old/lexer.rs delete mode 100644 wright/src/parser/old/lexer/definition.rs delete mode 100644 wright/src/parser/old/lexer/pretty_print.rs delete mode 100644 wright/src/parser/old/lexer/tokens.rs diff --git a/README.md b/README.md index 00f10754..e250037d 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ | Cargo Test Status | ![Cargo Test status](https://github.com/vcfxb/wright-lang/actions/workflows/cargo-test.yml/badge.svg?branch=master) | | Cargo Clippy Status | ![Cargo Clippy status](https://github.com/vcfxb/wright-lang/actions/workflows/cargo-clippy.yml/badge.svg?branch=master) | | Code Coverage (Coveralls) | [![Coverage Status](https://coveralls.io/repos/github/vcfxb/wright-lang/badge.svg?branch=master&kill_cache=1)](https://coveralls.io/github/vcfxb/wright-lang?branch=master) | -| Code Coverage (Codecov.io) | [![codecov](https://codecov.io/github/vcfxb/wright-lang/graph/badge.svg?token=HO07JEYMIH)](https://codecov.io/github/vcfxb/wright-lang) +| Code Coverage (Codecov.io) | [![codecov](https://codecov.io/github/vcfxb/wright-lang/graph/badge.svg?token=HO07JEYMIH)](https://codecov.io/github/vcfxb/wright-lang) | | Docs.rs | [![Documentation](https://docs.rs/wright/badge.svg)](https://docs.rs/wright) | | Crates.io | [![Crates.io](https://img.shields.io/crates/v/wright.svg)](https://crates.io/crates/wright) | | GitHub release | [![GitHub release](https://img.shields.io/github/release/vcfxb/wright-lang.svg)](https://github.com/vcfxb/wright-lang/releases) | diff --git a/wright/src/parser/old/lexer.rs b/wright/src/parser/old/lexer.rs deleted file mode 100644 index 051313dc..00000000 --- a/wright/src/parser/old/lexer.rs +++ /dev/null @@ -1,500 +0,0 @@ -//! The wright lexer. This module is responsible for lexical analysis and initial processing of source code. -//! -//! This is implemented here using an iterator that looks up the next character from the input using a `const`-defined -//! lexer structure definition. This can be found in [definition]. - -pub mod tokens; -mod definition; -// mod pretty_print; - -use std::{ - iter::{FusedIterator, Peekable}, - str::CharIndices, -}; - -use self::tokens::{CommentTy, Token, TokenTy}; - -/// Lexical analyzer for wright code. This struct host functions that produce tokens from wright source. -#[derive(Debug, Clone)] -pub struct Lexer<'a> { - /// Iterator over the indexed input characters tied to the lifetime of the source code. - iterator: Peekable>, - /// The source code passed to the lexer. This is used to check for keywords. - source: &'a str, -} - -impl<'a> Lexer<'a> { - /// Create a new lexer that iterates on a given source string. - pub fn new(source: &'a str) -> Self { - Lexer { - iterator: source.char_indices().peekable(), - source, - } - } -} - -impl<'a> Iterator for Lexer<'a> { - type Item = Token; - - fn next(&mut self) -> Option { - // Get the next character out of the iterator. - let (start_index, next) = self.iterator.next()?; - - // Handle single character tokens first. - let single_char_tokens = [ - - ]; - - for (c, variant) in single_char_tokens { - if next == c { - return Some(Token { variant, length: 1 }); - } - } - - // Next handle tokens that can possibly be followed by an equal sign. - let possible_eq_upgrades = [ - ('!', TokenTy::Bang, TokenTy::BangEq), - ('%', TokenTy::Mod, TokenTy::ModEq), - ('^', TokenTy::Xor, TokenTy::XorEq), - ('*', TokenTy::Star, TokenTy::StarEq), - ('+', TokenTy::Plus, TokenTy::PlusEq), - ]; - - for (c, no_eq, with_eq) in possible_eq_upgrades { - if next == c { - return match self.iterator.next_if(|&(_, x)| x == '=') { - Some(_) => Some(Token { - variant: with_eq, - length: 2, - }), - None => Some(Token { - variant: no_eq, - length: 1, - }), - }; - } - } - - // Next handle tokens that can be doubled or have an equals sign. - let possible_eq_or_double = [ - ('&', TokenTy::And, TokenTy::AndEq, TokenTy::AndAnd), - ('|', TokenTy::Or, TokenTy::OrEq, TokenTy::OrOr), - ('<', TokenTy::Lt, TokenTy::LtEq, TokenTy::ShiftLeft), - ('>', TokenTy::Gt, TokenTy::GtEq, TokenTy::ShiftRight), - (':', TokenTy::Colon, TokenTy::ColonEq, TokenTy::ColonColon), - ('/', TokenTy::Div, TokenTy::DivEq, TokenTy::DivDiv), - ]; - - for (c, alone, with_eq, doubled) in possible_eq_or_double { - if next == c { - return match self.iterator.next_if(|&(_, x)| x == '=' || x == c) { - // Followed by `=` - Some((_, '=')) => Some(Token { - variant: with_eq, - length: 2, - }), - - // Followed by itself. - Some(_) => Some(Token { - variant: doubled, - length: 2, - }), - - // Single char token - None => Some(Token { - variant: alone, - length: 1, - }), - }; - } - } - - // Next deal with arrows - let arrows = [ - ('-', TokenTy::Minus, TokenTy::MinusEq, TokenTy::SingleArrow), - ('=', TokenTy::Eq, TokenTy::EqEq, TokenTy::DoubleArrow), - ('~', TokenTy::Tilde, TokenTy::TildeEq, TokenTy::TildeArrow), - ]; - - for (c, alone, with_eq, as_arrow) in arrows { - if next == c { - return match self.iterator.next_if(|&(_, x)| x == '=' || x == '>') { - Some((_, '=')) => Some(Token { - variant: with_eq, - length: 2, - }), - Some((_, '>')) => Some(Token { - variant: as_arrow, - length: 2, - }), - None => Some(Token { - variant: alone, - length: 1, - }), - _ => unreachable!(), - }; - } - } - - // Dot and range operators. - if next == '.' { - return match self.iterator.next_if(|&(_, x)| x == '.') { - None => Some(Token { - variant: TokenTy::Dot, - length: 1, - }), - Some(_) => match self.iterator.next_if(|&(_, x)| x == '=') { - None => Some(Token { - variant: TokenTy::Range, - length: 2, - }), - Some(_) => Some(Token { - variant: TokenTy::RangeInclusive, - length: 3, - }), - }, - }; - } - - // Whitespace. - if next.is_whitespace() { - // Accumulate the number of bytes of whitespace consumed. - let mut acc = next.len_utf8(); - // Use while-let instead of take-while to avoid consuming the whole iterator. - while let Some((_, consumed)) = self.iterator.next_if(|&(_, x)| x.is_whitespace()) { - acc += consumed.len_utf8(); - } - - return Some(Token { - variant: TokenTy::Whitespace, - length: acc, - }); - } - - // Identifiers - if unicode_ident::is_xid_start(next) || next == '_' { - // Accumulate the number of bytes consumed in the identifier. - let mut acc = next.len_utf8(); - // Consume the rest of the identifier. - while let Some((_, consumed)) = self - .iterator - .next_if(|&(_, x)| unicode_ident::is_xid_continue(x)) - { - acc += consumed.len_utf8(); - } - - // Get the matching source code to check for reserved words. - let range = start_index..start_index + acc; - let matching_source = &self.source[range]; - - // Match on reserved words. - let variant: TokenTy = match matching_source { - // Declaration keywords - "class" => TokenTy::Class, - "struct" => TokenTy::Struct, - "record" => TokenTy::Record, - "trait" => TokenTy::Trait, - "func" => TokenTy::Func, - "enum" => TokenTy::Enum, - "union" => TokenTy::Union, - "module" => TokenTy::Module, - "import" => TokenTy::Import, - "implement" => TokenTy::Implement, - "represent" => TokenTy::Represent, - - // Visibility keywords - "public" => TokenTy::Public, - "package" => TokenTy::Package, - "private" => TokenTy::Private, - - // Boolean literals - "true" => TokenTy::True, - "false" => TokenTy::False, - - // Other keywords. - "constraint" => TokenTy::Constraint, - "constrain" => TokenTy::Constrain, - "relation" => TokenTy::Relation, - "unsafe" => TokenTy::Unsafe, - "unchecked" => TokenTy::Unchecked, - "lifetime" => TokenTy::Lifetime, - "outlives" => TokenTy::Outlives, - "Self" => TokenTy::SelfUpper, - "self" => TokenTy::SelfLower, - "type" => TokenTy::Type, - "const" => TokenTy::Const, - "var" => TokenTy::Var, - "if" => TokenTy::If, - "else" => TokenTy::Else, - "match" => TokenTy::Match, - "is" => TokenTy::Is, - "as" => TokenTy::As, - "on" => TokenTy::On, - "in" => TokenTy::In, - "not" => TokenTy::Not, - "dyn" => TokenTy::Dyn, - "try" => TokenTy::Try, - - _ => TokenTy::Identifier, - }; - - return Some(Token { - variant, - length: acc, - }); - } - - // Numerical literals. - if next.is_ascii_digit() { - // Accumulate the number of bytes consumed in the numeric literal. - // All ascii is 1 byte wide so avoid the extra call to `.len_utf8()`. - let mut acc = 1; - // Track the radix - let mut radix = 10; - - // Change the radix if necessary - if next == '0' { - if let Some((_, prefix)) = self - .iterator - .next_if(|(_, x)| ['x', 'o', 'b', 'X', 'B'].contains(x)) - { - acc += 1; - - radix = match prefix { - 'x' | 'X' => 16, - 'b' | 'B' => 2, - 'o' => 8, - _ => unreachable!(), - }; - } - } - - // Consume the rest of the integer literal. - while self - .iterator - .next_if(|&(_, x)| x.is_digit(radix) || x == '_') - .is_some() - { - // All accepted characters should be ascii, so we can just simplify `.len_utf8()` to 1. - acc += 1; - } - - return Some(Token { - variant: TokenTy::IntegerLit, - length: acc, - }); - } - - // String and Character literals. - if ['\'', '"', '`'].contains(&next) { - // Accumulator to track number of bytes consumed. - let mut acc: usize = 1; - let mut is_terminated = false; - - // Consume characters until the end of the literal - while let Some((_, consumed)) = self.iterator.next() { - acc += consumed.len_utf8(); - - match consumed { - // Ending character is the same as starting character. - // Escapes should all be handled, so don't worry about this being escaped. - _ if consumed == next => { - is_terminated = true; - break; - } - - // Escaped pattern. - // Only worry about escaped terminators here, since all other escaped - // patterns can be dealt with later. - '\\' => { - // Consume the escaped character regardless of what it is. - // It will always be part of the quoted literal. - if let Some((_, escaped)) = self.iterator.next() { - acc += escaped.len_utf8(); - } - } - - // Do nothing for non-escaped chars since the quoted literal continues - // and we have already recorded the consumed bytes. - _ => {} - } - } - - // We have finished consuming the literal -- make sure we produce the - // right variant - return match next { - '\'' => Some(Token { - variant: TokenTy::CharLit { is_terminated }, - length: acc, - }), - _ => Some(Token { - variant: TokenTy::StringLit { - is_format: next == '`', - is_terminated, - }, - length: acc, - }), - }; - } - - // Comments. - if next == '#' { - // Use accumulator to track number of bytes consumed. - let mut acc = 1; - - // There are a few variants as follows. - // `#...` - single line comment - // `#*...*#` - multiline comment - // `##...` - single line inner doc comment - // `##!...` - single line outer doc comment - // `#**...*#` - multiline inner doc comment - // `#*!...*#` - multiline outer doc comment - // If a multiline comment is not terminated by the end of the file then just mark it as such in the - // produced token. A seperate token error handling layer will raise that outside of this function. - - // Handle multiline comments - if self.iterator.next_if(|&(_, x)| x == '*').is_some() { - acc += 1; - - // Check if it's a doc comment. - let comment_type = match self.iterator.next_if(|&(_, x)| x == '*' || x == '!') { - Some((_, '*')) => { - acc += 1; - CommentTy::InnerDoc - } - - Some((_, '!')) => { - acc += 1; - CommentTy::OuterDoc - } - - None => CommentTy::Normal, - - _ => unreachable!(), - }; - - // Read the rest of the multi-line comment - while let Some((_, consumed)) = self.iterator.next() { - acc += consumed.len_utf8(); - if consumed == '*' && matches!(self.iterator.peek(), Some((_, '#'))) { - acc += 1; - return Some(Token { - variant: TokenTy::MultilineComment { - comment_type, - is_terminated: true, - }, - length: acc, - }); - } - } - - // If we hit the end, the comment is not terminated. - return Some(Token { - variant: TokenTy::MultilineComment { - comment_type, - is_terminated: false, - }, - length: acc, - }); - } - - // Handle single line comment. - let mut comment_type = CommentTy::Normal; - - // Check for inner doc comment - if self.iterator.next_if(|&(_, x)| x == '#').is_some() { - acc += 1; - comment_type = CommentTy::InnerDoc; - - // Check for outer doc comment - if self.iterator.next_if(|&(_, x)| x == '!').is_some() { - acc += 1; - comment_type = CommentTy::OuterDoc; - } - } - - // Read to end of line/file for rest of comment. Include line ending in consumed bytes. - for (_, consumed) in self.iterator.by_ref() { - acc += consumed.len_utf8(); - if consumed == '\n' { - break; - } - } - - return Some(Token { - variant: TokenTy::SingleLineComment { comment_type }, - length: acc, - }); - } - - // If we haven't matched by this point, return an unknown token. - Some(Token { - variant: TokenTy::Unknown, - length: next.len_utf8(), - }) - } - - fn size_hint(&self) -> (usize, Option) { - // Get the size hint of the internal iterator. - let (inner_lower, upper) = self.iterator.size_hint(); - // If there are any characters left, then there is at least one token remaining. - ((inner_lower > 0) as usize, upper) - } -} - -impl<'a> FusedIterator for Lexer<'a> {} - -/// A token with an index in a piece of source code. -#[derive(Copy, Clone, Debug)] -pub struct IndexedToken { - /// The byte index into the source code that this token starts on. - pub index: usize, - /// The token itself. - pub token: Token, -} - -/// An iterator over the tokens in the source code with byte indices attached. -#[derive(Debug, Clone)] -pub struct IndexedLexer<'src> { - /// The current index in source code -- the number of bytes currently consumed by the iterator. - pub index: usize, - /// The underlying lexer iterator. - lexer: Lexer<'src>, -} - -impl<'src> IndexedLexer<'src> { - /// Construct a new indexed lexer. - pub fn new(source: &'src str) -> Self { - Self { - index: 0, - lexer: Lexer::new(source), - } - } -} - -impl<'a> Iterator for IndexedLexer<'a> { - type Item = IndexedToken; - - fn next(&mut self) -> Option { - // Pull a token from the iterator. - let token = self.lexer.next()?; - - // If available, add the current index to it to return. - let indexed_token = IndexedToken { - index: self.index, - token, - }; - - // Update the current index with the length of the token. - self.index += token.length; - - // Return indexed token - Some(indexed_token) - } - - fn size_hint(&self) -> (usize, Option) { - self.lexer.size_hint() - } -} - -impl<'a> FusedIterator for IndexedLexer<'a> {} diff --git a/wright/src/parser/old/lexer/definition.rs b/wright/src/parser/old/lexer/definition.rs deleted file mode 100644 index 8dbeb5fa..00000000 --- a/wright/src/parser/old/lexer/definition.rs +++ /dev/null @@ -1,72 +0,0 @@ -//! The lexer definition in a rust constant that tells us how to handle characters encountered and lists all the -//! possible tokens produced. - -use super::tokens::{TokenTy}; - -/// A single character token matches a single character from the input, and produces a token of the length of the -/// character exactly. -#[derive(Clone, Copy, Debug)] -pub struct SingleCharToken { - /// The character to match. - pub matching_char: char, - /// The token type produced. - pub produces: TokenTy, -} - -impl SingleCharToken { - /// Turn a single character token into a lexer branch. - const fn into_lexer_branch(self) -> LexerBranch { - LexerBranch::SingleCharToken(self) - } -} - -/// A set of posible continuations from a single char token that will form multi char tokens -/// (i.e. going from `&` to `&&` and `&=`). -#[derive(Clone, Copy, Debug)] -pub struct PossibleContinuations { - /// The base single char and the token it produces when not followed by one of the other possible characters. - pub base: SingleCharToken, - /// The characters that can follow this and the tokens they would produce. - pub continuations: &'static [(char, TokenTy)] -} - -impl PossibleContinuations { - /// Convert to a [LexerBranch]. - const fn into_lexer_branch(self) -> LexerBranch { - LexerBranch::PossibleContinuations(self) - } -} - -/// A branch in the lexer, representing options to be tried. -#[derive(Debug)] -pub enum LexerBranch { - /// A single character token (such as '[') with no option for continuation. - SingleCharToken(SingleCharToken), - PossibleContinuations(PossibleContinuations) - -} - -// Below is a variety of `const-fn`s to make generating this structure easier. - -/// Makes a [SingleCharToken]. -const fn single(matching_char: char, produces: TokenTy) -> SingleCharToken { - SingleCharToken { matching_char, produces } -} - -/// Makes a [PossibleContinuations]. -const fn pc(matching_char: char, produces: TokenTy, continuations: &'static [(char, TokenTy)]) -> PossibleContinuations { - PossibleContinuations { base: single(matching_char, produces), continuations } -} - - -/// The lexer's definition, in abstract branching. -pub const DEFINITION: &[LexerBranch] = &[ - single('(', TokenTy::LeftParen).into_lexer_branch(), - single(')', TokenTy::RightParen).into_lexer_branch(), - - pc('+', TokenTy::Plus, &[ - ('=', TokenTy::PlusEq), - ]).into_lexer_branch(), - - -]; diff --git a/wright/src/parser/old/lexer/pretty_print.rs b/wright/src/parser/old/lexer/pretty_print.rs deleted file mode 100644 index 84629e73..00000000 --- a/wright/src/parser/old/lexer/pretty_print.rs +++ /dev/null @@ -1,176 +0,0 @@ -//! Lexer pretty printer. - -use crate::parser::lexer::{IndexedLexer, IndexedToken}; - -use super::Lexer; -use codespan_reporting::files::{Files, SimpleFile}; -use std::cmp; -use std::io::Write; -use std::{fmt::Display, ops::Range}; -use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; - -#[derive(Default)] -struct PrettyPrinter { - print_lines: [String; 2], -} - -impl PrettyPrinter { - fn flush(&mut self) -> anyhow::Result<()> { - // Use termcolor to print in different colors. - let mut out = StandardStream::stdout(ColorChoice::Always); - // Print source code default. - writeln!(&mut out, "{}", self.print_lines[0])?; - // Print token info in cyan - out.set_color(ColorSpec::new().set_fg(Some(Color::Cyan)))?; - writeln!(&mut out, "{}", self.print_lines[1])?; - // Reset after printing is over. - out.set_color(ColorSpec::new().set_reset(true))?; - // Reset the print_lines. - self.print_lines = [Default::default(), Default::default()]; - Ok(()) - } -} - -impl<'a> Lexer<'a> { - /// Print in pretty format the source code and the tokens it matched to under it. - pub fn debug_pretty_print>( - source: &SimpleFile, - ) -> anyhow::Result<()> { - // Create a pretty-printer to use for colored text - let mut pp: PrettyPrinter = Default::default(); - // Print a header line to indicate columns. - println!("line ({:#10})", "byte"); - // Get the source code as a str ref. - let source_str: &str = source.source().as_ref(); - // Get the token iterator for the source code. - let mut token_iter = IndexedLexer::new(source_str) - // Go from byte start indices to byte ranges in the source string - .map(|IndexedToken { index, token }| (index..index + token.length, token)) - // Make it peekable so that we can consume the iterator conditionally - .peekable(); - - // Make an iterator over the line byte-index ranges. - let mut line_range_iter = source_str - .lines() - // Use enumerate to get line indices. - .enumerate() - // Get line byte-index ranges for each line. Use `.unwrap()` beacause - // all the indices out of enumerate should be valid. - .map(|(line_index, _)| (line_index, source.line_range((), line_index).unwrap())) - // Use `.peekable()` to make it conditionally consubable. - .peekable(); - - // Make a utility function to get the matching source for a byte-index range. - let matching_source = |range: Range| -> String { - source_str[range] - // Use `.replace()` to make sure tabs are printed in the - // same width in a predictable way. - .replace('\t', " ") - // Also use replace to avoid double-printing newline characters if they exist. - // Do replace them with a space though, to avoid underflow on subtraction in formatting. - .replace(['\r', '\n'], " ") - }; - - // Iterate on the lines of the source file. - while let Some((line_index, line_range)) = line_range_iter.next() { - // Set the print headers if empty. - if pp.print_lines[0].is_empty() { - pp.print_lines[0] = format!("{:04} ({:#010x}): ", line_index, line_range.start); - pp.print_lines[1] = format!("{:04} ({:#010x}): ", line_index, line_range.start); - } - - // Consume all tokens that end (and therefore start also) on this line. - while let Some((token_range, token)) = - token_iter.next_if(|(token_range, _)| token_range.end <= line_range.end) - { - // Get the matching source code for the token. - let matched = matching_source(token_range); - - // Make a string representation of the token to print in the debug. - let token_string: String = token.to_string(); - - // Get the width of the display as the max of the two string character (not byte) lengths. Add two to the - // token length to represent the square brackets added later. - let width: usize = - cmp::max(token_string.chars().count() + 2, matched.chars().count()); - - // Add source to first line and token info to second line as appopriate. Add two to the source with for the - // square brackets. - pp.print_lines[0].push_str(format!("{matched: add_line_range.start); - - // If it does continue on the next line, start the line. - if let Some((add_line_index, add_line_range)) = continues_on_next_line { - // Get the matching source. - let matching_source_range: Range = add_line_range.start..token_range.end; - let matched: String = matching_source(matching_source_range); - - // Calculate the number of spaces to put before the closing bracket. - let space = matched.chars().count() - 1; - - // Add the match and the closing bracket. - pp.print_lines[0] = format!( - "{:04} ({:#010x}): {matched}", - add_line_index, add_line_range.start - ); - pp.print_lines[1] = format!( - "{:04} ({:#010x}): {:space$}]", - add_line_index, add_line_range.start, "" - ); - } - } else { - // The next token is on the next line, just flush the print_lines and move on. - pp.flush()?; - } - } - - Ok(()) - } -} diff --git a/wright/src/parser/old/lexer/tokens.rs b/wright/src/parser/old/lexer/tokens.rs deleted file mode 100644 index ada667b2..00000000 --- a/wright/src/parser/old/lexer/tokens.rs +++ /dev/null @@ -1,189 +0,0 @@ -use derive_more::Display; - -/// Token of Wright source code. -#[derive(Clone, Copy, Debug, Display)] -#[display(fmt = "{} ({}b)", variant, length)] -pub struct Token { - /// What type of token is it? - pub variant: TokenTy, - /// How many bytes of source code long is it? Note this doesn't necessarily mean how many characters long it is. - pub length: usize, -} - -/// All of the reserved words are just upper-case versions of the -/// matching source code unless otherwise stated. -#[derive(Clone, Copy, PartialEq, Eq, Debug, Display)] -pub enum TokenTy { - // Operators and parentheses - LeftParen, // ( - RightParen, // ) - Bang, // ! - BangEq, // != - Tilde, // ~ - TildeArrow, // ~> - TildeEq, // ~= - At, // @ - Pound, // # - Dollar, // $ - Mod, // % - ModEq, // %= - Xor, // ^ - XorEq, // ^= - And, // & - AndEq, // &= - AndAnd, // && - Or, // | - OrEq, // |= - OrOr, // || - Star, // * - StarEq, // *= - Plus, // + - PlusEq, // += - Minus, // - - MinusEq, // -= - SingleArrow, // -> - Gt, // > - GtEq, // >= - ShiftRight, // >> - Lt, // < - LtEq, // <= - ShiftLeft, // << - Eq, // = - EqEq, // == - DoubleArrow, // => - Div, // / - DivEq, // /= - DivDiv, // // - Semi, // ; - Colon, // : - ColonColon, // :: - ColonEq, // := - Question, // ? - Dot, // . - Range, // .. - RangeInclusive, // ..= - Comma, // , - LeftSquare, // [ - RightSquare, // ] - LeftBracket, // { - RightBracket, // } - - // Reserved words - Class, - Struct, - Record, - Enum, - Union, - Trait, - Type, - Func, - Module, - Implement, - Represent, - /// Publicly visible. - Public, - /// Visible in the package only. - Package, - /// Visible only in file/module. - Private, - Constraint, - Constrain, - /// Used to constrain relations between variables. - Relation, - Unsafe, - /// May use similar to unsafe in Rust -- call a function or cast without checking any of the constraints. - Unchecked, - Import, - Const, - Var, - If, - Else, - Match, - Is, - As, - On, - In, - Not, - /// Marks functions as dynamic, and not to be executed at compile time. - Dyn, - /// For try { } blocks. - Try, - True, - False, - Lifetime, - Outlives, - - /// `Self` in source code. - #[display(fmt = "Self")] - SelfUpper, - - /// `self` in source code. - #[display(fmt = "self")] - SelfLower, - - /// Whitespace of any kind and length. - #[display(fmt = "W")] - Whitespace, - - /// Single line comment started with `#`. Optionally `## ` or `##! ` for documentation. - #[display(fmt = "Single line {} comment", comment_type)] - SingleLineComment { - comment_type: CommentTy, - }, - - /// Multiline comment between `#*` and `*#`. Starts with `#**` or `#*!` for documentation. - #[display( - fmt = "Multiline {} comment (terminated = {})", - comment_type, - is_terminated - )] - MultilineComment { - comment_type: CommentTy, - /// Is this comment terminated? If not raise an error before parsing the tokens. - is_terminated: bool, - }, - - /// Integer literal. This is a literal integer in source code. May include underscores after the leading digit - /// as visual seperators. May also include a prefix such as `0x`, `0o`, or `0b` for hex, octal, or binary. - IntegerLit, - - /// A string literal in source code. - #[display( - fmt = "StringLit (fmt = {}, terminated = {})", - is_format, - is_terminated - )] - StringLit { - /// For format strings (backticks instead of double quotes) - is_format: bool, - /// Is this string terminated? - is_terminated: bool, - }, - - /// A character literal in source code. - #[display(fmt = "CharLit (terminated = {})", is_terminated)] - CharLit { - /// Is the char lit terminated? - is_terminated: bool, - }, - - /// A identifier in source code (such as a variable name). At this stage keywords (such as 'struct') are - /// also considered identifiers. - #[display(fmt = "ID")] - Identifier, - - /// Unknown character for the lexer. - #[display(fmt = "?")] - Unknown, -} - -/// Different types of comments. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Display)] -pub enum CommentTy { - /// Normal comment that does not get used in documentation. - Normal, - /// Documentation for a declaration in the file. - InnerDoc, - /// Documentation for the file itself. - OuterDoc, -} From 6e186224dc573d1e4050b0541bdd562f360c49e6 Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:46:01 -0400 Subject: [PATCH 59/60] Put back grcov config see if it fixes things --- .github/actions-rs/grcov.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .github/actions-rs/grcov.yml diff --git a/.github/actions-rs/grcov.yml b/.github/actions-rs/grcov.yml new file mode 100644 index 00000000..97314290 --- /dev/null +++ b/.github/actions-rs/grcov.yml @@ -0,0 +1,6 @@ +branch: true +output-type: lcov +output-file: ./lcov.info +ignore-not-existing: true +ignore: + - "/*" From ab0adf4a2a56d9677fd0d8a9caad88de63d166fd Mon Sep 17 00:00:00 2001 From: Venus Xeon-Blonde Date: Sat, 16 Mar 2024 01:54:50 -0400 Subject: [PATCH 60/60] Update grcov.yml --- .github/actions-rs/grcov.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/actions-rs/grcov.yml b/.github/actions-rs/grcov.yml index 97314290..5a936960 100644 --- a/.github/actions-rs/grcov.yml +++ b/.github/actions-rs/grcov.yml @@ -2,5 +2,7 @@ branch: true output-type: lcov output-file: ./lcov.info ignore-not-existing: true +llvm: true ignore: - "/*" + - "../*"