Add identifier parsing test and dedicated whitespace token type.

vcfxb · Jul 13, 2024 · 9775b67 · 9775b67
1 parent cec24c0
commit 9775b67
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 15 deletions.
diff --git a/wright/src/lexer.rs b/wright/src/lexer.rs
@@ -104,17 +104,6 @@ impl Lexer {
         self.remaining.offset_from(&origin.remaining)
     }
 
-    /// Remove and ignore any whitespace at the start of the [Lexer::remaining] [Fragment].
-    pub fn ignore_whitespace(&mut self) {
-        // Get a reference to the slice of the string past any whitespace at the start.
-        let without_whitespace = self.remaining.clone().trim_start();
-
-        // If the references aren't equal, update the remaining fragment.
-        if self.remaining.range != without_whitespace.range {
-            self.remaining = without_whitespace;
-        }
-    }
-
     /// Check if a pattern matches at the start of the [Lexer::remaining] [Fragment].
     pub fn matches(&self, pattern: &str) -> bool {
         self.remaining.as_str().starts_with(pattern)
@@ -183,14 +172,27 @@ impl Lexer {
 
     /// Get the next token from the lexer.
     pub fn next_token(&mut self) -> Option<Token> {
-        // Ignore any whitespace at the start of the lexer.
-        self.ignore_whitespace();
-
         // If the remaining input is empty, there is no token.
         if self.remaining.is_empty() {
             return None;
         }
 
+        // If there is whitespace, it becomes its own token.
+        // Use a little unsafe here since this check is done every time and needs to be fast.
+        {
+            let remaining_str = self.remaining.as_str();
+            let trimmed = remaining_str.trim_start().as_ptr();
+
+            // Calculate the delta by pointer offset.
+            // SAFETY: In this case, all the requirements of pointer::offset_from are satisfied.
+            let delta = unsafe { trimmed.offset_from(remaining_str.as_ptr()) };
+
+            if delta > 0 {
+                // SAFETY: trim_start should always return a valid string, and delta is just checked to be > 0.
+                return unsafe { Some(self.split_token_unchecked(delta as usize, TokenTy::Whitespace)) };
+            }
+        }
+
         // Attempt to parse a single line comment and then attempt a multi-line comment.
         for comment_match_fn in [try_match_single_line_comment, try_match_block_comment] {
             // Attempt to parse a comment using the given match function. Return it if it's documentation or unterminated.

diff --git a/wright/src/lexer/identifier.rs b/wright/src/lexer/identifier.rs
@@ -91,6 +91,7 @@ mod tests {
         let mut lexer = Lexer::new_test("const TEST");
 
         assert_eq!(lexer.next_token().unwrap().variant, TokenTy::KwConst);
+        assert_eq!(lexer.next_token().unwrap().variant, TokenTy::Whitespace);
         assert_eq!(lexer.next_token().unwrap().variant, TokenTy::Identifier);
     }
 }
diff --git a/wright/src/lexer/quoted.rs b/wright/src/lexer/quoted.rs
@@ -66,7 +66,7 @@ mod tests {
 
     #[test]
     fn string_literal() {
-        let mut lexer = Lexer::new_test(r#" "Test string literal" "#);
+        let mut lexer = Lexer::new_test(r#""Test string literal""#);
         let token = lexer.next_token().unwrap();
         assert_eq!(token.variant, TokenTy::StringLiteral { terminated: true });
         assert_eq!(token.fragment.as_str(), "\"Test string literal\"");

diff --git a/wright/src/lexer/token.rs b/wright/src/lexer/token.rs
@@ -91,6 +91,9 @@ pub enum TokenTy {
     FormatStringLiteral { terminated: bool },
     CharLiteral { terminated: bool },
 
+    /// Whitespace counts as a token.
+    Whitespace,
+
     /// Unknown character in lexer fragment. 
     Unknown
 }
diff --git a/wright/src/parser/identifier.rs b/wright/src/parser/identifier.rs
@@ -7,3 +7,25 @@ impl Parse for Identifier {
         Ok(Identifier { fragment: ident_token.fragment })
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{ast::identifier::Identifier, lexer::{token::TokenTy, Lexer}, parser::{Parse, ParseError}};
+
+    #[test]
+    fn test_parse_ident() {
+        let mut lexer = Lexer::new_test("source");
+        let ident = Identifier::parse(&mut lexer).unwrap();
+        assert_eq!(ident.fragment.as_str(), "source");
+        assert_eq!(lexer.remaining.len(), 0);
+    }
+
+    #[test]
+    fn test_parse_ident_fail() {
+        for fail in ["12", "+", " ", " test", "_", "record"] {
+            let mut lexer = Lexer::new_test(&fail);
+            let error = Identifier::parse(&mut lexer).unwrap_err();
+            assert!(matches!(&error, ParseError::Expected { expected: TokenTy::Identifier, .. }));
+        }
+    }
+}