Reimplement humanizer using line-index crate

hurryabit · Dec 23, 2023 · 95008b6 · 95008b6
1 parent 430d5f3
commit 95008b6
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 53 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/compiler/Cargo.toml b/compiler/Cargo.toml
@@ -30,6 +30,7 @@ tower-lsp = "0.20"
 tokio = { version = "1.35", features = ["full"] }
 simple_logger = { version = "4.3", features = ["stderr"] }
 time = "0.3"
+line-index = "0.1"
 
 [dev-dependencies]
 criterion = "0.5"

diff --git a/compiler/src/parser/humanizer.rs b/compiler/src/parser/humanizer.rs
@@ -1,70 +1,53 @@
 use crate::location::SourceLocation;
+use line_index::{LineIndex, TextSize, WideEncoding};
 
 #[derive(Debug, Eq, PartialEq)]
-pub struct Humanizer {
-    line_starts: Vec<usize>,
-}
+pub struct Humanizer(LineIndex);
 
 impl Humanizer {
     pub fn new(input: &str) -> Self {
-        let mut line_starts = Vec::new();
-        let mut index = 0;
-        line_starts.push(index);
-        for line in input.lines() {
-            // FIXME(MH): This assumes a newline character is just one byte,
-            // which is not true on Windows.
-            index += line.len() + 1;
-            line_starts.push(index);
-        }
-        Self { line_starts }
+        Self(LineIndex::new(input))
     }
 
     pub fn run(&self, loc: usize) -> SourceLocation {
-        let line = self.line_starts.binary_search(&loc).unwrap_or_else(|x| x - 1);
-        SourceLocation { line: line as u32, column: (loc - self.line_starts[line]) as u32 }
+        u32::try_from(loc)
+            .ok()
+            .and_then(|loc| self.0.try_line_col(TextSize::new(loc)))
+            .and_then(|utf8_pos| self.0.to_wide(WideEncoding::Utf32, utf8_pos))
+            .map(|utf16_pos| SourceLocation { line: utf16_pos.line, column: utf16_pos.col })
+            .unwrap_or(SourceLocation { line: 0, column: 0 })
     }
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
 
-    #[test]
-    fn test_line_starts() {
-        let cases = vec![
-            ("", vec![0]),
-            ("a", vec![0, 2]),
-            ("a\n", vec![0, 2]),
-            ("aa", vec![0, 3]),
-            ("a\nb", vec![0, 2, 4]),
-            ("a\nb\n", vec![0, 2, 4]),
-            ("ab\ncd\n", vec![0, 3, 6]),
-            ("\na", vec![0, 1, 3]),
-        ];
-        for (input, expected_line_starts) in cases {
-            let humanizer = Humanizer::new(input);
-            let expected_line_starts: Vec<_> = expected_line_starts.into_iter().collect();
-            assert_eq!(humanizer.line_starts, expected_line_starts);
-        }
-    }
-
     #[test]
     fn test_translation() {
-        let humanizer = Humanizer::new("ab\nc\nde\n\nf");
+        let humanizer = Humanizer::new("ab\nc\nde\n\nf\r\ng\näß");
         let cases = vec![
-            (0, 0, 0),
-            (1, 0, 1),
-            (2, 0, 2),
-            (3, 1, 0),
-            (4, 1, 1),
-            (5, 2, 0),
-            (6, 2, 1),
-            (7, 2, 2),
-            (8, 3, 0),
-            (9, 4, 0),
-            (10, 4, 1),
-            (11, 5, 0),
-            (100, 5, 89),
+            (0, 0, 0),  // ^|a
+            (1, 0, 1),  // a|b
+            (2, 0, 2),  // b|\n
+            (3, 1, 0),  // \n|c
+            (4, 1, 1),  // c|\n
+            (5, 2, 0),  // \n|d
+            (6, 2, 1),  // d|e
+            (7, 2, 2),  // e|\n
+            (8, 3, 0),  // \n|\n
+            (9, 4, 0),  // \n|f
+            (10, 4, 1), // f|\r
+            (11, 4, 2), // \r|\n
+            (12, 5, 0), // \n|g
+            (13, 5, 1), // g|\n
+            (14, 6, 0), // \n|ä
+            (15, 0, 0), // in ä
+            (16, 6, 1), // ä|ß
+            (17, 0, 0), // in ß
+            (18, 6, 2), // ß|$
+            (19, 0, 0), // $|
+            (100, 0, 0),
         ];
         for (loc, line, column) in cases {
             assert_eq!(humanizer.run(loc), SourceLocation { line, column });

diff --git a/compiler/src/tests/parser/mod.rs b/compiler/src/tests/parser/mod.rs
@@ -96,13 +96,14 @@ fn location_eol_lf() {
     "###);
 }
 
-// TODO(MH): This should have the same output as `location_eol_lf`.
 #[test]
 fn location_eol_crlf() {
     insta::assert_snapshot!(parse_error("\r\nx"), @r###"
     ---
     --------------------------------------------------
-    2:2-3:1: Unrecognized token `x` found at 2:2:3:1
+      2 | x
+          ~
+    Unrecognized token `x` found at 2:1:2:2
     Expected one of "fn" or "type"
     "###);
 }
@@ -119,15 +120,14 @@ fn location_comment_ascii() {
     "###);
 }
 
-// TODO(MH): This should have the same output as `location_comment_ascii`.
 #[test]
 fn location_comment_unlauts() {
     insta::assert_snapshot!(parse_error("/* äëïöü */ x"), @r###"
     ---
     --------------------------------------------------
       1 | /* äëïöü */ x
-                           ~
-    Unrecognized token `x` found at 1:18:1:19
+                      ~
+    Unrecognized token `x` found at 1:13:1:14
     Expected one of "fn" or "type"
     "###);
 }