Adding escape sequences to char literals (#514)

This change also modifies the escape sequence parsing for unicode characters to use the logic in the `Char.fromInt` prelude method, now that it exists.
kengorab · Dec 1, 2024 · 4e21e04 · 4e21e04
1 parent 5043b18
commit 4e21e04
Show file tree

Hide file tree

Showing 25 changed files with 145 additions and 95 deletions.
diff --git a/projects/compiler/example.abra b/projects/compiler/example.abra
@@ -1,13 +1,15 @@
-val chars = "a£￫😀".chars()
+// val chars = "a£￫😀".chars()
 
-/// Expect: a 97 [0b1100001]
-/// Expect: £ 163 [0b11000010, 0b10100011]
-/// Expect: ￫ 65515 [0b11101111, 0b10111111, 0b10101011]
-/// Expect: 😀 128512 [0b11110000, 0b10011111, 0b10011000, 0b10000000]
-for ch in chars {
-  println(ch, ch.asInt(), ch.bytes().map(b => b.binary()))
-}
+// /// Expect: a 97 [0b1100001]
+// /// Expect: £ 163 [0b11000010, 0b10100011]
+// /// Expect: ￫ 65515 [0b11101111, 0b10111111, 0b10101011]
+// /// Expect: 😀 128512 [0b11110000, 0b10011111, 0b10011000, 0b10000000]
+// for ch in chars {
+//   println(ch, ch.asInt(), ch.bytes().map(b => b.binary()))
+// }
 
-val ch = Char.fromInt(0xD800)
-/// Expect: �
-println(ch)
+// val ch = Char.fromInt(0xD800)
+// /// Expect: �
+// println(ch)
+
+println(0.hex())
diff --git a/projects/compiler/src/lexer.abra b/projects/compiler/src/lexer.abra
@@ -165,6 +165,7 @@ export enum StringInterpolationChunk {
 
 export enum LexerErrorKind {
   UnexpectedChar(char: String)
+  UnterminatedCharLiteral
   UnterminatedString(start: Position)
   UnsupportedEscapeSequence(seq: String, isUnicode: Bool)
   UnexpectedEof
@@ -182,6 +183,10 @@ export type LexerError {
         lines.push("Unexpected character '$char':")
         lines.push(self._getCursorLine(self.position, contents))
       }
+      LexerErrorKind.UnterminatedCharLiteral => {
+        lines.push("Unterminated character literal:")
+        lines.push(self._getCursorLine(self.position, contents))
+      }
       LexerErrorKind.UnterminatedString(start) => {
         lines.push("Unterminated string:")
         lines.push("  String begins at (${start.line}:${start.col})")
@@ -445,15 +450,35 @@ export type Lexer {
 
     var ch = self._input[self._cursor]
     if ch == "'" return Err(LexerError(position: self._curPos(), kind: LexerErrorKind.UnexpectedChar(ch)))
-    if ch == "\\" todo("character escape sequences")
+    val intVal = if ch == "\\" {
+      val pos = self._curPos()
+      self._advance() // consume '\'
+      if self._cursor >= self._input.length return Err(LexerError(position: pos, kind: LexerErrorKind.UnexpectedEof))
+      ch = self._input[self._cursor]
+
+      match ch {
+        "0" => 0
+        "n" => 10
+        "\\" => 92
+        "r" => 13
+        "t" => 9
+        "'" => 39
+        "u" => {
+          val ch = try self._parseUnicodeEscape(pos)
+          ch.asInt()
+        }
+        _ => return Err(LexerError(position: pos, kind: LexerErrorKind.UnsupportedEscapeSequence(seq: "\\$ch", isUnicode: false)))
+      }
+    } else {
+      // TODO: once characters are supported, there should be no need for low-level byte manipulation like this
+      ch._buffer.offset(0).load().asInt()
+    }
 
-    // TODO: once characters are supported, there should be no need for low-level byte manipulation like this
-    val intVal = ch._buffer.offset(0).load().asInt()
     self._advance()
 
     ch = self._input[self._cursor]
-    if ch != "'" {
-      return Err(LexerError(position: self._curPos(), kind: LexerErrorKind.UnexpectedChar(ch)))
+    if self._cursor >= self._input.length || ch != "'" {
+      return Err(LexerError(position: self._curPos(), kind: LexerErrorKind.UnterminatedCharLiteral))
     }
     self._advance() // consume "'"
 
@@ -482,13 +507,16 @@ export type Lexer {
         val ch = self._input[self._cursor]
         val escapedCh = match ch {
           "n" => "\n"
-          "\\" => "\\",
-          "r" => "\r",
-          "t" => "\t",
-          "\'" => "\'",
-          "\"" => "\"",
-          "$" => "$",
-          "u" => try self._parseUnicodeEscape(pos)
+          "\\" => "\\"
+          "r" => "\r"
+          "t" => "\t"
+          // "\'" => "\'"
+          "\"" => "\""
+          "$" => "$"
+          "u" => {
+            val ch = try self._parseUnicodeEscape(pos)
+            ch.toString()
+          }
           _ => return Err(LexerError(position: pos, kind: LexerErrorKind.UnsupportedEscapeSequence(seq: "\\$ch", isUnicode: false)))
         }
 
@@ -581,7 +609,7 @@ export type Lexer {
     }
   }
 
-  func _parseUnicodeEscape(self, startPos: Position): Result<String, LexerError> {
+  func _parseUnicodeEscape(self, startPos: Position): Result<Char, LexerError> {
     self._advance() // consume 'u'
 
     var value = 0
@@ -601,45 +629,7 @@ export type Lexer {
     self._advance()
     self._advance()
 
-    // TODO: Clean up this unicode->utf-8 conversion and abstract it away into some stdlib method once Chars exist?
-    val str = if !(value.isBetween(0, 0xD7FF, true) || value.isBetween(0xE000, 0x10FFFF, true)) {
-      "�"
-    } else if value.isBetween(0, 0x007F, true) {
-      val s = String.withLength(1)
-      s._buffer.offset(0).store(value.asByte())
-      s
-    } else if value.isBetween(0x0080, 0x07FF, true) {
-      val s = String.withLength(2)
-      val b1 = 0b11000000 || ((value && 0b11111000000) >> 6)
-      val b2 = 0b10000000 || (value && 0b00000111111)
-      s._buffer.offset(0).store(b1.asByte())
-      s._buffer.offset(1).store(b2.asByte())
-      s
-    } else if value.isBetween(0x0800, 0xFFFF, true) {
-      val s = String.withLength(3)
-      val b1 = 0b11100000 || ((value && 0b1111000000000000) >> 12)
-      val b2 = 0b10000000 || ((value && 0b0000111111000000) >> 6)
-      val b3 = 0b10000000 || (value && 0b0000000000111111)
-      s._buffer.offset(0).store(b1.asByte())
-      s._buffer.offset(1).store(b2.asByte())
-      s._buffer.offset(2).store(b3.asByte())
-      s
-    } else if value.isBetween(0x10000, 0x10FFFF, true) {
-      val s = String.withLength(4)
-      val b1 = 0b11110000 || (value && 0b111000000000000000000)
-      val b2 = 0b10000000 || (value && 0b000111111000000000000)
-      val b3 = 0b10000000 || (value && 0b000000000111111000000)
-      val b4 = 0b10000000 || (value && 0b000000000000000111111)
-      s._buffer.offset(0).store(b1.asByte())
-      s._buffer.offset(1).store(b2.asByte())
-      s._buffer.offset(2).store(b3.asByte())
-      s._buffer.offset(3).store(b4.asByte())
-      s
-    } else {
-      "�"
-    }
-
-    Ok(str)
+    Ok(Char.fromInt(value))
   }
 
   func _tokenizeIdentifier(self, startPos: Position): Token {

diff --git a/projects/compiler/src/test_utils.abra b/projects/compiler/src/test_utils.abra
@@ -12,22 +12,6 @@ export func printTokenAsJson(token: Token, indentLevelStart: Int, currentIndentL
   print("$endIndent}")
 }
 
-export func charToString(charVal: Int): String {
-  // TODO: clean this up
-  val lowestByte = charVal && 0xff
-
-  val higherBytes = [
-    charVal && 0xff00,
-    charVal && 0xff0000,
-    charVal && 0xff000000,
-  ]
-  for b, idx in higherBytes {
-    if b != 0 unreachable("byte $idx of Char($charVal) is not 0")
-  }
-
-  lowestByte.hex()
-}
-
 func printTokenKindAsJson(kind: TokenKind, indentLevelStart: Int, currentIndentLevel: Int) {
   val startIndent = "  ".repeat(indentLevelStart)
   val fieldsIndent = "  ".repeat(currentIndentLevel + 1)
@@ -48,10 +32,8 @@ func printTokenKindAsJson(kind: TokenKind, indentLevelStart: Int, currentIndentL
       println("$fieldsIndent\"value\": $value")
     }
     TokenKind.Char(intVal) => {
-      val charAsString = charToString(intVal)
-
       println("$fieldsIndent\"name\": \"Char\",")
-      println("$fieldsIndent\"value\": \"$charAsString\"")
+      println("$fieldsIndent\"value\": \"${intVal.hex()}\"")
     }
     TokenKind.String(value) => {
       println("$fieldsIndent\"name\": \"String\",")
@@ -364,9 +346,7 @@ func printAstNodeKindAsJson(kind: AstNodeKind, indentLevelStart: Int, currentInd
         LiteralAstNode.Float(value) => ("float", value.toString())
         LiteralAstNode.Bool(value) => ("bool", value.toString())
         LiteralAstNode.Char(value) => {
-          val charAsString = charToString(value)
-
-          ("char", "\"$charAsString\"")
+          ("char", "\"${value.hex()}\"")
         }
         LiteralAstNode.String(value) => ("string", "\"$value\"")
       }
@@ -692,8 +672,7 @@ func printAstNodeKindAsJson(kind: AstNodeKind, indentLevelStart: Int, currentInd
                 LiteralAstNode.Float(value) => ("float", value.toString())
                 LiteralAstNode.Bool(value) => ("bool", value.toString())
                 LiteralAstNode.Char(value) => {
-                  val charAsString = charToString(value)
-                  ("char", "\"$charAsString\"")
+                  ("char", "\"${value.hex()}\"")
                 }
                 LiteralAstNode.String(value) => ("string", "\"$value\"")
               }

diff --git a/projects/compiler/src/typechecker_test_utils.abra b/projects/compiler/src/typechecker_test_utils.abra
@@ -1,6 +1,6 @@
 import LiteralAstNode, IndexingMode from "./parser"
 import Type, TypeKind, TypedModule, TypedAstNode, TypedAstNodeKind, Variable, Function, FunctionKind, Scope, Struct, Enum, StructOrEnum, AccessorPathSegment, TypedInvokee, TypedIndexingNode, TypedAssignmentMode, Field, EnumVariantKind, Export, TypedMatchCaseKind from "./typechecker"
-import printTokenAsJson, printLabelAsJson, printBindingPatternAsJson, charToString from "./test_utils"
+import printTokenAsJson, printLabelAsJson, printBindingPatternAsJson from "./test_utils"
 
 export type Jsonifier {
   indentLevel: Int = 0
@@ -331,7 +331,7 @@ export type Jsonifier {
           LiteralAstNode.Int(value) => self.println("\"value\": $value")
           LiteralAstNode.Float(value) => self.println("\"value\": $value")
           LiteralAstNode.Bool(value) => self.println("\"value\": $value")
-          LiteralAstNode.Char(value) => self.println("\"value\": \"${charToString(value)}\"")
+          LiteralAstNode.Char(value) => self.println("\"value\": \"${value.hex()}\"")
           LiteralAstNode.String(value) => self.println("\"value\": \"$value\"")
         }
       }

diff --git a/projects/compiler/test/compiler/chars.abra b/projects/compiler/test/compiler/chars.abra
@@ -9,12 +9,16 @@ println(a == b, a == 'a')
 /// Expect: false true
 println(b != b, b != 'a')
 
+// Test escape sequences in char literals
+/// Expect: true
+println('\\' == '\u005C')
+
 val m = { 'a': 1, 'b': 2 }
 /// Expect: { a: 1, b: 2 } Option.None Option.Some(value: 1)
 println(m, m['c'], m['a'])
 
+// Test UTF-8 encoding/decoding
 val chars = "a£￫😀".chars()
-
 /// Expect: a 97 [0b1100001]
 /// Expect: £ 163 [0b11000010, 0b10100011]
 /// Expect: ￫ 65515 [0b11101111, 0b10111111, 0b10101011]

diff --git a/projects/compiler/test/compiler/process_callstack.abra b/projects/compiler/test/compiler/process_callstack.abra
@@ -26,7 +26,7 @@ val arr = [1].map((i, _) => {
 /// Expect:   at baz (%TEST_DIR%/compiler/process_callstack.abra:10)
 /// Expect:   at bar (%TEST_DIR%/compiler/process_callstack.abra:5)
 /// Expect:   at foo (%TEST_DIR%/compiler/process_callstack.abra:19)
-/// Expect:   at <expression> (%STD_DIR%/prelude.abra:688)
+/// Expect:   at <expression> (%STD_DIR%/prelude.abra:690)
 /// Expect:   at Array.map (%TEST_DIR%/compiler/process_callstack.abra:18)
 
 type OneTwoThreeIterator {

diff --git a/projects/compiler/test/lexer/chars.abra b/projects/compiler/test/lexer/chars.abra
@@ -1,4 +1,9 @@
 'a'
 ' '
 '{'
-'Z'
+'Z'
+
+'\0'
+'\n'
+'\\'
+'\u00E9'
diff --git a/projects/compiler/test/lexer/chars.out.json b/projects/compiler/test/lexer/chars.out.json
@@ -26,5 +26,33 @@
       "name": "Char",
       "value": "0x5a"
     }
+  },
+  {
+    "position": [6, 1],
+    "kind": {
+      "name": "Char",
+      "value": "0x0"
+    }
+  },
+  {
+    "position": [7, 1],
+    "kind": {
+      "name": "Char",
+      "value": "0xa"
+    }
+  },
+  {
+    "position": [8, 1],
+    "kind": {
+      "name": "Char",
+      "value": "0x5c"
+    }
+  },
+  {
+    "position": [9, 1],
+    "kind": {
+      "name": "Char",
+      "value": "0xe9"
+    }
   }
 ]
diff --git a/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_char.abra b/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_char.abra
@@ -0,0 +1 @@
+'\u04x4'
diff --git a/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_char.out b/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_char.out
@@ -0,0 +1,5 @@
+Error at %FILE_NAME%:1:2
+Unsupported escape sequence:
+  |  '\u04x4'
+      ^^^^
+Unicode escape sequences must be \u followed by 4 hexadecimal characters (between 0000 and 7FFF)
diff --git a/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_eof.abra b/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_eof.abra
@@ -0,0 +1 @@
+'\u04
diff --git a/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_eof.out b/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_eof.out
@@ -0,0 +1,5 @@
+Error at %FILE_NAME%:1:2
+Unsupported escape sequence:
+  |  '\u04
+      ^^^^
+Unicode escape sequences must be \u followed by 4 hexadecimal characters (between 0000 and 7FFF)
diff --git a/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_length.abra b/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_length.abra
@@ -0,0 +1 @@
+'\u04'
diff --git a/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_length.out b/projects/compiler/test/lexer/chars_error_invalid_unicode_seq_length.out
@@ -0,0 +1,5 @@
+Error at %FILE_NAME%:1:2
+Unsupported escape sequence:
+  |  '\u04'
+      ^^^^
+Unicode escape sequences must be \u followed by 4 hexadecimal characters (between 0000 and 7FFF)
diff --git a/projects/compiler/test/lexer/chars_error_too_big.out b/projects/compiler/test/lexer/chars_error_too_big.out
@@ -1,4 +1,4 @@
 Error at %FILE_NAME%:1:3
-Unexpected character 'b':
+Unterminated character literal:
   |  'ab'
        ^
diff --git a/projects/compiler/test/lexer/chars_error_unsupported_escape_sequence.abra b/projects/compiler/test/lexer/chars_error_unsupported_escape_sequence.abra
@@ -0,0 +1 @@
+'\z'
diff --git a/projects/compiler/test/lexer/chars_error_unsupported_escape_sequence.out b/projects/compiler/test/lexer/chars_error_unsupported_escape_sequence.out
@@ -0,0 +1,4 @@
+Error at %FILE_NAME%:1:2
+Unsupported escape sequence:
+  |  '\z'
+      ^^
diff --git a/projects/compiler/test/lexer/chars_error_unterminated_eof.abra b/projects/compiler/test/lexer/chars_error_unterminated_eof.abra
@@ -0,0 +1 @@
+'a
diff --git a/projects/compiler/test/lexer/chars_error_unterminated_eof.out b/projects/compiler/test/lexer/chars_error_unterminated_eof.out
@@ -0,0 +1,4 @@
+Error at %FILE_NAME%:1:3
+Unterminated character literal:
+  |  'a
+       ^
diff --git a/projects/compiler/test/lexer/chars_error_unterminated_newline.abra b/projects/compiler/test/lexer/chars_error_unterminated_newline.abra
@@ -0,0 +1,2 @@
+'a
+'
diff --git a/projects/compiler/test/lexer/chars_error_unterminated_newline.out b/projects/compiler/test/lexer/chars_error_unterminated_newline.out
@@ -0,0 +1,4 @@
+Error at %FILE_NAME%:1:3
+Unterminated character literal:
+  |  'a
+       ^
diff --git a/projects/compiler/test/lexer/strings.abra b/projects/compiler/test/lexer/strings.abra
@@ -1,4 +1,4 @@
 "" "abcd"
 "hello wörld: 1 + 2   ! 👩🏻‍⚕️"
-"a\nb\tc\\\\nd\'e\"f\$$"
+"a\nb\tc\\\\nde\"f\$$"
 "\u007a\u306e\u77e7\ud801"
diff --git a/projects/compiler/test/lexer/strings.out.json b/projects/compiler/test/lexer/strings.out.json
@@ -25,7 +25,7 @@
     "kind": {
       "name": "String",
       "value": "a
-b	c\\nd'e"f$$"
+b	c\\nde"f$$"
     }
   },
   {