Skip to content

Commit

Permalink
Adding escape sequences to char literals (#514)
Browse files Browse the repository at this point in the history
This change also modifies the escape sequence parsing for unicode
characters to use the logic in the `Char.fromInt` prelude method, now
that it exists.
  • Loading branch information
kengorab authored Dec 1, 2024
1 parent 5043b18 commit 4e21e04
Show file tree
Hide file tree
Showing 25 changed files with 145 additions and 95 deletions.
24 changes: 13 additions & 11 deletions projects/compiler/example.abra
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
val chars = "a£→😀".chars()
// val chars = "a£→😀".chars()

/// Expect: a 97 [0b1100001]
/// Expect: £ 163 [0b11000010, 0b10100011]
/// Expect: → 65515 [0b11101111, 0b10111111, 0b10101011]
/// Expect: 😀 128512 [0b11110000, 0b10011111, 0b10011000, 0b10000000]
for ch in chars {
println(ch, ch.asInt(), ch.bytes().map(b => b.binary()))
}
// /// Expect: a 97 [0b1100001]
// /// Expect: £ 163 [0b11000010, 0b10100011]
// /// Expect: → 65515 [0b11101111, 0b10111111, 0b10101011]
// /// Expect: 😀 128512 [0b11110000, 0b10011111, 0b10011000, 0b10000000]
// for ch in chars {
// println(ch, ch.asInt(), ch.bytes().map(b => b.binary()))
// }

val ch = Char.fromInt(0xD800)
/// Expect: �
println(ch)
// val ch = Char.fromInt(0xD800)
// /// Expect: �
// println(ch)

println(0.hex())
94 changes: 42 additions & 52 deletions projects/compiler/src/lexer.abra
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ export enum StringInterpolationChunk {

export enum LexerErrorKind {
UnexpectedChar(char: String)
UnterminatedCharLiteral
UnterminatedString(start: Position)
UnsupportedEscapeSequence(seq: String, isUnicode: Bool)
UnexpectedEof
Expand All @@ -182,6 +183,10 @@ export type LexerError {
lines.push("Unexpected character '$char':")
lines.push(self._getCursorLine(self.position, contents))
}
LexerErrorKind.UnterminatedCharLiteral => {
lines.push("Unterminated character literal:")
lines.push(self._getCursorLine(self.position, contents))
}
LexerErrorKind.UnterminatedString(start) => {
lines.push("Unterminated string:")
lines.push(" String begins at (${start.line}:${start.col})")
Expand Down Expand Up @@ -445,15 +450,35 @@ export type Lexer {

var ch = self._input[self._cursor]
if ch == "'" return Err(LexerError(position: self._curPos(), kind: LexerErrorKind.UnexpectedChar(ch)))
if ch == "\\" todo("character escape sequences")
val intVal = if ch == "\\" {
val pos = self._curPos()
self._advance() // consume '\'
if self._cursor >= self._input.length return Err(LexerError(position: pos, kind: LexerErrorKind.UnexpectedEof))
ch = self._input[self._cursor]

match ch {
"0" => 0
"n" => 10
"\\" => 92
"r" => 13
"t" => 9
"'" => 39
"u" => {
val ch = try self._parseUnicodeEscape(pos)
ch.asInt()
}
_ => return Err(LexerError(position: pos, kind: LexerErrorKind.UnsupportedEscapeSequence(seq: "\\$ch", isUnicode: false)))
}
} else {
// TODO: once characters are supported, there should be no need for low-level byte manipulation like this
ch._buffer.offset(0).load().asInt()
}

// TODO: once characters are supported, there should be no need for low-level byte manipulation like this
val intVal = ch._buffer.offset(0).load().asInt()
self._advance()

ch = self._input[self._cursor]
if ch != "'" {
return Err(LexerError(position: self._curPos(), kind: LexerErrorKind.UnexpectedChar(ch)))
if self._cursor >= self._input.length || ch != "'" {
return Err(LexerError(position: self._curPos(), kind: LexerErrorKind.UnterminatedCharLiteral))
}
self._advance() // consume "'"

Expand Down Expand Up @@ -482,13 +507,16 @@ export type Lexer {
val ch = self._input[self._cursor]
val escapedCh = match ch {
"n" => "\n"
"\\" => "\\",
"r" => "\r",
"t" => "\t",
"\'" => "\'",
"\"" => "\"",
"$" => "$",
"u" => try self._parseUnicodeEscape(pos)
"\\" => "\\"
"r" => "\r"
"t" => "\t"
// "\'" => "\'"
"\"" => "\""
"$" => "$"
"u" => {
val ch = try self._parseUnicodeEscape(pos)
ch.toString()
}
_ => return Err(LexerError(position: pos, kind: LexerErrorKind.UnsupportedEscapeSequence(seq: "\\$ch", isUnicode: false)))
}

Expand Down Expand Up @@ -581,7 +609,7 @@ export type Lexer {
}
}

func _parseUnicodeEscape(self, startPos: Position): Result<String, LexerError> {
func _parseUnicodeEscape(self, startPos: Position): Result<Char, LexerError> {
self._advance() // consume 'u'

var value = 0
Expand All @@ -601,45 +629,7 @@ export type Lexer {
self._advance()
self._advance()

// TODO: Clean up this unicode->utf-8 conversion and abstract it away into some stdlib method once Chars exist?
val str = if !(value.isBetween(0, 0xD7FF, true) || value.isBetween(0xE000, 0x10FFFF, true)) {
"�"
} else if value.isBetween(0, 0x007F, true) {
val s = String.withLength(1)
s._buffer.offset(0).store(value.asByte())
s
} else if value.isBetween(0x0080, 0x07FF, true) {
val s = String.withLength(2)
val b1 = 0b11000000 || ((value && 0b11111000000) >> 6)
val b2 = 0b10000000 || (value && 0b00000111111)
s._buffer.offset(0).store(b1.asByte())
s._buffer.offset(1).store(b2.asByte())
s
} else if value.isBetween(0x0800, 0xFFFF, true) {
val s = String.withLength(3)
val b1 = 0b11100000 || ((value && 0b1111000000000000) >> 12)
val b2 = 0b10000000 || ((value && 0b0000111111000000) >> 6)
val b3 = 0b10000000 || (value && 0b0000000000111111)
s._buffer.offset(0).store(b1.asByte())
s._buffer.offset(1).store(b2.asByte())
s._buffer.offset(2).store(b3.asByte())
s
} else if value.isBetween(0x10000, 0x10FFFF, true) {
val s = String.withLength(4)
val b1 = 0b11110000 || (value && 0b111000000000000000000)
val b2 = 0b10000000 || (value && 0b000111111000000000000)
val b3 = 0b10000000 || (value && 0b000000000111111000000)
val b4 = 0b10000000 || (value && 0b000000000000000111111)
s._buffer.offset(0).store(b1.asByte())
s._buffer.offset(1).store(b2.asByte())
s._buffer.offset(2).store(b3.asByte())
s._buffer.offset(3).store(b4.asByte())
s
} else {
"�"
}

Ok(str)
Ok(Char.fromInt(value))
}

func _tokenizeIdentifier(self, startPos: Position): Token {
Expand Down
27 changes: 3 additions & 24 deletions projects/compiler/src/test_utils.abra
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,6 @@ export func printTokenAsJson(token: Token, indentLevelStart: Int, currentIndentL
print("$endIndent}")
}

export func charToString(charVal: Int): String {
// TODO: clean this up
val lowestByte = charVal && 0xff

val higherBytes = [
charVal && 0xff00,
charVal && 0xff0000,
charVal && 0xff000000,
]
for b, idx in higherBytes {
if b != 0 unreachable("byte $idx of Char($charVal) is not 0")
}

lowestByte.hex()
}

func printTokenKindAsJson(kind: TokenKind, indentLevelStart: Int, currentIndentLevel: Int) {
val startIndent = " ".repeat(indentLevelStart)
val fieldsIndent = " ".repeat(currentIndentLevel + 1)
Expand All @@ -48,10 +32,8 @@ func printTokenKindAsJson(kind: TokenKind, indentLevelStart: Int, currentIndentL
println("$fieldsIndent\"value\": $value")
}
TokenKind.Char(intVal) => {
val charAsString = charToString(intVal)

println("$fieldsIndent\"name\": \"Char\",")
println("$fieldsIndent\"value\": \"$charAsString\"")
println("$fieldsIndent\"value\": \"${intVal.hex()}\"")
}
TokenKind.String(value) => {
println("$fieldsIndent\"name\": \"String\",")
Expand Down Expand Up @@ -364,9 +346,7 @@ func printAstNodeKindAsJson(kind: AstNodeKind, indentLevelStart: Int, currentInd
LiteralAstNode.Float(value) => ("float", value.toString())
LiteralAstNode.Bool(value) => ("bool", value.toString())
LiteralAstNode.Char(value) => {
val charAsString = charToString(value)

("char", "\"$charAsString\"")
("char", "\"${value.hex()}\"")
}
LiteralAstNode.String(value) => ("string", "\"$value\"")
}
Expand Down Expand Up @@ -692,8 +672,7 @@ func printAstNodeKindAsJson(kind: AstNodeKind, indentLevelStart: Int, currentInd
LiteralAstNode.Float(value) => ("float", value.toString())
LiteralAstNode.Bool(value) => ("bool", value.toString())
LiteralAstNode.Char(value) => {
val charAsString = charToString(value)
("char", "\"$charAsString\"")
("char", "\"${value.hex()}\"")
}
LiteralAstNode.String(value) => ("string", "\"$value\"")
}
Expand Down
4 changes: 2 additions & 2 deletions projects/compiler/src/typechecker_test_utils.abra
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import LiteralAstNode, IndexingMode from "./parser"
import Type, TypeKind, TypedModule, TypedAstNode, TypedAstNodeKind, Variable, Function, FunctionKind, Scope, Struct, Enum, StructOrEnum, AccessorPathSegment, TypedInvokee, TypedIndexingNode, TypedAssignmentMode, Field, EnumVariantKind, Export, TypedMatchCaseKind from "./typechecker"
import printTokenAsJson, printLabelAsJson, printBindingPatternAsJson, charToString from "./test_utils"
import printTokenAsJson, printLabelAsJson, printBindingPatternAsJson from "./test_utils"

export type Jsonifier {
indentLevel: Int = 0
Expand Down Expand Up @@ -331,7 +331,7 @@ export type Jsonifier {
LiteralAstNode.Int(value) => self.println("\"value\": $value")
LiteralAstNode.Float(value) => self.println("\"value\": $value")
LiteralAstNode.Bool(value) => self.println("\"value\": $value")
LiteralAstNode.Char(value) => self.println("\"value\": \"${charToString(value)}\"")
LiteralAstNode.Char(value) => self.println("\"value\": \"${value.hex()}\"")
LiteralAstNode.String(value) => self.println("\"value\": \"$value\"")
}
}
Expand Down
6 changes: 5 additions & 1 deletion projects/compiler/test/compiler/chars.abra
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@ println(a == b, a == 'a')
/// Expect: false true
println(b != b, b != 'a')

// Test escape sequences in char literals
/// Expect: true
println('\\' == '\u005C')

val m = { 'a': 1, 'b': 2 }
/// Expect: { a: 1, b: 2 } Option.None Option.Some(value: 1)
println(m, m['c'], m['a'])

// Test UTF-8 encoding/decoding
val chars = "a£→😀".chars()

/// Expect: a 97 [0b1100001]
/// Expect: £ 163 [0b11000010, 0b10100011]
/// Expect: → 65515 [0b11101111, 0b10111111, 0b10101011]
Expand Down
2 changes: 1 addition & 1 deletion projects/compiler/test/compiler/process_callstack.abra
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ val arr = [1].map((i, _) => {
/// Expect: at baz (%TEST_DIR%/compiler/process_callstack.abra:10)
/// Expect: at bar (%TEST_DIR%/compiler/process_callstack.abra:5)
/// Expect: at foo (%TEST_DIR%/compiler/process_callstack.abra:19)
/// Expect: at <expression> (%STD_DIR%/prelude.abra:688)
/// Expect: at <expression> (%STD_DIR%/prelude.abra:690)
/// Expect: at Array.map (%TEST_DIR%/compiler/process_callstack.abra:18)

type OneTwoThreeIterator {
Expand Down
7 changes: 6 additions & 1 deletion projects/compiler/test/lexer/chars.abra
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
'a'
' '
'{'
'Z'
'Z'

'\0'
'\n'
'\\'
'\u00E9'
28 changes: 28 additions & 0 deletions projects/compiler/test/lexer/chars.out.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,33 @@
"name": "Char",
"value": "0x5a"
}
},
{
"position": [6, 1],
"kind": {
"name": "Char",
"value": "0x0"
}
},
{
"position": [7, 1],
"kind": {
"name": "Char",
"value": "0xa"
}
},
{
"position": [8, 1],
"kind": {
"name": "Char",
"value": "0x5c"
}
},
{
"position": [9, 1],
"kind": {
"name": "Char",
"value": "0xe9"
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\u04x4'
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Error at %FILE_NAME%:1:2
Unsupported escape sequence:
| '\u04x4'
^^^^
Unicode escape sequences must be \u followed by 4 hexadecimal characters (between 0000 and 7FFF)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\u04
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Error at %FILE_NAME%:1:2
Unsupported escape sequence:
| '\u04
^^^^
Unicode escape sequences must be \u followed by 4 hexadecimal characters (between 0000 and 7FFF)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\u04'
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Error at %FILE_NAME%:1:2
Unsupported escape sequence:
| '\u04'
^^^^
Unicode escape sequences must be \u followed by 4 hexadecimal characters (between 0000 and 7FFF)
2 changes: 1 addition & 1 deletion projects/compiler/test/lexer/chars_error_too_big.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Error at %FILE_NAME%:1:3
Unexpected character 'b':
Unterminated character literal:
| 'ab'
^
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\z'
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Error at %FILE_NAME%:1:2
Unsupported escape sequence:
| '\z'
^^
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'a
4 changes: 4 additions & 0 deletions projects/compiler/test/lexer/chars_error_unterminated_eof.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Error at %FILE_NAME%:1:3
Unterminated character literal:
| 'a
^
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
'a
'
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Error at %FILE_NAME%:1:3
Unterminated character literal:
| 'a
^
2 changes: 1 addition & 1 deletion projects/compiler/test/lexer/strings.abra
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"" "abcd"
"hello wörld: 1 + 2 ! 👩🏻‍⚕️"
"a\nb\tc\\\\nd\'e\"f\$$"
"a\nb\tc\\\\nde\"f\$$"
"\u007a\u306e\u77e7\ud801"
2 changes: 1 addition & 1 deletion projects/compiler/test/lexer/strings.out.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"kind": {
"name": "String",
"value": "a
b c\\nd'e"f$$"
b c\\nde"f$$"
}
},
{
Expand Down
Loading

0 comments on commit 4e21e04

Please sign in to comment.