From c629777d91594f0e21004db2d757335770949d7b Mon Sep 17 00:00:00 2001 From: "Matthew \"strager\" Glazar" Date: Sat, 2 Mar 2024 21:20:22 -0500 Subject: [PATCH] =?UTF-8?q?feat(fe):=20error=20on=20more=20confusables=20l?= =?UTF-8?q?ike=20=C7=83=20and=20=EF=B4=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also remove some assumptions that certain symbols in expressions are only one byte. --- docs/CHANGELOG.md | 6 +- src/quick-lint-js/fe/expression.h | 35 +++------- src/quick-lint-js/fe/lex.cpp | 107 ++++++++++++++++++++++++++++-- test/test-parse-expression.cpp | 32 +++++++++ 4 files changed, 149 insertions(+), 31 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index ea615a3d8c..d669398317 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -13,8 +13,10 @@ Semantic Versioning. * VS Code: You can now make quick-lint-js messages fun and insulting with the `quick-lint-js.snarky` setting (disabled by default). (Implemented by [vegerot][].) -* Using Greek question mark (;, U+037E) instead of a semicolon (;, U+003B) now - reports [E0457][] ("this is a Greek Question Mark, not a semicolon (';')"). +* Using Greek question mark (`;`, U+037E) instead of a semicolon (`;`, U+003B) + now reports [E0457][] ("this is a Greek Question Mark, not a semicolon + (';')"). This diagnostic is also reported for similar-looking characters like + `ǃ` (which should be `!`) and `﴾` (which should be `(`). * TypeScript: Decorators on abstract classes are now parsed. ([#1194][]) ### Fixed diff --git a/src/quick-lint-js/fe/expression.h b/src/quick-lint-js/fe/expression.h index 14486be95b..fca2870188 100644 --- a/src/quick-lint-js/fe/expression.h +++ b/src/quick-lint-js/fe/expression.h @@ -615,23 +615,19 @@ class Expression::Call final : public Expression { Source_Code_Span left_paren_span, const Char8 *span_end, std::optional optional_chaining_operator) : Expression(kind), - call_left_paren_begin_(left_paren_span.begin()), + call_left_paren_(left_paren_span), span_end_(span_end), children_(children), optional_chaining_operator_begin_( optional_chaining_operator.has_value() ? optional_chaining_operator->begin() : nullptr) { - QLJS_ASSERT(left_paren_span.size() == 1); if (optional_chaining_operator.has_value()) { QLJS_ASSERT(optional_chaining_operator->size() == 2); } } - Source_Code_Span left_paren_span() const { - return Source_Code_Span(this->call_left_paren_begin_, - this->call_left_paren_begin_ + 1); - } + Source_Code_Span left_paren_span() const { return this->call_left_paren_; } std::optional optional_chaining_operator_span() const { if (this->optional_chaining_operator_begin_ == nullptr) { @@ -641,7 +637,7 @@ class Expression::Call final : public Expression { this->optional_chaining_operator_begin_ + 2); } - const Char8 *call_left_paren_begin_; + Source_Code_Span call_left_paren_; const Char8 *span_end_; Expression_Arena::Array_Ptr children_; const Char8 *optional_chaining_operator_begin_ = nullptr; @@ -895,16 +891,12 @@ class Expression::Optional final : public Expression { static constexpr Expression_Kind kind = Expression_Kind::Optional; explicit Optional(Expression *child, Source_Code_Span question_span) - : Expression(kind), child_(child), question_end_(question_span.end()) { - QLJS_ASSERT(question_span.end() - question_span.begin() == 1); - } + : Expression(kind), child_(child), question_(question_span) {} - Source_Code_Span question_span() const { - return Source_Code_Span(this->question_end_ - 1, this->question_end_); - } + Source_Code_Span question_span() const { return this->question_; } Expression *child_; - const Char8 *question_end_; + Source_Code_Span question_; }; static_assert(Expression_Arena::is_allocatable); @@ -1093,23 +1085,18 @@ class Expression::Type_Annotated final : public Expression { const Char8 *span_end) : Expression(kind), child_(child), - colon_(colon_span.begin()), + colon_(colon_span), type_visits_(std::move(type_visits)), - span_end_(span_end) { - QLJS_ASSERT(*colon_span.begin() == u8':'); - QLJS_ASSERT(colon_span.size() == 1); - } + span_end_(span_end) {} - Source_Code_Span colon_span() const { - return Source_Code_Span(this->colon_, this->colon_ + 1); - } + Source_Code_Span colon_span() const { return this->colon_; } void visit_type_annotation(Parse_Visitor_Base &v) { std::move(this->type_visits_).move_into(v); } Expression *child_; - const Char8 *colon_; + Source_Code_Span colon_; Buffering_Visitor type_visits_{nullptr}; const Char8 *span_end_; }; @@ -1458,7 +1445,7 @@ inline Source_Code_Span Expression::span() const { case Expression_Kind::Optional: { auto *optional = expression_cast(this); return Source_Code_Span(optional->child_->span().begin(), - optional->question_end_); + optional->question_.end()); } case Expression_Kind::Paren: return expression_cast(this)->span_; diff --git a/src/quick-lint-js/fe/lex.cpp b/src/quick-lint-js/fe/lex.cpp index bc24d7a90c..95799d41c6 100644 --- a/src/quick-lint-js/fe/lex.cpp +++ b/src/quick-lint-js/fe/lex.cpp @@ -93,16 +93,113 @@ constexpr char32_t right_double_quote = U'\u201d'; struct Confusable_Symbol { char32_t confusable; - Char8 confusable_name[20]; + Char8 confusable_name[51]; Char8 symbol; - Char8 symbol_name[20]; + Char8 symbol_name[21]; Token_Type symbol_token_type; }; Confusable_Symbol confusable_symbols[] = { - {0x037e, u8"Greek Question Mark", u8';', u8"semicolon", - Token_Type::semicolon}, - // TODO(strager): Add more. + // clang-format off + { 0x037e, u8"Greek Question Mark", u8';', u8"semicolon", Token_Type::semicolon}, + + { 0x02d0, u8"Modifier Letter Triangular Colon", u8':', u8"colon", Token_Type::colon}, + { 0x02f8, u8"Modifier Letter Raised Colon", u8':', u8"colon", Token_Type::colon}, + { 0x0589, u8"Armenian Full Stop", u8':', u8"colon", Token_Type::colon}, + { 0x05c3, u8"Hebrew Punctuation Sof Pasuq", u8':', u8"colon", Token_Type::colon}, + { 0x0703, u8"Syriac Supralinear Colon", u8':', u8"colon", Token_Type::colon}, + { 0x0704, u8"Syriac Sublinear Colon", u8':', u8"colon", Token_Type::colon}, + { 0x0903, u8"Devanagari Sign Visarga", u8':', u8"colon", Token_Type::colon}, + { 0x0a83, u8"Gujarati Sign Visarga", u8':', u8"colon", Token_Type::colon}, + { 0x16ec, u8"Runic Multiple Punctuation", u8':', u8"colon", Token_Type::colon}, + { 0x1803, u8"Mongolian Full Stop", u8':', u8"colon", Token_Type::colon}, + { 0x1809, u8"Mongolian Manchu Full Stop", u8':', u8"colon", Token_Type::colon}, + { 0x205a, u8"Two Dot Punctuation", u8':', u8"colon", Token_Type::colon}, + { 0x2236, u8"Ratio", u8':', u8"colon", Token_Type::colon}, + { 0xa4fd, u8"Lisu Letter Tone Mya Jeu", u8':', u8"colon", Token_Type::colon}, + { 0xa789, u8"Modifier Letter Colon", u8':', u8"colon", Token_Type::colon}, + { 0xfe30, u8"Presentation Form For Vertical Two Dot Leader", u8':', u8"colon", Token_Type::colon}, + { 0xff1a, u8"Fullwidth Colon", u8':', u8"colon", Token_Type::colon}, + + { 0x00b8, u8"Cedilla", u8',', u8"comma", Token_Type::comma}, + { 0x060d, u8"Arabic Date Separator", u8',', u8"comma", Token_Type::comma}, + { 0x066b, u8"Arabic Decimal Separator", u8',', u8"comma", Token_Type::comma}, + { 0x201a, u8"Single Low-9 Quotation Mark", u8',', u8"comma", Token_Type::comma}, + { 0xa4f9, u8"Lisu Letter Tone Na Po", u8',', u8"comma", Token_Type::comma}, + + { 0x01c3, u8"Latin Letter Retroflex Click", u8'!', u8"exclamation mark", Token_Type::bang}, + { 0x2d51, u8"Tifinagh Letter Tuareg Yang", u8'!', u8"exclamation mark", Token_Type::bang}, + { 0xff01, u8"Fullwidth Exclamation Mark", u8'!', u8"exclamation mark", Token_Type::bang}, + + // TODO(strager): Also match symbols like "․․․". + { 0x0660, u8"Arabic-Indic Digit Zero", u8'.', u8"dot", Token_Type::dot}, + { 0x06f0, u8"Extended Arabic-Indic Digit Zero", u8'.', u8"dot", Token_Type::dot}, + { 0x0701, u8"Syriac Supralinear Full Stop", u8'.', u8"dot", Token_Type::dot}, + { 0x0702, u8"Syriac Sublinear Full Stop", u8'.', u8"dot", Token_Type::dot}, + { 0x2024, u8"One Dot Leader", u8'.', u8"dot", Token_Type::dot}, + { 0xa4f8, u8"Lisu Letter Tone Mya Ti", u8'.', u8"dot", Token_Type::dot}, + { 0xa60e, u8"Vai Full Stop", u8'.', u8"dot", Token_Type::dot}, + {0x10a50, u8"Kharoshthi Punctuation Dot", u8'.', u8"dot", Token_Type::dot}, + {0x1d16d, u8"Musical Symbol Combining Augmentation Dot", u8'.', u8"dot", Token_Type::dot}, + + // NOTE(strager): We diverge from Unicode here. Unicode considers a few of these as parentheses. + { 0x2772, u8"Light Left Tortoise Shell Bracket Ornament", u8'(', u8"left square bracket", Token_Type::left_square}, + { 0x2773, u8"Light Right Tortoise Shell Bracket Ornament", u8')', u8"right square bracket", Token_Type::right_square}, + { 0x3014, u8"Left Tortoise Shell Bracket", u8'(', u8"left square bracket", Token_Type::left_square}, + { 0x3015, u8"Right Tortoise Shell Bracket", u8')', u8"right square bracket", Token_Type::right_square}, + { 0xff3b, u8"Fullwidth Left Square Bracket", u8'(', u8"left square bracket", Token_Type::left_square}, + { 0xff3d, u8"Fullwidth Right Square Bracket", u8')', u8"right square bracket", Token_Type::right_square}, + + { 0x2768, u8"Medium Left Parenthesis Ornament", u8'(', u8"left parenthesis", Token_Type::left_paren}, + { 0x2769, u8"Medium Right Parenthesis Ornament", u8')', u8"right parenthesis", Token_Type::right_paren}, + { 0xfd3e, u8"Ornate Left Parenthesis", u8'(', u8"left parenthesis", Token_Type::left_paren}, + { 0xfd3f, u8"Ornate Right Parenthesis", u8')', u8"right parenthesis", Token_Type::right_paren}, + + { 0x2774, u8"Medium Left Curly Bracket Ornament", u8'{', u8"left curly bracket", Token_Type::left_curly}, + { 0x2775, u8"Medium Right Curly Bracket Ornament", u8'}', u8"right curly bracket", Token_Type::right_curly}, + {0x1d114, u8"Musical Symbol Brace", u8'{', u8"left curly bracket", Token_Type::left_curly}, + + // TODO(strager): Also match symbols like "ꝸ=" and "᐀᐀". + // NOTE(strager): 0x0294 is legal in identifiers. + { 0x0294, u8"Latin Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question}, + { 0x0241, u8"Latin Capital Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question}, + { 0x097d, u8"Devanagari Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question}, + { 0x13ae, u8"Cherokee Letter He", u8'?', u8"question mark", Token_Type::question}, + { 0xa6eb, u8"Bamum Letter Ntuu", u8'?', u8"question mark", Token_Type::question}, + + { 0xa778, u8"Latin Small Letter Um", u8'&', u8"ampersand", Token_Type::ampersand}, + + { 0x066d, u8"Arabic Five Pointed Star", u8'*', u8"asterisk", Token_Type::star}, + { 0x204e, u8"Low Asterisk", u8'*', u8"asterisk", Token_Type::star}, + { 0x2217, u8"Asterisk Operator", u8'*', u8"asterisk", Token_Type::star}, + {0x1031f, u8"Old Italic Letter Ess", u8'*', u8"asterisk", Token_Type::star}, + + { 0x02c4, u8"Modifier Letter Up Arrowhead", u8'^', u8"circumflex", Token_Type::circumflex}, + { 0x02c6, u8"Modifier Letter Circumflex Accent", u8'^', u8"circumflex", Token_Type::circumflex}, + + { 0x02c2, u8"Modifier Letter Left Arrowhead", u8'<', u8"less than", Token_Type::less}, + { 0x1438, u8"Canadian Syllabics Pa", u8'<', u8"less than", Token_Type::less}, + { 0x16b2, u8"Runic Letter Kauna", u8'<', u8"less than", Token_Type::less}, + { 0x2039, u8"Single Left-Pointing Angle Quotation Mark", u8'<', u8"less than", Token_Type::less}, + { 0x276e, u8"Heavy Left-Pointing Angle Quotation Mark Ornament", u8'<', u8"less than", Token_Type::less}, + {0x1d236, u8"Greek Instrumental Notation Symbol-40", u8'<', u8"less than", Token_Type::less}, + + { 0x02c3, u8"Modifier Letter Right Arrowhead", u8'>', u8"greater than", Token_Type::greater}, + { 0x1433, u8"Canadian Syllabics Po", u8'>', u8"greater than", Token_Type::greater}, + { 0x203a, u8"Single Right-Pointing Angle Quotation Mark", u8'>', u8"greater than", Token_Type::greater}, + { 0x276f, u8"Heavy Right-Pointing Angle Quotation Mark Ornament", u8'>', u8"greater than", Token_Type::greater}, + {0x16f3f, u8"Miao Letter Archaic Zza", u8'>', u8"greater than", Token_Type::greater}, + {0x1d237, u8"Greek Instrumental Notation Symbol-42", u8'>', u8"greater than", Token_Type::greater}, + + { 0x02dc, u8"Small Tilde", u8'~', u8"tilde", Token_Type::tilde}, + { 0x1fc0, u8"Greek Perispomeni", u8'~', u8"tilde", Token_Type::tilde}, + { 0x2053, u8"Swung Dash", u8'~', u8"tilde", Token_Type::tilde}, + { 0x223c, u8"Tilde Operator", u8'~', u8"tilde", Token_Type::tilde}, + + { 0x1400, u8"Canadian Syllabics Hyphen", u8'=', u8"equals", Token_Type::equal}, + { 0x2e40, u8"Double Hyphen", u8'=', u8"equals", Token_Type::equal}, + { 0x30a0, u8"Katakana-Hiragana Double Hyphen", u8'=', u8"equals", Token_Type::equal}, + { 0xa4ff, u8"Lisu Punctuation Full Stop", u8'=', u8"equals", Token_Type::equal}, }; bool look_up_in_unicode_table(const std::uint8_t* table, std::size_t table_size, diff --git a/test/test-parse-expression.cpp b/test/test-parse-expression.cpp index 4710e1cd4b..ff40b1a126 100644 --- a/test/test-parse-expression.cpp +++ b/test/test-parse-expression.cpp @@ -3834,6 +3834,38 @@ TEST_F(Test_Parse_Expression, precedence) { } } } + +TEST_F(Test_Parse_Expression, confusable_symbols) { + // In a previous version of quick-lint-js, confusable symbols would cause + // assertion failures during parsing. + + { + Test_Parser p(u8"f﴾)"_sv, capture_diags); + Expression* ast = p.parse_expression(); + EXPECT_EQ(ast->kind(), Expression_Kind::Call); + p.assert_offsets(static_cast(ast)->left_paren_span(), + u8"f"_sv.size(), u8"f﴾"_sv.size()); + } + + { + Test_Parser p(u8"(foo ʔ)"_sv, capture_diags); + Expression* ast = p.parse_expression(); + EXPECT_EQ(ast->without_paren()->kind(), Expression_Kind::Optional); + p.assert_offsets(static_cast(ast->without_paren()) + ->question_span(), + u8"(foo "_sv.size(), u8"(foo ʔ"_sv.size()); + } + + { + Test_Parser p(u8"(x ։ y)"_sv, capture_diags); + Expression* ast = p.parse_expression(); + EXPECT_EQ(ast->without_paren()->kind(), Expression_Kind::Type_Annotated); + p.assert_offsets( + static_cast(ast->without_paren()) + ->colon_span(), + u8"(x "_sv.size(), u8"(x ։"_sv.size()); + } +} } }