Skip to content

Commit

Permalink
feat(fe): error on more confusables like ǃ and ﴾
Browse files Browse the repository at this point in the history
Also remove some assumptions that certain symbols in expressions are
only one byte.
  • Loading branch information
strager committed Mar 3, 2024
1 parent b27315e commit c629777
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 31 deletions.
6 changes: 4 additions & 2 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ Semantic Versioning.
* VS Code: You can now make quick-lint-js messages fun and insulting with the
`quick-lint-js.snarky` setting (disabled by default). (Implemented by
[vegerot][].)
* Using Greek question mark (;, U+037E) instead of a semicolon (;, U+003B) now
reports [E0457][] ("this is a Greek Question Mark, not a semicolon (';')").
* Using Greek question mark (`;`, U+037E) instead of a semicolon (`;`, U+003B)
now reports [E0457][] ("this is a Greek Question Mark, not a semicolon
(';')"). This diagnostic is also reported for similar-looking characters like
`ǃ` (which should be `!`) and `` (which should be `(`).
* TypeScript: Decorators on abstract classes are now parsed. ([#1194][])

### Fixed
Expand Down
35 changes: 11 additions & 24 deletions src/quick-lint-js/fe/expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -615,23 +615,19 @@ class Expression::Call final : public Expression {
Source_Code_Span left_paren_span, const Char8 *span_end,
std::optional<Source_Code_Span> optional_chaining_operator)
: Expression(kind),
call_left_paren_begin_(left_paren_span.begin()),
call_left_paren_(left_paren_span),
span_end_(span_end),
children_(children),
optional_chaining_operator_begin_(
optional_chaining_operator.has_value()
? optional_chaining_operator->begin()
: nullptr) {
QLJS_ASSERT(left_paren_span.size() == 1);
if (optional_chaining_operator.has_value()) {
QLJS_ASSERT(optional_chaining_operator->size() == 2);
}
}

Source_Code_Span left_paren_span() const {
return Source_Code_Span(this->call_left_paren_begin_,
this->call_left_paren_begin_ + 1);
}
Source_Code_Span left_paren_span() const { return this->call_left_paren_; }

std::optional<Source_Code_Span> optional_chaining_operator_span() const {
if (this->optional_chaining_operator_begin_ == nullptr) {
Expand All @@ -641,7 +637,7 @@ class Expression::Call final : public Expression {
this->optional_chaining_operator_begin_ + 2);
}

const Char8 *call_left_paren_begin_;
Source_Code_Span call_left_paren_;
const Char8 *span_end_;
Expression_Arena::Array_Ptr<Expression *> children_;
const Char8 *optional_chaining_operator_begin_ = nullptr;
Expand Down Expand Up @@ -895,16 +891,12 @@ class Expression::Optional final : public Expression {
static constexpr Expression_Kind kind = Expression_Kind::Optional;

explicit Optional(Expression *child, Source_Code_Span question_span)
: Expression(kind), child_(child), question_end_(question_span.end()) {
QLJS_ASSERT(question_span.end() - question_span.begin() == 1);
}
: Expression(kind), child_(child), question_(question_span) {}

Source_Code_Span question_span() const {
return Source_Code_Span(this->question_end_ - 1, this->question_end_);
}
Source_Code_Span question_span() const { return this->question_; }

Expression *child_;
const Char8 *question_end_;
Source_Code_Span question_;
};
static_assert(Expression_Arena::is_allocatable<Expression::Optional>);

Expand Down Expand Up @@ -1093,23 +1085,18 @@ class Expression::Type_Annotated final : public Expression {
const Char8 *span_end)
: Expression(kind),
child_(child),
colon_(colon_span.begin()),
colon_(colon_span),
type_visits_(std::move(type_visits)),
span_end_(span_end) {
QLJS_ASSERT(*colon_span.begin() == u8':');
QLJS_ASSERT(colon_span.size() == 1);
}
span_end_(span_end) {}

Source_Code_Span colon_span() const {
return Source_Code_Span(this->colon_, this->colon_ + 1);
}
Source_Code_Span colon_span() const { return this->colon_; }

void visit_type_annotation(Parse_Visitor_Base &v) {
std::move(this->type_visits_).move_into(v);
}

Expression *child_;
const Char8 *colon_;
Source_Code_Span colon_;
Buffering_Visitor type_visits_{nullptr};
const Char8 *span_end_;
};
Expand Down Expand Up @@ -1458,7 +1445,7 @@ inline Source_Code_Span Expression::span() const {
case Expression_Kind::Optional: {
auto *optional = expression_cast<const Expression::Optional *>(this);
return Source_Code_Span(optional->child_->span().begin(),
optional->question_end_);
optional->question_.end());
}
case Expression_Kind::Paren:
return expression_cast<const Paren *>(this)->span_;
Expand Down
107 changes: 102 additions & 5 deletions src/quick-lint-js/fe/lex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,113 @@ constexpr char32_t right_double_quote = U'\u201d';

struct Confusable_Symbol {
char32_t confusable;
Char8 confusable_name[20];
Char8 confusable_name[51];
Char8 symbol;
Char8 symbol_name[20];
Char8 symbol_name[21];
Token_Type symbol_token_type;
};

Confusable_Symbol confusable_symbols[] = {
{0x037e, u8"Greek Question Mark", u8';', u8"semicolon",
Token_Type::semicolon},
// TODO(strager): Add more.
// clang-format off
{ 0x037e, u8"Greek Question Mark", u8';', u8"semicolon", Token_Type::semicolon},

{ 0x02d0, u8"Modifier Letter Triangular Colon", u8':', u8"colon", Token_Type::colon},
{ 0x02f8, u8"Modifier Letter Raised Colon", u8':', u8"colon", Token_Type::colon},
{ 0x0589, u8"Armenian Full Stop", u8':', u8"colon", Token_Type::colon},
{ 0x05c3, u8"Hebrew Punctuation Sof Pasuq", u8':', u8"colon", Token_Type::colon},
{ 0x0703, u8"Syriac Supralinear Colon", u8':', u8"colon", Token_Type::colon},
{ 0x0704, u8"Syriac Sublinear Colon", u8':', u8"colon", Token_Type::colon},
{ 0x0903, u8"Devanagari Sign Visarga", u8':', u8"colon", Token_Type::colon},
{ 0x0a83, u8"Gujarati Sign Visarga", u8':', u8"colon", Token_Type::colon},
{ 0x16ec, u8"Runic Multiple Punctuation", u8':', u8"colon", Token_Type::colon},
{ 0x1803, u8"Mongolian Full Stop", u8':', u8"colon", Token_Type::colon},
{ 0x1809, u8"Mongolian Manchu Full Stop", u8':', u8"colon", Token_Type::colon},
{ 0x205a, u8"Two Dot Punctuation", u8':', u8"colon", Token_Type::colon},
{ 0x2236, u8"Ratio", u8':', u8"colon", Token_Type::colon},
{ 0xa4fd, u8"Lisu Letter Tone Mya Jeu", u8':', u8"colon", Token_Type::colon},
{ 0xa789, u8"Modifier Letter Colon", u8':', u8"colon", Token_Type::colon},
{ 0xfe30, u8"Presentation Form For Vertical Two Dot Leader", u8':', u8"colon", Token_Type::colon},
{ 0xff1a, u8"Fullwidth Colon", u8':', u8"colon", Token_Type::colon},

{ 0x00b8, u8"Cedilla", u8',', u8"comma", Token_Type::comma},
{ 0x060d, u8"Arabic Date Separator", u8',', u8"comma", Token_Type::comma},
{ 0x066b, u8"Arabic Decimal Separator", u8',', u8"comma", Token_Type::comma},
{ 0x201a, u8"Single Low-9 Quotation Mark", u8',', u8"comma", Token_Type::comma},
{ 0xa4f9, u8"Lisu Letter Tone Na Po", u8',', u8"comma", Token_Type::comma},

{ 0x01c3, u8"Latin Letter Retroflex Click", u8'!', u8"exclamation mark", Token_Type::bang},
{ 0x2d51, u8"Tifinagh Letter Tuareg Yang", u8'!', u8"exclamation mark", Token_Type::bang},
{ 0xff01, u8"Fullwidth Exclamation Mark", u8'!', u8"exclamation mark", Token_Type::bang},

// TODO(strager): Also match symbols like "․․․".
{ 0x0660, u8"Arabic-Indic Digit Zero", u8'.', u8"dot", Token_Type::dot},
{ 0x06f0, u8"Extended Arabic-Indic Digit Zero", u8'.', u8"dot", Token_Type::dot},
{ 0x0701, u8"Syriac Supralinear Full Stop", u8'.', u8"dot", Token_Type::dot},
{ 0x0702, u8"Syriac Sublinear Full Stop", u8'.', u8"dot", Token_Type::dot},
{ 0x2024, u8"One Dot Leader", u8'.', u8"dot", Token_Type::dot},
{ 0xa4f8, u8"Lisu Letter Tone Mya Ti", u8'.', u8"dot", Token_Type::dot},
{ 0xa60e, u8"Vai Full Stop", u8'.', u8"dot", Token_Type::dot},
{0x10a50, u8"Kharoshthi Punctuation Dot", u8'.', u8"dot", Token_Type::dot},
{0x1d16d, u8"Musical Symbol Combining Augmentation Dot", u8'.', u8"dot", Token_Type::dot},

// NOTE(strager): We diverge from Unicode here. Unicode considers a few of these as parentheses.
{ 0x2772, u8"Light Left Tortoise Shell Bracket Ornament", u8'(', u8"left square bracket", Token_Type::left_square},
{ 0x2773, u8"Light Right Tortoise Shell Bracket Ornament", u8')', u8"right square bracket", Token_Type::right_square},
{ 0x3014, u8"Left Tortoise Shell Bracket", u8'(', u8"left square bracket", Token_Type::left_square},
{ 0x3015, u8"Right Tortoise Shell Bracket", u8')', u8"right square bracket", Token_Type::right_square},
{ 0xff3b, u8"Fullwidth Left Square Bracket", u8'(', u8"left square bracket", Token_Type::left_square},
{ 0xff3d, u8"Fullwidth Right Square Bracket", u8')', u8"right square bracket", Token_Type::right_square},

{ 0x2768, u8"Medium Left Parenthesis Ornament", u8'(', u8"left parenthesis", Token_Type::left_paren},
{ 0x2769, u8"Medium Right Parenthesis Ornament", u8')', u8"right parenthesis", Token_Type::right_paren},
{ 0xfd3e, u8"Ornate Left Parenthesis", u8'(', u8"left parenthesis", Token_Type::left_paren},
{ 0xfd3f, u8"Ornate Right Parenthesis", u8')', u8"right parenthesis", Token_Type::right_paren},

{ 0x2774, u8"Medium Left Curly Bracket Ornament", u8'{', u8"left curly bracket", Token_Type::left_curly},
{ 0x2775, u8"Medium Right Curly Bracket Ornament", u8'}', u8"right curly bracket", Token_Type::right_curly},
{0x1d114, u8"Musical Symbol Brace", u8'{', u8"left curly bracket", Token_Type::left_curly},

// TODO(strager): Also match symbols like "ꝸ=" and "᐀᐀".
// NOTE(strager): 0x0294 is legal in identifiers.
{ 0x0294, u8"Latin Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question},
{ 0x0241, u8"Latin Capital Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question},
{ 0x097d, u8"Devanagari Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question},
{ 0x13ae, u8"Cherokee Letter He", u8'?', u8"question mark", Token_Type::question},
{ 0xa6eb, u8"Bamum Letter Ntuu", u8'?', u8"question mark", Token_Type::question},

{ 0xa778, u8"Latin Small Letter Um", u8'&', u8"ampersand", Token_Type::ampersand},

{ 0x066d, u8"Arabic Five Pointed Star", u8'*', u8"asterisk", Token_Type::star},
{ 0x204e, u8"Low Asterisk", u8'*', u8"asterisk", Token_Type::star},
{ 0x2217, u8"Asterisk Operator", u8'*', u8"asterisk", Token_Type::star},
{0x1031f, u8"Old Italic Letter Ess", u8'*', u8"asterisk", Token_Type::star},

{ 0x02c4, u8"Modifier Letter Up Arrowhead", u8'^', u8"circumflex", Token_Type::circumflex},
{ 0x02c6, u8"Modifier Letter Circumflex Accent", u8'^', u8"circumflex", Token_Type::circumflex},

{ 0x02c2, u8"Modifier Letter Left Arrowhead", u8'<', u8"less than", Token_Type::less},
{ 0x1438, u8"Canadian Syllabics Pa", u8'<', u8"less than", Token_Type::less},
{ 0x16b2, u8"Runic Letter Kauna", u8'<', u8"less than", Token_Type::less},
{ 0x2039, u8"Single Left-Pointing Angle Quotation Mark", u8'<', u8"less than", Token_Type::less},
{ 0x276e, u8"Heavy Left-Pointing Angle Quotation Mark Ornament", u8'<', u8"less than", Token_Type::less},
{0x1d236, u8"Greek Instrumental Notation Symbol-40", u8'<', u8"less than", Token_Type::less},

{ 0x02c3, u8"Modifier Letter Right Arrowhead", u8'>', u8"greater than", Token_Type::greater},
{ 0x1433, u8"Canadian Syllabics Po", u8'>', u8"greater than", Token_Type::greater},
{ 0x203a, u8"Single Right-Pointing Angle Quotation Mark", u8'>', u8"greater than", Token_Type::greater},
{ 0x276f, u8"Heavy Right-Pointing Angle Quotation Mark Ornament", u8'>', u8"greater than", Token_Type::greater},
{0x16f3f, u8"Miao Letter Archaic Zza", u8'>', u8"greater than", Token_Type::greater},
{0x1d237, u8"Greek Instrumental Notation Symbol-42", u8'>', u8"greater than", Token_Type::greater},

{ 0x02dc, u8"Small Tilde", u8'~', u8"tilde", Token_Type::tilde},
{ 0x1fc0, u8"Greek Perispomeni", u8'~', u8"tilde", Token_Type::tilde},
{ 0x2053, u8"Swung Dash", u8'~', u8"tilde", Token_Type::tilde},
{ 0x223c, u8"Tilde Operator", u8'~', u8"tilde", Token_Type::tilde},

{ 0x1400, u8"Canadian Syllabics Hyphen", u8'=', u8"equals", Token_Type::equal},
{ 0x2e40, u8"Double Hyphen", u8'=', u8"equals", Token_Type::equal},
{ 0x30a0, u8"Katakana-Hiragana Double Hyphen", u8'=', u8"equals", Token_Type::equal},
{ 0xa4ff, u8"Lisu Punctuation Full Stop", u8'=', u8"equals", Token_Type::equal},
};

bool look_up_in_unicode_table(const std::uint8_t* table, std::size_t table_size,
Expand Down
32 changes: 32 additions & 0 deletions test/test-parse-expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3834,6 +3834,38 @@ TEST_F(Test_Parse_Expression, precedence) {
}
}
}

TEST_F(Test_Parse_Expression, confusable_symbols) {
// In a previous version of quick-lint-js, confusable symbols would cause
// assertion failures during parsing.

{
Test_Parser p(u8"f﴾)"_sv, capture_diags);
Expression* ast = p.parse_expression();
EXPECT_EQ(ast->kind(), Expression_Kind::Call);
p.assert_offsets(static_cast<Expression::Call*>(ast)->left_paren_span(),
u8"f"_sv.size(), u8"f﴾"_sv.size());
}

{
Test_Parser p(u8"(foo ʔ)"_sv, capture_diags);
Expression* ast = p.parse_expression();
EXPECT_EQ(ast->without_paren()->kind(), Expression_Kind::Optional);
p.assert_offsets(static_cast<Expression::Optional*>(ast->without_paren())
->question_span(),
u8"(foo "_sv.size(), u8"(foo ʔ"_sv.size());
}

{
Test_Parser p(u8"(x ։ y)"_sv, capture_diags);
Expression* ast = p.parse_expression();
EXPECT_EQ(ast->without_paren()->kind(), Expression_Kind::Type_Annotated);
p.assert_offsets(
static_cast<Expression::Type_Annotated*>(ast->without_paren())
->colon_span(),
u8"(x "_sv.size(), u8"(x ։"_sv.size());
}
}
}
}

Expand Down

0 comments on commit c629777

Please sign in to comment.