From a3d6bc34684f67b9b5b2a419d57dc0afe5af4a67 Mon Sep 17 00:00:00 2001 From: clubby789 Date: Sat, 7 Jan 2023 16:33:05 +0000 Subject: [PATCH] Emit a single error for contiguous sequences of Unicode homoglyphs --- compiler/rustc_parse/src/lexer/mod.rs | 28 +++++++++++++++--- .../rustc_parse/src/lexer/unicode_chars.rs | 10 +++++-- tests/rustdoc-ui/invalid-syntax.stderr | 2 -- tests/ui/parser/issues/issue-66473.stderr | Bin 5260 -> 1061 bytes tests/ui/parser/issues/issue-68629.stderr | Bin 1831 -> 1637 bytes tests/ui/parser/issues/issue-68730.stderr | Bin 1226 -> 1266 bytes tests/ui/parser/unicode-chars.rs | 4 +++ tests/ui/parser/unicode-chars.stderr | 14 ++++++++- 8 files changed, 49 insertions(+), 9 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index f027843e6b43d..8761c23625b21 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -79,7 +79,7 @@ impl<'a> StringReader<'a> { /// preceded by whitespace. fn next_token(&mut self) -> (Token, bool) { let mut preceded_by_whitespace = false; - + let mut swallow_next_invalid = 0; // Skip trivial (whitespace & comments) tokens loop { let token = self.cursor.advance_token(); @@ -232,19 +232,34 @@ impl<'a> StringReader<'a> { rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { - let c = self.str_from(start).chars().next().unwrap(); + // Don't emit diagnostics for sequences of the same invalid token + if swallow_next_invalid > 0 { + swallow_next_invalid -= 1; + continue; + } + let mut it = self.str_from_to_end(start).chars(); + let c = it.next().unwrap(); + let repeats = it.take_while(|c1| *c1 == c).count(); let mut err = - self.struct_err_span_char(start, self.pos, "unknown start of token", c); + self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c); // FIXME: the lexer could be used to turn the ASCII version of unicode // homoglyphs, instead of keeping a table in `check_for_substitution`into the // token. Ideally, this should be inside `rustc_lexer`. However, we should // first remove compound tokens like `<<` from `rustc_lexer`, and then add // fancier error recovery to it, as there will be less overall work to do this // way. - let token = unicode_chars::check_for_substitution(self, start, c, &mut err); + let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1); if c == '\x00' { err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used"); } + if repeats > 0 { + if repeats == 1 { + err.note(format!("character appears once more")); + } else { + err.note(format!("character appears {repeats} more times")); + } + swallow_next_invalid = repeats; + } err.emit(); if let Some(token) = token { token @@ -486,6 +501,11 @@ impl<'a> StringReader<'a> { &self.src[self.src_index(start)..self.src_index(end)] } + /// Slice of the source text spanning from `start` until the end + fn str_from_to_end(&self, start: BytePos) -> &str { + &self.src[self.src_index(start)..] + } + fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! { match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) { Err(RawStrError::InvalidStarter { bad_char }) => { diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs index f1b50296e2565..65479b341d7a8 100644 --- a/compiler/rustc_parse/src/lexer/unicode_chars.rs +++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs @@ -337,10 +337,11 @@ pub(super) fn check_for_substitution<'a>( pos: BytePos, ch: char, err: &mut Diagnostic, + count: usize, ) -> Option { let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?; - let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8())); + let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count)); let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else { let msg = format!("substitution character not found for '{}'", ch); @@ -369,7 +370,12 @@ pub(super) fn check_for_substitution<'a>( "Unicode character '{}' ({}) looks like '{}' ({}), but it is not", ch, u_name, ascii_char, ascii_name ); - err.span_suggestion(span, &msg, ascii_char, Applicability::MaybeIncorrect); + err.span_suggestion( + span, + &msg, + ascii_char.to_string().repeat(count), + Applicability::MaybeIncorrect, + ); } token.clone() } diff --git a/tests/rustdoc-ui/invalid-syntax.stderr b/tests/rustdoc-ui/invalid-syntax.stderr index 597d19e748cb7..6140a06c555f2 100644 --- a/tests/rustdoc-ui/invalid-syntax.stderr +++ b/tests/rustdoc-ui/invalid-syntax.stderr @@ -77,8 +77,6 @@ LL | /// ``` | ^^^ | = note: error from rustc: unknown start of token: ` - = note: error from rustc: unknown start of token: ` - = note: error from rustc: unknown start of token: ` warning: could not parse code block as Rust code --> $DIR/invalid-syntax.rs:64:5 diff --git a/tests/ui/parser/issues/issue-66473.stderr b/tests/ui/parser/issues/issue-66473.stderr index 8a16d7f955129811997464d47c4e10238db77340..0e8b0a5da220569b607fe3512ae59ee94e3fcf1b 100644 GIT binary patch delta 122 zcmeCtT*|Q_no&zZLBUobFTW(!N+CHTu_!UQB(+E(v7jI|v8Y(V&|D!mzbI9qBr`X) wc=7~6m(AsZ;f!i{_2lOzr-F4&UL$-)*i50ID77pzzqD8(wWuh+sF;fj0L7&#-~a#s delta 252 zcmZ3=(WAK`nsKrtlLNDfmC3{ntD#I&1k((`G)FKk5KK!blf}f!&;W^Th{QHRC^wuu zky&Hn^%js#AT2OkL2Q_{AU4ci5F2JOhz+wD#745(2x_(R=9P@)ARjp~noVwDnlss* b`Lu|Ui9$h9YFTD}X|Y0TQBi(TF&7sAkD^jK diff --git a/tests/ui/parser/issues/issue-68629.stderr b/tests/ui/parser/issues/issue-68629.stderr index b2c7dddc8011c21e1f3d21185bc48a97dbadf79c..43a903e6c4698e59840cdf18f73ed830c8b50c32 100644 GIT binary patch delta 75 zcmZ3^_mpQtHSR}z f+nfDZjxh;aC=?W>mSyIb7AvF{73CKdb8!Ix $DIR/unicode-chars.rs:5:5 + | +LL |     let x = 0; + | ^^^^ + | + = note: character appears 3 more times +help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not + | +LL | let x = 0; + | ++++ + +error: aborting due to 2 previous errors