Skip to content

Commit

Permalink
syntax: remove all uses of 'as'
Browse files Browse the repository at this point in the history
It turns out that all uses of 'as' in the regex-syntax crate can be
replaced with either explicitly infallible routines (like
'u32::from(char)'), or with routines that will panic on failure. These
panics are strictly better than truncating casts that might otherwise
lead to subtle bugs in the context of this crate. (Namely, we never
really care about the perf effects here, since regex parsing is just
never a bottleneck.)
  • Loading branch information
BurntSushi committed Apr 15, 2023
1 parent 2a45b99 commit 425d6b0
Show file tree
Hide file tree
Showing 9 changed files with 181 additions and 156 deletions.
11 changes: 6 additions & 5 deletions regex-syntax/src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -615,11 +615,12 @@ impl Literal {
/// If this literal was written as a `\x` hex escape, then this returns
/// the corresponding byte value. Otherwise, this returns `None`.
pub fn byte(&self) -> Option<u8> {
let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
if self.c as u32 <= 255 && self.kind == short_hex {
Some(self.c as u8)
} else {
None
match self.kind {
LiteralKind::HexFixed(HexLiteralKind::X) => {
// MSRV(1.59): Use 'u8::try_from(self.c)' instead.
u8::try_from(u32::from(self.c)).ok()
}
_ => None,
}
}
}
Expand Down
14 changes: 7 additions & 7 deletions regex-syntax/src/ast/print.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,24 +213,24 @@ impl<W: fmt::Write> Writer<W> {
match ast.kind {
Verbatim => self.wtr.write_char(ast.c),
Punctuation => write!(self.wtr, r"\{}", ast.c),
Octal => write!(self.wtr, r"\{:o}", ast.c as u32),
Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
HexFixed(ast::HexLiteralKind::X) => {
write!(self.wtr, r"\x{:02X}", ast.c as u32)
write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
}
HexFixed(ast::HexLiteralKind::UnicodeShort) => {
write!(self.wtr, r"\u{:04X}", ast.c as u32)
write!(self.wtr, r"\u{:04X}", u32::from(ast.c))
}
HexFixed(ast::HexLiteralKind::UnicodeLong) => {
write!(self.wtr, r"\U{:08X}", ast.c as u32)
write!(self.wtr, r"\U{:08X}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::X) => {
write!(self.wtr, r"\x{{{:X}}}", ast.c as u32)
write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::UnicodeShort) => {
write!(self.wtr, r"\u{{{:X}}}", ast.c as u32)
write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c))
}
HexBrace(ast::HexLiteralKind::UnicodeLong) => {
write!(self.wtr, r"\U{{{:X}}}", ast.c as u32)
write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c))
}
Special(ast::SpecialLiteralKind::Bell) => {
self.wtr.write_str(r"\a")
Expand Down
8 changes: 4 additions & 4 deletions regex-syntax/src/hir/interval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ impl Bound for u8 {
u8::MAX
}
fn as_u32(self) -> u32 {
self as u32
u32::from(self)
}
fn increment(self) -> Self {
self.checked_add(1).unwrap()
Expand All @@ -499,20 +499,20 @@ impl Bound for char {
'\u{10FFFF}'
}
fn as_u32(self) -> u32 {
self as u32
u32::from(self)
}

fn increment(self) -> Self {
match self {
'\u{D7FF}' => '\u{E000}',
c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
}
}

fn decrement(self) -> Self {
match self {
'\u{E000}' => '\u{D7FF}',
c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
}
}
}
Expand Down
35 changes: 18 additions & 17 deletions regex-syntax/src/hir/literal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -475,8 +475,8 @@ impl Literals {
base = vec![Literal::empty()];
}
for r in cls.iter() {
let (s, e) = (r.start as u32, r.end as u32 + 1);
for c in (s..e).filter_map(char::from_u32) {
let (s, e) = (u32::from(r.start), u32::from(r.end));
for c in (s..=e).filter_map(char::from_u32) {
for mut lit in base.clone() {
let mut bytes = c.to_string().into_bytes();
if reverse {
Expand All @@ -502,8 +502,7 @@ impl Literals {
base = vec![Literal::empty()];
}
for r in cls.iter() {
let (s, e) = (r.start as u32, r.end as u32 + 1);
for b in (s..e).map(|b| b as u8) {
for b in r.start..=r.end {
for mut lit in base.clone() {
lit.push(b);
self.lits.push(lit);
Expand Down Expand Up @@ -784,7 +783,10 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
lits: &mut Literals,
mut f: F,
) {
if min == 0 {
// If 'min' somehow overflows usize, then we just treat it as 0, which is
// the most conservative thing we can do.
let umin = usize::try_from(min).unwrap_or(0);
if umin == 0 {
// This is a bit conservative. If `max` is set, then we could
// treat this as a finite set of alternations. For now, we
// just treat it as `e*`.
Expand All @@ -797,11 +799,11 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
lits,
);
} else {
if min > 0 {
let n = cmp::min(lits.limit_size, min as usize);
if umin > 0 {
let n = cmp::min(lits.limit_size, umin);
let es = iter::repeat(e.clone()).take(n).collect();
f(&Hir::concat(es), lits);
if n < min as usize || lits.contains_empty() {
if n < umin || lits.contains_empty() {
lits.cut();
}
}
Expand Down Expand Up @@ -928,12 +930,13 @@ fn escape_unicode(bytes: &[u8]) -> String {
let mut space_escaped = String::new();
for c in show.chars() {
if c.is_whitespace() {
let escaped = if c as u32 <= 0x7F {
escape_byte(c as u8)
} else if c as u32 <= 0xFFFF {
format!(r"\u{{{:04x}}}", c as u32)
let cp = u32::from(c);
let escaped = if cp <= 0x7F {
escape_byte(u8::try_from(cp).unwrap())
} else if cp <= 0xFFFF {
format!(r"\u{{{:04x}}}", cp)
} else {
format!(r"\U{{{:08x}}}", c as u32)
format!(r"\U{{{:08x}}}", cp)
};
space_escaped.push_str(&escaped);
} else {
Expand All @@ -959,13 +962,11 @@ fn escape_byte(byte: u8) -> String {
}

fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
as usize
cls.iter().map(|&r| r.len()).sum()
}

fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
as usize
cls.iter().map(|&r| r.len()).sum()
}

#[cfg(test)]
Expand Down
41 changes: 34 additions & 7 deletions regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,13 @@ pub enum ErrorKind {
__Nonexhaustive,
}

// BREADCRUMBS:
//
// Remove EmptyClassNotAllowed
// Make errors non_exhaustive
// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc)
// Get rid of deprecated things

impl ErrorKind {
// TODO: Remove this method entirely on the next breaking semver release.
#[allow(deprecated)]
Expand Down Expand Up @@ -1013,12 +1020,12 @@ impl fmt::Debug for ClassUnicodeRange {
{
self.start.to_string()
} else {
format!("0x{:X}", self.start as u32)
format!("0x{:X}", u32::from(self.start))
};
let end = if !self.end.is_whitespace() && !self.end.is_control() {
self.end.to_string()
} else {
format!("0x{:X}", self.end as u32)
format!("0x{:X}", u32::from(self.end))
};
f.debug_struct("ClassUnicodeRange")
.field("start", &start)
Expand Down Expand Up @@ -1058,10 +1065,9 @@ impl Interval for ClassUnicodeRange {
if !unicode::contains_simple_case_mapping(self.start, self.end)? {
return Ok(());
}
let start = self.start as u32;
let end = (self.end as u32).saturating_add(1);
let (start, end) = (u32::from(self.start), u32::from(self.end));
let mut next_simple_cp = None;
for cp in (start..end).filter_map(char::from_u32) {
for cp in (start..=end).filter_map(char::from_u32) {
if next_simple_cp.map_or(false, |next| cp < next) {
continue;
}
Expand Down Expand Up @@ -1104,6 +1110,18 @@ impl ClassUnicodeRange {
pub fn end(&self) -> char {
self.end
}

/// Returns the number of codepoints in this range.
pub fn len(&self) -> usize {
let diff = 1 + u32::from(self.end) - u32::from(self.start);
// This is likely to panic in 16-bit targets since a usize can only fit
// 2^16. It's not clear what to do here, other than to return an error
// when building a Unicode class that contains a range whose length
// overflows usize. (Which, to be honest, is probably quite common on
// 16-bit targets. For example, this would imply that '.' and '\p{any}'
// would be impossible to build.)
usize::try_from(diff).expect("char class len fits in usize")
}
}

/// A set of characters represented by arbitrary bytes (where one byte
Expand Down Expand Up @@ -1291,18 +1309,27 @@ impl ClassBytesRange {
pub fn end(&self) -> u8 {
self.end
}

/// Returns the number of bytes in this range.
pub fn len(&self) -> usize {
usize::from(self.end.checked_sub(self.start).unwrap())
.checked_add(1)
.unwrap()
}
}

impl fmt::Debug for ClassBytesRange {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut debug = f.debug_struct("ClassBytesRange");
if self.start <= 0x7F {
debug.field("start", &(self.start as char));
let ch = char::try_from(self.start).unwrap();
debug.field("start", &ch);
} else {
debug.field("start", &self.start);
}
if self.end <= 0x7F {
debug.field("end", &(self.end as char));
let ch = char::try_from(self.start).unwrap();
debug.field("end", &ch);
} else {
debug.field("end", &self.end);
}
Expand Down
10 changes: 4 additions & 6 deletions regex-syntax/src/hir/print.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,18 +217,16 @@ impl<W: fmt::Write> Writer<W> {
}

fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
let c = b as char;
if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
self.write_literal_char(c)
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
self.write_literal_char(char::try_from(b).unwrap())
} else {
write!(self.wtr, "(?-u:\\x{:02X})", b)
}
}

fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
let c = b as char;
if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
self.write_literal_char(c)
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
self.write_literal_char(char::try_from(b).unwrap())
} else {
write!(self.wtr, "\\x{:02X}", b)
}
Expand Down
Loading

0 comments on commit 425d6b0

Please sign in to comment.