diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9db9afaf1..8ea740735 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -615,11 +615,12 @@ impl Literal { /// If this literal was written as a `\x` hex escape, then this returns /// the corresponding byte value. Otherwise, this returns `None`. pub fn byte(&self) -> Option { - let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); - if self.c as u32 <= 255 && self.kind == short_hex { - Some(self.c as u8) - } else { - None + match self.kind { + LiteralKind::HexFixed(HexLiteralKind::X) => { + // MSRV(1.59): Use 'u8::try_from(self.c)' instead. + u8::try_from(u32::from(self.c)).ok() + } + _ => None, } } } diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 045de2eaf..f6b2462c0 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -213,24 +213,24 @@ impl Writer { match ast.kind { Verbatim => self.wtr.write_char(ast.c), Punctuation => write!(self.wtr, r"\{}", ast.c), - Octal => write!(self.wtr, r"\{:o}", ast.c as u32), + Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)), HexFixed(ast::HexLiteralKind::X) => { - write!(self.wtr, r"\x{:02X}", ast.c as u32) + write!(self.wtr, r"\x{:02X}", u32::from(ast.c)) } HexFixed(ast::HexLiteralKind::UnicodeShort) => { - write!(self.wtr, r"\u{:04X}", ast.c as u32) + write!(self.wtr, r"\u{:04X}", u32::from(ast.c)) } HexFixed(ast::HexLiteralKind::UnicodeLong) => { - write!(self.wtr, r"\U{:08X}", ast.c as u32) + write!(self.wtr, r"\U{:08X}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::X) => { - write!(self.wtr, r"\x{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::UnicodeShort) => { - write!(self.wtr, r"\u{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::UnicodeLong) => { - write!(self.wtr, r"\U{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c)) } Special(ast::SpecialLiteralKind::Bell) => { self.wtr.write_str(r"\a") diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 56698c53a..d6e83f7b2 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -481,7 +481,7 @@ impl Bound for u8 { u8::MAX } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { self.checked_add(1).unwrap() @@ -499,20 +499,20 @@ impl Bound for char { '\u{10FFFF}' } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { match self { '\u{D7FF}' => '\u{E000}', - c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), } } fn decrement(self) -> Self { match self { '\u{E000}' => '\u{D7FF}', - c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), } } } diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index fbc5d3c97..58b8871ed 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -475,8 +475,8 @@ impl Literals { base = vec![Literal::empty()]; } for r in cls.iter() { - let (s, e) = (r.start as u32, r.end as u32 + 1); - for c in (s..e).filter_map(char::from_u32) { + let (s, e) = (u32::from(r.start), u32::from(r.end)); + for c in (s..=e).filter_map(char::from_u32) { for mut lit in base.clone() { let mut bytes = c.to_string().into_bytes(); if reverse { @@ -502,8 +502,7 @@ impl Literals { base = vec![Literal::empty()]; } for r in cls.iter() { - let (s, e) = (r.start as u32, r.end as u32 + 1); - for b in (s..e).map(|b| b as u8) { + for b in r.start..=r.end { for mut lit in base.clone() { lit.push(b); self.lits.push(lit); @@ -784,7 +783,10 @@ fn repeat_range_literals( lits: &mut Literals, mut f: F, ) { - if min == 0 { + // If 'min' somehow overflows usize, then we just treat it as 0, which is + // the most conservative thing we can do. + let umin = usize::try_from(min).unwrap_or(0); + if umin == 0 { // This is a bit conservative. If `max` is set, then we could // treat this as a finite set of alternations. For now, we // just treat it as `e*`. @@ -797,11 +799,11 @@ fn repeat_range_literals( lits, ); } else { - if min > 0 { - let n = cmp::min(lits.limit_size, min as usize); + if umin > 0 { + let n = cmp::min(lits.limit_size, umin); let es = iter::repeat(e.clone()).take(n).collect(); f(&Hir::concat(es), lits); - if n < min as usize || lits.contains_empty() { + if n < umin || lits.contains_empty() { lits.cut(); } } @@ -928,12 +930,13 @@ fn escape_unicode(bytes: &[u8]) -> String { let mut space_escaped = String::new(); for c in show.chars() { if c.is_whitespace() { - let escaped = if c as u32 <= 0x7F { - escape_byte(c as u8) - } else if c as u32 <= 0xFFFF { - format!(r"\u{{{:04x}}}", c as u32) + let cp = u32::from(c); + let escaped = if cp <= 0x7F { + escape_byte(u8::try_from(cp).unwrap()) + } else if cp <= 0xFFFF { + format!(r"\u{{{:04x}}}", cp) } else { - format!(r"\U{{{:08x}}}", c as u32) + format!(r"\U{{{:08x}}}", cp) }; space_escaped.push_str(&escaped); } else { @@ -959,13 +962,11 @@ fn escape_byte(byte: u8) -> String { } fn cls_char_count(cls: &hir::ClassUnicode) -> usize { - cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::() - as usize + cls.iter().map(|&r| r.len()).sum() } fn cls_byte_count(cls: &hir::ClassBytes) -> usize { - cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::() - as usize + cls.iter().map(|&r| r.len()).sum() } #[cfg(test)] diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 55dc95c20..e363e2fb6 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -90,6 +90,13 @@ pub enum ErrorKind { __Nonexhaustive, } +// BREADCRUMBS: +// +// Remove EmptyClassNotAllowed +// Make errors non_exhaustive +// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc) +// Get rid of deprecated things + impl ErrorKind { // TODO: Remove this method entirely on the next breaking semver release. #[allow(deprecated)] @@ -1013,12 +1020,12 @@ impl fmt::Debug for ClassUnicodeRange { { self.start.to_string() } else { - format!("0x{:X}", self.start as u32) + format!("0x{:X}", u32::from(self.start)) }; let end = if !self.end.is_whitespace() && !self.end.is_control() { self.end.to_string() } else { - format!("0x{:X}", self.end as u32) + format!("0x{:X}", u32::from(self.end)) }; f.debug_struct("ClassUnicodeRange") .field("start", &start) @@ -1058,10 +1065,9 @@ impl Interval for ClassUnicodeRange { if !unicode::contains_simple_case_mapping(self.start, self.end)? { return Ok(()); } - let start = self.start as u32; - let end = (self.end as u32).saturating_add(1); + let (start, end) = (u32::from(self.start), u32::from(self.end)); let mut next_simple_cp = None; - for cp in (start..end).filter_map(char::from_u32) { + for cp in (start..=end).filter_map(char::from_u32) { if next_simple_cp.map_or(false, |next| cp < next) { continue; } @@ -1104,6 +1110,18 @@ impl ClassUnicodeRange { pub fn end(&self) -> char { self.end } + + /// Returns the number of codepoints in this range. + pub fn len(&self) -> usize { + let diff = 1 + u32::from(self.end) - u32::from(self.start); + // This is likely to panic in 16-bit targets since a usize can only fit + // 2^16. It's not clear what to do here, other than to return an error + // when building a Unicode class that contains a range whose length + // overflows usize. (Which, to be honest, is probably quite common on + // 16-bit targets. For example, this would imply that '.' and '\p{any}' + // would be impossible to build.) + usize::try_from(diff).expect("char class len fits in usize") + } } /// A set of characters represented by arbitrary bytes (where one byte @@ -1291,18 +1309,27 @@ impl ClassBytesRange { pub fn end(&self) -> u8 { self.end } + + /// Returns the number of bytes in this range. + pub fn len(&self) -> usize { + usize::from(self.end.checked_sub(self.start).unwrap()) + .checked_add(1) + .unwrap() + } } impl fmt::Debug for ClassBytesRange { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut debug = f.debug_struct("ClassBytesRange"); if self.start <= 0x7F { - debug.field("start", &(self.start as char)); + let ch = char::try_from(self.start).unwrap(); + debug.field("start", &ch); } else { debug.field("start", &self.start); } if self.end <= 0x7F { - debug.field("end", &(self.end as char)); + let ch = char::try_from(self.start).unwrap(); + debug.field("end", &ch); } else { debug.field("end", &self.end); } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index b71f3897c..433f9bf11 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -217,18 +217,16 @@ impl Writer { } fn write_literal_byte(&mut self, b: u8) -> fmt::Result { - let c = b as char; - if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { - self.write_literal_char(c) + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "(?-u:\\x{:02X})", b) } } fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { - let c = b as char; - if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { - self.write_literal_char(c) + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "\\x{:02X}", b) } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 890e1608b..04409cf95 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -656,7 +656,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { Some(byte) => byte, }; if byte <= 0x7F { - return Ok(hir::Literal::Unicode(byte as char)); + return Ok(hir::Literal::Unicode(char::try_from(byte).unwrap())); } if !self.trans().allow_invalid_utf8 { return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); @@ -704,7 +704,12 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( - c as u8, c as u8, + // OK because 'c.len_utf8() == 1' which in turn implies + // that 'c' is ASCII. + // + // MSRV(1.59): Use 'u8::try_from(c)' instead. + u8::try_from(u32::from(c)).unwrap(), + u8::try_from(u32::from(c)).unwrap(), )]); cls.case_fold_simple(); Ok(Hir::class(hir::Class::Bytes(cls))) @@ -848,9 +853,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ast: &ast::ClassAscii, ) -> Result { let mut cls = hir::ClassUnicode::new( - ascii_class(&ast.kind) - .iter() - .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ascii_class_as_chars(&ast.kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), ); self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) @@ -862,8 +866,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ) -> Result { let mut cls = hir::ClassBytes::new( ascii_class(&ast.kind) - .iter() - .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + .map(|(s, e)| hir::ClassBytesRange::new(s, e)), ); self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) @@ -985,8 +988,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { match self.literal_to_char(ast)? { hir::Literal::Byte(byte) => Ok(byte), hir::Literal::Unicode(ch) => { - if ch <= 0x7F as char { - Ok(ch as u8) + let cp = u32::from(ch); + if cp <= 0x7F { + Ok(u8::try_from(cp).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1085,38 +1089,44 @@ impl Flags { fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { let ranges: Vec<_> = ascii_class(kind) - .iter() - .cloned() - .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)) + .map(|(s, e)| hir::ClassBytesRange::new(s, e)) .collect(); hir::ClassBytes::new(ranges) } -fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { +fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator { use crate::ast::ClassAsciiKind::*; - match *kind { - Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')], - Alpha => &[('A', 'Z'), ('a', 'z')], - Ascii => &[('\x00', '\x7F')], - Blank => &[('\t', '\t'), (' ', ' ')], - Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')], - Digit => &[('0', '9')], - Graph => &[('!', '~')], - Lower => &[('a', 'z')], - Print => &[(' ', '~')], - Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')], + + let slice: &'static [(u8, u8)] = match *kind { + Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], + Alpha => &[(b'A', b'Z'), (b'a', b'z')], + Ascii => &[(b'\x00', b'\x7F')], + Blank => &[(b'\t', b'\t'), (b' ', b' ')], + Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], + Digit => &[(b'0', b'9')], + Graph => &[(b'!', b'~')], + Lower => &[(b'a', b'z')], + Print => &[(b' ', b'~')], + Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], Space => &[ - ('\t', '\t'), - ('\n', '\n'), - ('\x0B', '\x0B'), - ('\x0C', '\x0C'), - ('\r', '\r'), - (' ', ' '), + (b'\t', b'\t'), + (b'\n', b'\n'), + (b'\x0B', b'\x0B'), + (b'\x0C', b'\x0C'), + (b'\r', b'\r'), + (b' ', b' '), ], - Upper => &[('A', 'Z')], - Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], - Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')], - } + Upper => &[(b'A', b'Z')], + Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], + Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], + }; + slice.iter().copied() +} + +fn ascii_class_as_chars( + kind: &ast::ClassAsciiKind, +) -> impl Iterator { + ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) } #[cfg(test)] @@ -1126,7 +1136,7 @@ mod tests { use crate::hir::{self, Hir, HirKind}; use crate::unicode::{self, ClassQuery}; - use super::{ascii_class, TranslatorBuilder}; + use super::{ascii_class, ascii_class_as_chars, TranslatorBuilder}; // We create these errors to compare with real hir::Errors in the tests. // We define equality between TestError and hir::Error to disregard the @@ -1281,6 +1291,19 @@ mod tests { Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) } + fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( + ascii_class_as_chars(kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), + ))) + } + + fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Bytes(hir::ClassBytes::new( + ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), + ))) + } + fn hir_uclass(ranges: &[(char, char)]) -> Hir { let ranges: Vec = ranges .iter() @@ -1297,18 +1320,6 @@ mod tests { Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) } - fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { - let ranges: Vec = ranges - .iter() - .map(|&(s, e)| { - assert!(s as u32 <= 0x7F); - assert!(e as u32 <= 0x7F); - hir::ClassBytesRange::new(s as u8, e as u8) - }) - .collect(); - Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) - } - fn hir_case_fold(expr: Hir) -> Hir { match expr.into_kind() { HirKind::Class(mut cls) => { @@ -1856,64 +1867,64 @@ mod tests { fn class_ascii() { assert_eq!( t("[[:alnum:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)) + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) ); assert_eq!( t("[[:alpha:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)) + hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) ); assert_eq!( t("[[:ascii:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)) + hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) ); assert_eq!( t("[[:blank:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)) + hir_ascii_uclass(&ast::ClassAsciiKind::Blank) ); assert_eq!( t("[[:cntrl:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)) + hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) ); assert_eq!( t("[[:digit:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_uclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t("[[:graph:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)) + hir_ascii_uclass(&ast::ClassAsciiKind::Graph) ); assert_eq!( t("[[:lower:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)) + hir_ascii_uclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("[[:print:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)) + hir_ascii_uclass(&ast::ClassAsciiKind::Print) ); assert_eq!( t("[[:punct:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)) + hir_ascii_uclass(&ast::ClassAsciiKind::Punct) ); assert_eq!( t("[[:space:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_uclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t("[[:upper:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)) + hir_ascii_uclass(&ast::ClassAsciiKind::Upper) ); assert_eq!( t("[[:word:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_uclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t("[[:xdigit:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)) + hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) ); assert_eq!( t("[[:^lower:]]"), - hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) + hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) ); #[cfg(feature = "unicode-case")] assert_eq!( @@ -1928,13 +1939,11 @@ mod tests { assert_eq!( t("(?-u)[[:lower:]]"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)) + hir_ascii_bclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("(?i-u)[[:lower:]]"), - hir_case_fold(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Lower - ))) + hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) ); assert_eq!( @@ -1965,14 +1974,14 @@ mod tests { assert_eq!( t("[[:alnum:][:^ascii:]]"), hir_union( - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), hir_uclass(&[('\u{80}', '\u{10FFFF}')]), ), ); assert_eq!( t_bytes("(?-u)[[:alnum:][:^ascii:]]"), hir_union( - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), hir_bclass(&[(0x80, 0xFF)]), ), ); @@ -2024,65 +2033,53 @@ mod tests { // ASCII only assert_eq!( t(r"(?-u)\d"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?-u)\s"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?-u)\w"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t(r"(?i-u)\d"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?i-u)\s"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?i-u)\w"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); // ASCII only, negated assert_eq!( t(r"(?-u)\D"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t(r"(?-u)\S"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Space - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( t(r"(?-u)\W"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( t(r"(?i-u)\D"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t(r"(?i-u)\S"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Space - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( t(r"(?i-u)\W"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); } @@ -2826,9 +2823,7 @@ mod tests { #[cfg(feature = "unicode-perl")] assert_eq!( t_bytes(r"(?-u)[^\w&&\d]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[a-z&&a-c]]"), @@ -2836,19 +2831,15 @@ mod tests { ); assert_eq!( t_bytes(r"(?-u)[^[\w&&\d]]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[^\w&&\d]]"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); } diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 5c22f66ac..0b716f5e6 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -419,7 +419,8 @@ pub fn is_word_character(c: char) -> result::Result { use crate::unicode_tables::perl_word::PERL_WORD; use std::cmp::Ordering; - if c <= 0x7F as char && is_word_byte(c as u8) { + // MSRV(1.59): Use 'u8::try_from(c)' instead. + if u8::try_from(u32::from(c)).map_or(false, is_word_byte) { return Ok(true); } Ok(PERL_WORD diff --git a/regex-syntax/src/utf8.rs b/regex-syntax/src/utf8.rs index b9c865532..b00cd7dba 100644 --- a/regex-syntax/src/utf8.rs +++ b/regex-syntax/src/utf8.rs @@ -306,7 +306,7 @@ impl Utf8Sequences { /// given. pub fn new(start: char, end: char) -> Self { let mut it = Utf8Sequences { range_stack: vec![] }; - it.push(start as u32, end as u32); + it.push(u32::from(start), u32::from(end)); it } @@ -317,7 +317,7 @@ impl Utf8Sequences { #[doc(hidden)] pub fn reset(&mut self, start: char, end: char) { self.range_stack.clear(); - self.push(start as u32, end as u32); + self.push(u32::from(start), u32::from(end)); } fn push(&mut self, start: u32, end: u32) { @@ -416,7 +416,9 @@ impl ScalarRange { /// values in this range can be encoded as a single byte. fn as_ascii(&self) -> Option { if self.is_ascii() { - Some(Utf8Range::new(self.start as u8, self.end as u8)) + let start = u8::try_from(self.start).unwrap(); + let end = u8::try_from(self.end).unwrap(); + Some(Utf8Range::new(start, end)) } else { None } @@ -472,7 +474,11 @@ mod tests { "Sequence ({:X}, {:X}) contains range {:?}, \ which matches surrogate code point {:X} \ with encoded bytes {:?}", - start as u32, end as u32, r, cp, buf, + u32::from(start), + u32::from(end), + r, + cp, + buf, ); } } @@ -579,9 +585,9 @@ mod tests { assert!(0xD800 <= cp && cp < 0xE000); let mut dst = [0; 3]; - dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B; - dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT; - dst[2] = (cp & 0x3F) as u8 | TAG_CONT; + dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B; + dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT; + dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT; dst } }