From 31a317eadd7d0270dea4ac90b54f5f560514ed01 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 27 Mar 2016 20:07:46 -0400 Subject: [PATCH] Major literal optimization refactoring. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The principle change in this commit is a complete rewrite of how literals are detected from a regular expression. In particular, we now traverse the abstract syntax to discover literals instead of the compiled byte code. This permits more tuneable control over which and how many literals are extracted, and is now exposed in the `regex-syntax` crate so that others can benefit from it. Other changes in this commit: * The Boyer-Moore algorithm was rewritten to use my own concoction based on frequency analysis. We end up regressing on a couple benchmarks slightly because of this, but gain in some others and in general should be faster in a broader number of cases. (Principally because we try to run `memchr` on the rarest byte in a literal.) This should also greatly improve handling of non-Western text. * A "reverse suffix" literal optimization was added. That is, if suffix literals exist but no prefix literals exist, then we can quickly scan for suffix matches and then run the DFA in reverse to find matches. (I'm not aware of any other regex engine that does this.) * The mutex-based pool has been replaced with a spinlock-based pool (from the new `mempool` crate). This reduces some amount of constant overhead and improves several benchmarks that either search short haystacks or find many matches in long haystacks. * Search parameters have been refactored. * RegexSet can now contain 0 or more regular expressions (previously, it could only contain 2 or more). The InvalidSet error variant is now deprecated. * A bug in computing start states was fixed. Namely, the DFA assumed the start states was always the first instruction, which is trivially wrong for an expression like `^☃$`. This bug persisted because it typically occurred when a literal optimization would otherwise run. * A new CLI tool, regex-debug, has been added as a non-published sub-crate. The CLI tool can answer various facts about regular expressions, such as printing its AST, its compiled byte code or its detected literals. Closes #96, #188, #189 --- Cargo.toml | 2 + HACKING.md | 64 +- benches/Cargo.toml | 1 - benches/src/bench_onig.rs | 1 - benches/src/bench_pcre.rs | 1 - benches/src/bench_rust.rs | 1 - benches/src/bench_rust_bytes.rs | 1 - benches/src/bench_rust_plugin.rs | 1 - benches/src/misc.rs | 8 + benches/src/rust_compile.rs | 21 + regex-debug/Cargo.toml | 19 + regex-debug/src/main.rs | 264 ++++++ regex-syntax/src/lib.rs | 104 +++ regex-syntax/src/literals.rs | 1385 +++++++++++++++++++++++++++++ run-bench | 9 +- scripts/frequencies.py | 82 ++ src/backtrack.rs | 37 +- src/compile.rs | 44 +- src/dfa.rs | 68 +- src/error.rs | 5 +- src/exec.rs | 573 ++++++++---- src/freqs.rs | 271 ++++++ src/input.rs | 31 +- src/lib.rs | 12 +- src/literals.rs | 1016 +++++++-------------- src/nfa.rs | 29 +- src/params.rs | 202 +++++ src/pool.rs | 107 --- src/prog.rs | 91 +- src/re_bytes.rs | 46 +- src/re_unicode.rs | 59 +- src/set.rs | 23 +- src/utf8.rs | 1 + tests/crazy.rs | 2 + tests/macros.rs | 1 + tests/misc.rs | 4 + tests/set.rs | 4 + tests/shortest_match.rs | 14 + tests/suffix_reverse.rs | 17 + tests/test_backtrack.rs | 18 + tests/test_backtrack_bytes.rs | 19 + tests/test_backtrack_utf8bytes.rs | 19 + tests/test_default.rs | 35 +- tests/test_default_bytes.rs | 2 + tests/test_nfa.rs | 15 + tests/test_nfa_bytes.rs | 16 + tests/test_nfa_utf8bytes.rs | 16 + tests/test_plugin.rs | 1 + 48 files changed, 3534 insertions(+), 1228 deletions(-) create mode 100644 regex-debug/Cargo.toml create mode 100644 regex-debug/src/main.rs create mode 100644 regex-syntax/src/literals.rs create mode 100755 scripts/frequencies.py create mode 100644 src/freqs.rs create mode 100644 src/params.rs delete mode 100644 src/pool.rs create mode 100644 tests/shortest_match.rs create mode 100644 tests/suffix_reverse.rs diff --git a/Cargo.toml b/Cargo.toml index c36b4ddaba..67fa0821d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,8 @@ finite automata and guarantees linear time matching on all inputs. aho-corasick = "0.5" # For skipping along search text quickly when a leading byte is known. memchr = "0.1" +# For managing regex caches quickly across multiple threads. +mempool = "0.2" # For parsing regular expressions. regex-syntax = { path = "regex-syntax", version = "0.3.0" } # For compiling UTF-8 decoding into automata. diff --git a/HACKING.md b/HACKING.md index bba1f55b2c..132533d000 100644 --- a/HACKING.md +++ b/HACKING.md @@ -32,7 +32,8 @@ The code to find prefixes and search for prefixes is in src/literals.rs. When more than one literal prefix is found, we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and Boyer-Moore use `memchr` when -appropriate. +appropriate. The Boyer-Moore variant in this library also uses elementary +frequency analysis to choose the write byte to run `memchr` with. Of course, detecting prefix literals can only take us so far. Not all regular expressions have literal prefixes. To remedy this, we try another approach to @@ -53,10 +54,12 @@ text results in at most one new DFA state. It is made fast by caching states. DFAs are susceptible to exponential state blow up (where the worst case is computing a new state for every input byte, regardless of what's in the state cache). To avoid using a lot of memory, the lazy DFA uses a bounded cache. Once -the cache is full, it is wiped and state computation starts over again. +the cache is full, it is wiped and state computation starts over again. If the +cache is wiped too frequently, then the DFA gives up and searching falls back +to one of the aforementioned algorithms. -All of the above matching engines expose precisely the matching semantics. This -is indeed tested. (See the section below about testing.) +All of the above matching engines expose precisely the same matching semantics. +This is indeed tested. (See the section below about testing.) The following sub-sections describe the rest of the library and how each of the matching engines are actually used. @@ -70,6 +73,9 @@ encountered. Parsing is done in a separate crate so that others may benefit from its existence, and because it is relatively divorced from the rest of the regex library. +The regex-syntax crate also provides sophisticated support for extracting +prefix and suffix literals from regular expressions. + ### Compilation The compiler is in src/compile.rs. The input to the compiler is some abstract @@ -162,7 +168,7 @@ knows what the caller wants. Using this information, we can determine which engine (or engines) to use. The logic for choosing which engine to execute is in src/exec.rs and is -documented on the Exec type. Exec values collection regular expression +documented on the Exec type. Exec values contain regular expression Programs (defined in src/prog.rs), which contain all the necessary tidbits for actually executing a regular expression on search text. @@ -172,6 +178,14 @@ of src/exec.rs by far is the execution of the lazy DFA, since it requires a forwards and backwards search, and then falls back to either the NFA algorithm or backtracking if the caller requested capture locations. +The parameterization of every search is defined in src/params.rs. Among other +things, search parameters provide storage for recording capture locations and +matches (for regex sets). The existence and nature of storage is itself a +configuration for how each matching engine behaves. For example, if no storage +for capture locations is provided, then the matching engines can give up as +soon as a match is witnessed (which may occur well before the leftmost-first +match). + ### Programs A regular expression program is essentially a sequence of opcodes produced by @@ -268,48 +282,46 @@ N.B. To run tests for the `regex!` macro, use: The benchmarking in this crate is made up of many micro-benchmarks. Currently, there are two primary sets of benchmarks: the benchmarks that were adopted at -this library's inception (in `benches/bench.rs`) and a newer set of benchmarks +this library's inception (in `benches/src/misc.rs`) and a newer set of benchmarks meant to test various optimizations. Specifically, the latter set contain some -analysis and are in `benches/bench_sherlock.rs`. Also, the latter set are all +analysis and are in `benches/src/sherlock.rs`. Also, the latter set are all executed on the same lengthy input whereas the former benchmarks are executed on strings of varying length. There is also a smattering of benchmarks for parsing and compilation. +Benchmarks are in a separate crate so that its dependencies can be managed +separately from the main regex crate. + Benchmarking follows a similarly wonky setup as tests. There are multiple entry points: -* `bench_native.rs` - benchmarks the `regex!` macro -* `bench_dynamic.rs` - benchmarks `Regex::new` -* `bench_dynamic_nfa.rs` benchmarks `Regex::new`, forced to use the NFA - algorithm on every regex. (N.B. This can take a few minutes to run.) +* `bench_rust_plugin.rs` - benchmarks the `regex!` macro +* `bench_rust.rs` - benchmarks `Regex::new` +* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` * `bench_pcre.rs` - benchmarks PCRE +* `bench_onig.rs` - benchmarks Oniguruma -The PCRE benchmarks exist as a comparison point to a mature regular expression -library. In general, this regex library compares favorably (there are even a -few benchmarks that PCRE simply runs too slowly on or outright can't execute at -all). I would love to add other regular expression library benchmarks -(especially RE2), but PCRE is the only one with reasonable bindings. +The PCRE and Oniguruma benchmarks exist as a comparison point to a mature +regular expression library. In general, this regex library compares favorably +(there are even a few benchmarks that PCRE simply runs too slowly on or +outright can't execute at all). I would love to add other regular expression +library benchmarks (especially RE2). If you're hacking on one of the matching engines and just want to see benchmarks, then all you need to run is: - $ cargo bench --bench dynamic + $ ./run-bench rust If you want to compare your results with older benchmarks, then try: - $ cargo bench --bench dynamic | tee old + $ ./run-bench rust | tee old $ ... make it faster - $ cargo bench --bench dynamic | tee new + $ ./run-bench rust | tee new $ cargo-benchcmp old new --improvements The `cargo-benchcmp` utility is available here: https://github.com/BurntSushi/cargo-benchcmp -To run the same benchmarks on PCRE, you'll need to use the sub-crate in -`regex-pcre-benchmark` like so: - - $ cargo bench --manifest-path regex-pcre-benchmark/Cargo.toml - -The PCRE benchmarks are separated from the main regex crate so that its -dependency doesn't break builds in environments without PCRE. +The `run-bench` utility can run benchmarks for PCRE and Oniguruma too. See +`./run-bench --help`. diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 5e3b6a3861..5dd7e7567d 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -14,7 +14,6 @@ enum-set = "0.0.6" lazy_static = "0.1" onig = { version = "0.4", optional = true } pcre = { version = "0.2", optional = true } -rand = "0.3" regex = { version = "0.1", path = ".." } regex_macros = { version = "0.1", path = "../regex_macros", optional = true } regex-syntax = { version = "0.3", path = "../regex-syntax" } diff --git a/benches/src/bench_onig.rs b/benches/src/bench_onig.rs index 577fb955d5..32a93caaf9 100644 --- a/benches/src/bench_onig.rs +++ b/benches/src/bench_onig.rs @@ -12,7 +12,6 @@ #[macro_use] extern crate lazy_static; extern crate onig; -extern crate rand; extern crate test; use std::ops::Deref; diff --git a/benches/src/bench_pcre.rs b/benches/src/bench_pcre.rs index 4441c1d0ff..f959b474d3 100644 --- a/benches/src/bench_pcre.rs +++ b/benches/src/bench_pcre.rs @@ -24,7 +24,6 @@ extern crate enum_set; #[macro_use] extern crate lazy_static; extern crate pcre; -extern crate rand; extern crate test; /// A nominal wrapper around pcre::Pcre to expose an interface similar to diff --git a/benches/src/bench_rust.rs b/benches/src/bench_rust.rs index e19d453cea..259c4a12b6 100644 --- a/benches/src/bench_rust.rs +++ b/benches/src/bench_rust.rs @@ -11,7 +11,6 @@ #![feature(test)] #[macro_use] extern crate lazy_static; -extern crate rand; extern crate regex; extern crate regex_syntax; extern crate test; diff --git a/benches/src/bench_rust_bytes.rs b/benches/src/bench_rust_bytes.rs index 05a5592722..00b600ca73 100644 --- a/benches/src/bench_rust_bytes.rs +++ b/benches/src/bench_rust_bytes.rs @@ -11,7 +11,6 @@ #![feature(test)] #[macro_use] extern crate lazy_static; -extern crate rand; extern crate regex; extern crate regex_syntax; extern crate test; diff --git a/benches/src/bench_rust_plugin.rs b/benches/src/bench_rust_plugin.rs index 11a85e634b..5b428e76ae 100644 --- a/benches/src/bench_rust_plugin.rs +++ b/benches/src/bench_rust_plugin.rs @@ -12,7 +12,6 @@ #![plugin(regex_macros)] #[macro_use] extern crate lazy_static; -extern crate rand; extern crate regex; extern crate regex_syntax; extern crate test; diff --git a/benches/src/misc.rs b/benches/src/misc.rs index 460c4f808a..204b919127 100644 --- a/benches/src/misc.rs +++ b/benches/src/misc.rs @@ -130,6 +130,14 @@ bench_match!(one_pass_long_prefix_not, regex!("^.bcdefghijklmnopqrstuvwxyz.*$"), "abcdefghijklmnopqrstuvwxyz".to_owned() }); +bench_match!(long_needle1, regex!("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"), { + repeat("a").take(100_000).collect::() + "b" +}); + +bench_match!(long_needle2, regex!("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbba"), { + repeat("b").take(100_000).collect::() + "a" +}); + #[cfg(feature = "re-rust")] #[bench] fn replace_all(b: &mut Bencher) { diff --git a/benches/src/rust_compile.rs b/benches/src/rust_compile.rs index 436e3a7ae9..9a89981fa0 100644 --- a/benches/src/rust_compile.rs +++ b/benches/src/rust_compile.rs @@ -29,6 +29,13 @@ fn compile_simple_bytes(b: &mut Bencher) { }); } +#[bench] +fn compile_simple_full(b: &mut Bencher) { + b.iter(|| { + regex!(r"^bc(d|e)*$") + }); +} + #[bench] fn compile_small(b: &mut Bencher) { b.iter(|| { @@ -45,6 +52,13 @@ fn compile_small_bytes(b: &mut Bencher) { }); } +#[bench] +fn compile_small_full(b: &mut Bencher) { + b.iter(|| { + regex!(r"\p{L}|\p{N}|\s|.|\d") + }); +} + #[bench] fn compile_huge(b: &mut Bencher) { b.iter(|| { @@ -60,3 +74,10 @@ fn compile_huge_bytes(b: &mut Bencher) { Compiler::new().bytes(true).compile(&[re]).unwrap() }); } + +#[bench] +fn compile_huge_full(b: &mut Bencher) { + b.iter(|| { + regex!(r"\p{L}{100}") + }); +} diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml new file mode 100644 index 0000000000..6645ab60d4 --- /dev/null +++ b/regex-debug/Cargo.toml @@ -0,0 +1,19 @@ +[package] +publish = false +name = "regex-debug" +version = "0.1.0" +authors = ["The Rust Project Developers"] +license = "MIT/Apache-2.0" +repository = "https://github.com/rust-lang/regex" +documentation = "http://doc.rust-lang.org/regex" +homepage = "https://github.com/rust-lang/regex" +description = "A tool useful for debugging regular expressions." + +[dependencies] +docopt = "0.6" +regex = { version = "0.1", path = ".." } +regex-syntax = { version = "0.3", path = "../regex-syntax" } +rustc-serialize = "0.3" + +[profile.release] +debug = true diff --git a/regex-debug/src/main.rs b/regex-debug/src/main.rs new file mode 100644 index 0000000000..d19a6c7a99 --- /dev/null +++ b/regex-debug/src/main.rs @@ -0,0 +1,264 @@ +extern crate docopt; +extern crate regex; +extern crate regex_syntax as syntax; +extern crate rustc_serialize; + +use std::error; +use std::io::{self, Write}; +use std::process; +use std::result; + +use docopt::Docopt; +use regex::internal::{Compiler, LiteralSearcher}; +use syntax::{ExprBuilder, Expr, Literals}; + +const USAGE: &'static str = " +Usage: + regex-debug [options] ast + regex-debug [options] prefixes ... + regex-debug [options] suffixes ... + regex-debug [options] anchors + regex-debug [options] captures + regex-debug [options] compile ... + regex-debug --help + +Options: + --help Show this usage message. + --size-limit ARG An approximate size limit on the total size (in bytes) + of a compiled regular expression program. + [default: 10485760] + --bytes Show the instruction codes for byte oriented programs. + (As opposed to Unicode oriented programs.) + --dfa Show the instruction codes for a DFA. + --dfa-reverse Show the instruction codes for a reverse DFA. + This implies --dfa. + -a, --all-literals Shows all literals extracted. + By default, only unambiguous literals are shown. + --literal-limit ARG An approximate limit on the total size (in bytes) + of all literals extracted. [default: 250] + --class-limit ARG A limit on the size of character classes used to + extract literals. [default: 10] + --lcp Show the longest common prefix of all the literals + extracted. + --lcs Show the longest common suffix of all the literals + extracted. + --searcher Show the debug output for the literal searcher + constructed by the literals found. +"; + +#[derive(RustcDecodable)] +struct Args { + cmd_ast: bool, + cmd_prefixes: bool, + cmd_suffixes: bool, + cmd_anchors: bool, + cmd_captures: bool, + cmd_compile: bool, + + arg_pattern: String, + arg_patterns: Vec, + + flag_size_limit: usize, + flag_bytes: bool, + flag_dfa: bool, + flag_dfa_reverse: bool, + flag_all_literals: bool, + flag_literal_limit: usize, + flag_class_limit: usize, + flag_lcp: bool, + flag_lcs: bool, + flag_searcher: bool, +} + +type Result = result::Result>; + +fn main() { + let mut args: Args = Docopt::new(USAGE) + .and_then(|d| d.decode()) + .unwrap_or_else(|e| e.exit()); + if args.flag_dfa_reverse { + args.flag_dfa = true; + } + match run(&args) { + Ok(_) => process::exit(0), + Err(err) => { + let _ = writeln!(&mut io::stderr(), "{}", err); + process::exit(1) + } + } +} + +fn run(args: &Args) -> Result<()> { + if args.cmd_ast { + cmd_ast(args) + } else if args.cmd_prefixes { + cmd_literals(args) + } else if args.cmd_suffixes { + cmd_literals(args) + } else if args.cmd_anchors { + cmd_anchors(args) + } else if args.cmd_captures { + cmd_captures(args) + } else if args.cmd_compile { + cmd_compile(args) + } else { + unreachable!() + } +} + +fn cmd_ast(args: &Args) -> Result<()> { + println!("{:#?}", try!(args.parse_one())); + Ok(()) +} + +fn cmd_literals(args: &Args) -> Result<()> { + let exprs = try!(args.parse_many()); + let mut lits = + if args.cmd_prefixes { + args.literals(&exprs, |lits, e| lits.union_prefixes(e)) + } else { + args.literals(&exprs, |lits, e| lits.union_suffixes(e)) + }; + if !args.flag_all_literals { + if args.cmd_prefixes { + lits = lits.unambiguous_prefixes(); + } else { + lits = lits.unambiguous_suffixes(); + } + } + if args.flag_searcher { + if args.cmd_prefixes { + println!("{:?}", LiteralSearcher::prefixes(lits)) + } else { + println!("{:?}", LiteralSearcher::suffixes(lits)) + } + } else if args.flag_lcp { + println!("{}", escape_unicode(lits.longest_common_prefix())); + } else if args.flag_lcs { + println!("{}", escape_unicode(lits.longest_common_suffix())); + } else { + for lit in lits.literals() { + println!("{:?}", lit); + } + } + Ok(()) +} + +fn cmd_anchors(args: &Args) -> Result<()> { + let expr = try!(args.parse_one()); + if expr.is_anchored_start() { + println!("start"); + } + if expr.is_anchored_end() { + println!("end"); + } + Ok(()) +} + +fn cmd_captures(args: &Args) -> Result<()> { + let expr = try!(args.parse_one()); + let prog = try!(args.compiler().only_utf8(false).compile(&[expr])); + for (i, name) in prog.captures.iter().enumerate() { + match *name { + None => println!("{}", i), + Some(ref name) => println!("{}:{}", i, name), + } + } + Ok(()) +} + +fn cmd_compile(args: &Args) -> Result<()> { + let exprs = try!(args.parse_many()); + let compiler = + args.compiler() + .bytes(args.flag_bytes) + .only_utf8(!args.flag_bytes) + .dfa(args.flag_dfa) + .reverse(args.flag_dfa_reverse); + let prog = try!(compiler.compile(&exprs)); + print!("{:?}", prog); + Ok(()) +} + +impl Args { + fn parse_one(&self) -> Result { + parse(&self.arg_pattern) + } + + fn parse_many(&self) -> Result> { + self.arg_patterns.iter().map(|s| parse(s)).collect() + } + + fn literals bool>( + &self, + exprs: &[Expr], + get_literals: F, + ) -> Literals { + let mut lits = Some(self.empty_literals()); + for e in exprs { + lits = lits.and_then(|mut lits| { + if !get_literals(&mut lits, e) { + None + } else { + Some(lits) + } + }); + } + lits.unwrap_or(self.empty_literals()) + } + + fn empty_literals(&self) -> Literals { + let mut lits = Literals::empty(); + lits.set_limit_size(self.flag_literal_limit); + lits.set_limit_class(self.flag_class_limit); + lits + } + + fn compiler(&self) -> Compiler { + Compiler::new().size_limit(self.flag_size_limit) + } +} + +fn parse(re: &str) -> Result { + ExprBuilder::new().allow_bytes(true).parse(re).map_err(From::from) +} + +fn escape_unicode(bytes: &[u8]) -> String { + let show = match ::std::str::from_utf8(bytes) { + Ok(v) => v.to_string(), + Err(_) => escape_bytes(bytes), + }; + let mut space_escaped = String::new(); + for c in show.chars() { + if c.is_whitespace() { + let escaped = if c as u32 <= 0x7F { + escape_byte(c as u8) + } else { + if c as u32 <= 0xFFFF { + format!(r"\u{{{:04x}}}", c as u32) + } else { + format!(r"\U{{{:08x}}}", c as u32) + } + }; + space_escaped.push_str(&escaped); + } else { + space_escaped.push(c); + } + } + space_escaped +} + +fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s +} + +fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() +} diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 1895236070..21a9681458 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -61,10 +61,12 @@ assert_eq!(err.kind(), &ErrorKind::UnclosedParen); */ #![deny(missing_docs)] +#![cfg_attr(test, deny(warnings))] #[cfg(test)] extern crate quickcheck; #[cfg(test)] extern crate rand; +mod literals; mod parser; mod unicode; @@ -86,6 +88,8 @@ use self::Repeater::*; use parser::{Flags, Parser}; +pub use literals::{Literals, Lit}; + /// A regular expression abstract syntax tree. /// /// An `Expr` represents the abstract syntax of a regular expression. @@ -488,6 +492,20 @@ impl Expr { simp(self, 0, nest_limit) } + /// Returns a set of literal prefixes extracted from this expression. + pub fn prefixes(&self) -> Literals { + let mut lits = Literals::empty(); + lits.union_prefixes(self); + lits + } + + /// Returns a set of literal suffixes extracted from this expression. + pub fn suffixes(&self) -> Literals { + let mut lits = Literals::empty(); + lits.union_suffixes(self); + lits + } + /// Returns true if and only if the expression is required to match from /// the beginning of text. pub fn is_anchored_start(&self) -> bool { @@ -568,6 +586,41 @@ impl CharClass { self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok() } + /// Removes the given character from the class if it exists. + /// + /// Note that this takes `O(n)` time in the number of ranges. + pub fn remove(&mut self, c: char) { + let mut i = match self.binary_search_by(|r| c.partial_cmp(r).unwrap()) { + Ok(i) => i, + Err(_) => return, + }; + let mut r = self.ranges.remove(i); + if r.start == c { + r.start = inc_char(c); + if r.start > r.end || c == char::MAX { + return; + } + self.ranges.insert(i, r); + } else if r.end == c { + r.end = dec_char(c); + if r.end < r.start || c == '\x00' { + return; + } + self.ranges.insert(0, r); + } else { + let (mut r1, mut r2) = (r.clone(), r.clone()); + r1.end = dec_char(c); + if r1.start <= r1.end { + self.ranges.insert(i, r1); + i += 1; + } + r2.start = inc_char(c); + if r2.start <= r2.end { + self.ranges.insert(i, r2); + } + } + } + /// Create a new empty class from this one. fn to_empty(&self) -> CharClass { CharClass { ranges: Vec::with_capacity(self.len()) } @@ -662,6 +715,14 @@ impl CharClass { } folded.canonicalize() } + + /// Returns the number of characters that match this class. + fn num_chars(&self) -> usize { + self.ranges.iter() + .map(|&r| 1 + (r.end as u32) - (r.start as u32)) + .fold(0, |acc, len| acc + len) + as usize + } } impl ClassRange { @@ -814,6 +875,41 @@ impl ByteClass { self.binary_search_by(|range| b.partial_cmp(range).unwrap()).is_ok() } + /// Removes the given byte from the class if it exists. + /// + /// Note that this takes `O(n)` time in the number of ranges. + pub fn remove(&mut self, b: u8) { + let mut i = match self.binary_search_by(|r| b.partial_cmp(r).unwrap()) { + Ok(i) => i, + Err(_) => return, + }; + let mut r = self.ranges.remove(i); + if r.start == b { + r.start = b.saturating_add(1); + if r.start > r.end || b == u8::MAX { + return; + } + self.ranges.insert(i, r); + } else if r.end == b { + r.end = b.saturating_sub(1); + if r.end < r.start || b == b'\x00' { + return; + } + self.ranges.insert(0, r); + } else { + let (mut r1, mut r2) = (r.clone(), r.clone()); + r1.end = b.saturating_sub(1); + if r1.start <= r1.end { + self.ranges.insert(i, r1); + i += 1; + } + r2.start = b.saturating_add(1); + if r2.start <= r2.end { + self.ranges.insert(i, r2); + } + } + } + /// Create a new empty class from this one. fn to_empty(&self) -> ByteClass { ByteClass { ranges: Vec::with_capacity(self.len()) } @@ -886,6 +982,14 @@ impl ByteClass { } folded.canonicalize() } + + /// Returns the number of bytes that match this class. + fn num_bytes(&self) -> usize { + self.ranges.iter() + .map(|&r| 1 + (r.end as u32) - (r.start as u32)) + .fold(0, |acc, len| acc + len) + as usize + } } impl ByteRange { diff --git a/regex-syntax/src/literals.rs b/regex-syntax/src/literals.rs new file mode 100644 index 0000000000..37204764c8 --- /dev/null +++ b/regex-syntax/src/literals.rs @@ -0,0 +1,1385 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::cmp; +use std::fmt; +use std::iter; +use std::mem; +use std::ops; + +use {Expr, CharClass, ClassRange, ByteClass, ByteRange, Repeater}; + +/// A set of literal byte strings extracted from a regular expression. +/// +/// Every member of the set is a `Lit`, which is represented by a `Vec`. +/// (Notably, it may contain invalid UTF-8.) Every member is said to be either +/// *complete* or *cut*. A complete literal means that it extends until the +/// beginning (or end) of the regular expression. In some circumstances, this +/// can be used to indicate a match in the regular expression. +/// +/// Note that a key aspect of literal extraction is knowing when to stop. It is +/// not feasible to blindly extract all literals from a regular expression, +/// even if there are finitely many. For example, the regular expression +/// `[0-9]{10}` has `10^10` distinct literals. For this reason, literal +/// extraction is bounded to some low number by default using heuristics, but +/// the limits can be tweaked. +#[derive(Clone, Eq, PartialEq)] +pub struct Literals { + lits: Vec, + limit_size: usize, + limit_class: usize, +} + +/// A single member of a set of literals extracted from a regular expression. +/// +/// This type has `Deref` and `DerefMut` impls to `Vec` so that all slice +/// and `Vec` operations are available. +#[derive(Clone, Eq, Ord)] +pub struct Lit { + v: Vec, + cut: bool, +} + +impl Literals { + /// Returns a new empty set of literals using default limits. + pub fn empty() -> Literals { + Literals { + lits: vec![], + limit_size: 250, + limit_class: 10, + } + } + + /// Get the approximate size limit (in bytes) of this set. + pub fn limit_size(&self) -> usize { + self.limit_size + } + + /// Set the approximate size limit (in bytes) of this set. + /// + /// If extracting a literal would put the set over this limit, then + /// extraction stops. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { + self.limit_size = size; + self + } + + /// Get the character class size limit for this set. + pub fn limit_class(&self) -> usize { + self.limit_class + } + + /// Limits the size of character(or byte) classes considered. + /// + /// A value of `0` prevents all character classes from being considered. + /// + /// This limit also applies to case insensitive literals, since each + /// character in the case insensitive literal is converted to a class, and + /// then case folded. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { + self.limit_class = size; + self + } + + /// Returns the set of literals as a slice. Its order is unspecified. + pub fn literals(&self) -> &[Lit] { + &self.lits + } + + /// Returns true if all members in this set are complete. + pub fn all_complete(&self) -> bool { + !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) + } + + /// Returns true if any member in this set is complete. + pub fn any_complete(&self) -> bool { + self.lits.iter().any(|lit| !lit.is_cut()) + } + + /// Returns true if this set contains an empty literal. + pub fn contains_empty(&self) -> bool { + self.lits.iter().any(|lit| lit.is_empty()) + } + + /// Returns true if this set is empty or if all of its members is empty. + pub fn is_empty(&self) -> bool { + self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) + } + + /// Returns a new empty set of literals using this set's limits. + pub fn to_empty(&self) -> Literals { + let mut lits = Literals::empty(); + lits.set_limit_size(self.limit_size) + .set_limit_class(self.limit_class); + lits + } + + /// Returns the longest common prefix of all members in this set. + pub fn longest_common_prefix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter() + .zip(lit0) + .take_while(|&(a, b)| a == b) + .count()); + } + &self.lits[0][..len] + } + + /// Returns the longest common suffix of all members in this set. + pub fn longest_common_suffix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter() + .rev() + .zip(lit0.iter().rev()) + .take_while(|&(a, b)| a == b) + .count()); + } + &self.lits[0][self.lits[0].len() - len..] + } + + /// Returns a new set of prefixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same starting position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_prefixes(&self) -> Literals { + if self.lits.is_empty() { + return self.to_empty(); + } + let mut new = self.to_empty(); + 'OUTER: + for lit1 in &self.lits { + if new.lits.is_empty() { + new.lits.push(lit1.clone()); + continue; + } + let mut candidate = lit1.clone(); + for lit2 in &mut new.lits { + if lit2.is_empty() { + continue; + } + if &candidate == lit2 { + // If the literal is already in the set, then we can + // just drop it. But make sure that cut literals are + // infectious! + candidate.cut = candidate.cut || lit2.cut; + lit2.cut = candidate.cut; + continue 'OUTER; + } + if candidate.len() <= lit2.len() { + if let Some(i) = position(&candidate, &lit2) { + lit2.truncate(i); + lit2.cut(); + candidate.cut(); + } + } else { + if let Some(i) = position(&lit2, &candidate) { + candidate.truncate(i); + candidate.cut(); + lit2.cut(); + } + } + // Oops, the candidate is already represented in the set. + if candidate.is_empty() { + continue 'OUTER; + } + } + new.lits.push(candidate); + } + new.lits.retain(|lit| !lit.is_empty()); + new.lits.sort(); + new.lits.dedup(); + new + } + + /// Returns a new set of suffixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same ending position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_suffixes(&self) -> Literals { + // This is a touch wasteful... + let mut lits = self.clone(); + lits.reverse(); + let mut unamb = lits.unambiguous_prefixes(); + unamb.reverse(); + unamb + } + + /// Unions the prefixes from the given expression to this set. + /// + /// If prefixes could not be added (for example, this set would exceed its + /// size limits or the set of prefixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the beginning of `expr` to the + /// end of `expr`. + pub fn union_prefixes(&mut self, expr: &Expr) -> bool { + let mut lits = self.to_empty(); + prefixes(expr, &mut lits); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions the suffixes from the given expression to this set. + /// + /// If suffixes could not be added (for example, this set would exceed its + /// size limits or the set of suffixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the end of `expr` to the + /// beginning of `expr`. + pub fn union_suffixes(&mut self, expr: &Expr) -> bool { + let mut lits = self.to_empty(); + suffixes(expr, &mut lits); + lits.reverse(); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions this set with another set. + /// + /// If the union would cause the set to exceed its limits, then the union + /// is skipped and it returns false. Otherwise, if the union succeeds, it + /// returns true. + pub fn union(&mut self, lits: Literals) -> bool { + if self.num_bytes() + lits.num_bytes() > self.limit_size { + return false; + } + if lits.is_empty() { + self.lits.push(Lit::empty()); + } else { + self.lits.extend(lits.lits); + } + true + } + + /// Extends this set with another set. + /// + /// The set of literals is extended via a cross product. + /// + /// If a cross product would cause this set to exceed its limits, then the + /// cross product is skipped and it returns false. Otherwise, if the cross + /// product succeeds, it returns true. + pub fn cross_product(&mut self, lits: &Literals) -> bool { + if lits.is_empty() { + return true; + } + // Check that we make sure we stay in our limits. + let mut size_after; + if self.is_empty() || !self.any_complete() { + size_after = self.num_bytes(); + for lits_lit in lits.literals() { + size_after += lits_lit.len(); + } + } else { + size_after = self.lits.iter().fold(0, |accum, lit| { + accum + if lit.is_cut() { lit.len() } else { 0 } + }); + for lits_lit in lits.literals() { + for self_lit in self.literals() { + if !self_lit.is_cut() { + size_after += self_lit.len() + lits_lit.len(); + } + } + } + } + if size_after > self.limit_size { + return false; + } + + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Lit::empty()]; + } + for lits_lit in lits.literals() { + for mut self_lit in base.clone() { + self_lit.extend(&**lits_lit); + self_lit.cut = lits_lit.cut; + self.lits.push(self_lit); + } + } + true + } + + /// Extends each literal in this set with the bytes given. + /// + /// If the set is empty, then the given literal is added to the set. + /// + /// If adding any number of bytes to all members of this set causes a limit + /// to be exceeded, then no bytes are added and false is returned. If a + /// prefix of `bytes` can be fit into this set, then it is used and all + /// resulting literals are cut. + pub fn cross_add(&mut self, bytes: &[u8]) -> bool { + // N.B. This could be implemented by simply calling cross_product with + // a literal set containing just `bytes`, but we can be smarter about + // taking shorter prefixes of `bytes` if they'll fit. + if bytes.is_empty() { + return true; + } + if self.lits.is_empty() { + let i = cmp::min(self.limit_size, bytes.len()); + self.lits.push(Lit::new(bytes[..i].to_owned())); + self.lits[0].cut = i < bytes.len(); + return !self.lits[0].is_cut(); + } + let size = self.num_bytes(); + if size + self.lits.len() >= self.limit_size { + return false; + } + let mut i = 1; + while size + (i * self.lits.len()) <= self.limit_size + && i <= bytes.len() { + i += 1; + } + for lit in &mut self.lits { + if !lit.is_cut() { + lit.extend(&bytes[..i]); + if i < bytes.len() { + lit.cut(); + } + } + } + true + } + + /// Adds the given literal to this set. + /// + /// Returns false if adding this literal would cause the class to be too + /// big. + pub fn add(&mut self, lit: Lit) -> bool { + if self.num_bytes() + lit.len() > self.limit_size { + return false; + } + self.lits.push(lit); + true + } + + /// Extends each literal in this set with the character class given. + /// + /// Returns false if the character class was too big to add. + pub fn add_char_class(&mut self, cls: &CharClass) -> bool { + use std::char; + + if self.class_exceeds_limits(cls.num_chars()) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Lit::empty()]; + } + for r in cls { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for c in (s..e).filter_map(char::from_u32) { + for mut lit in base.clone() { + lit.extend(c.to_string().as_bytes()); + self.lits.push(lit); + } + } + } + true + } + + /// Extends each literal in this set with the byte class given. + /// + /// Returns false if the byte class was too big to add. + pub fn add_byte_class(&mut self, cls: &ByteClass) -> bool { + if self.class_exceeds_limits(cls.num_bytes()) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Lit::empty()]; + } + for r in cls { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for b in (s..e).map(|b| b as u8) { + for mut lit in base.clone() { + lit.push(b); + self.lits.push(lit); + } + } + } + true + } + + /// Cuts every member of this set. When a member is cut, it can never + /// be extended. + pub fn cut(&mut self) { + for lit in &mut self.lits { + lit.cut(); + } + } + + /// Reverses all members in place. + pub fn reverse(&mut self) { + for lit in &mut self.lits { + lit.reverse(); + } + } + + /// Clears this set of all members. + pub fn clear(&mut self) { + self.lits.clear(); + } + + /// Pops all complete literals out of this set. + fn remove_complete(&mut self) -> Vec { + let mut base = vec![]; + for lit in mem::replace(&mut self.lits, vec![]) { + if lit.is_cut() { + self.lits.push(lit); + } else { + base.push(lit); + } + } + base + } + + /// Returns the total number of bytes in this set. + fn num_bytes(&self) -> usize { + self.lits.iter().fold(0, |accum, lit| accum + lit.len()) + } + + /// Returns true if a character class with the given size would cause this + /// set to exceed its limits. + /// + /// The size given should correspond to the number of items in the class. + fn class_exceeds_limits(&self, size: usize) -> bool { + if size > self.limit_class { + return true; + } + // This is an approximation since codepoints in a char class can encode + // to 1-4 bytes. + let new_byte_count = + if self.lits.is_empty() { + size + } else { + self.lits + .iter() + .fold(0, |accum, lit| { + accum + if lit.is_cut() { + // If the literal is cut, then we'll never add + // anything to it, so don't count it. + 0 + } else { + (lit.len() + 1) * size + } + }) + }; + new_byte_count > self.limit_size + } +} + +fn prefixes(expr: &Expr, lits: &mut Literals) { + use Expr::*; + match *expr { + Literal { ref chars, casei: false } => { + let s: String = chars.iter().cloned().collect(); + lits.cross_add(s.as_bytes()); + } + Literal { ref chars, casei: true } => { + for &c in chars { + let cls = CharClass::new(vec![ + ClassRange { start: c, end: c }, + ]).case_fold(); + if !lits.add_char_class(&cls) { + lits.cut(); + return; + } + } + } + LiteralBytes { ref bytes, casei: false } => { + lits.cross_add(bytes); + } + LiteralBytes { ref bytes, casei: true } => { + for &b in bytes { + let cls = ByteClass::new(vec![ + ByteRange { start: b, end: b }, + ]).case_fold(); + if !lits.add_byte_class(&cls) { + lits.cut(); + return; + } + } + } + Class(ref cls) => { + if !lits.add_char_class(cls) { + lits.cut(); + } + } + ClassBytes(ref cls) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + Group { ref e, .. } => { + prefixes(&**e, lits); + } + Repeat { ref e, r: Repeater::ZeroOrOne, .. } => { + repeat_zero_or_one_literals(&**e, lits, prefixes); + } + Repeat { ref e, r: Repeater::ZeroOrMore, .. } => { + repeat_zero_or_more_literals(&**e, lits, prefixes); + } + Repeat { ref e, r: Repeater::OneOrMore, .. } => { + repeat_one_or_more_literals(&**e, lits, prefixes); + } + Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { + repeat_range_literals(&**e, min, max, greedy, lits, prefixes); + } + Concat(ref es) if es.is_empty() => {} + Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), + Concat(ref es) => { + for e in es { + if let StartText = *e { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Lit::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + prefixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + Alternate(ref es) => { + alternate_literals(es, lits, prefixes); + } + _ => lits.cut(), + } +} + +fn suffixes(expr: &Expr, lits: &mut Literals) { + use Expr::*; + match *expr { + Literal { ref chars, casei: false } => { + let s: String = chars.iter().rev().cloned().collect(); + lits.cross_add(s.as_bytes()); + } + Literal { ref chars, casei: true } => { + for &c in chars.iter().rev() { + let cls = CharClass::new(vec![ + ClassRange { start: c, end: c }, + ]).case_fold(); + if !lits.add_char_class(&cls) { + lits.cut(); + return; + } + } + } + LiteralBytes { ref bytes, casei: false } => { + let b: Vec = bytes.iter().rev().cloned().collect(); + lits.cross_add(&b); + } + LiteralBytes { ref bytes, casei: true } => { + for &b in bytes.iter().rev() { + let cls = ByteClass::new(vec![ + ByteRange { start: b, end: b }, + ]).case_fold(); + if !lits.add_byte_class(&cls) { + lits.cut(); + return; + } + } + } + Class(ref cls) => { + if !lits.add_char_class(cls) { + lits.cut(); + } + } + ClassBytes(ref cls) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + Group { ref e, .. } => { + suffixes(&**e, lits); + } + Repeat { ref e, r: Repeater::ZeroOrOne, .. } => { + repeat_zero_or_one_literals(&**e, lits, suffixes); + } + Repeat { ref e, r: Repeater::ZeroOrMore, .. } => { + repeat_zero_or_more_literals(&**e, lits, suffixes); + } + Repeat { ref e, r: Repeater::OneOrMore, .. } => { + repeat_one_or_more_literals(&**e, lits, suffixes); + } + Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { + repeat_range_literals(&**e, min, max, greedy, lits, suffixes); + } + Concat(ref es) if es.is_empty() => {} + Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), + Concat(ref es) => { + for e in es.iter().rev() { + if let EndText = *e { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Lit::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + suffixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + Alternate(ref es) => { + alternate_literals(es, lits, suffixes); + } + _ => lits.cut(), + } +} + +fn repeat_zero_or_one_literals( + e: &Expr, + lits: &mut Literals, + mut f: F, +) { + let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); + lits3.set_limit_size(lits.limit_size() / 2); + f(e, &mut lits3); + + if lits3.is_empty() || !lits2.cross_product(&lits3) { + lits.cut(); + return; + } + lits2.add(Lit::empty()); + if !lits.union(lits2) { + lits.cut(); + } +} + +fn repeat_zero_or_more_literals( + e: &Expr, + lits: &mut Literals, + mut f: F, +) { + let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); + lits3.set_limit_size(lits.limit_size() / 2); + f(e, &mut lits3); + + if lits3.is_empty() || !lits2.cross_product(&lits3) { + lits.cut(); + return; + } + lits2.cut(); + lits2.add(Lit::empty()); + if !lits.union(lits2) { + lits.cut(); + } +} + +fn repeat_one_or_more_literals( + e: &Expr, + lits: &mut Literals, + mut f: F, +) { + f(e, lits); + lits.cut(); +} + +fn repeat_range_literals( + e: &Expr, + min: u32, + max: Option, + greedy: bool, + lits: &mut Literals, + mut f: F, +) { + use Expr::*; + + if min == 0 { + // This is a bit conservative. If `max` is set, then we could + // treat this as a finite set of alternations. For now, we + // just treat it as `e*`. + f(&Repeat { + e: Box::new(e.clone()), + r: Repeater::ZeroOrMore, + greedy: greedy, + }, lits); + } else { + if min > 0 { + let n = cmp::min(lits.limit_size, min as usize); + let es = iter::repeat(e.clone()).take(n).collect(); + f(&Concat(es), lits); + if n < min as usize { + lits.cut(); + } + } + if max.map_or(true, |max| min < max) { + lits.cut(); + } + } +} + +fn alternate_literals( + es: &[Expr], + lits: &mut Literals, + mut f: F, +) { + let mut lits2 = lits.to_empty(); + for e in es { + let mut lits3 = lits.to_empty(); + lits3.set_limit_size(lits.limit_size() / 5); + f(e, &mut lits3); + if lits3.is_empty() || !lits2.union(lits3) { + // If we couldn't find suffixes for *any* of the + // alternates, then the entire alternation has to be thrown + // away and any existing members must be frozen. Similarly, + // if the union couldn't complete, stop and freeze. + lits.cut(); + return; + } + } + if !lits.cross_product(&lits2) { + lits.cut(); + } +} + +impl fmt::Debug for Literals { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Literals") + .field("lits", &self.lits) + .field("limit_size", &self.limit_size) + .field("limit_class", &self.limit_class) + .finish() + } +} + +impl Lit { + /// Returns a new complete literal with the bytes given. + pub fn new(bytes: Vec) -> Lit { + Lit { v: bytes, cut: false } + } + + /// Returns a new complete empty literal. + pub fn empty() -> Lit { + Lit { v: vec![], cut: false } + } + + /// Returns true if this literal was "cut." + pub fn is_cut(&self) -> bool { + self.cut + } + + /// Cuts this literal. + pub fn cut(&mut self) { + self.cut = true; + } +} + +impl PartialEq for Lit { + fn eq(&self, other: &Lit) -> bool { + self.v == other.v + } +} + +impl PartialOrd for Lit { + fn partial_cmp(&self, other: &Lit) -> Option { + self.v.partial_cmp(&other.v) + } +} + +impl fmt::Debug for Lit { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", escape_unicode(&self.v)) + } else { + write!(f, "Complete({})", escape_unicode(&self.v)) + } + } +} + +impl AsRef<[u8]> for Lit { + fn as_ref(&self) -> &[u8] { &self.v } +} + +impl ops::Deref for Lit { + type Target = Vec; + fn deref(&self) -> &Vec { &self.v } +} + +impl ops::DerefMut for Lit { + fn deref_mut(&mut self) -> &mut Vec { &mut self.v } +} + +fn position(needle: &[u8], mut haystack: &[u8]) -> Option { + let mut i = 0; + while haystack.len() >= needle.len() { + if needle == &haystack[..needle.len()] { + return Some(i); + } + i += 1; + haystack = &haystack[1..]; + } + None +} + +fn escape_unicode(bytes: &[u8]) -> String { + let show = match ::std::str::from_utf8(bytes) { + Ok(v) => v.to_string(), + Err(_) => escape_bytes(bytes), + }; + let mut space_escaped = String::new(); + for c in show.chars() { + if c.is_whitespace() { + let escaped = if c as u32 <= 0x7F { + escape_byte(c as u8) + } else { + if c as u32 <= 0xFFFF { + format!(r"\u{{{:04x}}}", c as u32) + } else { + format!(r"\U{{{:08x}}}", c as u32) + } + }; + space_escaped.push_str(&escaped); + } else { + space_escaped.push(c); + } + } + space_escaped +} + +fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s +} + +fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use {Expr, ExprBuilder}; + use super::{Literals, Lit, escape_bytes}; + + // To make test failures easier to read. + #[derive(Debug, Eq, PartialEq)] + struct Bytes(Vec); + #[derive(Debug, Eq, PartialEq)] + struct Unicode(Vec); + + fn escape_lits(blits: &[Lit]) -> Vec { + let mut ulits = vec![]; + for blit in blits { + ulits.push(ULit { v: escape_bytes(&blit), cut: blit.is_cut() }); + } + ulits + } + + fn create_lits>(it: I) -> Literals { + Literals { + lits: it.into_iter().collect(), + limit_size: 0, + limit_class: 0, + } + } + + // Needs to be pub for 1.3? + #[derive(Clone, Eq, PartialEq)] + pub struct ULit { + v: String, + cut: bool, + } + + impl ULit { + fn is_cut(&self) -> bool { self.cut } + } + + impl fmt::Debug for ULit { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", self.v) + } else { + write!(f, "Complete({})", self.v) + } + } + } + + impl PartialEq for ULit { + fn eq(&self, other: &Lit) -> bool { + self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() + } + } + + impl PartialEq for Lit { + fn eq(&self, other: &ULit) -> bool { + &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() + } + } + + #[allow(non_snake_case)] + fn C(s: &'static str) -> ULit { ULit { v: s.to_owned(), cut: true } } + #[allow(non_snake_case)] + fn M(s: &'static str) -> ULit { ULit { v: s.to_owned(), cut: false } } + + fn prefixes(lits: &mut Literals, expr: &Expr) { + lits.union_prefixes(expr); + } + + fn suffixes(lits: &mut Literals, expr: &Expr) { + lits.union_suffixes(expr); + } + + macro_rules! assert_lit_eq { + ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ + let expected: Vec = vec![$($expected_lit),*]; + let lits = $got_lits; + assert_eq!( + $which(expected.clone()), + $which(escape_lits(lits.literals()))); + assert_eq!( + !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), + lits.all_complete()); + assert_eq!( + expected.iter().any(|l| !l.is_cut()), + lits.any_complete()); + }}; + } + + macro_rules! test_lit { + ($name:ident, $which:ident, $re:expr) => { + test_lit!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = Expr::parse($re).unwrap(); + let lits = expr.$which(); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ExprBuilder::new().allow_bytes(true).unicode(false) + .parse($re).unwrap(); + let lits = expr.$which(); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // ************************************************************************ + // Tests for prefix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(pfx_one_lit1, prefixes, "a", M("a")); + test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); + test_lit!(pfx_one_class, prefixes, "[1-4]", + M("1"), M("2"), M("3"), M("4")); + test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", + M("A"), M("a")); + test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc", + M("ABC"), M("aBC"), M("AbC"), M("abC"), + M("ABc"), M("aBc"), M("Abc"), M("abc")); + test_lit!(pfx_group1, prefixes, "(a)", M("a")); + test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); + test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); + test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); + test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); + test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); + test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); + test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); + test_lit!(pfx_rep_range1, prefixes, "a{0}"); + test_lit!(pfx_rep_range2, prefixes, "a{0,}"); + test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); + test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); + test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); + test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); + test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); + test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); + test_lit!(pfx_cat3, prefixes, "(?i)[ab]z", + M("AZ"), M("BZ"), M("aZ"), M("bZ"), + M("Az"), M("Bz"), M("az"), M("bz")); + test_lit!(pfx_cat4, prefixes, "[ab][yz]", + M("ay"), M("by"), M("az"), M("bz")); + test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); + test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); + test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); + test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); + test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); + test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); + test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); + test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); + test_lit!(pfx_cat14, prefixes, "a^", C("a")); + test_lit!(pfx_cat15, prefixes, "$a"); + test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); + test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); + test_lit!(pfx_cat19, prefixes, "a.z", C("a")); + + // Test regexes with alternations. + test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); + test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(pfx_alt4, prefixes, "a|b*"); + test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); + test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); + test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c", + C("a"), C("b"), M("c"), C("a"), C("ab"), M("c")); + test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(pfx_empty1, prefixes, "^a", M("a")); + test_lit!(pfx_empty2, prefixes, "^abc", M("abc")); + test_lit!(pfx_empty3, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); + + // Make sure some curious regexes have no prefixes. + test_lit!(pfx_nothing1, prefixes, "."); + test_lit!(pfx_nothing2, prefixes, "(?s)."); + test_lit!(pfx_nothing3, prefixes, "^"); + test_lit!(pfx_nothing4, prefixes, "$"); + test_lit!(pfx_nothing6, prefixes, "(?m)$"); + test_lit!(pfx_nothing7, prefixes, r"\b"); + test_lit!(pfx_nothing8, prefixes, r"\B"); + + // Test a few regexes that defeat any prefix literal detection. + test_lit!(pfx_defeated1, prefixes, ".a"); + test_lit!(pfx_defeated2, prefixes, "(?s).a"); + test_lit!(pfx_defeated3, prefixes, "a*b*c*"); + test_lit!(pfx_defeated4, prefixes, "a|."); + test_lit!(pfx_defeated5, prefixes, ".|a"); + test_lit!(pfx_defeated6, prefixes, "a|^"); + test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); + test_lit!(pfx_defeated8, prefixes, "$a"); + test_lit!(pfx_defeated9, prefixes, "(?m)$a"); + test_lit!(pfx_defeated10, prefixes, r"\ba"); + test_lit!(pfx_defeated11, prefixes, r"\Ba"); + test_lit!(pfx_defeated12, prefixes, "^*a"); + test_lit!(pfx_defeated13, prefixes, "^+a"); + + test_lit!( + pfx_crazy1, + prefixes, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam")); + + // ************************************************************************ + // Tests for quiting prefix literal search. + // ************************************************************************ + + macro_rules! test_exhausted { + ($name:ident, $which:ident, $re:expr) => { + test_exhausted!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = Expr::parse($re).unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ExprBuilder::new().allow_bytes(true).unicode(false) + .parse($re).unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); + test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); + test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); + test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar", + C("FO"), C("fO"), C("Fo"), C("fo")); + test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}", + C("abababababababababab")); + test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd", + C("ababababab"), M("cd")); + test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd", + C("zababababab"), M("zcd")); + test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz", + C("aaaaaaaaaaaaaaaaaaaa")); + + // ************************************************************************ + // Tests for suffix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(sfx_one_lit1, suffixes, "a", M("a")); + test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); + test_lit!(sfx_one_class, suffixes, "[1-4]", + M("1"), M("2"), M("3"), M("4")); + test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", + M("A"), M("a")); + test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc", + M("ABC"), M("ABc"), M("AbC"), M("Abc"), + M("aBC"), M("aBc"), M("abC"), M("abc")); + test_lit!(sfx_group1, suffixes, "(a)", M("a")); + test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); + test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); + test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); + test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); + test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); + test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); + test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); + test_lit!(sfx_rep_range1, suffixes, "a{0}"); + test_lit!(sfx_rep_range2, suffixes, "a{0,}"); + test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); + test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); + test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); + test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); + test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); + test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); + test_lit!(sfx_cat3, suffixes, "(?i)[ab]z", + M("AZ"), M("Az"), M("BZ"), M("Bz"), + M("aZ"), M("az"), M("bZ"), M("bz")); + test_lit!(sfx_cat4, suffixes, "[ab][yz]", + M("ay"), M("az"), M("by"), M("bz")); + test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); + test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); + test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); + test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); + test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); + test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); + test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat12, suffixes, "ab+", C("b")); + test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); + test_lit!(sfx_cat14, suffixes, "a^"); + test_lit!(sfx_cat15, suffixes, "$a", C("a")); + test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); + test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); + test_lit!(sfx_cat19, suffixes, "a.z", C("z")); + + // Test regexes with alternations. + test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); + test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(sfx_alt4, suffixes, "a|b*"); + test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); + test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); + test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c", + C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c")); + test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(sfx_empty1, suffixes, "a$", M("a")); + + // Make sure some curious regexes have no suffixes. + test_lit!(sfx_nothing1, suffixes, "."); + test_lit!(sfx_nothing2, suffixes, "(?s)."); + test_lit!(sfx_nothing3, suffixes, "^"); + test_lit!(sfx_nothing4, suffixes, "$"); + test_lit!(sfx_nothing6, suffixes, "(?m)$"); + test_lit!(sfx_nothing7, suffixes, r"\b"); + test_lit!(sfx_nothing8, suffixes, r"\B"); + + // Test a few regexes that defeat any suffix literal detection. + test_lit!(sfx_defeated1, suffixes, "a."); + test_lit!(sfx_defeated2, suffixes, "(?s)a."); + test_lit!(sfx_defeated3, suffixes, "a*b*c*"); + test_lit!(sfx_defeated4, suffixes, "a|."); + test_lit!(sfx_defeated5, suffixes, ".|a"); + test_lit!(sfx_defeated6, suffixes, "a|^"); + test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); + test_lit!(sfx_defeated8, suffixes, "a^"); + test_lit!(sfx_defeated9, suffixes, "(?m)a$"); + test_lit!(sfx_defeated10, suffixes, r"a\b"); + test_lit!(sfx_defeated11, suffixes, r"a\B"); + test_lit!(sfx_defeated12, suffixes, "a^*"); + test_lit!(sfx_defeated13, suffixes, "a^+"); + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); + test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); + test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); + test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar", + C("AR"), C("Ar"), C("aR"), C("ar")); + test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}", + C("abababababababababab")); + test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*", + C("ababababab"), M("cd")); + test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z", + C("abababababz"), M("cdz")); + test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa", + C("aaaaaaaaaaaaaaaaaaaa")); + + // ************************************************************************ + // Tests for generating unambiguous literal sets. + // ************************************************************************ + + macro_rules! test_unamb { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Lit { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.unambiguous_prefixes(); + assert_eq!($expected, escape_lits(got.literals())); + } + }; + } + + test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); + test_unamb!(unambiguous2, + vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]); + test_unamb!(unambiguous3, + vec![M("Sherlock"), M("Watson")], + vec![M("Sherlock"), M("Watson")]); + test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); + test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); + test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); + test_unamb!(unambiguous9, + vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], + vec![C("a"), C("b"), C("c")]); + test_unamb!(unambiguous10, + vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], + vec![C("Mo"), C("Mu")]); + test_unamb!(unambiguous11, + vec![M("zazb"), M("azb")], vec![C("azb"), C("z")]); + test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); + + // ************************************************************************ + // Tests for longest common prefix. + // ************************************************************************ + + macro_rules! test_lcp { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|s: &str| Lit { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_prefix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcp!(lcp1, vec!["a"], "a"); + test_lcp!(lcp2, vec![], ""); + test_lcp!(lcp3, vec!["a", "b"], ""); + test_lcp!(lcp4, vec!["ab", "ab"], "ab"); + test_lcp!(lcp5, vec!["ab", "a"], "a"); + test_lcp!(lcp6, vec!["a", "ab"], "a"); + test_lcp!(lcp7, vec!["ab", "b"], ""); + test_lcp!(lcp8, vec!["b", "ab"], ""); + test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); + test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); + test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); + test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); + + // ************************************************************************ + // Tests for longest common suffix. + // ************************************************************************ + + macro_rules! test_lcs { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|s: &str| Lit { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_suffix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcs!(lcs1, vec!["a"], "a"); + test_lcs!(lcs2, vec![], ""); + test_lcs!(lcs3, vec!["a", "b"], ""); + test_lcs!(lcs4, vec!["ab", "ab"], "ab"); + test_lcs!(lcs5, vec!["ab", "a"], ""); + test_lcs!(lcs6, vec!["a", "ab"], ""); + test_lcs!(lcs7, vec!["ab", "b"], "b"); + test_lcs!(lcs8, vec!["b", "ab"], "b"); + test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); + test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); + test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); + test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); +} diff --git a/run-bench b/run-bench index 8a7873d693..e9a3be664b 100755 --- a/run-bench +++ b/run-bench @@ -1,8 +1,12 @@ #!/bin/bash -if [ $# = 0 ] || [ $1 = '-h' ]; then +usage() { echo "Usage: $(basename $0) [rust | rust-bytes | rust-plugin | pcre | onig]" >&2 exit 1 +} + +if [ $# = 0 ] || [ $1 = '-h' ] || [ $1 = '--help' ]; then + usage fi which="$1" @@ -43,4 +47,7 @@ case $which in --features re-onig \ "$@" ;; + *) + usage + ;; esac diff --git a/scripts/frequencies.py b/scripts/frequencies.py new file mode 100755 index 0000000000..c800b0bb39 --- /dev/null +++ b/scripts/frequencies.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +# This does simple normalized frequency analysis on UTF-8 encoded text. The +# result of the analysis is translated to a ranked list, where every byte is +# assigned a rank. This list is written to src/freqs.rs. +# +# Currently, the frequencies are generated from the following corpuses: +# +# * The CIA world fact book +# * The source code of rustc +# * Septuaginta + +from __future__ import absolute_import, division, print_function + +import argparse +from collections import Counter +import sys + +preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/frequencies.py", do not +// edit directly +''' + + +def eprint(*args, **kwargs): + kwargs['file'] = sys.stderr + print(*args, **kwargs) + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('corpus', metavar='FILE', nargs='+') + args = p.parse_args() + + # Get frequency counts of each byte. + freqs = Counter() + for i in range(0, 256): + freqs[i] = 0 + + eprint('reading entire corpus into memory') + corpus = [] + for fpath in args.corpus: + corpus.append(open(fpath, 'rb').read()) + + eprint('computing byte frequencies') + for c in corpus: + for byte in c: + freqs[byte] += 1.0 / float(len(c)) + + eprint('writing Rust code') + # Get the rank of each byte. A lower rank => lower relative frequency. + rank = [0] * 256 + for i, (byte, _) in enumerate(freqs.most_common()): + # print(byte) + rank[byte] = 255 - i + + # Forcefully set the highest rank possible for bytes that start multi-byte + # UTF-8 sequences. The idea here is that a continuation byte will be more + # discerning in a homogenous haystack. + for byte in range(0xC0, 0xFF + 1): + rank[byte] = 255 + + # Now write Rust. + olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = ['] + for byte in range(256): + olines.append(' %3d, // %r' % (rank[byte], chr(byte))) + olines.append('];') + + print(preamble) + print('\n'.join(olines)) + +if __name__ == '__main__': + main() diff --git a/src/backtrack.rs b/src/backtrack.rs index 88d829313d..592cf9f759 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -26,8 +26,8 @@ // the bitset has to be zeroed on each execution, which becomes quite expensive // on large bitsets. -use exec::Search; use input::{Input, InputAt}; +use params::Params; use prog::{Program, InstPtr}; /// Returns true iff the given regex and input should be executed by this @@ -50,10 +50,10 @@ const MAX_INPUT_SIZE: usize = 128 * (1 << 10); /// A backtracking matching engine. #[derive(Debug)] -pub struct Backtrack<'a, 'b, 'c: 'b, 'm: 'b, 'r, I> { +pub struct Backtrack<'a, 'r, 'p, 'c: 'p, 'm: 'p, I> { prog: &'r Program, input: I, - search: &'b mut Search<'m, 'c>, + params: &'p mut Params<'c, 'm>, m: &'a mut BacktrackCache, } @@ -84,24 +84,24 @@ enum Job { SaveRestore { slot: usize, old_pos: Option }, } -impl<'a, 'b, 'c, 'm, 'r, I: Input> Backtrack<'a, 'b, 'c, 'r, 'm, I> { +impl<'a, 'r, 'p, 'c, 'm, I: Input> Backtrack<'a, 'r, 'p, 'c, 'm, I> { /// Execute the backtracking matching engine. /// /// If there's a match, `exec` returns `true` and populates the given /// captures accordingly. pub fn exec( prog: &'r Program, - search: &'b mut Search<'c, 'm>, + cache: &mut BacktrackCache, + params: &'p mut Params<'c, 'm>, input: I, start: usize, ) -> bool { let start = input.at(start); - let mut m = prog.cache_backtrack(); let mut b = Backtrack { prog: prog, input: input, - search: search, - m: &mut m, + params: params, + m: cache, }; b.exec_(start) } @@ -155,18 +155,19 @@ impl<'a, 'b, 'c, 'm, 'r, I: Input> Backtrack<'a, 'b, 'c, 'r, 'm, I> { loop { if !self.prog.prefixes.is_empty() { at = match self.input.prefix_at(&self.prog.prefixes, at) { - None => return false, + None => break, Some(at) => at, }; } - if self.backtrack(at) { + if self.backtrack(at) && self.prog.matches.len() == 1 { return true; } if at.is_end() { - return false; + break; } at = self.input.at(at.next_pos()); } + self.params.is_match() } /// The main backtracking loop starting at the given input position. @@ -185,18 +186,18 @@ impl<'a, 'b, 'c, 'm, 'r, I: Input> Backtrack<'a, 'b, 'c, 'r, 'm, I> { if self.step(ip, at) { // Only quit if we're matching one regex. // If we're matching a regex set, then mush on and - // try to find other matches. - if !self.search.find_many_matches() { + // try to find other matches (if we want them). + if self.prog.matches.len() == 1 { return true; } } } Job::SaveRestore { slot, old_pos } => { - self.search.set_capture(slot, old_pos); + self.params.set_capture(slot, old_pos); } } } - false + self.params.is_match() } fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { @@ -208,11 +209,11 @@ impl<'a, 'b, 'c, 'm, 'r, I: Input> Backtrack<'a, 'b, 'c, 'r, 'm, I> { // in place. match self.prog[ip] { Match(slot) => { - self.search.set_match(slot); + self.params.set_match(slot); return true; } Save(ref inst) => { - if let Some(old_pos) = self.search.capture(inst.slot) { + if let Some(&old_pos) = self.params.captures().get(inst.slot) { // If this path doesn't work out, then we save the old // capture index (if one exists) in an alternate // job. If the next path fails, then the alternate @@ -221,7 +222,7 @@ impl<'a, 'b, 'c, 'm, 'r, I: Input> Backtrack<'a, 'b, 'c, 'r, 'm, I> { slot: inst.slot, old_pos: old_pos, }); - self.search.set_capture(inst.slot, Some(at.pos())); + self.params.set_capture(inst.slot, Some(at.pos())); } ip = inst.goto; } diff --git a/src/compile.rs b/src/compile.rs index 8f1efdf2b2..02d54c5634 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -138,14 +138,20 @@ impl Compiler { // add a `.*?` before the first capture group. // Other matching engines handle this by baking the logic into the // matching engine itself. + let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; self.compiled.is_anchored_start = expr.is_anchored_start(); self.compiled.is_anchored_end = expr.is_anchored_end(); if self.compiled.needs_dotstar() { - try!(self.c_dotstar()); + dotstar_patch = try!(self.c_dotstar()); + self.compiled.start = dotstar_patch.entry; } self.compiled.captures = vec![None]; - self.compiled.start = self.insts.len(); let patch = try!(self.c_capture(0, expr)); + if self.compiled.needs_dotstar() { + self.fill(dotstar_patch.hole, patch.entry); + } else { + self.compiled.start = patch.entry; + } self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); @@ -162,11 +168,15 @@ impl Compiler { exprs.iter().all(|e| e.is_anchored_start()); self.compiled.is_anchored_end = exprs.iter().all(|e| e.is_anchored_end()); + let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { - try!(self.c_dotstar()); + dotstar_patch = try!(self.c_dotstar()); + self.compiled.start = dotstar_patch.entry; + } else { + self.compiled.start = 0; // first instruction is always split } + self.fill_to_next(dotstar_patch.hole); - self.compiled.start = self.insts.len(); for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { let split = self.push_split_hole(); let Patch { hole, entry } = try!(self.c_capture(0, expr)); @@ -182,7 +192,6 @@ impl Compiler { self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); self.push_compiled(Inst::Match(i)); - self.compile_finish() } @@ -214,11 +223,9 @@ impl Compiler { ]) } AnyByte => { - assert!(!self.compiled.only_utf8()); self.c_class_bytes(&[ByteRange { start: 0, end: 0xFF }]) } AnyByteNoNL => { - assert!(!self.compiled.only_utf8()); self.c_class_bytes(&[ ByteRange { start: 0, end: 0x9 }, ByteRange { start: 0xB, end: 0xFF }, @@ -309,8 +316,8 @@ impl Compiler { } } - fn c_dotstar(&mut self) -> result::Result<(), Error> { - let patch = if !self.compiled.only_utf8() { + fn c_dotstar(&mut self) -> Result { + Ok(if !self.compiled.only_utf8() { try!(self.c(&Expr::Repeat { e: Box::new(Expr::AnyByte), r: Repeater::ZeroOrMore, @@ -322,13 +329,11 @@ impl Compiler { r: Repeater::ZeroOrMore, greedy: false, })) - }; - self.fill_to_next(patch.hole); - Ok(()) + }) } fn c_literal(&mut self, chars: &[char], casei: bool) -> Result { - assert!(!chars.is_empty()); + debug_assert!(!chars.is_empty()); let mut chars: Box> = if self.compiled.is_reverse { Box::new(chars.iter().rev()) @@ -374,7 +379,7 @@ impl Compiler { } fn c_bytes(&mut self, bytes: &[u8], casei: bool) -> Result { - assert!(!bytes.is_empty()); + debug_assert!(!bytes.is_empty()); let mut bytes: Box> = if self.compiled.is_reverse { Box::new(bytes.iter().rev()) @@ -402,7 +407,7 @@ impl Compiler { } fn c_class_bytes(&mut self, ranges: &[ByteRange]) -> Result { - assert!(!ranges.is_empty()); + debug_assert!(!ranges.is_empty()); let first_split_entry = self.insts.len(); let mut holes = vec![]; @@ -451,7 +456,8 @@ impl Compiler { } fn c_alternate(&mut self, exprs: &[Expr]) -> Result { - assert!(exprs.len() >= 2, "alternates must have at least 2 exprs"); + debug_assert!( + exprs.len() >= 2, "alternates must have at least 2 exprs"); // Initial entry point is always the first split. let first_split_entry = self.insts.len(); @@ -892,9 +898,9 @@ impl<'a, 'b> CompileClass<'a, 'b> { })); } from_inst = self.c.insts.len().checked_sub(1).unwrap(); - assert!(from_inst < ::std::usize::MAX); + debug_assert!(from_inst < ::std::usize::MAX); } - assert!(from_inst < ::std::usize::MAX); + debug_assert!(from_inst < ::std::usize::MAX); Ok(Patch { hole: last_hole, entry: from_inst }) } } @@ -987,7 +993,7 @@ impl ByteClassSet { } fn set_range(&mut self, start: u8, end: u8) { - assert!(start <= end); + debug_assert!(start <= end); if start > 0 { self.0[start as usize - 1] = true; } diff --git a/src/dfa.rs b/src/dfa.rs index 738f2fa29a..a43b0ba483 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -45,7 +45,7 @@ use std::collections::HashMap; use std::fmt; use std::mem; -use exec::Search; +use params::Params; use prog::{Inst, Program}; use sparse::SparseSet; @@ -157,7 +157,7 @@ pub struct DfaCache { /// N.B. We only use a single lifetime here since all pointers are taken /// from the same cache. #[derive(Debug)] -pub struct Dfa<'a, 'b, 'c: 'b, 'm: 'b> { +pub struct Dfa<'a, 'p, 'c: 'p, 'm: 'p> { /// prog contains the NFA instruction opcodes. DFA execution uses either /// the `dfa` instructions or the `dfa_reverse` instructions from /// `exec::Executor`. (It never uses `Executor.prog`, which may have @@ -169,7 +169,7 @@ pub struct Dfa<'a, 'b, 'c: 'b, 'm: 'b> { /// The search configuration, which includes capture groups. It also /// includes space for indicating which regex matched if executing a /// regex set. - search: &'b mut Search<'c, 'm>, + params: &'p mut Params<'c, 'm>, /// The current position in the input. at: usize, /// The last state that matched. @@ -325,7 +325,7 @@ impl DfaCache { } } -impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { +impl<'a, 'p, 'c, 'm> Dfa<'a, 'p, 'c, 'm> { /// The main entry point to executing a DFA, which returns the *end* of /// a match if one exists, using Perl's "leftmost-first" semantics. /// @@ -340,21 +340,20 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { /// at the beginning of `text` or not (i.e., for empty assertions). pub fn exec( prog: &'a Program, - search: &'b mut Search<'c, 'm>, + cache: &mut DfaCache, + params: &'p mut Params<'c, 'm>, text: &[u8], at: usize, ) -> DfaResult { // Retrieve our DFA cache from the program. If another thread tries to // execute this DFA *simultaneously*, then a new independent cache is // created. - let mut _cache = prog.cache_dfa(); - let mut cache = &mut **_cache; cache.resize(prog.len()); let mut dfa = Dfa { prog: prog, start: 0, // filled in below - search: search, + params: params, at: at, last_match_si: STATE_UNKNOWN, last_cache_flush: at, @@ -376,14 +375,14 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text) }; if result.is_match() { - if dfa.search.find_one_match() { - dfa.search.set_match(0); + if prog.matches.len() == 1 { + dfa.params.set_match(0); } else { debug_assert!(dfa.last_match_si != STATE_UNKNOWN); debug_assert!(dfa.last_match_si != STATE_DEAD); for &ip in &dfa.states[dfa.last_match_si as usize].insts { if let Inst::Match(slot) = dfa.prog[ip as usize] { - dfa.search.set_match(slot); + dfa.params.set_match(slot); } } } @@ -473,11 +472,11 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { si = next_si; if self.states[si as usize].is_match { self.last_match_si = si; - if self.search.quit_after_first_match() { - return DfaResult::Match; - } result = DfaResult::Match; - self.search.set_end(Some(self.at)); + self.params.set_end(Some(self.at)); + if self.params.style().quit_after_first_match() { + return result; + } } self.at += 1; } @@ -492,11 +491,11 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { } if self.states[si as usize].is_match { self.last_match_si = si; - if self.search.quit_after_first_match() { - return DfaResult::Match; - } result = DfaResult::Match; - self.search.set_end(Some(text.len())); + self.params.set_end(Some(text.len())); + if self.params.style().quit_after_first_match() { + return result; + } } result } @@ -540,11 +539,11 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { si = next_si; if self.states[si as usize].is_match { self.last_match_si = si; - if self.search.quit_after_first_match() { - return DfaResult::NoMatch; - } result = DfaResult::Match; - self.search.set_start(Some(self.at+1)); + self.params.set_start(Some(self.at+1)); + if self.params.style().quit_after_first_match() { + return result; + } } } // Run the DFA once more on the special EOF senitnel value. @@ -558,11 +557,11 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { } if self.states[si as usize].is_match { self.last_match_si = si; - if self.search.quit_after_first_match() { - return DfaResult::Match; - } result = DfaResult::Match; - self.search.set_start(Some(0)); + self.params.set_start(Some(0)); + if self.params.style().quit_after_first_match() { + return result; + } } result } @@ -644,17 +643,11 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { is_match = true; if !self.continue_past_first_match() { break; - } else if !self.search.find_one_match() + } else if self.prog.matches.len() > 1 && !qnext.contains_ip(ip as usize) { // If we are continuing on to find other matches, // then keep a record of the match states we've seen. qnext.add(ip); - // BREADCRUMBS: - // Perhaps we need another sparse set here and track - // these "recorded" matches separately. They should - // still make their way into cached states, but perhaps - // they shouldn't prevent a DEAD state from - // occurring. } } Bytes(ref inst) => { @@ -666,7 +659,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { } } let mut cache = true; - if b.is_eof() && !self.search.find_one_match() { + if b.is_eof() && self.prog.matches.len() > 1 { // If we're processing the last byte of the input and we're // matching a regex set, then make the next state contain the // previous states transitions. We do this so that the main @@ -1054,7 +1047,8 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { si => return Some(si), } q.clear(); - self.follow_epsilons(0, q, start_flags); + let start = usize_to_u32(self.prog.start); + self.follow_epsilons(start, q, start_flags); // Start states can never be match states because we delay every match // by one byte. Given an empty string and an empty match, the match // won't actually occur until the DFA processes the special EOF @@ -1129,7 +1123,7 @@ impl<'a, 'b, 'c, 'm> Dfa<'a, 'b, 'c, 'm> { /// the longest match (for reverse search) or all possible matches (for /// regex sets). fn continue_past_first_match(&self) -> bool { - self.prog.is_reverse || !self.search.find_one_match() + self.prog.is_reverse || self.prog.matches.len() > 1 } /// Approximate size returns the approximate heap space currently used by diff --git a/src/error.rs b/src/error.rs index 8e1cbaaa2d..e014a37aba 100644 --- a/src/error.rs +++ b/src/error.rs @@ -20,7 +20,10 @@ pub enum Error { /// The compiled program exceeded the set size limit. /// The argument is the size limit imposed. CompiledTooBig(usize), - /// An invalid set is a regex set with fewer than 2 regular expressions. + /// **DEPRECATED:** Will be removed on next major version bump. + /// + /// This error is no longer used. (A `RegexSet` can now contain zero or + /// more regular expressions.) InvalidSet, /// Hints that destructuring should not be exhaustive. /// diff --git a/src/exec.rs b/src/exec.rs index b436a9f92a..502313d0fc 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -9,113 +9,24 @@ // except according to those terms. use std::collections::HashMap; +use std::ops::Deref; use std::sync::Arc; -use syntax; +use mempool::Pool; +use syntax::{Expr, ExprBuilder, Literals}; -use backtrack::{self, Backtrack}; +use backtrack::{self, Backtrack, BacktrackCache}; use compile::Compiler; -use dfa::{self, Dfa, DfaResult}; +use dfa::{self, Dfa, DfaCache, DfaResult}; use error::Error; use input::{ByteInput, CharInput}; -use literals::Literals; -use nfa::Nfa; +use literals::LiteralSearcher; +use nfa::{Nfa, NfaCache}; +use params::Params; use prog::{Program, InstPtr}; use re_bytes; use re_unicode; - -pub type CaptureSlots<'a> = &'a mut [CaptureSlot]; - -pub type CaptureSlot = Option; - -/// The parameters to running a regex search over some text. -#[derive(Debug)] -pub struct Search<'caps, 'matches> { - /// The matching engine writes capture locations to this slice. - /// - /// Note that some matching engines, like the DFA, have limited support - /// for this. The DFA can only fill in one capture location (the end - /// location of the match). - pub captures: CaptureSlots<'caps>, - /// The matching engine indicates which match instructions were executed - /// when searching stopped. - /// - /// In standard searches, there is exactly one value in this slice and it - /// should be initialized to `false`. When executing sets of regexes, - /// there should be a location for each regex. - matches: &'matches mut [bool], - /// Whether the matching engine has recorded any match. - matched_any: bool, -} - -impl<'caps, 'matches> Search<'caps, 'matches> { - pub fn new( - captures: CaptureSlots<'caps>, - matches: &'matches mut [bool], - ) -> Search<'caps, 'matches> { - Search { - captures: captures, - matches: matches, - matched_any: false, - } - } - - pub fn quit_after_first_match(&self) -> bool { - self.captures.is_empty() && self.matches.len() <= 1 - } - - pub fn find_many_matches(&self) -> bool { - self.matches.len() > 1 - } - - pub fn find_one_match(&self) -> bool { - self.matches.len() == 1 - } - - pub fn matched_all(&self) -> bool { - self.matches.iter().all(|m| *m) - } - - pub fn set_match(&mut self, match_slot: usize) { - self.matched_any = true; - if let Some(old) = self.matches.get_mut(match_slot) { - *old = true; - } - } - - pub fn capture(&self, i: usize) -> Option { - self.captures.get(i).map(|&slot| slot) - } - - pub fn set_start(&mut self, slot: CaptureSlot) { - self.set_capture(0, slot); - } - - pub fn set_end(&mut self, slot: CaptureSlot) { - self.set_capture(1, slot); - } - - pub fn set_capture(&mut self, i: usize, slot: CaptureSlot) { - if let Some(old_slot) = self.captures.get_mut(i) { - *old_slot = slot; - } - } - - pub fn copy_captures_from(&mut self, caps: &[CaptureSlot]) { - for (slot, val) in self.captures.iter_mut().zip(caps.iter()) { - *slot = *val; - } - } - - pub fn reset(&mut self) { - for slot in self.captures.iter_mut() { - *slot = None; - } - for m in self.matches.iter_mut() { - *m = false; - } - } -} +use set; /// Exec manages the execution of a regular expression. /// @@ -131,7 +42,7 @@ pub struct Exec { /// /// N.B. It is not possibly to make this byte-based from the public API. /// It is only used for testing byte based programs in the NFA simulations. - prog: Program, + nfa: Program, /// A compiled byte based program for DFA execution. This is only used /// if a DFA can be executed. (Currently, only word boundary assertions are /// not supported.) Note that this program contains an embedded `.*?` @@ -144,6 +55,11 @@ pub struct Exec { dfa_reverse: Program, /// Set to true if and only if the DFA can be executed. can_dfa: bool, + /// A set of suffix literals extracted from the regex. + /// + /// Prefix literals are stored on the `Program`, since they are used inside + /// the matching engines. + suffixes: LiteralSearcher, /// A preference for matching engine selection. /// /// This defaults to Automatic, which means the matching engine is selected @@ -154,6 +70,8 @@ pub struct Exec { /// either is capable of executing every compiled program on any input /// size. match_engine: MatchEngine, + /// Caches for the various matching engines. + cache: ProgramPool, } /// Facilitates the construction of an executor by exposing various knobs @@ -265,51 +183,52 @@ impl ExecBuilder { /// Build an executor that can run a regular expression. pub fn build(self) -> Result { if self.res.is_empty() { - return Err(Error::InvalidSet); + return Ok(Exec { + res: vec![], + nfa: Program::new(), + dfa: Program::new(), + dfa_reverse: Program::new(), + can_dfa: false, + suffixes: LiteralSearcher::empty(), + match_engine: MatchEngine::Automatic, + cache: ProgramPool::new(), + }); } - let mut exprs = vec![]; - for re in &self.res { - let parser = - syntax::ExprBuilder::new() - .allow_bytes(!self.only_utf8) - .unicode(self.only_utf8); - exprs.push(try!(parser.parse(re))); - } - let mut prog = try!( + let parsed = try!(parse(&self.res, self.only_utf8)); + let mut nfa = try!( Compiler::new() .size_limit(self.size_limit) .bytes(self.bytes) .only_utf8(self.only_utf8) - .compile(&exprs)); + .compile(&parsed.exprs)); let mut dfa = try!( Compiler::new() .size_limit(self.size_limit) .dfa(true) .only_utf8(self.only_utf8) - .compile(&exprs)); + .compile(&parsed.exprs)); let dfa_reverse = try!( Compiler::new() .size_limit(self.size_limit) .dfa(true) .only_utf8(self.only_utf8) .reverse(true) - .compile(&exprs)); - - // Compute literal prefixes for only `prog`, which is likely a Unicode - // based program. Literal prefix extract currently works better on - // Unicode programs. - prog.prefixes = Literals::prefixes(&prog); - // And give it to the DFA too, which can use Unicode prefixes even - // though the program itself is byte based. - dfa.prefixes = prog.prefixes.clone(); + .compile(&parsed.exprs)); + + let prefixes = parsed.prefixes.unambiguous_prefixes(); + let suffixes = parsed.suffixes.unambiguous_suffixes(); + nfa.prefixes = LiteralSearcher::prefixes(prefixes); + dfa.prefixes = nfa.prefixes.clone(); let can_dfa = dfa::can_exec(&dfa); Ok(Exec { res: self.res, - prog: prog, + nfa: nfa, dfa: dfa, dfa_reverse: dfa_reverse, can_dfa: can_dfa, + suffixes: LiteralSearcher::suffixes(suffixes), match_engine: self.match_engine, + cache: ProgramPool::new(), }) } } @@ -334,61 +253,81 @@ impl Exec { /// choosing the engine to use. If self.match_engine is Nfa or Backtrack, /// then that engine is always used. Otherwise, one is selected /// automatically. - pub fn exec<'c, 'm>( + pub fn exec( &self, - search: &mut Search<'c, 'm>, + params: &mut Params, text: &[u8], start: usize, ) -> bool { + // An empty regular expression never matches. + if self.nfa.insts.is_empty() { + return false; + } + // If we have prefix/suffix literals and the regex is anchored, then + // we should be able to detect certain classes of failed matches + // very quickly. + // + // But don't do this on very short haystacks, since detecting a + // non-match on short haystack should be fast anyway. + if text.len() > 256 && !self.is_anchor_match(text, start) { + return false; + } // Why isn't the DFA or literal engine checked for here? Well, it's // only possible to execute those engines in exec_auto. See comment on // MatchEngine below for more details. match self.match_engine { - MatchEngine::Automatic => self.exec_auto(search, text, start), - MatchEngine::Backtrack => self.exec_backtrack(search, text, start), - MatchEngine::Nfa => self.exec_nfa(search, text, start), + MatchEngine::Automatic => self.exec_auto(params, text, start), + MatchEngine::Backtrack => self.exec_backtrack(params, text, start), + MatchEngine::Nfa => self.exec_nfa(params, text, start), } } /// Like exec, but always selects the engine automatically. - fn exec_auto<'c, 'm>( + fn exec_auto( &self, - search: &mut Search<'c, 'm>, + params: &mut Params, text: &[u8], start: usize, ) -> bool { - if search.captures.len() <= 2 && self.prog.prefixes.at_match() { + if params.captures().len() <= 2 && self.nfa.prefixes.complete() { // We should be able to execute the literal engine even if there // are more captures by falling back to the NFA engine after a // match. However, that's effectively what the NFA engine does // already (since it will use the literal engine if it exists). - self.exec_literals(search, text, start) + self.exec_literals(&self.nfa.prefixes, params, text, start) } else if self.can_dfa { - self.exec_dfa(search, text, start) + self.exec_dfa(params, text, start) } else { - self.exec_auto_nfa(search, text, start) + self.exec_auto_nfa(params, text, start) } } /// Like exec, but always tries to execute the lazy DFA. /// /// Note that self.can_dfa must be true. This will panic otherwise. - fn exec_dfa<'a, 'c, 'm>( + fn exec_dfa<'a>( &self, - search: &'a mut Search<'c, 'm>, + params: &mut Params, text: &[u8], start: usize, ) -> bool { debug_assert!(self.can_dfa); - match Dfa::exec(&self.dfa, search, text, start) { + if self.should_suffix_scan() { + return self.exec_dfa_reverse_first(params, text, start); + } + let mut cache = self.cache.get_ref(); + match Dfa::exec(&self.dfa, &mut cache.dfa, params, text, start) { DfaResult::Match => {} // fallthrough DfaResult::NoMatch => return false, DfaResult::Quit => { - search.reset(); - return self.exec_auto_nfa(search, text, start); + params.reset(); + return self.exec_auto_nfa(params, text, start); } } - let match_end = match search.captures.get(1) { + if params.style().match_only() { + return true; + } + let match_end = match params.captures().get(1) { Some(&Some(i)) => i, // The DFA returned true for a match, but did not set any capture // location because the caller didn't ask for them. Therefore, we @@ -401,83 +340,188 @@ impl Exec { if start == match_end { // Be careful... If the caller wants sub-captures, than we are // obliged to run the NFA to get them. - if search.captures.len() == 2 { + if params.captures().len() == 2 { // The caller only needs the start/end, so we can avoid the // NFA here. - search.captures[0] = Some(start); - search.captures[1] = Some(start); + params.set_start(Some(start)); + params.set_end(Some(start)); return true; } - return self.exec_auto_nfa(search, text, start); + return self.exec_auto_nfa(params, text, start); } // OK, now we find the start of the match by running the DFA backwards // on the text. We *start* the search at the end of the match. let result = Dfa::exec( - &self.dfa_reverse, search, &text[start..], match_end - start); + &self.dfa_reverse, + &mut cache.dfa_reverse, + params, + &text[start..], + match_end - start); match result { DfaResult::Match => {} // fallthrough DfaResult::NoMatch => { - panic!("BUG: forward match implies backward match"); + panic!("BUG: forward match implies reverse match"); } DfaResult::Quit => { - search.reset(); - return self.exec_auto_nfa(search, text, start); + params.reset(); + return self.exec_auto_nfa(params, text, start); } } - let match_start = match search.captures.get(0) { + let match_start = match params.captures().get(0) { Some(&Some(i)) => start + i, _ => panic!("BUG: early match can't happen on reverse search"), }; - if search.captures.len() == 2 { + if params.captures().len() == 2 { // If the caller doesn't care about capture locations, then we can // avoid running the NFA to fill them in. - search.captures[0] = Some(match_start); - search.captures[1] = Some(match_end); + params.set_start(Some(match_start)); + params.set_end(Some(match_end)); return true; } - self.exec_auto_nfa(search, text, match_start) + self.exec_auto_nfa(params, text, match_start) + } + + /// Like exec_dfa, but tries executing the DFA in reverse from suffix + /// literal matches. + /// + /// Note that self.can_dfa must be true. This will panic otherwise. + fn exec_dfa_reverse_first( + &self, + params: &mut Params, + text: &[u8], + start: usize, + ) -> bool { + let mut cache = self.cache.get_ref(); + let lcs = self.suffixes.lcs(); + + let mut end = start; + while end <= text.len() { + end = end + match lcs.find(&text[end..]) { + None => return false, + Some(e) => e + lcs.len(), + }; + params.set_end(Some(end)); // in case we quit early + + // Search in reverse from the end of the suffix match. + let result = Dfa::exec( + &self.dfa_reverse, + &mut cache.dfa_reverse, + params, + &text[start..end], + end - start); + let match_start = match result { + DfaResult::Match => match params.captures().get(0) { + Some(&Some(i)) => start + i, + // We know we have a match, but the caller didn't ask + // for any captures, so we can quit now. + _ => return true, + }, + DfaResult::NoMatch => continue, + DfaResult::Quit => { + params.reset(); + return self.exec_auto_nfa(params, text, start); + } + }; + if params.style().match_only() { + return true; + } + + // Now search forwards from the start of the reverse match. + let result = Dfa::exec( + &self.dfa, + &mut cache.dfa, + params, + text, + match_start); + let match_end = match result { + DfaResult::Match => match params.captures().get(1) { + Some(&Some(i)) => i, + _ => panic!("BUG: early match can't happen"), + }, + DfaResult::NoMatch => { + panic!("BUG: reverse match implies forward match"); + } + DfaResult::Quit => { + params.reset(); + return self.exec_auto_nfa(params, text, start); + } + }; + + // If the caller only requested the start/end of a match, then we + // can quit now. + if params.captures().len() == 2 { + params.set_start(Some(match_start)); + params.set_end(Some(match_end)); + return true; + } + // Otherwise, we have to fall back to NFA to fill in captures. + return self.exec_auto_nfa(params, text, match_start); + } + false } /// This is like exec_auto, except it always chooses between either the /// full NFA simulation or the bounded backtracking engine. - fn exec_auto_nfa<'c, 'm>( + fn exec_auto_nfa( &self, - search: &mut Search<'c, 'm>, + params: &mut Params, text: &[u8], start: usize, ) -> bool { - if backtrack::should_exec(self.prog.len(), text.len()) { - self.exec_backtrack(search, text, start) + if backtrack::should_exec(self.nfa.len(), text.len()) { + self.exec_backtrack(params, text, start) } else { - self.exec_nfa(search, text, start) + self.exec_nfa(params, text, start) } } /// Always run the NFA algorithm. - fn exec_nfa<'c, 'm>( + fn exec_nfa( &self, - search: &mut Search<'c, 'm>, + params: &mut Params, text: &[u8], start: usize, ) -> bool { - if self.prog.uses_bytes() { - Nfa::exec(&self.prog, search, ByteInput::new(text), start) + let mut cache = self.cache.get_ref(); + if self.nfa.uses_bytes() { + Nfa::exec( + &self.nfa, + &mut cache.nfa, + params, + ByteInput::new(text), + start) } else { - Nfa::exec(&self.prog, search, CharInput::new(text), start) + Nfa::exec( + &self.nfa, + &mut cache.nfa, + params, + CharInput::new(text), + start) } } /// Always runs the NFA using bounded backtracking. - fn exec_backtrack<'c, 'm>( + fn exec_backtrack( &self, - search: &mut Search<'c, 'm>, + params: &mut Params, text: &[u8], start: usize, ) -> bool { - if self.prog.uses_bytes() { - Backtrack::exec(&self.prog, search, ByteInput::new(text), start) + let mut cache = self.cache.get_ref(); + if self.nfa.uses_bytes() { + Backtrack::exec( + &self.nfa, + &mut cache.backtrack, + params, + ByteInput::new(text), + start) } else { - Backtrack::exec(&self.prog, search, CharInput::new(text), start) + Backtrack::exec( + &self.nfa, + &mut cache.backtrack, + params, + CharInput::new(text), + start) } } @@ -488,36 +532,103 @@ impl Exec { /// regex machinery and use specialized DFAs. /// /// This panics if the set of literals do not correspond to matches. - fn exec_literals<'c, 'm>( + fn exec_literals( &self, - search: &mut Search<'c, 'm>, + literals: &LiteralSearcher, + params: &mut Params, text: &[u8], start: usize, ) -> bool { - debug_assert!(self.prog.prefixes.at_match()); - match self.prog.prefixes.find(&text[start..]) { + debug_assert!(literals.complete()); + debug_assert!(self.res.len() == 1); + match literals.find(&text[start..]) { None => false, Some((s, e)) => { - if search.captures.len() == 2 { - search.captures[0] = Some(start + s); - search.captures[1] = Some(start + e); + if s > 0 && self.nfa.is_anchored_start + || e < text.len() && self.nfa.is_anchored_end { + // It seem inefficient to reject the match here, but in + // fact, for large strings this would have been rejected + // earlier. To avoid overhead, we skip that check for + // smaller strings but need to make sure we don't + // accidentally report an errant match. + return false; } + if params.captures().len() == 2 { + params.set_start(Some(start + s)); + params.set_end(Some(start + e)); + } + params.set_match(0); true } } } - /// Build a dynamic Regex from this executor. + /// Returns false if the regex has a start/end anchor, but none of the + /// prefix/suffix literals match. + /// + /// Returns true if there are no anchors, no prefix/suffix literals or if + /// the literals match. + pub fn is_anchor_match(&self, text: &[u8], start: usize) -> bool { + self.is_anchor_start_match(text, start) + && self.is_anchor_end_match(text, start) + } + + fn is_anchor_start_match(&self, text: &[u8], _start: usize) -> bool { + if !self.nfa.is_anchored_start || self.nfa.prefixes.is_empty() { + return true; + } + self.nfa.prefixes.find_start(text).is_some() + } + + fn is_anchor_end_match(&self, text: &[u8], _start: usize) -> bool { + if !self.nfa.is_anchored_end || self.suffixes.is_empty() { + return true; + } + self.suffixes.find_end(text).is_some() + } + + /// Returns true if the program is amenable to suffix scanning. + /// + /// When this is true, as a heuristic, we assume it is OK to quickly scan + /// for suffix literals and then do a *reverse* DFA match from any matches + /// produced by the literal scan. (And then followed by a forward DFA + /// search, since the previously found suffix literal maybe not actually be + /// the end of a match.) + /// + /// This is a bit of a specialized optimization, but can result in pretty + /// big performance wins if 1) there are no prefix literals and 2) the + /// suffix literals are pretty rare in the text. (1) is obviously easy to + /// account for but (2) is harder. As a proxy, we assume that longer + /// strings are generally rarer, so we only enable this optimization when + /// we have a meaty suffix. + fn should_suffix_scan(&self) -> bool { + if self.suffixes.is_empty() { + return false; + } + let lcs_len = self.suffixes.lcs().char_len(); + lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() + } + + /// Build a Regex from this executor. pub fn into_regex(self) -> re_unicode::Regex { re_unicode::Regex::from(self) } - /// Build a dynamic Regex from this executor that can match arbitrary - /// bytes. + /// Build a RegexSet from this executor. + pub fn into_regex_set(self) -> set::RegexSet { + set::RegexSet::from(self) + } + + /// Build a Regex from this executor that can match arbitrary bytes. pub fn into_byte_regex(self) -> re_bytes::Regex { re_bytes::Regex::from(self) } + /// Build a RegexSet from this executor that can match arbitrary bytes. + pub fn into_byte_regex_set(self) -> re_bytes::RegexSet { + re_bytes::RegexSet::from(self) + } + /// The original regular expressions given by the caller that were /// compiled. pub fn regex_strings(&self) -> &[String] { @@ -528,20 +639,20 @@ impl Exec { /// /// There is a match slot for every regular expression in this executor. pub fn matches(&self) -> &[InstPtr] { - &self.prog.matches + &self.nfa.matches } /// Return a slice of capture names. /// /// Any capture that isn't named is None. pub fn captures(&self) -> &[Option] { - &self.prog.captures + &self.nfa.captures } /// Return a reference to named groups mapping (from group name to /// group position). pub fn capture_name_idx(&self) -> &Arc> { - &self.prog.capture_name_idx + &self.nfa.capture_name_idx } } @@ -568,3 +679,107 @@ enum MatchEngine { /// slowest choice. Nfa, } + +/// ProgramPool is a proxy for mempool::Pool that knows how to impl Clone. +#[derive(Debug)] +struct ProgramPool(Pool>); + +impl ProgramPool { + fn new() -> Self { + ProgramPool(Pool::new(Box::new(|| Box::new(ProgramCache::new())))) + } +} + +impl Deref for ProgramPool { + type Target = Pool>; + fn deref(&self) -> &Self::Target { &self.0 } +} + +impl Clone for ProgramPool { + fn clone(&self) -> ProgramPool { ProgramPool::new() } +} + +/// ProgramCache maintains reusable allocations for each matching engine +/// available to a particular program. +/// +/// The allocations are created lazily, so we don't pay for caches that +/// aren't used. +/// +/// N.B. These are all behind a pointer because it's fewer bytes to memcpy. +/// These caches are pushed/popped from the pool a lot, and a smaller +/// footprint can have an impact on matching small inputs. See, for example, +/// the hard_32 benchmark. +#[derive(Debug)] +struct ProgramCache { + nfa: NfaCache, + backtrack: BacktrackCache, + dfa: DfaCache, + dfa_reverse: DfaCache, +} + +impl ProgramCache { + fn new() -> Self { + ProgramCache { + nfa: NfaCache::new(), + backtrack: BacktrackCache::new(), + dfa: DfaCache::new(), + dfa_reverse: DfaCache::new(), + } + } +} + +impl Clone for ProgramCache { + fn clone(&self) -> ProgramCache { + ProgramCache::new() + } +} + +struct Parsed { + exprs: Vec, + prefixes: Literals, + suffixes: Literals, +} + +fn parse(res: &[String], only_utf8: bool) -> Result { + let mut exprs = Vec::with_capacity(res.len()); + let mut prefixes = Some(Literals::empty()); + let mut suffixes = Some(Literals::empty()); + for re in res { + let parser = + ExprBuilder::new() + .allow_bytes(!only_utf8) + .unicode(only_utf8); + let expr = try!(parser.parse(re)); + prefixes = prefixes.and_then(|mut prefixes| { + if !prefixes.union_prefixes(&expr) { + None + } else { + Some(prefixes) + } + }); + suffixes = suffixes.and_then(|mut suffixes| { + if !suffixes.union_suffixes(&expr) { + None + } else { + Some(suffixes) + } + }); + exprs.push(expr); + } + // If this is a set, then we have to force our prefixes/suffixes to all be + // cut so that they don't trigger the literal engine (which doesn't work + // with sets... yet). + if res.len() != 1 { + if let Some(ref mut prefixes) = prefixes { + prefixes.cut(); + } + if let Some(ref mut suffixes) = suffixes { + suffixes.cut(); + } + } + Ok(Parsed { + exprs: exprs, + prefixes: prefixes.unwrap_or(Literals::empty()), + suffixes: suffixes.unwrap_or(Literals::empty()), + }) +} diff --git a/src/freqs.rs b/src/freqs.rs new file mode 100644 index 0000000000..92bafc199f --- /dev/null +++ b/src/freqs.rs @@ -0,0 +1,271 @@ +// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/frequencies.py", do not +// edit directly + +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/src/input.rs b/src/input.rs index 912646d4a7..956df978bc 100644 --- a/src/input.rs +++ b/src/input.rs @@ -17,7 +17,7 @@ use std::u32; use syntax; use utf8::{decode_utf8, decode_last_utf8}; -use literals::Literals; +use literals::LiteralSearcher; /// Represents a location in the input. #[derive(Clone, Copy, Debug)] @@ -84,7 +84,11 @@ pub trait Input { fn previous_char(&self, at: InputAt) -> Char; /// Scan the input for a matching prefix. - fn prefix_at(&self, prefixes: &Literals, at: InputAt) -> Option; + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option; /// The number of bytes in the input. fn len(&self) -> usize; @@ -95,12 +99,21 @@ pub trait Input { impl<'a, T: Input> Input for &'a T { fn at(&self, i: usize) -> InputAt { (**self).at(i) } + fn next_char(&self, at: InputAt) -> Char { (**self).next_char(at) } + fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) } - fn prefix_at(&self, prefixes: &Literals, at: InputAt) -> Option { + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { (**self).prefix_at(prefixes, at) } + fn len(&self) -> usize { (**self).len() } + fn as_bytes(&self) -> &[u8] { (**self).as_bytes() } } @@ -148,7 +161,11 @@ impl<'t> Input for CharInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } - fn prefix_at(&self, prefixes: &Literals, at: InputAt) -> Option { + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) } @@ -203,7 +220,11 @@ impl<'t> Input for ByteInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } - fn prefix_at(&self, prefixes: &Literals, at: InputAt) -> Option { + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) } diff --git a/src/lib.rs b/src/lib.rs index 0cd810d3b6..e617b3ee08 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,7 +17,8 @@ //! //! This crate's documentation provides some simple examples, describes Unicode //! support and exhaustively lists the supported syntax. For more specific -//! details on the API, please see the documentation for the `Regex` type. +//! details on the API, please see the documentation for the +//! [`Regex`](struct.Regex.html) type. //! //! # Usage //! @@ -486,6 +487,7 @@ extern crate aho_corasick; extern crate memchr; +extern crate mempool; #[cfg(test)] extern crate quickcheck; extern crate regex_syntax as syntax; extern crate utf8_ranges; @@ -596,10 +598,11 @@ mod dfa; mod error; mod exec; mod expand; +mod freqs; mod input; mod literals; mod nfa; -mod pool; +mod params; mod prog; mod re_bytes; mod re_unicode; @@ -607,13 +610,14 @@ mod set; mod sparse; /// The `internal` module exists to support the `regex!` macro and other -/// suspicious activity, such as testing different matching engines. +/// suspicious activity, such as testing different matching engines and +/// supporting the `regex-debug` CLI utility. #[doc(hidden)] pub mod internal { pub use compile::Compiler; pub use exec::{Exec, ExecBuilder}; pub use input::{Char, Input, CharInput, InputAt}; - pub use literals::Literals; + pub use literals::LiteralSearcher; pub use prog::{Program, Inst, EmptyLook, InstRanges}; pub use re_unicode::{_Regex, ExNative}; } diff --git a/src/literals.rs b/src/literals.rs index 5dbd6ef1bc..50c16c18ba 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -8,365 +8,13 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use std::char; -use std::collections::HashSet; -use std::fmt; use std::mem; use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton}; use memchr::{memchr, memchr2, memchr3}; +use syntax; -use utf8::encode_utf8; -use prog::{Program, Inst, InstBytes, InstRanges}; - -#[derive(Clone, Eq, PartialEq)] -pub struct AlternateLiterals { - at_match: bool, - literals: Vec>, -} - -impl AlternateLiterals { - pub fn into_matcher(self) -> Literals { - if self.literals.is_empty() { - Literals::empty() - } else { - let alts = self.unambiguous(); - let at_match = alts.at_match; - Literals { - at_match: at_match, - matcher: LiteralMatcher::new(alts), - } - } - } - - fn empty() -> AlternateLiterals { - AlternateLiterals { at_match: false, literals: vec![] } - } - - fn is_empty(&self) -> bool { - self.literals.is_empty() - } - - fn is_single_byte(&self) -> bool { - self.literals.len() == 1 && self.literals[0].len() == 1 - } - - fn all_single_bytes(&self) -> bool { - self.literals.len() >= 1 && self.literals.iter().all(|s| s.len() == 1) - } - - fn all_same_length(&self) -> bool { - if self.literals.is_empty() { - return true; - } - self.literals.iter().all(|s| s.len() == self.literals[0].len()) - } - - fn is_one_literal(&self) -> bool { - self.literals.len() == 1 - } - - fn distinct_single_bytes(&self) -> usize { - let mut seen = vec![false; 256]; - for lit in &self.literals { - if lit.len() == 1 { - seen[lit[0] as usize] = true; - } - } - seen.iter().filter(|yes| **yes).count() - } - - fn num_bytes(&self) -> usize { - self.literals.iter().map(|lit| lit.len()).fold(0, |acc, len| acc + len) - } - - fn add_alternates(&mut self, alts: AlternateLiterals) { - self.at_match = self.at_match && alts.at_match; - self.literals.extend(alts.literals); - } - - fn add_literal_char(&mut self, c: char) { - let scratch = &mut [0; 4]; - let n = encode_utf8(c, scratch).unwrap(); - for alt in &mut self.literals { - alt.extend(&scratch[0..n]); - } - } - - fn add_literal_char_ranges(&mut self, inst: &InstRanges) { - // This is tricky. We need to think of each range as its own set of - // alternations. For example, `[a-cx-z]` is comprised of two ranges: - // `a-c` and `x-z`. This is equivalent to the regex `a|b|c|x|y|z`. If - // we've already found two prefixes, e.g., `foo|bar`, then we need to - // extend all such prefixes with all alternates here. For e.g., `fooa`, - // ..., `fooz`, `bara`, ..., `barz`. - // - // To achieve this, we copy our existing literals for every char! - let scratch = &mut [0; 4]; - let nlits = self.literals.len(); - let orig = mem::replace(&mut self.literals, Vec::with_capacity(nlits)); - for &(s, e) in &inst.ranges { - for c in (s as u32)..(e as u32 + 1) { - for alt in &orig { - let mut alt = alt.clone(); - let ch = char::from_u32(c).unwrap(); - let n = encode_utf8(ch, scratch).unwrap(); - - alt.extend(&scratch[0..n]); - self.literals.push(alt); - } - } - } - } - - fn add_literal_byte_range(&mut self, inst: &InstBytes) { - // Pretty much the same process as for literal char ranges, but we - // only have one range. - let nlits = self.literals.len(); - let orig = mem::replace(&mut self.literals, Vec::with_capacity(nlits)); - for b in inst.start..(inst.end + 1) { - for alt in &orig { - let mut alt = alt.clone(); - alt.push(b); - self.literals.push(alt); - } - } - } - - /// Returns a new set of alternate literals that are guaranteed to be - /// unambiguous. - /// - /// State differently, the set of literals returned is guaranteed to never - /// result in any overlapping matches. - /// - /// Duplicate literals are dropped. Literals that are otherwise distinct - /// will be possibly truncated. - fn unambiguous(&self) -> AlternateLiterals { - fn position(needle: &[u8], mut haystack: &[u8]) -> Option { - let mut i = 0; - while haystack.len() >= needle.len() { - if needle == &haystack[..needle.len()] { - return Some(i); - } - i += 1; - haystack = &haystack[1..]; - } - None - } - - // This function is a bit gratuitous and allocation heavy, but in - // general, we limit the number of alternates to be pretty small. - - if self.all_same_length() { - return self.clone(); - } - let mut new = AlternateLiterals { - at_match: self.at_match, - literals: Vec::with_capacity(self.literals.len()), - }; -'OUTER: - for lit1 in &self.literals { - if new.literals.is_empty() { - new.literals.push(lit1.clone()); - continue; - } - let mut candidate = lit1.clone(); - for lit2 in &mut new.literals { - if &candidate == lit2 { - // If the literal is already in the set, then we can - // just drop it. - continue 'OUTER; - } - if lit1.len() <= lit2.len() { - if let Some(i) = position(&candidate, lit2) { - new.at_match = false; - lit2.truncate(i); - } - } else { - if let Some(i) = position(lit2, &candidate) { - new.at_match = false; - candidate.truncate(i); - } - } - } - new.literals.push(candidate); - } - new.literals.retain(|lit| !lit.is_empty()); - // This is OK only because the alternates are unambiguous. - new.literals.sort(); - new.literals.dedup(); - new - } -} - -struct BuildPrefixes<'a> { - insts: &'a Program, - limit: usize, - alts: AlternateLiterals, -} - -impl<'a> BuildPrefixes<'a> { - fn new(insts: &'a Program) -> Self { - BuildPrefixes { - insts: insts, - limit: 250, - alts: AlternateLiterals { at_match: true, literals: vec![] }, - } - } - - fn literals(mut self) -> AlternateLiterals { - let mut stack = vec![self.insts.skip(self.insts.start)]; - let mut seen = HashSet::new(); - while let Some(mut pc) = stack.pop() { - seen.insert(pc); - pc = self.insts.skip(pc); - if let Inst::Split(ref inst) = self.insts[pc] { - if !seen.contains(&inst.goto2) { - stack.push(inst.goto2); - } - if !seen.contains(&inst.goto1) { - stack.push(inst.goto1); - } - continue; - } - // When searching for required literals, set the local limit to - // something a bit less than our real limit. This prevents a single - // alternation from blowing our budget in most cases. (If a single - // alt blows the budget, then we can't consume literals from other - // alts, which means we end up with nothing to show for it.) - // - // For example, consider `a?[0-9]{3}`. This splits into two - // regexes `a[0-9]{3}` and `[0-9]{3}`. The latter regex can be - // expanded completely into a set of alternate literals that - // consumes exactly 3000 bytes. This is our entire budget if the - // limit is 3000. Therefore, we're left with no room to add the - // second branch (`a[0-9]{3}`) to our set of literals. If we can't - // represent all required alternates, then we have to give up. - // Therefore, as a heuristic, limit what each alternate is allowed - // to use. In this case, `[0-9]{3}` will only gather literals for - // `[0-9]{2}`, which leaves more than enough room for our second - // branch. - let alts = BuildRequiredLiterals::new(self.insts) - .set_limit(self.limit / 10) - .literals(pc); - if alts.is_empty() { - // If we couldn't find any literals required in this path - // through the program, then we can't conclude anything about - // prefix literals for this program. For example, if the regex - // is `a|b*`, then the second alternate has no prefix to search - // for. (`b*` matches the empty string!) - return AlternateLiterals::empty(); - } - if self.alts.num_bytes() + alts.num_bytes() > self.limit { - // We've blown our budget. Give up. - // We could do something a little smarter here and try to trim - // the literals we've got here. (e.g., If every literal is two - // characters, then it would be legal to remove the second char - // from every literal.) - return AlternateLiterals::empty(); - } - self.alts.add_alternates(alts); - } - self.alts - } -} - -pub struct BuildRequiredLiterals<'a> { - insts: &'a Program, - limit: usize, - alts: AlternateLiterals, -} - -impl<'a> BuildRequiredLiterals<'a> { - pub fn new(insts: &'a Program) -> Self { - BuildRequiredLiterals { - insts: insts, - limit: 250, - alts: AlternateLiterals { at_match: true, literals: vec![vec![]] }, - } - } - - pub fn set_limit(mut self, limit: usize) -> Self { - self.limit = limit; - self - } - - fn literals(mut self, mut pc: usize) -> AlternateLiterals { - use prog::Inst::*; - loop { - let inst = &self.insts[pc]; - match *inst { - Save(ref inst) => pc = inst.goto, - Char(ref inst) => { - if !self.add_literal_char(inst.c) { - self.alts.at_match = false; - break; - } - pc = inst.goto; - } - Ranges(ref inst) => { - if !self.add_literal_char_ranges(inst) { - self.alts.at_match = false; - break; - } - pc = inst.goto; - } - Bytes(ref inst) => { - if !self.add_literal_byte_range(inst) { - self.alts.at_match = false; - break; - } - pc = inst.goto; - } - Split(_) | EmptyLook(_) | Match(_) => { - self.alts.at_match = self.insts.leads_to_match(pc); - break; - } - } - } - if self.alts.literals.len() == 1 && self.alts.literals[0].is_empty() { - AlternateLiterals::empty() - } else { - self.alts - } - } - - fn add_literal_char(&mut self, c: char) -> bool { - if self.alts.num_bytes() + 1 > self.limit { - return false; - } - self.alts.add_literal_char(c); - true - } - - fn add_literal_char_ranges(&mut self, inst: &InstRanges) -> bool { - // Compute roughly how many bytes will be in our literals following - // the addition of the given ranges. If we blow our limit, then we - // can't add *any* of them. - let nchars = inst.num_chars(); - let new_byte_count = (self.alts.num_bytes() * nchars) - + (self.alts.literals.len() * nchars); - if new_byte_count > self.limit { - return false; - } - self.alts.add_literal_char_ranges(inst); - true - } - - fn add_literal_byte_range(&mut self, inst: &InstBytes) -> bool { - // Compute roughly how many bytes will be in our literals following - // the addition of the given range. If we blow our limit, then we - // can't add anything. - let nbytes = (inst.end as usize) - (inst.start as usize) + 1; - let new_byte_count = (self.alts.num_bytes() * nbytes) - + (self.alts.literals.len() * nbytes); - if new_byte_count > self.limit { - return false; - } - self.alts.add_literal_byte_range(inst); - true - } -} +use freqs::BYTE_FREQUENCIES; /// A prefix extracted from a compiled regular expression. /// @@ -385,75 +33,119 @@ impl<'a> BuildRequiredLiterals<'a> { /// It's possible that there's room here for other substring algorithms, /// such as Boyer-Moore for single-set prefixes greater than 1, or Rabin-Karp /// for small sets of same-length prefixes. -#[derive(Clone)] -pub struct Literals { - at_match: bool, - matcher: LiteralMatcher, +#[derive(Clone, Debug)] +pub struct LiteralSearcher { + complete: bool, + lcp: SingleSearch, + lcs: SingleSearch, + matcher: Matcher, } -#[derive(Clone)] -enum LiteralMatcher { - /// No prefixes. (Never advances through the input.) +#[derive(Clone, Debug)] +enum Matcher { + /// No literals. (Never advances through the input.) Empty, - /// A single byte prefix. - Byte(u8), - /// A set of two or more single byte prefixes. - Bytes { - chars: Vec, - sparse: Vec, - }, + /// A set of four or more single byte literals. + Bytes(SingleByteSet), /// A single substring. (Likely using Boyer-Moore with memchr.) Single(SingleSearch), /// An Aho-Corasick automaton. - AC(FullAcAutomaton>), + AC(FullAcAutomaton), } -impl Literals { +impl LiteralSearcher { /// Returns a matcher that never matches and never advances the input. pub fn empty() -> Self { - Literals { at_match: false, matcher: LiteralMatcher::Empty } + Self::new(syntax::Literals::empty(), Matcher::Empty) } - /// Returns a matcher for literal prefixes in the given program. - pub fn prefixes(prog: &Program) -> Self { - BuildPrefixes::new(prog).literals().into_matcher() + /// Returns a matcher for literal prefixes from the given set. + pub fn prefixes(lits: syntax::Literals) -> Self { + let matcher = Matcher::prefixes(&lits); + Self::new(lits, matcher) } - /// Returns true if and only if a literal match corresponds to a match - /// in the regex from which the literal was extracted. - pub fn at_match(&self) -> bool { - self.at_match && self.len() > 0 + /// Returns a matcher for literal suffixes from the given set. + pub fn suffixes(lits: syntax::Literals) -> Self { + let matcher = Matcher::suffixes(&lits); + Self::new(lits, matcher) } - /// Find the position of a prefix in `haystack` if it exists. + fn new(lits: syntax::Literals, matcher: Matcher) -> Self { + let complete = lits.all_complete(); + LiteralSearcher { + complete: complete, + lcp: SingleSearch::new(lits.longest_common_prefix().to_vec()), + lcs: SingleSearch::new(lits.longest_common_suffix().to_vec()), + matcher: matcher, + } + } + + /// Returns true if all matches comprise the entire regular expression. /// - /// In the matching engines, we only actually need the starting index - /// because the prefix is used to only skip ahead---the matching engine - /// still needs to run over the prefix input. However, we return the ending - /// location as well in case the prefix corresponds to the entire regex, - /// in which case, you need the end of the match. + /// This does not necessarily mean that a literal match implies a match + /// of the regular expression. For example, the regular expresison `^a` + /// is comprised of a single complete literal `a`, but the regular + /// expression demands that it only match at the beginning of a string. + pub fn complete(&self) -> bool { + self.complete && self.len() > 0 + } + + /// Find the position of a literal in `haystack` if it exists. pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { - use self::LiteralMatcher::*; + use self::Matcher::*; match self.matcher { Empty => Some((0, 0)), - Byte(b) => memchr(b, haystack).map(|i| (i, i+1)), - Bytes { ref sparse, ref chars } => { - if chars.len() == 2 { - memchr2(chars[0], chars[1], haystack).map(|i| (i, i+1)) - } else if chars.len() == 3 { - let (b0, b1, b2) = (chars[0], chars[1], chars[2]); - memchr3(b0, b1, b2, haystack).map(|i| (i, i+1)) - } else { - find_singles(sparse, haystack) - } + Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), + Single(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)), + } + } + + /// Like find, except matches must start at index `0`. + pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[0..lit.len()] { + return Some((0, lit.len())); } - Single(ref searcher) => { - searcher.find(haystack).map(|i| (i, i + searcher.pat.len())) + } + None + } + + /// Like find, except matches must end at index `haystack.len()`. + pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; } - AC(ref aut) => { - aut.find(haystack).next().map(|m| (m.start, m.end)) + if lit == &haystack[haystack.len() - lit.len()..] { + return Some((haystack.len() - lit.len(), haystack.len())); } } + None + } + + /// Returns an iterator over all literals to be matched. + pub fn iter(&self) -> LiteralIter { + match self.matcher { + Matcher::Empty => LiteralIter::Empty, + Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), + Matcher::Single(ref s) => LiteralIter::Single(&s.pat), + Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()), + } + } + + /// Returns a matcher for the longest common prefix of this matcher. + pub fn lcp(&self) -> &SingleSearch { + &self.lcp + } + + /// Returns a matcher for the longest common suffix of this matcher. + pub fn lcs(&self) -> &SingleSearch { + &self.lcs } /// Returns true iff this prefix is empty. @@ -463,11 +155,10 @@ impl Literals { /// Returns the number of prefixes in this machine. pub fn len(&self) -> usize { - use self::LiteralMatcher::*; + use self::Matcher::*; match self.matcher { Empty => 0, - Byte(_) => 1, - Bytes { ref chars, .. } => chars.len(), + Bytes(ref sset) => sset.dense.len(), Single(_) => 1, AC(ref aut) => aut.len(), } @@ -475,340 +166,295 @@ impl Literals { /// Return the approximate heap usage of literals in bytes. pub fn approximate_size(&self) -> usize { - use self::LiteralMatcher::*; + use self::Matcher::*; match self.matcher { - Empty | Byte(_) => 0, - Bytes { ref chars, ref sparse } => { - (chars.len() * mem::size_of::()) - + (sparse.len() * mem::size_of::()) - } - Single(ref single) => { - (single.pat.len() * mem::size_of::()) - + (single.shift.len() * mem::size_of::()) - } + Empty => 0, + Bytes(ref sset) => sset.approximate_size(), + Single(ref single) => single.approximate_size(), AC(ref aut) => aut.heap_bytes(), } } +} - /// Returns all of the literal participating in this machine. - /// - /// For debug/testing only! (It allocates.) - #[allow(dead_code)] - fn strings(&self) -> Vec { - self.byte_strings() - .into_iter() - .map(|p| String::from_utf8(p).unwrap()) - .collect() - } - - #[allow(dead_code)] - fn byte_strings(&self) -> Vec> { - use self::LiteralMatcher::*; - match self.matcher { - Empty => vec![], - Byte(b) => vec![vec![b]], - Bytes { ref chars, .. } => { - chars.iter().map(|&byte| vec![byte]).collect() - } - Single(ref searcher) => vec![searcher.pat.clone()], - AC(ref aut) => aut.patterns().iter().cloned().collect(), +impl Matcher { + fn prefixes(lits: &syntax::Literals) -> Self { + let sset = SingleByteSet::prefixes(&lits); + Matcher::new(lits, sset) + } + + fn suffixes(lits: &syntax::Literals) -> Self { + let sset = SingleByteSet::suffixes(&lits); + Matcher::new(lits, sset) + } + + fn new(lits: &syntax::Literals, sset: SingleByteSet) -> Self { + if lits.literals().is_empty() { + Matcher::Empty + } else if sset.dense.len() >= 26 { + // Avoid trying to match a large number of single bytes. + // This is *very* sensitive to a frequency analysis comparison + // between the bytes in sset and the composition of the haystack. + // No matter the size of sset, if its members all are rare in the + // haystack, then it'd be worth using it. How to tune this... IDK. + // ---AG + Matcher::Empty + } else if sset.complete { + Matcher::Bytes(sset) + } else if lits.literals().len() == 1 { + Matcher::Single(SingleSearch::new(lits.literals()[0].to_vec())) + } else { + let pats = lits.literals().to_owned(); + Matcher::AC(AcAutomaton::new(pats).into_full()) } } } -impl LiteralMatcher { - fn new(mut alts: AlternateLiterals) -> Self { - use self::LiteralMatcher::*; - - if alts.is_empty() { - Empty - } else if alts.distinct_single_bytes() >= 26 { - // Why do we do this? Well, it's a heuristic to prevent thrashing. - // Basically, if our literal matcher has lots of literals that are - // a single byte, then we lose a lot of the benefits of fast - // literal searching. In particular, single bytes have a high - // probability of matching. In a regex that rarely matches, we end - // up ping-ponging between the literal matcher and the regex engine - // for every byte of input. That's bad juju. - // - // Note that we only count distinct starting bytes from literals of - // length 1. For literals longer than that, we assume they have - // a lower probability of matching. - // - // This particular heuristic would be triggered on, e.g., - // `[a-z].+`. The prefix here is a single byte that is very likely - // to match on any given byte in the input, so it's quicker just - // to let the matching engine process it. - // - // TODO(burntsushi): Consider lowering the threshold! - Empty - } else if alts.is_single_byte() { - Byte(alts.literals[0][0]) - } else if alts.all_single_bytes() { - let mut set = vec![false; 256]; - let mut bytes = vec![]; - for lit in alts.literals { - bytes.push(lit[0]); - set[lit[0] as usize] = true; +pub enum LiteralIter<'a> { + Empty, + Bytes(&'a [u8]), + Single(&'a [u8]), + AC(&'a [syntax::Lit]), +} + +impl<'a> Iterator for LiteralIter<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + match *self { + LiteralIter::Empty => None, + LiteralIter::Bytes(ref mut many) => { + if many.is_empty() { + None + } else { + let next = &many[0..1]; + *many = &many[1..]; + Some(next) + } + } + LiteralIter::Single(ref mut one) => { + if one.is_empty() { + None + } else { + let next = &one[..]; + *one = &[]; + Some(next) + } + } + LiteralIter::AC(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(&**next) + } } - Bytes { chars: bytes, sparse: set } - } else if alts.is_one_literal() { - Single(SingleSearch::new(alts.literals.pop().unwrap())) - } else { - AC(AcAutomaton::new(alts.literals).into_full()) } } } -/// Provides an implementation of fast subtring search. -/// -/// In particular, this uses Boyer-Moore-Horspool with Tim Raita's twist: -/// https://en.wikipedia.org/wiki/Raita_Algorithm -/// -/// I'm skeptical of the utility here, because benchmarks suggest that it is -/// difficult to beat Aho-Corasick on random text. Namely, both algorithms are -/// dominated by the performance of `memchr` for the leading byte prefix. -/// With that said, BMH does seem to surpass AC when the search text gets -/// longer (see the `easy0_1MB` vs. `easy1_1MB` benchmarks). -/// -/// More analysis needs to be done to test this on different search texts. #[derive(Clone, Debug)] -pub struct SingleSearch { - pat: Vec, - shift: Vec, +struct SingleByteSet { + sparse: Vec, + dense: Vec, + complete: bool, } -impl SingleSearch { - fn new(pat: Vec) -> SingleSearch { - assert!(pat.len() >= 1); - let mut shift = vec![pat.len(); 256]; - for i in 0..(pat.len() - 1) { - shift[pat[i] as usize] = pat.len() - i - 1; - } - SingleSearch { - pat: pat, - shift: shift, +impl SingleByteSet { + fn new() -> SingleByteSet { + SingleByteSet { + sparse: vec![false; 256], + dense: vec![], + complete: true, } } - fn find(&self, haystack: &[u8]) -> Option { - let pat = &*self.pat; - if haystack.len() < pat.len() { - return None; - } - let mut i = match memchr(pat[0], haystack) { - None => return None, - Some(i) => i, - }; - while i <= haystack.len() - pat.len() { - let b = haystack[i + pat.len() - 1]; - if b == pat[pat.len() - 1] - && haystack[i] == pat[0] - && haystack[i + (pat.len() / 2)] == pat[pat.len() / 2] - && pat == &haystack[i..i + pat.len()] { - return Some(i); + fn prefixes(lits: &syntax::Literals) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + for lit in lits.literals() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.get(0) { + if !sset.sparse[b as usize] { + sset.dense.push(b); + sset.sparse[b as usize] = true; + } } - i += self.shift[b as usize]; - i += match memchr(pat[0], &haystack[i..]) { - None => return None, - Some(i) => i, - }; } - None + sset } -} -/// A quick scan for multiple single byte prefixes using a sparse map. -fn find_singles( - sparse: &[bool], - text: &[u8], -) -> Option<(usize, usize)> { - for (hi, &b) in text.iter().enumerate() { - if sparse[b as usize] { - return Some((hi, hi+1)); + fn suffixes(lits: &syntax::Literals) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + for lit in lits.literals() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { + if !sset.sparse[b as usize] { + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } } + sset } - None -} -impl fmt::Debug for AlternateLiterals { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let mut strings = vec![]; - for lit in &self.literals { - strings.push(String::from_utf8_lossy(lit).into_owned()); + /// Faster find that special cases certain sizes to use memchr. + fn find(&self, text: &[u8]) -> Option { + match self.dense.len() { + 0 => None, + 1 => memchr(self.dense[0], text), + 2 => memchr2(self.dense[0], self.dense[1], text), + 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), + _ => self._find(text), } - f.debug_struct("AlternateLiterals") - .field("at_match", &self.at_match) - .field("literals", &strings) - .finish() } -} -impl fmt::Debug for Literals { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::LiteralMatcher::*; - try!(write!(f, "complete? {}, matcher: ", self.at_match)); - match self.matcher { - Empty => write!(f, "Empty"), - Byte(b) => write!(f, "one byte: {:?}", b as char), - Bytes { ref chars, .. } => { - let chars: Vec = - chars.iter() - .map(|&c| format!("{:?}", c as char)) - .collect(); - write!(f, "alternate single bytes: {}", chars.join(", ")) + /// Generic find that works on any sized set. + fn _find(&self, haystack: &[u8]) -> Option { + for (i, &b) in haystack.iter().enumerate() { + if self.sparse[b as usize] { + return Some(i); } - Single(ref searcher) => write!(f, "{:?}", searcher), - AC(ref aut) => write!(f, "{:?}", aut), } + None } -} -#[cfg(test)] -mod tests { - use compile::Compiler; - use super::{AlternateLiterals, BuildPrefixes}; - use syntax::Expr; - - macro_rules! prog { - ($re:expr) => {{ - let expr = Expr::parse($re).unwrap(); - let prog = Compiler::new().compile(&[expr]).unwrap(); - prog - }} + fn approximate_size(&self) -> usize { + (self.dense.len() * mem::size_of::()) + + (self.sparse.len() * mem::size_of::()) } +} - macro_rules! prefixes { - ($re:expr) => {{ - let p = prog!($re); - let prefixes = BuildPrefixes::new(&p).literals().into_matcher(); - assert!(!prefixes.at_match()); - prefixes.strings() - }} - } - macro_rules! prefixes_complete { - ($re:expr) => {{ - let p = prog!($re); - let prefixes = BuildPrefixes::new(&p).literals().into_matcher(); - assert!(prefixes.at_match()); - prefixes.strings() - }} - } +/// Provides an implementation of fast subtring search. +/// +/// This particular implementation is a Boyer-Moore variant, based on the +/// "tuned boyer moore" search from (Hume & Sunday, 1991). It has been tweaked +/// slightly to better use memchr. +/// +/// memchr is so fast that we do everything we can to keep the loop in memchr +/// for as long as possible. The easiest way to do this is to intelligently +/// pick the byte to send to memchr. The best byte is the byte that occurs +/// least frequently in the haystack. Since doing frequency analysis on the +/// haystack is far too expensive, we compute a set of fixed frequencies up +/// front and hard code them in src/freqs.rs. Frequency analysis is done via +/// scripts/frequencies.py. +/// +/// TODO(burntsushi): Add some amount of shifting to this. +#[derive(Clone, Debug)] +pub struct SingleSearch { + /// The pattern. + pat: Vec, + /// The number of Unicode characters in the pattern. This is useful for + /// determining the effective length of a pattern when deciding which + /// optimizations to perform. A trailing incomplete UTF-8 sequence counts + /// as one character. + char_len: usize, + /// The rarest byte in the pattern, according to pre-computed frequency + /// analysis. + rare1: u8, + /// The offset of the rarest byte in `pat`. + rare1i: usize, + /// The second rarest byte in the pattern, according to pre-computed + /// frequency analysis. (This may be equivalent to the rarest byte.) + /// + /// The second rarest byte is used as a type of guard for quickly detecting + /// a mismatch after memchr locates an instance of the rarest byte. This + /// is a hedge against pathological cases where the pre-computed frequency + /// analysis may be off. (But of course, does not prevent pathological + /// cases.) + rare2: u8, + /// The offset of the second rarest byte in `pat`. + rare2i: usize, +} - #[test] - fn single() { - assert_eq!(prefixes_complete!("a"), vec!["a"]); - assert_eq!(prefixes_complete!("[a]"), vec!["a"]); - assert_eq!(prefixes!("a+"), vec!["a"]); - assert_eq!(prefixes!("(?:a)+"), vec!["a"]); - assert_eq!(prefixes!("(a)+"), vec!["a"]); - } +impl SingleSearch { + fn new(pat: Vec) -> SingleSearch { + fn freq_rank(b: u8) -> usize { BYTE_FREQUENCIES[b as usize] as usize } - #[test] - fn single_alt() { - assert_eq!(prefixes_complete!("a|b"), vec!["a", "b"]); - assert_eq!(prefixes_complete!("b|a"), vec!["b", "a"]); - assert_eq!(prefixes_complete!("[a]|[b]"), vec!["a", "b"]); - assert_eq!(prefixes!("a+|b"), vec!["a", "b"]); - assert_eq!(prefixes!("a|b+"), vec!["a", "b"]); - assert_eq!(prefixes!("(?:a+)|b"), vec!["a", "b"]); - assert_eq!(prefixes!("(a+)|b"), vec!["a", "b"]); - } + if pat.is_empty() { + return SingleSearch::empty(); + } - #[test] - fn many() { - assert_eq!(prefixes_complete!("abcdef"), vec!["abcdef"]); - assert_eq!(prefixes!("abcdef+"), vec!["abcdef"]); - assert_eq!(prefixes!("(?:abcdef)+"), vec!["abcdef"]); - assert_eq!(prefixes!("(abcdef)+"), vec!["abcdef"]); - } + // Find the rarest two bytes. Try to make them distinct (but it's not + // required). + let mut rare1 = pat[0]; + let mut rare2 = pat[0]; + for b in pat[1..].iter().cloned() { + if freq_rank(b) < freq_rank(rare1) { + rare1 = b; + } + } + for &b in &pat { + if rare1 == rare2 { + rare2 = b + } else if b != rare1 && freq_rank(b) < freq_rank(rare2) { + rare2 = b; + } + } - #[test] - fn many_alt() { - assert_eq!(prefixes_complete!("abc|def"), vec!["abc", "def"]); - assert_eq!(prefixes_complete!("def|abc"), vec!["def", "abc"]); - assert_eq!(prefixes!("abc+|def"), vec!["abc", "def"]); - assert_eq!(prefixes!("abc|def+"), vec!["abc", "def"]); - assert_eq!(prefixes!("(?:abc)+|def"), vec!["abc", "def"]); - assert_eq!(prefixes!("(abc)+|def"), vec!["abc", "def"]); - } + // And find the offsets of their last occurrences. + let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap(); + let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap(); - #[test] - fn class() { - assert_eq!(prefixes_complete!("[0-9]"), vec![ - "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", - ]); - assert_eq!(prefixes!("[0-9]+"), vec![ - "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", - ]); + let char_len = char_len_lossy(&pat); + SingleSearch { + pat: pat, + char_len: char_len, + rare1: rare1, + rare1i: rare1i, + rare2: rare2, + rare2i: rare2i, + } } - #[test] - fn preceding_alt() { - assert_eq!(prefixes!("(?:a|b).+"), vec!["a", "b"]); - assert_eq!(prefixes!("(a|b).+"), vec!["a", "b"]); + fn empty() -> SingleSearch { + SingleSearch { + pat: vec![], + char_len: 0, + rare1: 0, + rare1i: 0, + rare2: 0, + rare2i: 0, + } } - #[test] - fn nested_alt() { - assert_eq!(prefixes_complete!("(a|b|c|d)"), - vec!["a", "b", "c", "d"]); - assert_eq!(prefixes_complete!("((a|b)|(c|d))"), - vec!["a", "b", "c", "d"]); + pub fn find(&self, haystack: &[u8]) -> Option { + let pat = &*self.pat; + if haystack.len() < pat.len() || pat.is_empty() { + return None; + } + let mut i = self.rare1i; + while i < haystack.len() { + i += match memchr(self.rare1, &haystack[i..]) { + None => return None, + Some(i) => i, + }; + let start = i - self.rare1i; + let end = start + pat.len(); + if end > haystack.len() { + return None; + } + let aligned = &haystack[start..end]; + if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat { + return Some(start); + } + i += 1; + } + None } - #[test] - fn snowman() { - assert_eq!(prefixes_complete!("☃"), vec!["☃"]); + pub fn len(&self) -> usize { + self.pat.len() } - macro_rules! alts { - ($($s:expr),*) => {{ - AlternateLiterals { - at_match: false, - literals: vec![$($s.as_bytes().to_owned()),*], - } - }} + pub fn char_len(&self) -> usize { + self.char_len } - #[test] - fn unambiguous() { - let given = alts!("z", "azb"); - let expected = alts!("a", "z"); - assert_eq!(expected, given.unambiguous()); - - let given = alts!("zaaaaaaaaaa", "aa"); - let expected = alts!("aa", "z"); - assert_eq!(expected, given.unambiguous()); - - let given = alts!("Sherlock", "Watson"); - let expected = alts!("Sherlock", "Watson"); - assert_eq!(expected, given.unambiguous()); - - let given = alts!("abc", "bc"); - let expected = alts!("a", "bc"); - assert_eq!(expected, given.unambiguous()); - - let given = alts!("bc", "abc"); - let expected = alts!("a", "bc"); - assert_eq!(expected, given.unambiguous()); - - let given = alts!("a", "aa"); - let expected = alts!("a"); - assert_eq!(expected, given.unambiguous()); - - let given = alts!("ab", "a"); - let expected = alts!("a"); - assert_eq!(expected, given.unambiguous()); + fn approximate_size(&self) -> usize { + self.pat.len() * mem::size_of::() } +} - // That this test fails suggests that the literal finder needs to be - // completely rewritten. Ug. It's not that it is wrong currently, but - // it's not as good at finding literals as it should be. - /* - #[test] - fn non_contiguous() { - assert_eq!(prefixes_complete!("z(a|c)"), vec!["za", "zc"]); - } - */ +fn char_len_lossy(bytes: &[u8]) -> usize { + String::from_utf8_lossy(bytes).chars().count() } diff --git a/src/nfa.rs b/src/nfa.rs index 508985d1b9..784ea46248 100644 --- a/src/nfa.rs +++ b/src/nfa.rs @@ -27,8 +27,8 @@ use std::mem; -use exec::Search; use input::{Input, InputAt}; +use params::Params; use prog::{Program, InstPtr}; use sparse::SparseSet; @@ -100,12 +100,11 @@ impl<'r, I: Input> Nfa<'r, I> { /// captures accordingly. pub fn exec( prog: &'r Program, - search: &mut Search, + cache: &mut NfaCache, + params: &mut Params, input: I, start: usize, ) -> bool { - let mut _cache = prog.cache_nfa(); - let mut cache = &mut **_cache; cache.clist.resize(prog.len(), prog.captures.len()); cache.nlist.resize(prog.len(), prog.captures.len()); let at = input.at(start); @@ -113,14 +112,14 @@ impl<'r, I: Input> Nfa<'r, I> { prog: prog, stack: &mut cache.stack, input: input, - }.exec_(&mut cache.clist, &mut cache.nlist, search, at) + }.exec_(&mut cache.clist, &mut cache.nlist, params, at) } fn exec_( &mut self, mut clist: &mut Threads, mut nlist: &mut Threads, - mut search: &mut Search, + mut params: &mut Params, mut at: InputAt, ) -> bool { let mut matched = false; @@ -155,7 +154,7 @@ impl<'r, I: Input> Nfa<'r, I> { // a state starting at the current position in the input for the // beginning of the program only if we don't already have a match. if clist.set.is_empty() || (!self.prog.is_anchored_start && !matched) { - self.add(&mut clist, &mut search.captures, 0, at) + self.add(&mut clist, params.captures_mut(), 0, at) } // The previous call to "add" actually inspects the position just // before the current character. For stepping through the machine, @@ -166,7 +165,7 @@ impl<'r, I: Input> Nfa<'r, I> { let ip = clist.set[i]; let step = self.step( &mut nlist, - search, + params, clist.caps(ip), ip, at, @@ -174,14 +173,14 @@ impl<'r, I: Input> Nfa<'r, I> { ); if step { if !matched { - matched = search.matched_all(); + matched = params.matches().iter().all(|&m| m); } - if search.quit_after_first_match() { + if params.style().quit_after_first_match() { // If we only care if a match occurs (not its // position), then we can quit right now. break 'LOOP; } - if !search.find_many_matches() { + if self.prog.matches.len() == 1 { // We don't need to check the rest of the threads // in this set because we've matched something // ("leftmost-first"). However, we still need to check @@ -201,7 +200,7 @@ impl<'r, I: Input> Nfa<'r, I> { mem::swap(clist, nlist); nlist.set.clear(); } - matched + params.is_match() } /// Step through the input, one token (byte or codepoint) at a time. @@ -219,7 +218,7 @@ impl<'r, I: Input> Nfa<'r, I> { fn step( &mut self, nlist: &mut Threads, - search: &mut Search, + params: &mut Params, thread_caps: &mut [Option], ip: usize, at: InputAt, @@ -228,8 +227,8 @@ impl<'r, I: Input> Nfa<'r, I> { use prog::Inst::*; match self.prog[ip] { Match(match_slot) => { - search.copy_captures_from(thread_caps); - search.set_match(match_slot); + params.copy_captures_from(thread_caps); + params.set_match(match_slot); true } Char(ref inst) => { diff --git a/src/params.rs b/src/params.rs new file mode 100644 index 0000000000..8573cfc928 --- /dev/null +++ b/src/params.rs @@ -0,0 +1,202 @@ +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// This module defines the internal parameter interface to matching engines. +// This doesn't actually contain any searching code. + +/// Search is a set of parameters that influence the behavior of matching +/// engines. In general, a given set of search parameters influence all the +/// matching engines in the same way. +#[derive(Debug)] +pub struct Params<'c, 'm> { + /// Storage for capture slots. + captures: &'c mut [Slot], + /// Which regular expression matched. For a `Regex`, this always has + /// length 1, but a `RegexSet` has length equal to the number of regexes + /// in the set. + matches: &'m mut [bool], + /// Whether `true` exists in `matches`. + matched_any: bool, + /// When true, always use "match-short" semantics. + match_short: bool, +} + +/// Slot is a single saved capture location. Note that there are two slots for +/// every capture in a regular expression (one slot each for the start and end +/// of the capture). +pub type Slot = Option; + +/// The matching semantics implied by a given set of search parameters. +/// +/// In general, each semantics is more expensive than the previous one. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchStyle { + /// "match-short" semantics means that the matching engine will do only + /// as much work as needed to discover whether an expression matches + /// the given text or not. Generally, match-short semantics means that + /// capture locations are not tracked. However, if memory is given to track + /// captures, then the ending location of the *shortest match* will be + /// returned. The starting location may not be returned. + Short, + /// "all-match" semantics means that more than one regular expression is + /// being executed and that the matching engine should answer whether each + /// regular expression matches or not. (This is only applicable for regex + /// sets.) + All, + /// "find" semantics means that the matching engine will find the proper + /// leftmost-first match and return the start and end location of that + /// match. Other capture groups are not tracked. + Find, + /// "capture" semantics means that the locations of all captures in the + /// regular expression are tracked. + Capture, +} + +impl MatchStyle { + pub fn match_only(&self) -> bool { + match *self { + MatchStyle::Short | MatchStyle::All => true, + _ => false, + } + } + + pub fn quit_after_first_match(&self) -> bool { + *self == MatchStyle::Short + } +} + +impl<'c, 'm> Params<'c, 'm> { + /// Creates a new set of search parameters. + /// + /// A default set of search parameters results in match-short semantics. + pub fn new(caps: &'c mut [Slot], mats: &'m mut [bool]) -> Params<'c, 'm> { + Params { + captures: caps, + matches: mats, + matched_any: false, + match_short: false, + } + } + + /// Resets the parameters as if no search has happened. + /// + /// This is useful for reusing the same set of parameters for multiple + /// independent searches. + /// + /// This method never changes the match semantics implied by the + /// parameters. + pub fn reset(&mut self) { + for slot in self.captures.iter_mut() { + *slot = None; + } + for m in self.matches.iter_mut() { + *m = false; + } + } + + /// Force match-short semantics. + /// + /// When this is enabled (it's disabled by default), match-short semantics + /// will always be used. + /// + /// Note that if backing storage for captures is provided then some of them + /// may be filled in after a search (but may not represent leftmost-first + /// match locations). + pub fn set_match_short(mut self, yes: bool) -> Params<'c, 'm> { + self.match_short = yes; + self + } + + pub fn alloc_captures(n: usize) -> Vec { + vec![None; 2 * n] + } + + pub fn alloc_matches(n: usize) -> Vec { + vec![false; n] + } + + /// Returns all capture slots. (There are 2 slots for every capture group.) + pub fn captures(&self) -> &[Slot] { + &self.captures + } + + /// Returns mutable capture slots. (There are 2 slots for every capture + /// group.) + pub fn captures_mut(&mut self) -> &mut [Slot] { + &mut self.captures + } + + /// Returns true if one or more regular expressions matched. + pub fn is_match(&self) -> bool { + self.matched_any + } + + /// Returns all matches. + pub fn matches(&self) -> &[bool] { + &self.matches + } + + /// Sets the capture slot at index `i` to the slot given. + /// + /// If `i` does not point to a valid capture slot, then this is a no-op. + #[doc(hidden)] + pub fn set_capture(&mut self, i: usize, slot: Slot) { + if let Some(old_slot) = self.captures.get_mut(i) { + *old_slot = slot; + } + } + + /// Sets the start location of the match. + #[doc(hidden)] + pub fn set_start(&mut self, slot: Slot) { + self.set_capture(0, slot) + } + + /// Set the end location of the match. + #[doc(hidden)] + pub fn set_end(&mut self, slot: Slot) { + self.set_capture(1, slot) + } + + /// Copies the given capture slots into self's capture slots. + #[doc(hidden)] + pub fn copy_captures_from(&mut self, caps: &[Slot]) { + for (slot, val) in self.captures.iter_mut().zip(caps.iter()) { + *slot = *val; + } + } + + /// Indicates that the regular expression at index `i` matches. + #[doc(hidden)] + pub fn set_match(&mut self, i: usize) { + self.matched_any = true; + if let Some(old) = self.matches.get_mut(i) { + *old = true; + } + } + + /// Returns the style of matching implied by these parameters. + #[doc(hidden)] + pub fn style(&self) -> MatchStyle { + if self.match_short { + MatchStyle::Short + } else if self.captures.is_empty() { + if self.matches.len() > 1 { + MatchStyle::All + } else { + MatchStyle::Short + } + } else if self.captures.len() > 2 { + MatchStyle::Capture + } else { + MatchStyle::Find + } + } +} diff --git a/src/pool.rs b/src/pool.rs deleted file mode 100644 index 483c9e6522..0000000000 --- a/src/pool.rs +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::fmt; -use std::ops::{Deref, DerefMut, Drop}; -use std::sync::Mutex; - -/// A very simple memory pool for managing cached state. -/// -/// This was motivated by a singular purpose: reduce the allocation overhead -/// of matching engines. -/// -/// With a pool, the matching engines need to allocate state each time they -/// are invoked. If a regex is used once to check for a match and never again, -/// then this is OK. But if a regex is used many times over, then not -/// re-allocating the engine's state is a huge win. (A regex is commonly -/// used many times, for example, with `find_iter`, `captures_iter` or -/// `replace_all`.) -/// -/// We use inherited mutability and ensure that each thread gets its own -/// state. There is no limit on the number of states that are created. If a -/// thread requests one and one isn't available, a new one is created. -pub struct Pool { - stack: Mutex>, - create: CreateFn, -} - -/// The type of the function used to create resources if none exist. -pub type CreateFn = Box T + Send + Sync>; - -/// A guard the provides access to a value pulled from the pool. -#[derive(Debug)] -pub struct PoolGuard<'a, T: 'a> { - pool: &'a Pool, - val: Option, -} - -impl Pool { - /// Create a new pool. - /// - /// When a caller requests a resource from the pool and one does not - /// exist, then `create` is called to allocate a new resource for the - /// caller. - /// - /// It is up to the caller to put the resource back into the pool for - /// future reuse. - /// - /// All resources are created lazily/on-demand. - pub fn new(create: CreateFn) -> Pool { - Pool { - stack: Mutex::new(vec![]), - create: create, - } - } - - /// Request a resource from the pool. - /// - /// If no resources are available, a new one is created. - /// - /// Once the guard is dropped, the resource is returned to the pool. - pub fn get(&self) -> PoolGuard { - let mut stack = self.stack.lock().unwrap(); - match stack.pop() { - None => PoolGuard { pool: self, val: Some((self.create)()) }, - Some(v) => PoolGuard { pool: self, val: Some(v) }, - } - } - - /// Add a resource to the pool. - /// - /// This makes the resource available for use with `get`. - fn put(&self, v: T) { - let mut stack = self.stack.lock().unwrap(); - stack.push(v); - } -} - -impl<'a, T> Deref for PoolGuard<'a, T> { - type Target = T; - fn deref(&self) -> &T { self.val.as_ref().unwrap() } -} - -impl<'a, T> DerefMut for PoolGuard<'a, T> { - fn deref_mut(&mut self) -> &mut T { self.val.as_mut().unwrap() } -} - -impl<'a, T> Drop for PoolGuard<'a, T> { - fn drop(&mut self) { - let val = self.val.take().unwrap(); - self.pool.put(val); - } -} - -impl fmt::Debug for Pool { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let stack = self.stack.lock(); - let stack = stack.unwrap(); - stack.fmt(f) - } -} diff --git a/src/prog.rs b/src/prog.rs index f58acc417b..7c8022f394 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -6,12 +6,8 @@ use std::mem; use std::slice; use std::sync::Arc; -use backtrack::BacktrackCache; -use dfa::DfaCache; use input::Char; -use literals::Literals; -use nfa::NfaCache; -use pool::{Pool, PoolGuard}; +use literals::LiteralSearcher; /// InstPtr represents the index of an instruction in a regex program. pub type InstPtr = usize; @@ -57,9 +53,7 @@ pub struct Program { /// Whether the regex must match at the end of the input. pub is_anchored_end: bool, /// A possibly empty machine for very quickly matching prefix literals. - pub prefixes: Literals, - /// Caches for use by the matching engines. - pub cache: EngineCache, + pub prefixes: LiteralSearcher, } impl Program { @@ -79,8 +73,7 @@ impl Program { is_reverse: false, is_anchored_start: false, is_anchored_end: false, - prefixes: Literals::empty(), - cache: EngineCache::new(), + prefixes: LiteralSearcher::empty(), } } @@ -129,21 +122,6 @@ impl Program { self.only_utf8 } - /// Retrieve cached state for NFA execution. - pub fn cache_nfa(&self) -> PoolGuard> { - self.cache.nfa.get() - } - - /// Retrieve cached state for backtracking execution. - pub fn cache_backtrack(&self) -> PoolGuard> { - self.cache.backtrack.get() - } - - /// Retrieve cached state for DFA execution. - pub fn cache_dfa(&self) -> PoolGuard> { - self.cache.dfa.get() - } - /// Return the approximate heap usage of this instruction sequence in /// bytes. pub fn approximate_size(&self) -> usize { @@ -180,29 +158,28 @@ impl fmt::Debug for Program { String::from_utf8_lossy(&escaped).into_owned() } - try!(writeln!(f, "--------------------------------")); for (pc, inst) in self.iter().enumerate() { match *inst { Match(slot) => { - try!(writeln!(f, "{:04} Match({:?})", pc, slot)) + try!(write!(f, "{:04} Match({:?})", pc, slot)) } Save(ref inst) => { let s = format!("{:04} Save({})", pc, inst.slot); - try!(writeln!(f, "{}", with_goto(pc, inst.goto, s))); + try!(write!(f, "{}", with_goto(pc, inst.goto, s))); } Split(ref inst) => { - try!(writeln!(f, "{:04} Split({}, {})", - pc, inst.goto1, inst.goto2)); + try!(write!(f, "{:04} Split({}, {})", + pc, inst.goto1, inst.goto2)); } EmptyLook(ref inst) => { let s = format!("{:?}", inst.look); - try!(writeln!(f, "{:04} {}", - pc, with_goto(pc, inst.goto, s))); + try!(write!(f, "{:04} {}", + pc, with_goto(pc, inst.goto, s))); } Char(ref inst) => { let s = format!("{:?}", inst.c); - try!(writeln!(f, "{:04} {}", - pc, with_goto(pc, inst.goto, s))); + try!(write!(f, "{:04} {}", + pc, with_goto(pc, inst.goto, s))); } Ranges(ref inst) => { let ranges = inst.ranges @@ -211,20 +188,23 @@ impl fmt::Debug for Program { .collect::>() .join(", "); let s = format!("{}", ranges); - try!(writeln!(f, "{:04} {}", - pc, with_goto(pc, inst.goto, s))); + try!(write!(f, "{:04} {}", + pc, with_goto(pc, inst.goto, s))); } Bytes(ref inst) => { let s = format!( "Bytes({}, {})", visible_byte(inst.start), visible_byte(inst.end)); - try!(writeln!(f, "{:04} {}", - pc, with_goto(pc, inst.goto, s))); + try!(write!(f, "{:04} {}", + pc, with_goto(pc, inst.goto, s))); } } + if pc == self.start { + try!(write!(f, " (start)")); + } + try!(write!(f, "\n")); } - try!(writeln!(f, "--------------------------------")); Ok(()) } } @@ -235,39 +215,6 @@ impl<'a> IntoIterator for &'a Program { fn into_iter(self) -> Self::IntoIter { self.iter() } } -/// EngineCache maintains reusable allocations for each matching engine -/// available to a particular program. -/// -/// The allocations are created lazily, so we don't pay for caches that -/// aren't used. -/// -/// N.B. These are all behind a pointer because it's fewer bytes to memcpy. -/// These caches are pushed/popped from the pool a lot, and a smaller -/// footprint can have an impact on matching small inputs. See, for example, -/// the hard_32 benchmark. -#[derive(Debug)] -pub struct EngineCache { - nfa: Pool>, - backtrack: Pool>, - dfa: Pool>, -} - -impl EngineCache { - fn new() -> Self { - EngineCache { - nfa: Pool::new(Box::new(|| Box::new(NfaCache::new()))), - backtrack: Pool::new(Box::new(|| Box::new(BacktrackCache::new()))), - dfa: Pool::new(Box::new(|| Box::new(DfaCache::new()))), - } - } -} - -impl Clone for EngineCache { - fn clone(&self) -> EngineCache { - EngineCache::new() - } -} - /// Inst is an instruction code in a Regex program. /// /// Regrettably, a regex program either contains Unicode codepoint diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 427cf77011..b20fccad0c 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -18,9 +18,10 @@ use std::sync::Arc; use memchr::memchr; -use exec::{Exec, ExecBuilder, Search, CaptureSlots}; +use exec::{Exec, ExecBuilder}; use expand::expand; use error::Error; +use params::{Params, Slot}; pub use set::RegexSetBytes as RegexSet; pub use set::SetMatchesBytes as SetMatches; @@ -487,6 +488,39 @@ impl Regex { new } + /// Returns the end location of a match in the text given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// text, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let text = b"aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); + /// assert_eq!(pos, Some(1)); + /// # } + /// ``` + pub fn shortest_match(&self, text: &[u8]) -> Option { + let mut caps = [None, None]; + let mut _matched = [false]; + let mut params = + Params::new(&mut caps, &mut _matched).set_match_short(true); + if !self.execp(&mut params, text, 0) { + None + } else { + params.captures()[1] + } + } + /// Returns the original string of this regex. pub fn as_str(&self) -> &str { &self.0.regex_strings()[0] @@ -502,10 +536,14 @@ impl Regex { self.0.captures().len() } - fn exec(&self, caps: CaptureSlots, text: &[u8], start: usize) -> bool { + fn exec(&self, caps: &mut [Slot], text: &[u8], start: usize) -> bool { let mut _matches = [false]; - let mut search = Search::new(caps, &mut _matches); - self.0.exec(&mut search, text, start) + let mut params = Params::new(caps, &mut _matches); + self.0.exec(&mut params, text, start) + } + + fn execp(&self, params: &mut Params, text: &[u8], start: usize) -> bool { + self.0.exec(params, text, start) } fn alloc_captures(&self) -> Vec> { diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 40003618f2..5e19495f46 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -19,8 +19,9 @@ use std::sync::Arc; use syntax; -use exec::{CaptureSlots, Exec, ExecBuilder, Search}; +use exec::{Exec, ExecBuilder}; use error::Error; +use params::{Params, Slot}; /// Escapes all regular expression meta characters in `text`. /// @@ -121,7 +122,7 @@ pub struct ExNative { #[doc(hidden)] pub groups: &'static &'static [(&'static str, usize)], #[doc(hidden)] - pub prog: fn(CaptureSlots, &str, usize) -> bool, + pub prog: fn(&mut [Option], &str, usize) -> bool, } impl Copy for ExNative {} @@ -339,7 +340,7 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `at(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option> { - let mut caps = self.alloc_captures(); + let mut caps = Params::alloc_captures(self.captures_len()); if !self.exec(&mut caps, text, 0) { None } else { @@ -577,6 +578,39 @@ impl Regex { new } + /// Returns the end location of a match in the text given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// text, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ```rust + /// # extern crate regex; use regex::Regex; + /// # fn main() { + /// let text = "aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); + /// assert_eq!(pos, Some(1)); + /// # } + /// ``` + pub fn shortest_match(&self, text: &str) -> Option { + let mut caps = [None, None]; + let mut _matched = [false]; + let mut params = + Params::new(&mut caps, &mut _matched).set_match_short(true); + if !self.execp(&mut params, text, 0) { + None + } else { + params.captures()[1] + } + } + /// Returns the original string of this regex. pub fn as_str(&self) -> &str { match self.0 { @@ -603,22 +637,19 @@ impl Regex { } } - fn alloc_captures(&self) -> Vec> { - match self.0 { - _Regex::Native(ref n) => vec![None; 2 * n.names.len()], - _Regex::Dynamic(ref d) => vec![None; 2 * d.captures().len()], - } + fn exec(&self, caps: &mut [Slot], text: &str, start: usize) -> bool { + let mut _matches = [false]; + let mut params = Params::new(caps, &mut _matches); + self.execp(&mut params, text, start) } - fn exec(&self, caps: CaptureSlots, text: &str, start: usize) -> bool { + fn execp(&self, params: &mut Params, text: &str, start: usize) -> bool { match self.0 { _Regex::Native(ExNative { ref prog, .. }) => { - (*prog)(caps, text, start) + (*prog)(params.captures_mut(), text, start) } _Regex::Dynamic(ref exec) => { - let mut _matches = [false]; - let mut search = Search::new(caps, &mut _matches); - exec.exec(&mut search, text.as_bytes(), start) + exec.exec(params, text.as_bytes(), start) } } } @@ -1074,7 +1105,7 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> { return None } - let mut caps = self.re.alloc_captures(); + let mut caps = Params::alloc_captures(self.re.captures_len()); if !self.re.exec(&mut caps, self.text, self.last_end) { return None } diff --git a/src/set.rs b/src/set.rs index 5d3bdfb7b9..e23f44791c 100644 --- a/src/set.rs +++ b/src/set.rs @@ -13,7 +13,8 @@ use std::iter; use std::slice; use std::vec; -use exec::{Exec, ExecBuilder, Search}; +use exec::{Exec, ExecBuilder}; +use params::Params; use Error; macro_rules! define_set { @@ -130,9 +131,6 @@ impl $ty { pub fn new(exprs: I) -> Result<$ty, Error> where S: AsRef, I: IntoIterator { let exec = try!($exec_build(exprs)); - if exec.regex_strings().len() < 2 { - return Err(Error::InvalidSet); - } Ok($ty(exec)) } @@ -161,8 +159,8 @@ impl $ty { /// assert!(!set.is_match("☃")); /// ``` pub fn is_match(&self, text: $text_ty) -> bool { - let mut search = Search::new(&mut [], &mut []); - self.0.exec(&mut search, $as_bytes(text), 0) + let mut params = Params::new(&mut [], &mut []); + self.0.exec(&mut params, $as_bytes(text), 0) } /// Returns the set of regular expressions that match in the given text. @@ -202,10 +200,10 @@ impl $ty { /// assert!(matches.matched(6)); /// ``` pub fn matches(&self, text: $text_ty) -> SetMatches { - let mut matches = vec![false; self.0.matches().len()]; + let mut matches = Params::alloc_matches(self.0.matches().len()); let matched_any = { - let mut search = Search::new(&mut [], &mut matches); - self.0.exec(&mut search, $as_bytes(text), 0) + let mut params = Params::new(&mut [], &mut matches); + self.0.exec(&mut params, $as_bytes(text), 0) }; SetMatches { matched_any: matched_any, @@ -310,6 +308,13 @@ impl<'a> Iterator for $ty_set_matches_iter<'a> { } } +#[doc(hidden)] +impl From for $ty { + fn from(exec: Exec) -> Self { + $ty(exec) + } +} + } } diff --git a/src/utf8.rs b/src/utf8.rs index c9b532779f..648a05fd2f 100644 --- a/src/utf8.rs +++ b/src/utf8.rs @@ -23,6 +23,7 @@ const TAG_FOUR: u8 = 0b1111_0000; /// /// If `dst` is not long enough, then `None` is returned. Otherwise, the number /// of bytes written is returned. +#[allow(dead_code)] #[inline] pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option { let code = character as u32; diff --git a/tests/crazy.rs b/tests/crazy.rs index 03ddb2dd95..bed66277e5 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -1,3 +1,5 @@ +mat!(ascii_literal, u!(r"a"), "a", Some((0, 1))); + // Some crazy expressions from regular-expressions.info. mat!(match_ranges, r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", diff --git a/tests/macros.rs b/tests/macros.rs index 7e73e4fa9e..c3b79e2221 100644 --- a/tests/macros.rs +++ b/tests/macros.rs @@ -18,6 +18,7 @@ macro_rules! mat( let got: Vec> = match r.captures(text) { Some(c) => { assert!(r.is_match(text)); + assert!(r.shortest_match(text).is_some()); c.iter_pos().collect() } None => vec![None], diff --git a/tests/misc.rs b/tests/misc.rs index efd488deef..4fba750359 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -10,6 +10,10 @@ use regex::Regex; +mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3))); +mat!(prefix_literal_nomatch, r"^abc", r"zabc", None); +mat!(one_literal_edge, r"abc", r"xxxxxab", None); + #[test] fn eq() { assert_eq!(regex!(r"[a-z]+"), Regex::new("[a-z]+").unwrap()); diff --git a/tests/set.rs b/tests/set.rs index a9dd098c1b..c329f2764e 100644 --- a/tests/set.rs +++ b/tests/set.rs @@ -12,9 +12,13 @@ matset!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1); matset!(set12, &[r".*?", "a"], "zzzzzza", 0, 1); matset!(set13, &[r".*", "a"], "zzzzzza", 0, 1); matset!(set14, &[r".*", "a"], "zzzzzz", 0); +matset!(set15, &[r"\ba\b"], "hello a bye", 0); +matset!(set16, &["a"], "a", 0); +matset!(set17, &[".*a"], "a", 0); nomatset!(nset1, &["a", "a"], "b"); nomatset!(nset2, &["^foo", "bar$"], "bar foo"); +nomatset!(nset3, { let xs: &[&str] = &[]; xs }, "a"); // See: https://github.com/rust-lang-nursery/regex/issues/187 #[test] diff --git a/tests/shortest_match.rs b/tests/shortest_match.rs new file mode 100644 index 0000000000..c964ab9131 --- /dev/null +++ b/tests/shortest_match.rs @@ -0,0 +1,14 @@ +macro_rules! shortmat { + ($name:ident, $re:expr, $text:expr, $shortest_match:expr) => { + #[test] + fn $name() { + let text = text!($text); + let re = regex!($re); + assert_eq!($shortest_match, re.shortest_match(text)); + } + } +} + +shortmat!(t01, r"a+", r"aa", Some(1)); +// Test that the reverse suffix optimization gets it right. +shortmat!(t02, r".*(?:abcd)+", r"abcdabcd", Some(4)); diff --git a/tests/suffix_reverse.rs b/tests/suffix_reverse.rs new file mode 100644 index 0000000000..d89143268a --- /dev/null +++ b/tests/suffix_reverse.rs @@ -0,0 +1,17 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +mat!(t01, r".*abcd", r"abcd", Some((0, 4))); +mat!(t02, r".*(?:abcd)+", r"abcd", Some((0, 4))); +mat!(t03, r".*(?:abcd)+", r"abcdabcd", Some((0, 8))); +mat!(t04, r".*(?:abcd)+", r"abcdxabcd", Some((0, 9))); +mat!(t05, r".*x(?:abcd)+", r"abcdxabcd", Some((0, 9))); +mat!(t06, r"[^abcd]*x(?:abcd)+", r"abcdxabcd", Some((4, 9))); +// mat!(t05, r".*(?:abcd)+", r"abcdabcd", Some((0, 4))); diff --git a/tests/test_backtrack.rs b/tests/test_backtrack.rs index 880233096f..f0d6fe79d3 100644 --- a/tests/test_backtrack.rs +++ b/tests/test_backtrack.rs @@ -27,6 +27,22 @@ macro_rules! regex { } } +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .build() + .map(|e| e.into_regex_set()) + }} +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + // Must come before other module definitions. include!("macros_str.rs"); include!("macros.rs"); @@ -40,4 +56,6 @@ mod multiline; mod noparse; mod regression; mod replace; +mod set; +mod suffix_reverse; mod unicode; diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs index 8b15f79ef0..306447f730 100644 --- a/tests/test_backtrack_bytes.rs +++ b/tests/test_backtrack_bytes.rs @@ -28,6 +28,23 @@ macro_rules! regex { } } +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex_set()) + }} +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + // Must come before other module definitions. include!("macros_bytes.rs"); include!("macros.rs"); @@ -41,4 +58,6 @@ mod multiline; mod noparse; mod regression; mod replace; +mod set; +mod suffix_reverse; mod unicode; diff --git a/tests/test_backtrack_utf8bytes.rs b/tests/test_backtrack_utf8bytes.rs index 12ed55d364..8667eaa1be 100644 --- a/tests/test_backtrack_utf8bytes.rs +++ b/tests/test_backtrack_utf8bytes.rs @@ -27,6 +27,23 @@ macro_rules! regex { } } +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .bytes(true) + .build() + .map(|e| e.into_regex_set()) + }} +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + // Must come before other module definitions. include!("macros_str.rs"); include!("macros.rs"); @@ -40,4 +57,6 @@ mod multiline; mod noparse; mod regression; mod replace; +mod set; +mod suffix_reverse; mod unicode; diff --git a/tests/test_default.rs b/tests/test_default.rs index 40dbe9e42e..fb2c1013eb 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -27,6 +27,12 @@ macro_rules! regex_new { }} } +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + } +} + macro_rules! regex_set_new { ($re:expr) => {{ use regex::RegexSet; @@ -34,12 +40,6 @@ macro_rules! regex_set_new { }} } -macro_rules! regex { - ($re:expr) => { - regex_new!($re).unwrap() - } -} - macro_rules! regex_set { ($res:expr) => { regex_set_new!($res).unwrap() @@ -60,34 +60,17 @@ mod api_str; mod crazy; mod flags; mod fowler; +mod misc; mod multiline; mod noparse; mod regression; mod replace; mod searcher; mod set; +mod shortest_match; +mod suffix_reverse; mod unicode; -#[test] -fn set_empty() { - use regex::{Error, RegexSet}; - let err = RegexSet::new::<&[String], &String>(&[]).unwrap_err(); - match err { - Error::InvalidSet => {} - err => panic!("expected Error::InvalidSet but got {:?}", err), - } -} - -#[test] -fn set_one() { - use regex::{Error, RegexSet}; - let err = RegexSet::new(&["foo"]).unwrap_err(); - match err { - Error::InvalidSet => {} - err => panic!("expected Error::InvalidSet but got {:?}", err), - } -} - #[test] fn disallow_unicode_flag() { assert!(regex::Regex::new("(?-u)a").is_err()); diff --git a/tests/test_default_bytes.rs b/tests/test_default_bytes.rs index 9f8dc5701b..da85672c3a 100644 --- a/tests/test_default_bytes.rs +++ b/tests/test_default_bytes.rs @@ -51,4 +51,6 @@ mod noparse; mod regression; mod replace; mod set; +mod shortest_match; +mod suffix_reverse; mod unicode; diff --git a/tests/test_nfa.rs b/tests/test_nfa.rs index 15686cd6fb..2b016032d0 100644 --- a/tests/test_nfa.rs +++ b/tests/test_nfa.rs @@ -26,6 +26,19 @@ macro_rules! regex { } } +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re).nfa().build().map(|e| e.into_regex_set()) + }} +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + // Must come before other module definitions. include!("macros_str.rs"); include!("macros.rs"); @@ -39,4 +52,6 @@ mod multiline; mod noparse; mod regression; mod replace; +mod set; +mod suffix_reverse; mod unicode; diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs index 2f0cb52f0f..265b2d033b 100644 --- a/tests/test_nfa_bytes.rs +++ b/tests/test_nfa_bytes.rs @@ -26,6 +26,20 @@ macro_rules! regex { } } +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .nfa().only_utf8(false).build().map(|e| e.into_byte_regex_set()) + }} +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + // Must come before other module definitions. include!("macros_bytes.rs"); include!("macros.rs"); @@ -39,4 +53,6 @@ mod multiline; mod noparse; mod regression; mod replace; +mod set; +mod suffix_reverse; mod unicode; diff --git a/tests/test_nfa_utf8bytes.rs b/tests/test_nfa_utf8bytes.rs index d8c45dc7a9..f1c72cff31 100644 --- a/tests/test_nfa_utf8bytes.rs +++ b/tests/test_nfa_utf8bytes.rs @@ -26,6 +26,20 @@ macro_rules! regex { } } +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .nfa().bytes(true).build().map(|e| e.into_regex_set()) + }} +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + } +} + // Must come before other module definitions. include!("macros_str.rs"); include!("macros.rs"); @@ -39,4 +53,6 @@ mod multiline; mod noparse; mod regression; mod replace; +mod set; +mod suffix_reverse; mod unicode; diff --git a/tests/test_plugin.rs b/tests/test_plugin.rs index c58c5a38e6..b4bc973433 100644 --- a/tests/test_plugin.rs +++ b/tests/test_plugin.rs @@ -27,4 +27,5 @@ mod fowler; mod multiline; mod plugin; mod replace; +mod suffix_reverse; mod unicode;