From 2875ce9c1cc42ef1e4b552127693dfdb8365a393 Mon Sep 17 00:00:00 2001 From: Ildar Shaimordanov Date: Sat, 3 Jul 2021 17:05:23 +0300 Subject: [PATCH 01/18] Add option `-B`, `-E`, `-X` for disabling anchors `^` and `$` --- src/main.rs | 30 ++++++++++++++++++++++++++++++ src/regexp/builder.rs | 7 +++++++ src/regexp/config.rs | 4 ++++ src/regexp/regexp.rs | 14 +++++++++++--- 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 338d01d..1d29be5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -187,6 +187,33 @@ struct Cli { )] is_output_colorized: bool, + #[structopt( + name = "no-match-beginning", + short = "B", + long, + help = "Don't match the beginning of the string (don't prepend \"^\")", + display_order = 14 + )] + no_match_begin: bool, + + #[structopt( + name = "no-match-end", + short = "E", + long, + help = "Don't match the end of the string (don't append \"$\")", + display_order = 15 + )] + no_match_end: bool, + + #[structopt( + name = "no-match-line", + short = "X", + long, + help = "Don't match the whole string (as a shorthand for -B -E)", + display_order = 16 + )] + no_match_line: bool, + // -------------------- // OPTIONS // -------------------- @@ -310,6 +337,9 @@ fn handle_input(cli: &Cli, input: Result, Error>) { } builder + .with_line_borders( + ! cli.no_match_begin && ! cli.no_match_line + , ! cli.no_match_end && ! cli.no_match_line) .with_minimum_repetitions(cli.minimum_repetitions) .with_minimum_substring_length(cli.minimum_substring_length); diff --git a/src/regexp/builder.rs b/src/regexp/builder.rs index 99a8abb..bb2e000 100644 --- a/src/regexp/builder.rs +++ b/src/regexp/builder.rs @@ -143,6 +143,13 @@ impl RegExpBuilder { self } + /// Tells `RegExpBuilder` to concatenate the resulting regular expression with the "^" and "$" anchors. + pub fn with_line_borders(&mut self, match_begin: bool, match_end: bool) -> &mut Self { + self.config.is_match_begin = match_begin; + self.config.is_match_end = match_end; + self + } + /// Builds the actual regular expression using the previously given settings. /// Every generated regular expression is surrounded by the anchors `^` and `$` /// so that substrings not being part of the test cases are not matched accidentally. diff --git a/src/regexp/config.rs b/src/regexp/config.rs index 95ff71e..2d88aa4 100644 --- a/src/regexp/config.rs +++ b/src/regexp/config.rs @@ -25,6 +25,8 @@ pub struct RegExpConfig { pub(crate) is_astral_code_point_converted_to_surrogate: bool, pub(crate) is_verbose_mode_enabled: bool, pub(crate) is_output_colorized: bool, + pub(crate) is_match_begin: bool, + pub(crate) is_match_end: bool, } impl RegExpConfig { @@ -37,6 +39,8 @@ impl RegExpConfig { is_astral_code_point_converted_to_surrogate: false, is_verbose_mode_enabled: false, is_output_colorized: false, + is_match_begin: true, + is_match_end: true, } } diff --git a/src/regexp/regexp.rs b/src/regexp/regexp.rs index a2b0d4d..d0c499b 100644 --- a/src/regexp/regexp.rs +++ b/src/regexp/regexp.rs @@ -87,8 +87,16 @@ impl Display for RegExp { } else { String::new() }; - let caret = Component::Caret.to_repr(self.config.is_output_colorized); - let dollar_sign = Component::DollarSign.to_repr(self.config.is_output_colorized); + let caret = if self.config.is_match_begin { + Component::Caret.to_repr(self.config.is_output_colorized) + } else { + "".to_string() + }; + let dollar_sign = if self.config.is_match_end { + Component::DollarSign.to_repr(self.config.is_output_colorized) + } else { + "".to_string() + }; let mut regexp = match self.ast { Expression::Alternation(_, _) => { format!( @@ -238,7 +246,7 @@ fn apply_verbose_mode(regexp: String, config: &RegExpConfig) -> String { }; let mut verbose_regexp = vec![verbose_mode_flag]; - let mut nesting_level = 0; + let mut nesting_level = if config.is_match_begin { 0 } else { 1 }; let regexp_with_replacements = regexp .replace( From 55551c870c83e6e90efea9a86178cf7802c7c367 Mon Sep 17 00:00:00 2001 From: Ildar Shaimordanov Date: Sat, 3 Jul 2021 17:06:09 +0300 Subject: [PATCH 02/18] Add tests for testing options `-B`, `-E`, `-X` --- tests/cli_integration_tests.rs | 77 +++++++++++++++++++++++++++++ tests/lib_integration_tests.rs | 89 ++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) diff --git a/tests/cli_integration_tests.rs b/tests/cli_integration_tests.rs index e810208..fccc586 100644 --- a/tests/cli_integration_tests.rs +++ b/tests/cli_integration_tests.rs @@ -23,6 +23,83 @@ use tempfile::NamedTempFile; const TEST_CASE: &str = "I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."; +mod anchor_conversion { + use super::*; + + mod no_verbose { + use super::*; + + #[test] + fn succeeds_with_no_match_beginning_option() { + let mut grex = init_command(); + grex.args(&["--no-match-beginning", TEST_CASE]); + grex.assert() + .success() + .stdout(predicate::eq("I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.$\n")); + } + + #[test] + fn succeeds_with_no_match_end_option() { + let mut grex = init_command(); + grex.args(&["--no-match-end", TEST_CASE]); + grex.assert() + .success() + .stdout(predicate::eq("^I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.\n")); + } + + #[test] + fn succeeds_with_no_match_line_option() { + let mut grex = init_command(); + grex.args(&["--no-match-line", TEST_CASE]); + grex.assert() + .success() + .stdout(predicate::eq("I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.\n")); + } + } + + mod verbose { + use super::*; + + #[test] + fn succeeds_with_verbose_mode_and_no_match_beginning_option() { + let mut grex = init_command(); + grex.args(&["--verbose", "--no-match-beginning", TEST_CASE]); + grex.assert().success().stdout(predicate::eq(indoc!( + r#" + (?x) + I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. + $ + "#, + ))); + } + + #[test] + fn succeeds_with_verbose_mode_and_no_match_end_option() { + let mut grex = init_command(); + grex.args(&["--verbose", "--no-match-end", TEST_CASE]); + grex.assert().success().stdout(predicate::eq(indoc!( + r#" + (?x) + ^ + I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. + "#, + ))); + } + + #[test] + fn succeeds_with_verbose_mode_and_no_match_line_option() { + let mut grex = init_command(); + grex.args(&["--verbose", "--no-match-line", TEST_CASE]); + grex.assert().success().stdout(predicate::eq(indoc!( + r#" + (?x) + I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. + "#, + ))); + } + } +} + mod no_conversion { use super::*; diff --git a/tests/lib_integration_tests.rs b/tests/lib_integration_tests.rs index 1ce6133..cca3ed2 100644 --- a/tests/lib_integration_tests.rs +++ b/tests/lib_integration_tests.rs @@ -21,6 +21,95 @@ use rstest::rstest; use std::io::Write; use tempfile::NamedTempFile; +mod anchor_conversion { + use super::*; + + mod no_verbose { + use super::*; + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], "My ♥♥♥ and 💩💩 is yours\\.$"), + )] + fn succeeds_with_no_match_beginning_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases) + .with_line_borders(false, true) + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], "^My ♥♥♥ and 💩💩 is yours\\."), + )] + fn succeeds_with_no_match_end_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases) + .with_line_borders(true, false) + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], "My ♥♥♥ and 💩💩 is yours\\."), + )] + fn succeeds_with_no_match_line_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases) + .with_line_borders(false, false) + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + } + + mod verbose { + use super::*; + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( + r#" + (?x) + My\ ♥♥♥\ and\ 💩💩\ is\ yours\. + $"# + )) + )] + fn succeeds_with_verbose_and_no_match_beginning_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases) + .with_line_borders(false, true) + .with_verbose_mode() + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( + r#" + (?x) + ^ + My\ ♥♥♥\ and\ 💩💩\ is\ yours\."# + )) + )] + fn succeeds_with_verbose_and_no_match_end_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases) + .with_line_borders(true, false) + .with_verbose_mode() + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( + r#" + (?x) + My\ ♥♥♥\ and\ 💩💩\ is\ yours\."# + )) + )] + fn succeeds_with_verbose_and_no_match_line_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases) + .with_line_borders(false, false) + .with_verbose_mode() + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + } +} + mod no_conversion { use super::*; From 00975004f8665044f2758d5e4dddb6a54bd46e97 Mon Sep 17 00:00:00 2001 From: Ildar Shaimordanov Date: Mon, 5 Jul 2021 15:44:21 +0300 Subject: [PATCH 03/18] update tests/property_tests.rs --- tests/property_tests.rs | 162 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/tests/property_tests.rs b/tests/property_tests.rs index c8ec407..de8034f 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -121,6 +121,81 @@ proptest! { prop_assert!(compile_regexp(®exp).is_ok()); } + #[test] + #[ignore] + fn valid_regexes_with_no_match_beginning( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(false, true) + .build(); + prop_assert!(compile_regexp(®exp).is_ok()); + } + + #[test] + #[ignore] + fn valid_regexes_with_no_match_beginning_and_verbose_mode( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(false, true) + .with_verbose_mode() + .build(); + prop_assert!(compile_regexp(®exp).is_ok()); + } + + #[test] + #[ignore] + fn valid_regexes_with_no_match_end( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(true, false) + .build(); + prop_assert!(compile_regexp(®exp).is_ok()); + } + + #[test] + #[ignore] + fn valid_regexes_with_no_match_end_and_verbose_mode( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(true, false) + .with_verbose_mode() + .build(); + prop_assert!(compile_regexp(®exp).is_ok()); + } + + #[test] + #[ignore] + fn valid_regexes_with_no_match_line( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(false, false) + .build(); + prop_assert!(compile_regexp(®exp).is_ok()); + } + + #[test] + #[ignore] + fn valid_regexes_with_no_match_line_and_verbose_mode( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(false, false) + .with_verbose_mode() + .build(); + prop_assert!(compile_regexp(®exp).is_ok()); + } + #[test] #[ignore] fn matching_regexes_with_default_settings( @@ -266,6 +341,93 @@ proptest! { } } } + + #[test] + #[ignore] + fn matching_regexes_with_no_match_beginning( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(false, true) + .build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); + } + } + + #[test] + #[ignore] + fn matching_regexes_with_no_match_beginning_and_verbose_mode( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(false, true) + .with_verbose_mode() + .build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); + } + } + + #[test] + #[ignore] + fn matching_regexes_with_no_match_end( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(true, false) + .build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); + } + } + + #[test] + #[ignore] + fn matching_regexes_with_no_match_end_and_verbose_mode( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(true, false) + .with_verbose_mode() + .build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); + } + } + + #[test] + #[ignore] + fn matching_regexes_with_no_match_line( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(false, false) + .build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); + } + } + + #[test] + #[ignore] + fn matching_regexes_with_no_match_line_and_verbose_mode( + test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec) + .with_line_borders(false, false) + .with_verbose_mode() + .build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); + } + } } fn conversion_feature_strategy() -> impl Strategy { From c2754ed0718ee8a69409d9abe55727e0a45b04b4 Mon Sep 17 00:00:00 2001 From: Ildar Shaimordanov Date: Mon, 5 Jul 2021 15:45:37 +0300 Subject: [PATCH 04/18] Add feature description to src/lib.rs --- src/lib.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index cc5883c..bdad8ac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -231,6 +231,46 @@ //! )); //! ``` //! +//! ### 4.8 Optional anchors +//! +//! The regular expressions generated by *grex* are strict and always +//! stick to edges of the line by the anchors "^" and "$", meaning the +//! beginning and the end of the line, respectively. It's possible to +//! avoid printing one or both anchors. +//! +//! ### 4.8.1. Optional "^" anchor - don't stick to the left edge +//! +//! ``` +//! use grex::RegExpBuilder; +//! +//! let regexp = RegExpBuilder::from(&["abc"]) +//! .with_line_borders(false, true) +//! .build(); +//! assert_eq!(regexp, "abc$"); +//! ``` +//! +//! ### 4.8.2. Optional "$" anchor - don't stick to the right edge +//! +//! ``` +//! use grex::RegExpBuilder; +//! +//! let regexp = RegExpBuilder::from(&["abc"]) +//! .with_line_borders(true, false) +//! .build(); +//! assert_eq!(regexp, "^abc"); +//! ``` +//! +//! ### 4.8.3. Optional "^" and "$" anchors - don't stick to both edges +//! +//! ``` +//! use grex::RegExpBuilder; +//! +//! let regexp = RegExpBuilder::from(&["abc"]) +//! .with_line_borders(false, false) +//! .build(); +//! assert_eq!(regexp, "abc"); +//! ``` +//! //! ### 5. How does it work? //! //! 1. A [deterministic finite automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) (DFA) From 7800056c64a53328293c0b8ad72ae5f629385383 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Thu, 8 Jul 2021 13:15:28 +0200 Subject: [PATCH 05/18] Update dependencies --- Cargo.lock | 85 ++++++++++++++++++++++++++++-------------------------- Cargo.toml | 20 ++++++------- 2 files changed, 54 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 801ef67..3325b29 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,10 +1,12 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "aho-corasick" -version = "0.7.15" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] @@ -20,9 +22,9 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "1.0.3" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2475b58cd94eb4f70159f4fd8844ba3b807532fe3131b3373fae060bbe30396" +checksum = "3d20831bd004dda4c7c372c19cdabff369f794a95e955b3f13fe460e3e1ae95f" dependencies = [ "bstr", "doc-comment", @@ -109,10 +111,10 @@ dependencies = [ ] [[package]] -name = "difference" -version = "2.0.0" +name = "difflib" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" [[package]] name = "doc-comment" @@ -128,9 +130,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" [[package]] name = "fixedbitset" -version = "0.2.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" +checksum = "398ea4fabe40b9b0d885340a2a991a44c8a645624075ad966d21f88688e2b69e" [[package]] name = "float-cmp" @@ -160,7 +162,7 @@ dependencies = [ [[package]] name = "grex" -version = "1.2.0" +version = "1.3.0" dependencies = [ "assert_cmd", "indoc", @@ -181,9 +183,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.9.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" [[package]] name = "heck" @@ -205,9 +207,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b" +checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" dependencies = [ "autocfg", "hashbrown", @@ -224,9 +226,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" dependencies = [ "either", ] @@ -260,15 +262,15 @@ dependencies = [ [[package]] name = "memchr" -version = "2.3.4" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "ndarray" -version = "0.15.0" +version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "073c7c76f7b90654996f08db92290e9f300d11de0634493d6f1c4fd11d8a1583" +checksum = "08e854964160a323e65baa19a0b1a027f76d590faba01f05c0cbc3187221a8c9" dependencies = [ "matrixmultiply", "num-complex", @@ -322,9 +324,9 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" dependencies = [ "fixedbitset", "indexmap", @@ -338,12 +340,13 @@ checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" [[package]] name = "predicates" -version = "1.0.7" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeb433456c1a57cc93554dea3ce40b4c19c4057e41c55d4a0f3d84ea71c325aa" +checksum = "c6e46ca79eb4e21e2ec14430340c71250ab69332abf85521c95d3a8bc336aa76" dependencies = [ - "difference", + "difflib", "float-cmp", + "itertools", "normalize-line-endings", "predicates-core", "regex", @@ -391,9 +394,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.24" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" +checksum = "f0d8caf72986c1a598726adc988bb5984792ef84f5ee5aa50209145ee8077038" dependencies = [ "unicode-xid", ] @@ -505,9 +508,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.4.5" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "957056ecddbeba1b26965114e191d2e8589ce74db242b6ea25fc4062427a5c19" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", @@ -525,9 +528,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.22" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "remove_dir_all" @@ -540,9 +543,9 @@ dependencies = [ [[package]] name = "rstest" -version = "0.7.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5056bc1e7cfd438570e8292ef9512774b1d0afc8a50d683fda0ebe74f6233cc6" +checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" dependencies = [ "cfg-if", "proc-macro2", @@ -598,9 +601,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.21" +version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5277acd7ee46e63e5168a80734c9f6ee81b1367a7d8772a2d765df2a3705d28c" +checksum = "69b041cdcb67226aca307e6e7be44c8806423d83e018bd662360a93dabce4d71" dependencies = [ "clap", "lazy_static", @@ -609,9 +612,9 @@ dependencies = [ [[package]] name = "structopt-derive" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90" +checksum = "7813934aecf5f51a54775e00068c237de98489463968231a51746bbbc03f9c10" dependencies = [ "heck", "proc-macro-error", @@ -622,9 +625,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.65" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a1d708c221c5a612956ef9f75b37e454e88d1f7b899fbd3a18d4252012d663" +checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7" dependencies = [ "proc-macro2", "quote", @@ -710,9 +713,9 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.7.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0d2e7be6ae3a5fa87eed5fb451aff96f2573d2694942e40543ae0bbe19c796" +checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" [[package]] name = "unicode-width" diff --git a/Cargo.toml b/Cargo.toml index 1ea0649..1af1b98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ [package] name = "grex" -version = "1.2.0" +version = "1.3.0" authors = ["Peter M. Stahl "] description = """ grex generates regular expressions from user-provided test cases. @@ -29,20 +29,20 @@ categories = ["command-line-utilities"] keywords = ["pattern", "regex", "regexp"] [dependencies] -itertools = "0.10.0" +itertools = "0.10.1" lazy_static = "1.4.0" -ndarray = "0.15.0" -petgraph = {version = "0.5.1", default-features = false, features = ["stable_graph"]} -regex = "1.4.5" -structopt = "0.3.21" +ndarray = "0.15.3" +petgraph = {version = "0.6.0", default-features = false, features = ["stable_graph"]} +regex = "1.5.4" +structopt = "0.3.22" unic-char-range = "0.9.0" unic-ucd-category = "0.9.0" -unicode-segmentation = "1.7.1" +unicode-segmentation = "1.8.0" [dev-dependencies] -assert_cmd = "1.0.3" +assert_cmd = "1.0.7" indoc = "1.0.3" -predicates = "1.0.7" +predicates = "2.0.0" proptest = "1.0.0" -rstest = "0.7.0" +rstest = "0.10.0" tempfile = "3.2.0" From fc582a9cb543157f3fe0c239c74edc551328e8fd Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Sun, 25 Jul 2021 23:49:46 +0200 Subject: [PATCH 06/18] Refactor anchor handling (#30 #43) --- src/lib.rs | 40 ------- src/main.rs | 63 +++++++---- src/regexp/builder.rs | 33 ++++-- src/regexp/config.rs | 8 +- src/regexp/regexp.rs | 18 ++-- tests/cli_integration_tests.rs | 154 +++++++++++++-------------- tests/lib_integration_tests.rs | 185 ++++++++++++++++---------------- tests/property_tests.rs | 186 +++++---------------------------- 8 files changed, 282 insertions(+), 405 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bdad8ac..cc5883c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -231,46 +231,6 @@ //! )); //! ``` //! -//! ### 4.8 Optional anchors -//! -//! The regular expressions generated by *grex* are strict and always -//! stick to edges of the line by the anchors "^" and "$", meaning the -//! beginning and the end of the line, respectively. It's possible to -//! avoid printing one or both anchors. -//! -//! ### 4.8.1. Optional "^" anchor - don't stick to the left edge -//! -//! ``` -//! use grex::RegExpBuilder; -//! -//! let regexp = RegExpBuilder::from(&["abc"]) -//! .with_line_borders(false, true) -//! .build(); -//! assert_eq!(regexp, "abc$"); -//! ``` -//! -//! ### 4.8.2. Optional "$" anchor - don't stick to the right edge -//! -//! ``` -//! use grex::RegExpBuilder; -//! -//! let regexp = RegExpBuilder::from(&["abc"]) -//! .with_line_borders(true, false) -//! .build(); -//! assert_eq!(regexp, "^abc"); -//! ``` -//! -//! ### 4.8.3. Optional "^" and "$" anchors - don't stick to both edges -//! -//! ``` -//! use grex::RegExpBuilder; -//! -//! let regexp = RegExpBuilder::from(&["abc"]) -//! .with_line_borders(false, false) -//! .build(); -//! assert_eq!(regexp, "abc"); -//! ``` -//! //! ### 5. How does it work? //! //! 1. A [deterministic finite automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) (DFA) diff --git a/src/main.rs b/src/main.rs index 1d29be5..695616b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,6 +16,7 @@ use grex::{Feature, RegExpBuilder}; use itertools::Itertools; +use regex::Regex; use std::io::{Error, ErrorKind}; use std::path::PathBuf; use structopt::clap::AppSettings::{AllowLeadingHyphen, ColoredHelp}; @@ -179,40 +180,55 @@ struct Cli { is_verbose_mode_enabled: bool, #[structopt( - name = "colorize", - short, + name = "no-start-anchor", long, - help = "Provides syntax highlighting for the resulting regular expression", + help = "Removes caret anchor '^' from resulting regular expression", + long_help = "Removes caret anchor '^' from resulting regular expression.\n\n\ + By default, the caret anchor is added to every generated regular\n\ + expression which guarantees that the expression matches the test cases\n\ + given as input only at the start of a string.\n\ + This flag removes the anchor, thereby allowing to match the test cases also\n\ + when they do not occur at the start of a string.", display_order = 13 )] - is_output_colorized: bool, + is_caret_anchor_disabled: bool, #[structopt( - name = "no-match-beginning", - short = "B", + name = "no-end-anchor", long, - help = "Don't match the beginning of the string (don't prepend \"^\")", + help = "Removes dollar sign anchor '$' from resulting regular expression", + long_help = "Removes dollar sign anchor '$' from resulting regular expression.\n\n\ + By default, the dollar sign anchor is added to every generated regular\n\ + expression which guarantees that the expression matches the test cases\n\ + given as input only at the end of a string.\n\ + This flag removes the anchor, thereby allowing to match the test cases also\n\ + when they do not occur at the end of a string.", display_order = 14 )] - no_match_begin: bool, + is_dollar_sign_anchor_disabled: bool, #[structopt( - name = "no-match-end", - short = "E", + name = "no-anchors", long, - help = "Don't match the end of the string (don't append \"$\")", + help = "Removes caret and dollar sign anchors from resulting regular expression", + long_help = "Removes caret and dollar sign anchors from resulting regular expression.\n\n\ + By default, anchors are added to every generated regular expression\n\ + which guarantee that the expression exactly matches only the test cases\n\ + given as input and nothing else.\n\ + This flag removes the anchors, thereby allowing to match the test cases also\n\ + when they occur within a larger string that contains other content as well.", display_order = 15 )] - no_match_end: bool, + are_anchors_disabled: bool, #[structopt( - name = "no-match-line", - short = "X", + name = "colorize", + short, long, - help = "Don't match the whole string (as a shorthand for -B -E)", + help = "Provides syntax highlighting for the resulting regular expression", display_order = 16 )] - no_match_line: bool, + is_output_colorized: bool, // -------------------- // OPTIONS @@ -332,14 +348,23 @@ fn handle_input(cli: &Cli, input: Result, Error>) { builder.with_verbose_mode(); } + if cli.is_caret_anchor_disabled { + builder.without_start_anchor(); + } + + if cli.is_dollar_sign_anchor_disabled { + builder.without_end_anchor(); + } + + if cli.are_anchors_disabled { + builder.without_anchors(); + } + if cli.is_output_colorized { builder.with_syntax_highlighting(); } builder - .with_line_borders( - ! cli.no_match_begin && ! cli.no_match_line - , ! cli.no_match_end && ! cli.no_match_line) .with_minimum_repetitions(cli.minimum_repetitions) .with_minimum_substring_length(cli.minimum_substring_length); diff --git a/src/regexp/builder.rs b/src/regexp/builder.rs index bb2e000..aab4b1f 100644 --- a/src/regexp/builder.rs +++ b/src/regexp/builder.rs @@ -128,11 +128,37 @@ impl RegExpBuilder { self } + /// Tells `RegExpBuilder` to produce a nicer looking regular expression in verbose mode. pub fn with_verbose_mode(&mut self) -> &mut Self { self.config.is_verbose_mode_enabled = true; self } + /// Tells `RegExpBuilder` to remove the caret anchor '^' from the resulting regular + /// expression, thereby allowing to match the test cases also when they do not occur + /// at the start of a string. + pub fn without_start_anchor(&mut self) -> &mut Self { + self.config.is_start_anchor_disabled = true; + self + } + + /// Tells `RegExpBuilder` to remove the dollar sign anchor '$' from the resulting regular + /// expression, thereby allowing to match the test cases also when they do not occur + /// at the end of a string. + pub fn without_end_anchor(&mut self) -> &mut Self { + self.config.is_end_anchor_disabled = true; + self + } + + /// Tells `RegExpBuilder` to remove the caret and dollar sign anchors from the resulting + /// regular expression, thereby allowing to match the test cases also when they occur + /// within a larger string that contains other content as well. + pub fn without_anchors(&mut self) -> &mut Self { + self.config.is_start_anchor_disabled = true; + self.config.is_end_anchor_disabled = true; + self + } + /// Tells `RegExpBuilder` to provide syntax highlighting for the resulting regular expression. /// /// ⚠ This method may only be used if the resulting regular expression is meant to @@ -143,13 +169,6 @@ impl RegExpBuilder { self } - /// Tells `RegExpBuilder` to concatenate the resulting regular expression with the "^" and "$" anchors. - pub fn with_line_borders(&mut self, match_begin: bool, match_end: bool) -> &mut Self { - self.config.is_match_begin = match_begin; - self.config.is_match_end = match_end; - self - } - /// Builds the actual regular expression using the previously given settings. /// Every generated regular expression is surrounded by the anchors `^` and `$` /// so that substrings not being part of the test cases are not matched accidentally. diff --git a/src/regexp/config.rs b/src/regexp/config.rs index 2d88aa4..e31235c 100644 --- a/src/regexp/config.rs +++ b/src/regexp/config.rs @@ -24,9 +24,9 @@ pub struct RegExpConfig { pub(crate) is_non_ascii_char_escaped: bool, pub(crate) is_astral_code_point_converted_to_surrogate: bool, pub(crate) is_verbose_mode_enabled: bool, + pub(crate) is_start_anchor_disabled: bool, + pub(crate) is_end_anchor_disabled: bool, pub(crate) is_output_colorized: bool, - pub(crate) is_match_begin: bool, - pub(crate) is_match_end: bool, } impl RegExpConfig { @@ -38,9 +38,9 @@ impl RegExpConfig { is_non_ascii_char_escaped: false, is_astral_code_point_converted_to_surrogate: false, is_verbose_mode_enabled: false, + is_start_anchor_disabled: false, + is_end_anchor_disabled: false, is_output_colorized: false, - is_match_begin: true, - is_match_end: true, } } diff --git a/src/regexp/regexp.rs b/src/regexp/regexp.rs index d0c499b..6008413 100644 --- a/src/regexp/regexp.rs +++ b/src/regexp/regexp.rs @@ -87,15 +87,15 @@ impl Display for RegExp { } else { String::new() }; - let caret = if self.config.is_match_begin { - Component::Caret.to_repr(self.config.is_output_colorized) + let caret = if self.config.is_start_anchor_disabled { + String::new() } else { - "".to_string() + Component::Caret.to_repr(self.config.is_output_colorized) }; - let dollar_sign = if self.config.is_match_end { - Component::DollarSign.to_repr(self.config.is_output_colorized) + let dollar_sign = if self.config.is_end_anchor_disabled { + String::new() } else { - "".to_string() + Component::DollarSign.to_repr(self.config.is_output_colorized) }; let mut regexp = match self.ast { Expression::Alternation(_, _) => { @@ -246,7 +246,11 @@ fn apply_verbose_mode(regexp: String, config: &RegExpConfig) -> String { }; let mut verbose_regexp = vec![verbose_mode_flag]; - let mut nesting_level = if config.is_match_begin { 0 } else { 1 }; + let mut nesting_level = if config.is_start_anchor_disabled { + 1 + } else { + 0 + }; let regexp_with_replacements = regexp .replace( diff --git a/tests/cli_integration_tests.rs b/tests/cli_integration_tests.rs index fccc586..8702aa1 100644 --- a/tests/cli_integration_tests.rs +++ b/tests/cli_integration_tests.rs @@ -23,83 +23,6 @@ use tempfile::NamedTempFile; const TEST_CASE: &str = "I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."; -mod anchor_conversion { - use super::*; - - mod no_verbose { - use super::*; - - #[test] - fn succeeds_with_no_match_beginning_option() { - let mut grex = init_command(); - grex.args(&["--no-match-beginning", TEST_CASE]); - grex.assert() - .success() - .stdout(predicate::eq("I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.$\n")); - } - - #[test] - fn succeeds_with_no_match_end_option() { - let mut grex = init_command(); - grex.args(&["--no-match-end", TEST_CASE]); - grex.assert() - .success() - .stdout(predicate::eq("^I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.\n")); - } - - #[test] - fn succeeds_with_no_match_line_option() { - let mut grex = init_command(); - grex.args(&["--no-match-line", TEST_CASE]); - grex.assert() - .success() - .stdout(predicate::eq("I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.\n")); - } - } - - mod verbose { - use super::*; - - #[test] - fn succeeds_with_verbose_mode_and_no_match_beginning_option() { - let mut grex = init_command(); - grex.args(&["--verbose", "--no-match-beginning", TEST_CASE]); - grex.assert().success().stdout(predicate::eq(indoc!( - r#" - (?x) - I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. - $ - "#, - ))); - } - - #[test] - fn succeeds_with_verbose_mode_and_no_match_end_option() { - let mut grex = init_command(); - grex.args(&["--verbose", "--no-match-end", TEST_CASE]); - grex.assert().success().stdout(predicate::eq(indoc!( - r#" - (?x) - ^ - I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. - "#, - ))); - } - - #[test] - fn succeeds_with_verbose_mode_and_no_match_line_option() { - let mut grex = init_command(); - grex.args(&["--verbose", "--no-match-line", TEST_CASE]); - grex.assert().success().stdout(predicate::eq(indoc!( - r#" - (?x) - I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. - "#, - ))); - } - } -} - mod no_conversion { use super::*; @@ -3868,6 +3791,83 @@ mod word_non_word_conversion { } } +mod anchor_conversion { + use super::*; + + mod no_verbose { + use super::*; + + #[test] + fn succeeds_with_no_start_anchor_option() { + let mut grex = init_command(); + grex.args(&["--no-start-anchor", TEST_CASE]); + grex.assert() + .success() + .stdout(predicate::eq("I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.$\n")); + } + + #[test] + fn succeeds_with_no_end_anchor_option() { + let mut grex = init_command(); + grex.args(&["--no-end-anchor", TEST_CASE]); + grex.assert() + .success() + .stdout(predicate::eq("^I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.\n")); + } + + #[test] + fn succeeds_with_no_anchors_option() { + let mut grex = init_command(); + grex.args(&["--no-anchors", TEST_CASE]); + grex.assert() + .success() + .stdout(predicate::eq("I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩\\.\n")); + } + } + + mod verbose { + use super::*; + + #[test] + fn succeeds_with_verbose_mode_and_no_start_anchor_option() { + let mut grex = init_command(); + grex.args(&["--verbose", "--no-start-anchor", TEST_CASE]); + grex.assert().success().stdout(predicate::eq(indoc!( + r#" + (?x) + I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. + $ + "#, + ))); + } + + #[test] + fn succeeds_with_verbose_mode_and_no_end_anchor_option() { + let mut grex = init_command(); + grex.args(&["--verbose", "--no-end-anchor", TEST_CASE]); + grex.assert().success().stdout(predicate::eq(indoc!( + r#" + (?x) + ^ + I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. + "#, + ))); + } + + #[test] + fn succeeds_with_verbose_mode_and_no_anchors_option() { + let mut grex = init_command(); + grex.args(&["--verbose", "--no-anchors", TEST_CASE]); + grex.assert().success().stdout(predicate::eq(indoc!( + r#" + (?x) + I\ \ \ ♥♥♥\ 36\ and\ ٣\ and\ y̆y̆\ and\ 💩💩\. + "#, + ))); + } + } +} + fn init_command() -> Command { Command::cargo_bin("grex").unwrap() } diff --git a/tests/lib_integration_tests.rs b/tests/lib_integration_tests.rs index cca3ed2..fcccafb 100644 --- a/tests/lib_integration_tests.rs +++ b/tests/lib_integration_tests.rs @@ -21,95 +21,6 @@ use rstest::rstest; use std::io::Write; use tempfile::NamedTempFile; -mod anchor_conversion { - use super::*; - - mod no_verbose { - use super::*; - - #[rstest(test_cases, expected_output, - case(vec!["My ♥♥♥ and 💩💩 is yours."], "My ♥♥♥ and 💩💩 is yours\\.$"), - )] - fn succeeds_with_no_match_beginning_option(test_cases: Vec<&str>, expected_output: &str) { - let regexp = RegExpBuilder::from(&test_cases) - .with_line_borders(false, true) - .build(); - assert_that_regexp_is_correct(regexp, expected_output, &test_cases); - } - - #[rstest(test_cases, expected_output, - case(vec!["My ♥♥♥ and 💩💩 is yours."], "^My ♥♥♥ and 💩💩 is yours\\."), - )] - fn succeeds_with_no_match_end_option(test_cases: Vec<&str>, expected_output: &str) { - let regexp = RegExpBuilder::from(&test_cases) - .with_line_borders(true, false) - .build(); - assert_that_regexp_is_correct(regexp, expected_output, &test_cases); - } - - #[rstest(test_cases, expected_output, - case(vec!["My ♥♥♥ and 💩💩 is yours."], "My ♥♥♥ and 💩💩 is yours\\."), - )] - fn succeeds_with_no_match_line_option(test_cases: Vec<&str>, expected_output: &str) { - let regexp = RegExpBuilder::from(&test_cases) - .with_line_borders(false, false) - .build(); - assert_that_regexp_is_correct(regexp, expected_output, &test_cases); - } - } - - mod verbose { - use super::*; - - #[rstest(test_cases, expected_output, - case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( - r#" - (?x) - My\ ♥♥♥\ and\ 💩💩\ is\ yours\. - $"# - )) - )] - fn succeeds_with_verbose_and_no_match_beginning_option(test_cases: Vec<&str>, expected_output: &str) { - let regexp = RegExpBuilder::from(&test_cases) - .with_line_borders(false, true) - .with_verbose_mode() - .build(); - assert_that_regexp_is_correct(regexp, expected_output, &test_cases); - } - - #[rstest(test_cases, expected_output, - case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( - r#" - (?x) - ^ - My\ ♥♥♥\ and\ 💩💩\ is\ yours\."# - )) - )] - fn succeeds_with_verbose_and_no_match_end_option(test_cases: Vec<&str>, expected_output: &str) { - let regexp = RegExpBuilder::from(&test_cases) - .with_line_borders(true, false) - .with_verbose_mode() - .build(); - assert_that_regexp_is_correct(regexp, expected_output, &test_cases); - } - - #[rstest(test_cases, expected_output, - case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( - r#" - (?x) - My\ ♥♥♥\ and\ 💩💩\ is\ yours\."# - )) - )] - fn succeeds_with_verbose_and_no_match_line_option(test_cases: Vec<&str>, expected_output: &str) { - let regexp = RegExpBuilder::from(&test_cases) - .with_line_borders(false, false) - .with_verbose_mode() - .build(); - assert_that_regexp_is_correct(regexp, expected_output, &test_cases); - } - } -} - mod no_conversion { use super::*; @@ -2030,6 +1941,102 @@ mod word_non_word_conversion { } } +mod anchor_conversion { + use super::*; + + mod no_verbose { + use super::*; + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], "My ♥♥♥ and 💩💩 is yours\\.$"), + )] + fn succeeds_with_no_start_anchor_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases) + .without_start_anchor() + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], "^My ♥♥♥ and 💩💩 is yours\\."), + )] + fn succeeds_with_no_end_anchor_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases) + .without_end_anchor() + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], "My ♥♥♥ and 💩💩 is yours\\."), + )] + fn succeeds_with_no_match_line_option(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases).without_anchors().build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + } + + mod verbose { + use super::*; + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( + r#" + (?x) + My\ ♥♥♥\ and\ 💩💩\ is\ yours\. + $"# + )) + )] + fn succeeds_with_verbose_mode_and_no_start_anchor_option( + test_cases: Vec<&str>, + expected_output: &str, + ) { + let regexp = RegExpBuilder::from(&test_cases) + .with_verbose_mode() + .without_start_anchor() + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( + r#" + (?x) + ^ + My\ ♥♥♥\ and\ 💩💩\ is\ yours\."# + )) + )] + fn succeeds_with_verbose_mode_and_no_end_anchor_option( + test_cases: Vec<&str>, + expected_output: &str, + ) { + let regexp = RegExpBuilder::from(&test_cases) + .with_verbose_mode() + .without_end_anchor() + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + + #[rstest(test_cases, expected_output, + case(vec!["My ♥♥♥ and 💩💩 is yours."], indoc!( + r#" + (?x) + My\ ♥♥♥\ and\ 💩💩\ is\ yours\."# + )) + )] + fn succeeds_with_verbose_mode_and_no_anchors_option( + test_cases: Vec<&str>, + expected_output: &str, + ) { + let regexp = RegExpBuilder::from(&test_cases) + .with_verbose_mode() + .without_anchors() + .build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + } + } +} + fn assert_that_regexp_is_correct(regexp: String, expected_output: &str, test_cases: &[&str]) { assert_eq!( regexp, expected_output, diff --git a/tests/property_tests.rs b/tests/property_tests.rs index de8034f..216779b 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -121,81 +121,6 @@ proptest! { prop_assert!(compile_regexp(®exp).is_ok()); } - #[test] - #[ignore] - fn valid_regexes_with_no_match_beginning( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(false, true) - .build(); - prop_assert!(compile_regexp(®exp).is_ok()); - } - - #[test] - #[ignore] - fn valid_regexes_with_no_match_beginning_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(false, true) - .with_verbose_mode() - .build(); - prop_assert!(compile_regexp(®exp).is_ok()); - } - - #[test] - #[ignore] - fn valid_regexes_with_no_match_end( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(true, false) - .build(); - prop_assert!(compile_regexp(®exp).is_ok()); - } - - #[test] - #[ignore] - fn valid_regexes_with_no_match_end_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(true, false) - .with_verbose_mode() - .build(); - prop_assert!(compile_regexp(®exp).is_ok()); - } - - #[test] - #[ignore] - fn valid_regexes_with_no_match_line( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(false, false) - .build(); - prop_assert!(compile_regexp(®exp).is_ok()); - } - - #[test] - #[ignore] - fn valid_regexes_with_no_match_line_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(false, false) - .with_verbose_mode() - .build(); - prop_assert!(compile_regexp(®exp).is_ok()); - } - #[test] #[ignore] fn matching_regexes_with_default_settings( @@ -310,6 +235,30 @@ proptest! { } } + #[test] + #[ignore] + fn matching_regexes_without_anchors( + test_cases in prop::collection::hash_set("[A-C]{1,10}", 1..=5) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec).without_anchors().build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + for test_case in test_cases_vec { + let matches = compiled_regexp.find_iter(&test_case).collect::>(); + let substrings = matches.iter().map(|m| m.as_str()).collect::>(); + prop_assert_eq!( + matches.len(), + 1, + "expression '{}' does not match test case '{}' entirely but {} of its substrings: {:?}", + regexp, + test_case, + matches.len(), + substrings + ); + } + } + } + #[test] #[ignore] fn regexes_not_matching_other_strings_with_default_settings( @@ -341,93 +290,6 @@ proptest! { } } } - - #[test] - #[ignore] - fn matching_regexes_with_no_match_beginning( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(false, true) - .build(); - if let Ok(compiled_regexp) = compile_regexp(®exp) { - prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); - } - } - - #[test] - #[ignore] - fn matching_regexes_with_no_match_beginning_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(false, true) - .with_verbose_mode() - .build(); - if let Ok(compiled_regexp) = compile_regexp(®exp) { - prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); - } - } - - #[test] - #[ignore] - fn matching_regexes_with_no_match_end( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(true, false) - .build(); - if let Ok(compiled_regexp) = compile_regexp(®exp) { - prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); - } - } - - #[test] - #[ignore] - fn matching_regexes_with_no_match_end_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(true, false) - .with_verbose_mode() - .build(); - if let Ok(compiled_regexp) = compile_regexp(®exp) { - prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); - } - } - - #[test] - #[ignore] - fn matching_regexes_with_no_match_line( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(false, false) - .build(); - if let Ok(compiled_regexp) = compile_regexp(®exp) { - prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); - } - } - - #[test] - #[ignore] - fn matching_regexes_with_no_match_line_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) - ) { - let test_cases_vec = test_cases.iter().cloned().collect::>(); - let regexp = RegExpBuilder::from(&test_cases_vec) - .with_line_borders(false, false) - .with_verbose_mode() - .build(); - if let Ok(compiled_regexp) = compile_regexp(®exp) { - prop_assert!(test_cases.iter().all(|test_case| compiled_regexp.is_match(&test_case))); - } - } } fn conversion_feature_strategy() -> impl Strategy { From dae0596aeda7b7ccbd8a28bb99ddd244f9cdd121 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Wed, 4 Aug 2021 20:43:50 +0200 Subject: [PATCH 07/18] Disable DFA minimization for disabled anchors (#31) --- src/fsm/dfa.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/fsm/dfa.rs b/src/fsm/dfa.rs index 90e02ed..119d920 100644 --- a/src/fsm/dfa.rs +++ b/src/fsm/dfa.rs @@ -42,7 +42,16 @@ impl Dfa { for cluster in grapheme_clusters { dfa.insert(cluster); } - dfa.minimize(); + + let is_exactly_one_anchor_disabled = + config.is_start_anchor_disabled ^ config.is_end_anchor_disabled; + let is_no_anchor_disabled = + !config.is_start_anchor_disabled && !config.is_end_anchor_disabled; + + if is_exactly_one_anchor_disabled || is_no_anchor_disabled { + dfa.minimize(); + } + dfa } From 91ce13fcc83d9bc1cb9660c422fd27777edf8b0f Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Wed, 4 Aug 2021 20:54:45 +0200 Subject: [PATCH 08/18] Optimize property tests --- tests/property_tests.rs | 99 ++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 35 deletions(-) diff --git a/tests/property_tests.rs b/tests/property_tests.rs index 216779b..7074fea 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -22,9 +22,8 @@ proptest! { #![proptest_config(ProptestConfig::with_cases(500))] #[test] - #[ignore] fn valid_regexes_with_default_settings( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5) ) { let test_cases_vec = test_cases.iter().cloned().collect::>(); let regexp = RegExpBuilder::from(&test_cases_vec).build(); @@ -32,9 +31,8 @@ proptest! { } #[test] - #[ignore] fn valid_regexes_with_escape_sequences( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5) ) { let test_cases_vec = test_cases.iter().cloned().collect::>(); let regexp = RegExpBuilder::from(&test_cases_vec) @@ -44,9 +42,8 @@ proptest! { } #[test] - #[ignore] fn valid_regexes_with_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5) ) { let test_cases_vec = test_cases.iter().cloned().collect::>(); let regexp = RegExpBuilder::from(&test_cases_vec) @@ -56,9 +53,8 @@ proptest! { } #[test] - #[ignore] fn valid_regexes_with_escape_sequences_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5) ) { let test_cases_vec = test_cases.iter().cloned().collect::>(); let regexp = RegExpBuilder::from(&test_cases_vec) @@ -69,9 +65,8 @@ proptest! { } #[test] - #[ignore] fn valid_regexes_with_conversion_features( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10), + test_cases in prop::collection::hash_set(".{1,10}", 1..=5), conversion_features in prop::collection::hash_set(conversion_feature_strategy(), 1..=9), minimum_repetitions in 1..100u32, minimum_substring_length in 1..100u32 @@ -86,9 +81,8 @@ proptest! { } #[test] - #[ignore] fn valid_regexes_with_conversion_features_and_escape_sequences( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10), + test_cases in prop::collection::hash_set(".{1,10}", 1..=5), conversion_features in prop::collection::hash_set(conversion_feature_strategy(), 1..=9), minimum_repetitions in 1..100u32, minimum_substring_length in 1..100u32 @@ -104,9 +98,8 @@ proptest! { } #[test] - #[ignore] fn valid_regexes_with_conversion_features_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10), + test_cases in prop::collection::hash_set(".{1,10}", 1..=5), conversion_features in prop::collection::hash_set(conversion_feature_strategy(), 1..=9), minimum_repetitions in 1..100u32, minimum_substring_length in 1..100u32 @@ -122,9 +115,8 @@ proptest! { } #[test] - #[ignore] fn matching_regexes_with_default_settings( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5) ) { let test_cases_vec = test_cases.iter().cloned().collect::>(); let regexp = RegExpBuilder::from(&test_cases_vec).build(); @@ -134,9 +126,8 @@ proptest! { } #[test] - #[ignore] fn matching_regexes_with_escape_sequences( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5) ) { let test_cases_vec = test_cases.iter().cloned().collect::>(); let regexp = RegExpBuilder::from(&test_cases_vec) @@ -148,9 +139,8 @@ proptest! { } #[test] - #[ignore] fn matching_regexes_with_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5) ) { let test_cases_vec = test_cases.iter().cloned().collect::>(); let regexp = RegExpBuilder::from(&test_cases_vec) @@ -162,9 +152,8 @@ proptest! { } #[test] - #[ignore] fn matching_regexes_with_escape_sequences_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5) ) { let test_cases_vec = test_cases.iter().cloned().collect::>(); let regexp = RegExpBuilder::from(&test_cases_vec) @@ -177,9 +166,8 @@ proptest! { } #[test] - #[ignore] fn matching_regexes_with_conversion_features( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10), + test_cases in prop::collection::hash_set(".{1,10}", 1..=5), conversion_features in prop::collection::hash_set(conversion_feature_strategy(), 1..=9), minimum_repetitions in 1..100u32, minimum_substring_length in 1..100u32 @@ -196,9 +184,8 @@ proptest! { } #[test] - #[ignore] fn matching_regexes_with_conversion_features_and_escape_sequences( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10), + test_cases in prop::collection::hash_set(".{1,10}", 1..=5), conversion_features in prop::collection::hash_set(conversion_feature_strategy(), 1..=9), minimum_repetitions in 1..100u32, minimum_substring_length in 1..100u32 @@ -216,9 +203,8 @@ proptest! { } #[test] - #[ignore] fn matching_regexes_with_conversion_features_and_verbose_mode( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10), + test_cases in prop::collection::hash_set(".{1,10}", 1..=5), conversion_features in prop::collection::hash_set(conversion_feature_strategy(), 1..=9), minimum_repetitions in 1..100u32, minimum_substring_length in 1..100u32 @@ -236,7 +222,52 @@ proptest! { } #[test] - #[ignore] + fn matching_regexes_without_start_anchor( + test_cases in prop::collection::hash_set("[A-C]{1,10}", 1..=5) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec).without_start_anchor().build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + for test_case in test_cases_vec { + let matches = compiled_regexp.find_iter(&test_case).collect::>(); + let substrings = matches.iter().map(|m| m.as_str()).collect::>(); + prop_assert_eq!( + matches.len(), + 1, + "expression '{}' does not match test case '{}' entirely but {} of its substrings: {:?}", + regexp, + test_case, + matches.len(), + substrings + ); + } + } + } + + #[test] + fn matching_regexes_without_end_anchor( + test_cases in prop::collection::hash_set("[A-C]{1,10}", 1..=5) + ) { + let test_cases_vec = test_cases.iter().cloned().collect::>(); + let regexp = RegExpBuilder::from(&test_cases_vec).without_end_anchor().build(); + if let Ok(compiled_regexp) = compile_regexp(®exp) { + for test_case in test_cases_vec { + let matches = compiled_regexp.find_iter(&test_case).collect::>(); + let substrings = matches.iter().map(|m| m.as_str()).collect::>(); + prop_assert_eq!( + matches.len(), + 1, + "expression '{}' does not match test case '{}' entirely but {} of its substrings: {:?}", + regexp, + test_case, + matches.len(), + substrings + ); + } + } + } + + #[test] fn matching_regexes_without_anchors( test_cases in prop::collection::hash_set("[A-C]{1,10}", 1..=5) ) { @@ -260,10 +291,9 @@ proptest! { } #[test] - #[ignore] fn regexes_not_matching_other_strings_with_default_settings( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10), - other_strings in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5), + other_strings in prop::collection::hash_set(".{1,10}", 1..=5) ) { if test_cases.is_disjoint(&other_strings) { let test_cases_vec = test_cases.iter().cloned().collect::>(); @@ -275,10 +305,9 @@ proptest! { } #[test] - #[ignore] fn regexes_not_matching_other_strings_with_escape_sequences( - test_cases in prop::collection::hash_set(".{1,20}", 1..=10), - other_strings in prop::collection::hash_set(".{1,20}", 1..=10) + test_cases in prop::collection::hash_set(".{1,10}", 1..=5), + other_strings in prop::collection::hash_set(".{1,10}", 1..=5) ) { if test_cases.is_disjoint(&other_strings) { let test_cases_vec = test_cases.iter().cloned().collect::>(); From c5a740105de0403f22cc4ca07a0745502997c422 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Wed, 4 Aug 2021 21:02:21 +0200 Subject: [PATCH 09/18] Fix clippy warnings --- src/ast/expression.rs | 11 ++--------- src/ast/format.rs | 8 ++++---- src/main.rs | 1 - src/regexp/regexp.rs | 4 ++-- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/ast/expression.rs b/src/ast/expression.rs index 006fa72..494fedf 100644 --- a/src/ast/expression.rs +++ b/src/ast/expression.rs @@ -212,15 +212,8 @@ impl Expression { expr: &Option, config: &RegExpConfig, ) -> Option { - if let Some(value) = expr { - Some(Expression::new_repetition( - value.clone(), - Quantifier::KleeneStar, - config, - )) - } else { - None - } + expr.as_ref() + .map(|value| Expression::new_repetition(value.clone(), Quantifier::KleeneStar, config)) } fn concatenate( diff --git a/src/ast/format.rs b/src/ast/format.rs index 8977f70..7b6468f 100644 --- a/src/ast/format.rs +++ b/src/ast/format.rs @@ -26,17 +26,17 @@ impl Display for Expression { fn fmt(&self, f: &mut Formatter<'_>) -> Result { match self { Expression::Alternation(options, config) => { - format_alternation(f, &self, options, config) + format_alternation(f, self, options, config) } Expression::CharacterClass(char_set, config) => { format_character_class(f, char_set, config) } Expression::Concatenation(expr1, expr2, config) => { - format_concatenation(f, &self, expr1, expr2, config) + format_concatenation(f, self, expr1, expr2, config) } Expression::Literal(cluster, config) => format_literal(f, cluster, config), Expression::Repetition(expr, quantifier, config) => { - format_repetition(f, &self, expr, quantifier, config) + format_repetition(f, self, expr, quantifier, config) } } } @@ -81,7 +81,7 @@ fn format_character_class( let escaped_char_set = char_set .iter() .map(|c| { - if chars_to_escape.contains(&c) { + if chars_to_escape.contains(c) { format!("{}{}", "\\", c) } else if c == &'\n' { "\\n".to_string() diff --git a/src/main.rs b/src/main.rs index 695616b..747465f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,7 +16,6 @@ use grex::{Feature, RegExpBuilder}; use itertools::Itertools; -use regex::Regex; use std::io::{Error, ErrorKind}; use std::path::PathBuf; use structopt::clap::AppSettings::{AllowLeadingHyphen, ColoredHelp}; diff --git a/src/regexp/regexp.rs b/src/regexp/regexp.rs index 6008413..8853ba0 100644 --- a/src/regexp/regexp.rs +++ b/src/regexp/regexp.rs @@ -36,7 +36,7 @@ impl RegExp { Self::convert_to_lowercase(test_cases); } Self::sort(test_cases); - let grapheme_clusters = Self::grapheme_clusters(&test_cases, config); + let grapheme_clusters = Self::grapheme_clusters(test_cases, config); let dfa = Dfa::from(grapheme_clusters, config); let ast = Expression::from(dfa, config); Self { @@ -53,7 +53,7 @@ impl RegExp { test_cases.sort(); test_cases.dedup(); test_cases.sort_by(|a, b| match a.len().cmp(&b.len()) { - Ordering::Equal => a.cmp(&b), + Ordering::Equal => a.cmp(b), other => other, }); } From 535232e4dc928cd940aabf8c8e18dcddd5f719e4 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Thu, 19 Aug 2021 23:38:54 +0200 Subject: [PATCH 10/18] Fix repetition detection (#36) --- src/char/cluster.rs | 63 ++++----- tests/cli_integration_tests.rs | 242 ++++++++++++++++++++++++++------- tests/lib_integration_tests.rs | 60 ++++---- 3 files changed, 251 insertions(+), 114 deletions(-) diff --git a/src/char/cluster.rs b/src/char/cluster.rs index 29d4941..7a2edbe 100644 --- a/src/char/cluster.rs +++ b/src/char/cluster.rs @@ -167,7 +167,7 @@ fn convert_repetitions( config: &RegExpConfig, ) { let repeated_substrings = collect_repeated_substrings(graphemes); - let ranges_of_repetitions = create_ranges_of_repetitions(repeated_substrings); + let ranges_of_repetitions = create_ranges_of_repetitions(repeated_substrings, config); let coalesced_repetitions = coalesce_repetitions(ranges_of_repetitions); replace_graphemes_with_repetitions(coalesced_repetitions, graphemes, repetitions, config) } @@ -190,43 +190,38 @@ fn collect_repeated_substrings(graphemes: &[Grapheme]) -> HashMap, V fn create_ranges_of_repetitions( repeated_substrings: HashMap, Vec>, + config: &RegExpConfig, ) -> Vec<(Range, Vec)> { let mut repetitions = Vec::<(Range, Vec)>::new(); for (prefix_length, group) in &repeated_substrings .iter() - .filter(|&(_, indices)| indices.len() > 1) + .filter(|&(prefix, indices)| { + indices + .iter() + .tuple_windows() + .all(|(first, second)| (second - first) >= prefix.len()) + }) .sorted_by_key(|&(prefix, _)| prefix.len()) .rev() .group_by(|&(prefix, _)| prefix.len()) { for (prefix, indices) in group.sorted_by_key(|&(_, indices)| indices[0]) { - let all_even = indices - .iter() - .all(|it| it % prefix_length == 0 || it % 2 == 0); - let all_odd = indices + indices .iter() - .all(|it| it % prefix_length == 1 || it % 2 == 1); - - if all_even || all_odd { - let ranges = indices - .iter() - .cloned() - .map(|it| it..it + prefix_length) - .coalesce(|x, y| { - if x.end == y.start { - Ok(x.start..y.end) - } else { - Err((x, y)) - } - }) - .filter(|it| (it.end - it.start) > prefix_length) - .collect_vec(); - - for range in ranges { - repetitions.push((range, prefix.clone())); - } - } + .map(|it| *it..it + prefix_length) + .coalesce(|x, y| { + if x.end == y.start { + Ok(x.start..y.end) + } else { + Err((x, y)) + } + }) + .filter(|range| { + let count = ((range.end - range.start) / prefix_length) as u32; + count > config.minimum_repetitions + }) + .for_each(|range| repetitions.push((range, prefix.clone()))); } } repetitions @@ -281,22 +276,10 @@ fn replace_graphemes_with_repetitions( let count = ((range.end - range.start) / substr.len()) as u32; - if count <= config.minimum_repetitions - || substr.len() < config.minimum_substring_length as usize - { + if substr.len() < config.minimum_substring_length as usize { continue; } - let joined_substr = substr.iter().join("").repeat(count as usize); - let graphemes_slice = repetitions[range.clone()] - .iter() - .map(|it| it.value()) - .join(""); - - if graphemes_slice != joined_substr { - break; - } - repetitions.splice( range.clone(), [Grapheme::new(substr.clone(), count, count, config)] diff --git a/tests/cli_integration_tests.rs b/tests/cli_integration_tests.rs index 8702aa1..0d1ef59 100644 --- a/tests/cli_integration_tests.rs +++ b/tests/cli_integration_tests.rs @@ -855,7 +855,7 @@ mod word_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--words", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\w {3}♥{3} \\w{2} \\w{3} \\w \\w{3} \\w{4} \\w{3} 💩{2}\\.$\n", + "^\\w {3}♥{3} \\w{2}(?: \\w{3} \\w){2}(?:\\w{3} ){2}💩{2}\\.$\n", )); } @@ -864,7 +864,7 @@ mod word_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--words", "--escape", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\w {3}\\u{2665}{3} \\w{2} \\w{3} \\w \\w{3} \\w{4} \\w{3} \\u{1f4a9}{2}\\.$\n", + "^\\w {3}\\u{2665}{3} \\w{2}(?: \\w{3} \\w){2}(?:\\w{3} ){2}\\u{1f4a9}{2}\\.$\n", )); } @@ -879,7 +879,7 @@ mod word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w {3}\\u{2665}{3} \\w{2} \\w{3} \\w \\w{3} \\w{4} \\w{3} (?:\\u{d83d}\\u{dca9}){2}\\.$\n", + "^\\w {3}\\u{2665}{3} \\w{2}(?: \\w{3} \\w){2}(?:\\w{3} ){2}(?:\\u{d83d}\\u{dca9}){2}\\.$\n", )); } @@ -891,7 +891,14 @@ mod word_conversion { r#" (?x) ^ - \w\ {3}♥{3}\ \w{2}\ \w{3}\ \w\ \w{3}\ \w{4}\ \w{3}\ 💩{2}\. + \w\ {3}♥{3}\ \w{2} + (?: + \ \w{3}\ \w + ){2} + (?: + \w{3}\ + ){2} + 💩{2}\. $ "# ))); @@ -911,7 +918,14 @@ mod word_conversion { r#" (?x) ^ - \w\ {3}\u{2665}{3}\ \w{2}\ \w{3}\ \w\ \w{3}\ \w{4}\ \w{3}\ \u{1f4a9}{2}\. + \w\ {3}\u{2665}{3}\ \w{2} + (?: + \ \w{3}\ \w + ){2} + (?: + \w{3}\ + ){2} + \u{1f4a9}{2}\. $ "# ))); @@ -932,7 +946,13 @@ mod word_conversion { r#" (?x) ^ - \w\ {3}\u{2665}{3}\ \w{2}\ \w{3}\ \w\ \w{3}\ \w{4}\ \w{3}\ + \w\ {3}\u{2665}{3}\ \w{2} + (?: + \ \w{3}\ \w + ){2} + (?: + \w{3}\ + ){2} (?: \u{d83d}\u{dca9} ){2} @@ -1263,7 +1283,7 @@ mod digit_word_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--digits", "--words", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\w {3}♥{3} \\d(?:\\d \\w{3} ){2}\\w{4} \\w{3} 💩{2}\\.$\n", + "^\\w {3}♥{3} \\d(?:\\d \\w{3} ){2}\\w(?:\\w{3} ){2}💩{2}\\.$\n", )); } @@ -1278,7 +1298,7 @@ mod digit_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w {3}\\u{2665}{3} \\d(?:\\d \\w{3} ){2}\\w{4} \\w{3} \\u{1f4a9}{2}\\.$\n", + "^\\w {3}\\u{2665}{3} \\d(?:\\d \\w{3} ){2}\\w(?:\\w{3} ){2}\\u{1f4a9}{2}\\.$\n", )); } @@ -1294,7 +1314,7 @@ mod digit_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w {3}\\u{2665}{3} \\d(?:\\d \\w{3} ){2}\\w{4} \\w{3} (?:\\u{d83d}\\u{dca9}){2}\\.$\n", + "^\\w {3}\\u{2665}{3} \\d(?:\\d \\w{3} ){2}\\w(?:\\w{3} ){2}(?:\\u{d83d}\\u{dca9}){2}\\.$\n", )); } @@ -1316,7 +1336,11 @@ mod digit_word_conversion { (?: \d\ \w{3}\ ){2} - \w{4}\ \w{3}\ 💩{2}\. + \w + (?: + \w{3}\ + ){2} + 💩{2}\. $ "# ))); @@ -1341,7 +1365,11 @@ mod digit_word_conversion { (?: \d\ \w{3}\ ){2} - \w{4}\ \w{3}\ \u{1f4a9}{2}\. + \w + (?: + \w{3}\ + ){2} + \u{1f4a9}{2}\. $ "# ))); @@ -1367,7 +1395,10 @@ mod digit_word_conversion { (?: \d\ \w{3}\ ){2} - \w{4}\ \w{3}\ + \w + (?: + \w{3}\ + ){2} (?: \u{d83d}\u{dca9} ){2} @@ -1476,7 +1507,7 @@ mod space_word_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--words", "--spaces", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\w\\s{3}♥{3}\\s\\w{2}\\s\\w{3}\\s\\w\\s\\w{3}\\s\\w{4}\\s\\w{3}\\s💩{2}\\.$\n", + "^\\w\\s{3}♥{3}\\s\\w{2}(?:\\s\\w{3}\\s\\w){2}(?:\\w{3}\\s){2}💩{2}\\.$\n", )); } @@ -1491,7 +1522,7 @@ mod space_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w\\s{3}\\u{2665}{3}\\s\\w{2}\\s\\w{3}\\s\\w\\s\\w{3}\\s\\w{4}\\s\\w{3}\\s\\u{1f4a9}{2}\\.$\n", + "^\\w\\s{3}\\u{2665}{3}\\s\\w{2}(?:\\s\\w{3}\\s\\w){2}(?:\\w{3}\\s){2}\\u{1f4a9}{2}\\.$\n", )); } @@ -1507,7 +1538,7 @@ mod space_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w\\s{3}\\u{2665}{3}\\s\\w{2}\\s\\w{3}\\s\\w\\s\\w{3}\\s\\w{4}\\s\\w{3}\\s(?:\\u{d83d}\\u{dca9}){2}\\.$\n", + "^\\w\\s{3}\\u{2665}{3}\\s\\w{2}(?:\\s\\w{3}\\s\\w){2}(?:\\w{3}\\s){2}(?:\\u{d83d}\\u{dca9}){2}\\.$\n", )); } @@ -1525,7 +1556,14 @@ mod space_word_conversion { r#" (?x) ^ - \w\s{3}♥{3}\s\w{2}\s\w{3}\s\w\s\w{3}\s\w{4}\s\w{3}\s💩{2}\. + \w\s{3}♥{3}\s\w{2} + (?: + \s\w{3}\s\w + ){2} + (?: + \w{3}\s + ){2} + 💩{2}\. $ "# ))); @@ -1546,7 +1584,14 @@ mod space_word_conversion { r#" (?x) ^ - \w\s{3}\u{2665}{3}\s\w{2}\s\w{3}\s\w\s\w{3}\s\w{4}\s\w{3}\s\u{1f4a9}{2}\. + \w\s{3}\u{2665}{3}\s\w{2} + (?: + \s\w{3}\s\w + ){2} + (?: + \w{3}\s + ){2} + \u{1f4a9}{2}\. $ "# ))); @@ -1568,7 +1613,13 @@ mod space_word_conversion { r#" (?x) ^ - \w\s{3}\u{2665}{3}\s\w{2}\s\w{3}\s\w\s\w{3}\s\w{4}\s\w{3}\s + \w\s{3}\u{2665}{3}\s\w{2} + (?: + \s\w{3}\s\w + ){2} + (?: + \w{3}\s + ){2} (?: \u{d83d}\u{dca9} ){2} @@ -1692,7 +1743,7 @@ mod digit_space_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w\\s{3}♥{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w{4}\\s\\w{3}\\s💩{2}\\.$\n", + "^\\w\\s{3}♥{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w(?:\\w{3}\\s){2}💩{2}\\.$\n", )); } @@ -1708,7 +1759,7 @@ mod digit_space_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w\\s{3}\\u{2665}{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w{4}\\s\\w{3}\\s\\u{1f4a9}{2}\\.$\n", + "^\\w\\s{3}\\u{2665}{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w(?:\\w{3}\\s){2}\\u{1f4a9}{2}\\.$\n", )); } @@ -1725,7 +1776,7 @@ mod digit_space_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w\\s{3}\\u{2665}{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w{4}\\s\\w{3}\\s(?:\\u{d83d}\\u{dca9}){2}\\.$\n", + "^\\w\\s{3}\\u{2665}{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w(?:\\w{3}\\s){2}(?:\\u{d83d}\\u{dca9}){2}\\.$\n", )); } @@ -1748,7 +1799,11 @@ mod digit_space_word_conversion { (?: \d\s\w{3}\s ){2} - \w{4}\s\w{3}\s💩{2}\. + \w + (?: + \w{3}\s + ){2} + 💩{2}\. $ "# ))); @@ -1774,7 +1829,11 @@ mod digit_space_word_conversion { (?: \d\s\w{3}\s ){2} - \w{4}\s\w{3}\s\u{1f4a9}{2}\. + \w + (?: + \w{3}\s + ){2} + \u{1f4a9}{2}\. $ "# ))); @@ -1801,7 +1860,10 @@ mod digit_space_word_conversion { (?: \d\s\w{3}\s ){2} - \w{4}\s\w{3}\s + \w + (?: + \w{3}\s + ){2} (?: \u{d83d}\u{dca9} ){2} @@ -2078,7 +2140,7 @@ mod non_space_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--non-spaces", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\S {3}\\S{3} \\S{2} \\S{3} \\S \\S{3} \\S{4} \\S{3} \\S{3}$\n", + "^\\S {3}\\S(?:\\S{2} ){2}\\S{3} (?:\\S(?: \\S{3}){2}){2}$\n", )); } @@ -2087,7 +2149,7 @@ mod non_space_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--non-spaces", "--escape", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\S {3}\\S{3} \\S{2} \\S{3} \\S \\S{3} \\S{4} \\S{3} \\S{3}$\n", + "^\\S {3}\\S(?:\\S{2} ){2}\\S{3} (?:\\S(?: \\S{3}){2}){2}$\n", )); } @@ -2102,7 +2164,7 @@ mod non_space_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\S {3}\\S{3} \\S{2} \\S{3} \\S \\S{3} \\S{4} \\S{3} \\S{3}$\n", + "^\\S {3}\\S(?:\\S{2} ){2}\\S{3} (?:\\S(?: \\S{3}){2}){2}$\n", )); } @@ -2114,7 +2176,17 @@ mod non_space_conversion { r#" (?x) ^ - \S\ {3}\S{3}\ \S{2}\ \S{3}\ \S\ \S{3}\ \S{4}\ \S{3}\ \S{3} + \S\ {3}\S + (?: + \S{2}\ + ){2} + \S{3}\ + (?: + \S + (?: + \ \S{3} + ){2} + ){2} $ "# ))); @@ -2134,7 +2206,17 @@ mod non_space_conversion { r#" (?x) ^ - \S\ {3}\S{3}\ \S{2}\ \S{3}\ \S\ \S{3}\ \S{4}\ \S{3}\ \S{3} + \S\ {3}\S + (?: + \S{2}\ + ){2} + \S{3}\ + (?: + \S + (?: + \ \S{3} + ){2} + ){2} $ "# ))); @@ -2155,7 +2237,17 @@ mod non_space_conversion { r#" (?x) ^ - \S\ {3}\S{3}\ \S{2}\ \S{3}\ \S\ \S{3}\ \S{4}\ \S{3}\ \S{3} + \S\ {3}\S + (?: + \S{2}\ + ){2} + \S{3}\ + (?: + \S + (?: + \ \S{3} + ){2} + ){2} $ "# ))); @@ -2859,7 +2951,7 @@ mod non_space_non_word_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--non-spaces", "--non-words", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\S\\W{7}\\S{2}\\W\\S{3}\\W\\S\\W\\S{3}\\W\\S{4}\\W\\S{3}\\W{4}$\n", + "^\\S\\W{7}\\S(?:\\S\\W\\S{3}\\W){2}\\S{4}\\W\\S{3}\\W{4}$\n", )); } @@ -2874,7 +2966,7 @@ mod non_space_non_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\S\\W{7}\\S{2}\\W\\S{3}\\W\\S\\W\\S{3}\\W\\S{4}\\W\\S{3}\\W{4}$\n", + "^\\S\\W{7}\\S(?:\\S\\W\\S{3}\\W){2}\\S{4}\\W\\S{3}\\W{4}$\n", )); } @@ -2890,7 +2982,7 @@ mod non_space_non_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\S\\W{7}\\S{2}\\W\\S{3}\\W\\S\\W\\S{3}\\W\\S{4}\\W\\S{3}\\W{4}$\n", + "^\\S\\W{7}\\S(?:\\S\\W\\S{3}\\W){2}\\S{4}\\W\\S{3}\\W{4}$\n", )); } @@ -2908,7 +3000,11 @@ mod non_space_non_word_conversion { r#" (?x) ^ - \S\W{7}\S{2}\W\S{3}\W\S\W\S{3}\W\S{4}\W\S{3}\W{4} + \S\W{7}\S + (?: + \S\W\S{3}\W + ){2} + \S{4}\W\S{3}\W{4} $ "# ))); @@ -2929,7 +3025,11 @@ mod non_space_non_word_conversion { r#" (?x) ^ - \S\W{7}\S{2}\W\S{3}\W\S\W\S{3}\W\S{4}\W\S{3}\W{4} + \S\W{7}\S + (?: + \S\W\S{3}\W + ){2} + \S{4}\W\S{3}\W{4} $ "# ))); @@ -2951,7 +3051,11 @@ mod non_space_non_word_conversion { r#" (?x) ^ - \S\W{7}\S{2}\W\S{3}\W\S\W\S{3}\W\S{4}\W\S{3}\W{4} + \S\W{7}\S + (?: + \S\W\S{3}\W + ){2} + \S{4}\W\S{3}\W{4} $ "# ))); @@ -3494,7 +3598,7 @@ mod space_non_space_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--spaces", "--non-spaces", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\S\\s{3}\\S{3}\\s\\S{2}\\s\\S{3}\\s\\S\\s\\S{3}\\s\\S{4}\\s\\S{3}\\s\\S{3}$\n", + "^\\S\\s{3}\\S(?:\\S{2}\\s){2}\\S{3}\\s(?:\\S(?:\\s\\S{3}){2}){2}$\n", )); } @@ -3509,7 +3613,7 @@ mod space_non_space_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\S\\s{3}\\S{3}\\s\\S{2}\\s\\S{3}\\s\\S\\s\\S{3}\\s\\S{4}\\s\\S{3}\\s\\S{3}$\n", + "^\\S\\s{3}\\S(?:\\S{2}\\s){2}\\S{3}\\s(?:\\S(?:\\s\\S{3}){2}){2}$\n", )); } @@ -3525,7 +3629,7 @@ mod space_non_space_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\S\\s{3}\\S{3}\\s\\S{2}\\s\\S{3}\\s\\S\\s\\S{3}\\s\\S{4}\\s\\S{3}\\s\\S{3}$\n", + "^\\S\\s{3}\\S(?:\\S{2}\\s){2}\\S{3}\\s(?:\\S(?:\\s\\S{3}){2}){2}$\n", )); } @@ -3543,7 +3647,17 @@ mod space_non_space_conversion { r#" (?x) ^ - \S\s{3}\S{3}\s\S{2}\s\S{3}\s\S\s\S{3}\s\S{4}\s\S{3}\s\S{3} + \S\s{3}\S + (?: + \S{2}\s + ){2} + \S{3}\s + (?: + \S + (?: + \s\S{3} + ){2} + ){2} $ "# ))); @@ -3564,7 +3678,17 @@ mod space_non_space_conversion { r#" (?x) ^ - \S\s{3}\S{3}\s\S{2}\s\S{3}\s\S\s\S{3}\s\S{4}\s\S{3}\s\S{3} + \S\s{3}\S + (?: + \S{2}\s + ){2} + \S{3}\s + (?: + \S + (?: + \s\S{3} + ){2} + ){2} $ "# ))); @@ -3586,7 +3710,17 @@ mod space_non_space_conversion { r#" (?x) ^ - \S\s{3}\S{3}\s\S{2}\s\S{3}\s\S\s\S{3}\s\S{4}\s\S{3}\s\S{3} + \S\s{3}\S + (?: + \S{2}\s + ){2} + \S{3}\s + (?: + \S + (?: + \s\S{3} + ){2} + ){2} $ "# ))); @@ -3691,7 +3825,7 @@ mod word_non_word_conversion { let mut grex = init_command(); grex.args(&["--repetitions", "--words", "--non-words", TEST_CASE]); grex.assert().success().stdout(predicate::eq( - "^\\w\\W{7}\\w{2}\\W\\w{3}\\W\\w\\W\\w{3}\\W\\w{4}\\W\\w{3}\\W{4}$\n", + "^\\w\\W{7}\\w(?:\\w\\W\\w{3}\\W){2}\\w{4}\\W\\w{3}\\W{4}$\n", )); } @@ -3706,7 +3840,7 @@ mod word_non_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w\\W{7}\\w{2}\\W\\w{3}\\W\\w\\W\\w{3}\\W\\w{4}\\W\\w{3}\\W{4}$\n", + "^\\w\\W{7}\\w(?:\\w\\W\\w{3}\\W){2}\\w{4}\\W\\w{3}\\W{4}$\n", )); } @@ -3722,7 +3856,7 @@ mod word_non_word_conversion { TEST_CASE, ]); grex.assert().success().stdout(predicate::eq( - "^\\w\\W{7}\\w{2}\\W\\w{3}\\W\\w\\W\\w{3}\\W\\w{4}\\W\\w{3}\\W{4}$\n", + "^\\w\\W{7}\\w(?:\\w\\W\\w{3}\\W){2}\\w{4}\\W\\w{3}\\W{4}$\n", )); } @@ -3740,7 +3874,11 @@ mod word_non_word_conversion { r#" (?x) ^ - \w\W{7}\w{2}\W\w{3}\W\w\W\w{3}\W\w{4}\W\w{3}\W{4} + \w\W{7}\w + (?: + \w\W\w{3}\W + ){2} + \w{4}\W\w{3}\W{4} $ "# ))); @@ -3761,7 +3899,11 @@ mod word_non_word_conversion { r#" (?x) ^ - \w\W{7}\w{2}\W\w{3}\W\w\W\w{3}\W\w{4}\W\w{3}\W{4} + \w\W{7}\w + (?: + \w\W\w{3}\W + ){2} + \w{4}\W\w{3}\W{4} $ "# ))); @@ -3783,7 +3925,11 @@ mod word_non_word_conversion { r#" (?x) ^ - \w\W{7}\w{2}\W\w{3}\W\w\W\w{3}\W\w{4}\W\w{3}\W{4} + \w\W{7}\w + (?: + \w\W\w{3}\W + ){2} + \w{4}\W\w{3}\W{4} $ "# ))); diff --git a/tests/lib_integration_tests.rs b/tests/lib_integration_tests.rs index fcccafb..7ec5304 100644 --- a/tests/lib_integration_tests.rs +++ b/tests/lib_integration_tests.rs @@ -318,6 +318,12 @@ mod no_conversion { case(vec!["a"], "^a$"), case(vec!["aa"], "^a{2}$"), case(vec!["aaa"], "^a{3}$"), + case(vec!["aaa aaa"], "^a{3} a{3}$"), + case(vec!["ababab ababab"], "^(?:ab){3} (?:ab){3}$"), + case(vec!["ababab ababab"], "^(?:ab){3} {2}(?:ab){3}$"), + case(vec!["a ababab ababab"], "^a(?: (?:ab){3}){2}$"), + case(vec!["ababab ababab a"], "^a(?:b(?:ab){2} a){2}$"), + case(vec!["ababababab abab ababab"], "^ababab(?:(?:ab){2} ){2}(?:ab){3}$"), case(vec!["a", "aa"], "^a{1,2}$"), case(vec!["aaa", "a", "aa"], "^a{1,3}$"), case(vec!["aaaa", "a", "aa"], "^(?:a{1,2}|a{4})$"), @@ -331,9 +337,9 @@ mod no_conversion { case(vec!["aababab"], "^a(?:ab){3}$"), case(vec!["abababaa"], "^(?:ab){3}a{2}$"), case(vec!["aaaaaabbbbb"], "^a{6}b{5}$"), - case(vec!["aabaababab"], "^(?:a{2}b){2}abab$"), // goal: ^(a{2}b){2}(ab){2}$ + case(vec!["aabaababab"], "^a{2}ba(?:ab){3}$"), case(vec!["aaaaaaabbbbbba"], "^a{7}b{6}a$"), - case(vec!["abaaaabaaba"], "^abaa(?:a{2}b){2}a$"), + case(vec!["abaaaabaaba"], "^abaaa(?:aba){2}$"), case(vec!["bbaababb"], "^b{2}a{2}bab{2}$"), case(vec!["b", "ba"], "^ba?$"), case(vec!["b", "ba", "baa"], "^b(?:a{1,2})?$"), @@ -483,21 +489,20 @@ mod no_conversion { r#" (?x) ^ + a{2}ba (?: - a{2}b - ){2} - abab + ab + ){3} $"# )), case(vec!["abaaaabaaba"], indoc!( r#" (?x) ^ - abaa + abaaa (?: - a{2}b + aba ){2} - a $"# )), case(vec!["xy̆y̆z", "xy̆y̆y̆y̆z"], indoc!( @@ -565,6 +570,7 @@ mod no_conversion { case(vec!["aaa"], "^aaa$"), case(vec!["aaaa"], "^a{4}$"), case(vec!["aaaaa"], "^a{5}$"), + case(vec!["ababababab abab ababab"], "^(?:ab){5} abab ababab$"), case(vec!["aabbaaaabbbabbbbba"], "^aabba{4}bbbab{5}a$"), case(vec!["baabaaaaaabb"], "^baaba{6}bb$"), case(vec!["ababab"], "^ababab$"), @@ -595,7 +601,8 @@ mod no_conversion { case(vec!["ababab"], "^ababab$"), case(vec!["abcabcabc"], "^(?:abc){3}$"), case(vec!["abcabcabc", "dede"], "^(?:dede|(?:abc){3})$"), - case(vec!["abcabcabc", "defgdefg"], "^(?:(?:defg){2}|(?:abc){3})$") + case(vec!["abcabcabc", "defgdefg"], "^(?:(?:defg){2}|(?:abc){3})$"), + case(vec!["ababababab abab ababab"], "^ababab(?:abab ){2}ababab$") )] fn succeeds_with_increased_minimum_substring_length( test_cases: Vec<&str>, @@ -614,7 +621,8 @@ mod no_conversion { case(vec!["abcabcabc"], "^abcabcabc$"), case(vec!["abcabcabcabc"], "^(?:abc){4}$"), case(vec!["aaaaaaaaaaaa"], "^aaaaaaaaaaaa$"), - case(vec!["abababab", "abcabcabcabc"], "^(?:abababab|(?:abc){4})$") + case(vec!["abababab", "abcabcabcabc"], "^(?:abababab|(?:abc){4})$"), + case(vec!["ababababab abab ababab"], "^ababababab abab ababab$") )] fn succeeds_with_increased_minimum_repetitions_and_substring_length( test_cases: Vec<&str>, @@ -962,7 +970,7 @@ mod word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w {3}♥{3} \\w{2} \\w{3} \\w \\w{3} \\w{4} \\w{3} 💩{2}\\.$" + "^\\w {3}♥{3} \\w{2}(?: \\w{3} \\w){2}(?:\\w{3} ){2}💩{2}\\.$" ) )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { @@ -976,7 +984,7 @@ mod word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w {3}\\u{2665}{3} \\w{2} \\w{3} \\w \\w{3} \\w{4} \\w{3} \\u{1f4a9}{2}\\.$" + "^\\w {3}\\u{2665}{3} \\w{2}(?: \\w{3} \\w){2}(?:\\w{3} ){2}\\u{1f4a9}{2}\\.$" ) )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { @@ -991,7 +999,7 @@ mod word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w {3}\\u{2665}{3} \\w{2} \\w{3} \\w \\w{3} \\w{4} \\w{3} (?:\\u{d83d}\\u{dca9}){2}\\.$" + "^\\w {3}\\u{2665}{3} \\w{2}(?: \\w{3} \\w){2}(?:\\w{3} ){2}(?:\\u{d83d}\\u{dca9}){2}\\.$" ) )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { @@ -1241,7 +1249,7 @@ mod digit_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w {3}♥{3} \\d(?:\\d \\w{3} ){2}\\w{4} \\w{3} 💩{2}\\.$" + "^\\w {3}♥{3} \\d(?:\\d \\w{3} ){2}\\w(?:\\w{3} ){2}💩{2}\\.$" ) )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { @@ -1255,7 +1263,7 @@ mod digit_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w {3}\\u{2665}{3} \\d(?:\\d \\w{3} ){2}\\w{4} \\w{3} \\u{1f4a9}{2}\\.$" + "^\\w {3}\\u{2665}{3} \\d(?:\\d \\w{3} ){2}\\w(?:\\w{3} ){2}\\u{1f4a9}{2}\\.$" ) )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { @@ -1270,7 +1278,7 @@ mod digit_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w {3}\\u{2665}{3} \\d(?:\\d \\w{3} ){2}\\w{4} \\w{3} (?:\\u{d83d}\\u{dca9}){2}\\.$" + "^\\w {3}\\u{2665}{3} \\d(?:\\d \\w{3} ){2}\\w(?:\\w{3} ){2}(?:\\u{d83d}\\u{dca9}){2}\\.$" ) )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { @@ -1339,7 +1347,7 @@ mod space_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w\\s{3}♥{3}\\s\\w{2}\\s\\w{3}\\s\\w\\s\\w{3}\\s\\w{4}\\s\\w{3}\\s💩{2}\\.$" + "^\\w\\s{3}♥{3}\\s\\w{2}(?:\\s\\w{3}\\s\\w){2}(?:\\w{3}\\s){2}💩{2}\\.$" ) )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { @@ -1353,7 +1361,7 @@ mod space_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w\\s{3}\\u{2665}{3}\\s\\w{2}\\s\\w{3}\\s\\w\\s\\w{3}\\s\\w{4}\\s\\w{3}\\s\\u{1f4a9}{2}\\.$" + "^\\w\\s{3}\\u{2665}{3}\\s\\w{2}(?:\\s\\w{3}\\s\\w){2}(?:\\w{3}\\s){2}\\u{1f4a9}{2}\\.$" ) )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { @@ -1368,7 +1376,7 @@ mod space_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w\\s{3}\\u{2665}{3}\\s\\w{2}\\s\\w{3}\\s\\w\\s\\w{3}\\s\\w{4}\\s\\w{3}\\s(?:\\u{d83d}\\u{dca9}){2}\\.$" + "^\\w\\s{3}\\u{2665}{3}\\s\\w{2}(?:\\s\\w{3}\\s\\w){2}(?:\\w{3}\\s){2}(?:\\u{d83d}\\u{dca9}){2}\\.$" ) )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { @@ -1437,7 +1445,7 @@ mod digit_space_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w\\s{3}♥{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w{4}\\s\\w{3}\\s💩{2}\\.$" + "^\\w\\s{3}♥{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w(?:\\w{3}\\s){2}💩{2}\\.$" ) )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { @@ -1456,7 +1464,7 @@ mod digit_space_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w\\s{3}\\u{2665}{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w{4}\\s\\w{3}\\s\\u{1f4a9}{2}\\.$" + "^\\w\\s{3}\\u{2665}{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w(?:\\w{3}\\s){2}\\u{1f4a9}{2}\\.$" ) )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { @@ -1476,7 +1484,7 @@ mod digit_space_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w\\s{3}\\u{2665}{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w{4}\\s\\w{3}\\s(?:\\u{d83d}\\u{dca9}){2}\\.$" + "^\\w\\s{3}\\u{2665}{3}\\s\\d(?:\\d\\s\\w{3}\\s){2}\\w(?:\\w{3}\\s){2}(?:\\u{d83d}\\u{dca9}){2}\\.$" ) )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { @@ -1585,7 +1593,7 @@ mod non_space_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\S {3}\\S{3} \\S{2} \\S{3} \\S \\S{3} \\S{4} \\S{3} \\S{3}$" + "^\\S {3}\\S(?:\\S{2} ){2}\\S{3} (?:\\S(?: \\S{3}){2}){2}$" ) )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { @@ -1769,7 +1777,7 @@ mod non_space_non_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\S\\W{7}\\S{2}\\W\\S{3}\\W\\S\\W\\S{3}\\W\\S{4}\\W\\S{3}\\W{4}$" + "^\\S\\W{7}\\S(?:\\S\\W\\S{3}\\W){2}\\S{4}\\W\\S{3}\\W{4}$" ) )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { @@ -1888,7 +1896,7 @@ mod space_non_space_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\S\\s{3}\\S{3}\\s\\S{2}\\s\\S{3}\\s\\S\\s\\S{3}\\s\\S{4}\\s\\S{3}\\s\\S{3}$" + "^\\S\\s{3}\\S(?:\\S{2}\\s){2}\\S{3}\\s(?:\\S(?:\\s\\S{3}){2}){2}$" ) )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { @@ -1928,7 +1936,7 @@ mod word_non_word_conversion { #[rstest(test_cases, expected_output, case( vec!["I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."], - "^\\w\\W{7}\\w{2}\\W\\w{3}\\W\\w\\W\\w{3}\\W\\w{4}\\W\\w{3}\\W{4}$" + "^\\w\\W{7}\\w(?:\\w\\W\\w{3}\\W){2}\\w{4}\\W\\w{3}\\W{4}$" ) )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { From 81acfb5b2c540ee376646c98fae834f4023ad970 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Thu, 26 Aug 2021 15:34:28 +0200 Subject: [PATCH 11/18] Rotate alternations to fix prefix matches (#31) --- src/ast/format.rs | 2 +- src/fsm/dfa.rs | 39 +++++++++++++++++----------------- src/regexp/regexp.rs | 37 ++++++++++++++++++++++++++++++-- tests/lib_integration_tests.rs | 20 +++++++++++++++-- tests/property_tests.rs | 21 ++++++++---------- 5 files changed, 83 insertions(+), 36 deletions(-) diff --git a/src/ast/format.rs b/src/ast/format.rs index 7b6468f..d1ef68e 100644 --- a/src/ast/format.rs +++ b/src/ast/format.rs @@ -77,7 +77,7 @@ fn format_character_class( char_set: &BTreeSet, config: &RegExpConfig, ) -> Result { - let chars_to_escape = ['[', ']', '\\', '-', '^']; + let chars_to_escape = ['[', ']', '\\', '-', '^', '$']; let escaped_char_set = char_set .iter() .map(|c| { diff --git a/src/fsm/dfa.rs b/src/fsm/dfa.rs index 119d920..62b9c30 100644 --- a/src/fsm/dfa.rs +++ b/src/fsm/dfa.rs @@ -37,21 +37,18 @@ pub struct Dfa { } impl Dfa { - pub(crate) fn from(grapheme_clusters: Vec, config: &RegExpConfig) -> Self { + pub(crate) fn from( + grapheme_clusters: &[GraphemeCluster], + is_minimized: bool, + config: &RegExpConfig, + ) -> Self { let mut dfa = Self::new(config); for cluster in grapheme_clusters { dfa.insert(cluster); } - - let is_exactly_one_anchor_disabled = - config.is_start_anchor_disabled ^ config.is_end_anchor_disabled; - let is_no_anchor_disabled = - !config.is_start_anchor_disabled && !config.is_end_anchor_disabled; - - if is_exactly_one_anchor_disabled || is_no_anchor_disabled { + if is_minimized { dfa.minimize(); } - dfa } @@ -88,17 +85,17 @@ impl Dfa { } } - fn insert(&mut self, cluster: GraphemeCluster) { + fn insert(&mut self, cluster: &GraphemeCluster) { let mut current_state = self.initial_state; for grapheme in cluster.graphemes() { self.alphabet.insert(grapheme.clone()); - current_state = self.get_next_state(current_state, grapheme); + current_state = self.return_next_state(current_state, grapheme); } self.final_state_indices.insert(current_state.index()); } - fn get_next_state(&mut self, current_state: State, edge_label: &Grapheme) -> State { + fn return_next_state(&mut self, current_state: State, edge_label: &Grapheme) -> State { match self.find_next_state(current_state, edge_label) { Some(next_state) => next_state, None => self.add_new_state(current_state, edge_label), @@ -276,7 +273,7 @@ mod tests { let mut dfa = Dfa::new(&config); assert_eq!(dfa.state_count(), 1); - dfa.insert(GraphemeCluster::from("abcd", &RegExpConfig::new())); + dfa.insert(&GraphemeCluster::from("abcd", &RegExpConfig::new())); assert_eq!(dfa.state_count(), 5); } @@ -284,7 +281,8 @@ mod tests { fn test_is_final_state() { let config = RegExpConfig::new(); let dfa = Dfa::from( - vec![GraphemeCluster::from("abcd", &RegExpConfig::new())], + &vec![GraphemeCluster::from("abcd", &RegExpConfig::new())], + true, &config, ); @@ -299,10 +297,11 @@ mod tests { fn test_outgoing_edges() { let config = RegExpConfig::new(); let dfa = Dfa::from( - vec![ + &vec![ GraphemeCluster::from("abcd", &RegExpConfig::new()), GraphemeCluster::from("abxd", &RegExpConfig::new()), ], + true, &config, ); let state = State::new(2); @@ -330,10 +329,11 @@ mod tests { fn test_states_in_depth_first_order() { let config = RegExpConfig::new(); let dfa = Dfa::from( - vec![ + &vec![ GraphemeCluster::from("abcd", &RegExpConfig::new()), GraphemeCluster::from("axyz", &RegExpConfig::new()), ], + true, &config, ); let states = dfa.states_in_depth_first_order(); @@ -403,11 +403,11 @@ mod tests { assert_eq!(dfa.graph.node_count(), 1); assert_eq!(dfa.graph.edge_count(), 0); - dfa.insert(GraphemeCluster::from("abcd", &RegExpConfig::new())); + dfa.insert(&GraphemeCluster::from("abcd", &RegExpConfig::new())); assert_eq!(dfa.graph.node_count(), 5); assert_eq!(dfa.graph.edge_count(), 4); - dfa.insert(GraphemeCluster::from("abxd", &RegExpConfig::new())); + dfa.insert(&GraphemeCluster::from("abxd", &RegExpConfig::new())); assert_eq!(dfa.graph.node_count(), 7); assert_eq!(dfa.graph.edge_count(), 6); @@ -420,10 +420,11 @@ mod tests { fn test_dfa_constructor() { let config = RegExpConfig::new(); let dfa = Dfa::from( - vec![ + &vec![ GraphemeCluster::from("abcd", &RegExpConfig::new()), GraphemeCluster::from("abxd", &RegExpConfig::new()), ], + true, &config, ); assert_eq!(dfa.graph.node_count(), 5); diff --git a/src/regexp/regexp.rs b/src/regexp/regexp.rs index 8853ba0..9a0b305 100644 --- a/src/regexp/regexp.rs +++ b/src/regexp/regexp.rs @@ -37,8 +37,17 @@ impl RegExp { } Self::sort(test_cases); let grapheme_clusters = Self::grapheme_clusters(test_cases, config); - let dfa = Dfa::from(grapheme_clusters, config); - let ast = Expression::from(dfa, config); + let mut dfa = Dfa::from(&grapheme_clusters, true, config); + let mut ast = Expression::from(dfa, config); + + if config.is_start_anchor_disabled + && config.is_end_anchor_disabled + && !Self::is_each_test_case_matched(&mut ast, test_cases) + { + dfa = Dfa::from(&grapheme_clusters, false, config); + ast = Expression::from(dfa, config); + } + Self { ast, config: config.clone(), @@ -78,6 +87,30 @@ impl RegExp { clusters } + + fn is_each_test_case_matched(expr: &mut Expression, test_cases: &[String]) -> bool { + for _ in 1..test_cases.len() { + let regex = Regex::new(&expr.to_string()).unwrap(); + if test_cases + .iter() + .all(|test_case| regex.find_iter(test_case).count() == 1) + { + return true; + } else if let Expression::Alternation(options, _) = expr { + options.rotate_right(1); + } else if let Expression::Concatenation(first, second, _) = expr { + let a: &mut Expression = first; + let b: &mut Expression = second; + + if let Expression::Alternation(options, _) = a { + options.rotate_right(1); + } else if let Expression::Alternation(options, _) = b { + options.rotate_right(1); + } + } + } + false + } } impl Display for RegExp { diff --git a/tests/lib_integration_tests.rs b/tests/lib_integration_tests.rs index 7ec5304..11e8369 100644 --- a/tests/lib_integration_tests.rs +++ b/tests/lib_integration_tests.rs @@ -306,6 +306,21 @@ mod no_conversion { assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); } + + #[rstest(test_cases, expected_output, + case(vec!["bab", "b", "cb", "bba"], "(?:(?:ba|c)b|b(?:ba)?)"), + case(vec!["a", "aba", "baaa", "aaab"], "(?:baaa|a(?:aab|ba)?)"), + case(vec!["a", "abab", "bbb", "aaac"], "(?:a(?:bab|aac)?|bbb)"), + case( + // https://github.com/pemistahl/grex/issues/31 + vec!["agbhd", "eibcd", "egbcd", "fbjbf", "agbh", "eibc", "egbc", "ebc", "fbc", "cd", "f", "c", "abcd", "ebcd", "fbcd"], + "(?:(?:e(?:[gi])?bc|(?:fb)?c)d?|a(?:gbhd?|bcd)|f(?:bjbf)?)") + )] + fn succeeds_without_anchors(test_cases: Vec<&str>, expected_output: &str) { + let regexp = RegExpBuilder::from(&test_cases).without_anchors().build(); + assert_that_regexp_is_correct(regexp, expected_output, &test_cases); + assert_that_regexp_matches_test_cases(expected_output, test_cases); + } } mod repetition { @@ -2056,8 +2071,9 @@ fn assert_that_regexp_is_correct(regexp: String, expected_output: &str, test_cas fn assert_that_regexp_matches_test_cases(expected_output: &str, test_cases: Vec<&str>) { let re = Regex::new(expected_output).unwrap(); for test_case in test_cases { - assert!( - re.is_match(test_case), + assert_eq!( + re.find_iter(test_case).count(), + 1, "\n\n\"{}\" does not match regex {}\n\n", test_case, expected_output diff --git a/tests/property_tests.rs b/tests/property_tests.rs index 7074fea..13e5b2b 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -229,15 +229,14 @@ proptest! { let regexp = RegExpBuilder::from(&test_cases_vec).without_start_anchor().build(); if let Ok(compiled_regexp) = compile_regexp(®exp) { for test_case in test_cases_vec { - let matches = compiled_regexp.find_iter(&test_case).collect::>(); - let substrings = matches.iter().map(|m| m.as_str()).collect::>(); + let substrings = compiled_regexp.find_iter(&test_case).map(|m| m.as_str()).collect::>(); prop_assert_eq!( - matches.len(), + substrings.len(), 1, "expression '{}' does not match test case '{}' entirely but {} of its substrings: {:?}", regexp, test_case, - matches.len(), + substrings.len(), substrings ); } @@ -252,15 +251,14 @@ proptest! { let regexp = RegExpBuilder::from(&test_cases_vec).without_end_anchor().build(); if let Ok(compiled_regexp) = compile_regexp(®exp) { for test_case in test_cases_vec { - let matches = compiled_regexp.find_iter(&test_case).collect::>(); - let substrings = matches.iter().map(|m| m.as_str()).collect::>(); + let substrings = compiled_regexp.find_iter(&test_case).map(|m| m.as_str()).collect::>(); prop_assert_eq!( - matches.len(), + substrings.len(), 1, "expression '{}' does not match test case '{}' entirely but {} of its substrings: {:?}", regexp, test_case, - matches.len(), + substrings.len(), substrings ); } @@ -275,15 +273,14 @@ proptest! { let regexp = RegExpBuilder::from(&test_cases_vec).without_anchors().build(); if let Ok(compiled_regexp) = compile_regexp(®exp) { for test_case in test_cases_vec { - let matches = compiled_regexp.find_iter(&test_case).collect::>(); - let substrings = matches.iter().map(|m| m.as_str()).collect::>(); + let substrings = compiled_regexp.find_iter(&test_case).map(|m| m.as_str()).collect::>(); prop_assert_eq!( - matches.len(), + substrings.len(), 1, "expression '{}' does not match test case '{}' entirely but {} of its substrings: {:?}", regexp, test_case, - matches.len(), + substrings.len(), substrings ); } From 8f4d941f6409795f305eaed09a7259daecaf4bee Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Sat, 28 Aug 2021 00:25:11 +0200 Subject: [PATCH 12/18] Add stdin support to command-line tool (#45) --- Cargo.lock | 5 +++-- Cargo.toml | 3 ++- src/main.rs | 27 ++++++++++++++++++++++++--- tests/cli_integration_tests.rs | 24 ++++++++++++++++++++++-- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3325b29..d92e3d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -22,9 +22,9 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "1.0.7" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d20831bd004dda4c7c372c19cdabff369f794a95e955b3f13fe460e3e1ae95f" +checksum = "54f002ce7d0c5e809ebb02be78fd503aeed4a511fd0fcaff6e6914cbdabbfa33" dependencies = [ "bstr", "doc-comment", @@ -165,6 +165,7 @@ name = "grex" version = "1.3.0" dependencies = [ "assert_cmd", + "atty", "indoc", "itertools", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 1af1b98..2a7c60c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ categories = ["command-line-utilities"] keywords = ["pattern", "regex", "regexp"] [dependencies] +atty = "0.2.14" itertools = "0.10.1" lazy_static = "1.4.0" ndarray = "0.15.3" @@ -40,7 +41,7 @@ unic-ucd-category = "0.9.0" unicode-segmentation = "1.8.0" [dev-dependencies] -assert_cmd = "1.0.7" +assert_cmd = "2.0.0" indoc = "1.0.3" predicates = "2.0.0" proptest = "1.0.0" diff --git a/src/main.rs b/src/main.rs index 747465f..b17ad6a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,7 +16,7 @@ use grex::{Feature, RegExpBuilder}; use itertools::Itertools; -use std::io::{Error, ErrorKind}; +use std::io::{BufRead, Error, ErrorKind, Read}; use std::path::PathBuf; use structopt::clap::AppSettings::{AllowLeadingHyphen, ColoredHelp}; use structopt::StructOpt; @@ -276,10 +276,31 @@ fn main() { } fn obtain_input(cli: &Cli) -> Result, Error> { + let is_stdin_available = atty::isnt(atty::Stream::Stdin); + if !cli.input.is_empty() { - Ok(cli.input.clone()) + let is_single_item = cli.input.len() == 1; + let is_hyphen = cli.input.get(0).unwrap() == "-"; + + if is_single_item && is_hyphen && is_stdin_available { + Ok(std::io::stdin() + .lock() + .lines() + .map(|line| line.unwrap()) + .collect_vec()) + } else { + Ok(cli.input.clone()) + } } else if let Some(file_path) = &cli.file_path { - match std::fs::read_to_string(&file_path) { + let is_hyphen = file_path.as_os_str() == "-"; + let path = if is_hyphen && is_stdin_available { + let mut stdin_file_path = String::new(); + std::io::stdin().read_to_string(&mut stdin_file_path)?; + PathBuf::from(stdin_file_path.trim()) + } else { + file_path.to_path_buf() + }; + match std::fs::read_to_string(&path) { Ok(file_content) => Ok(file_content.lines().map(|it| it.to_string()).collect_vec()), Err(error) => Err(error), } diff --git a/tests/cli_integration_tests.rs b/tests/cli_integration_tests.rs index 0d1ef59..f9e0463 100644 --- a/tests/cli_integration_tests.rs +++ b/tests/cli_integration_tests.rs @@ -14,11 +14,10 @@ * limitations under the License. */ -use assert_cmd::prelude::*; +use assert_cmd::Command; use indoc::indoc; use predicates::prelude::*; use std::io::Write; -use std::process::Command; use tempfile::NamedTempFile; const TEST_CASE: &str = "I ♥♥♥ 36 and ٣ and y̆y̆ and 💩💩."; @@ -128,6 +127,27 @@ mod no_conversion { .stdout(predicate::eq("^(?:b\\\\n|äöü|[ac♥])$\n")); } + #[test] + fn succeeds_with_test_cases_from_stdin() { + let mut grex = init_command(); + grex.write_stdin("a\nb\\n\n\nc\näöü\n♥") + .arg("-") + .assert() + .stdout(predicate::eq("^(?:b\\\\n|äöü|[ac♥])$\n")); + } + + #[test] + fn succeeds_with_file_from_stdin() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "a\nb\\n\n\nc\näöü\n♥").unwrap(); + + let mut grex = init_command(); + grex.write_stdin(file.path().to_str().unwrap()) + .args(&["-f", "-"]) + .assert() + .stdout(predicate::eq("^(?:b\\\\n|äöü|[ac♥])$\n")); + } + #[test] fn fails_with_surrogate_but_without_escape_option() { let mut grex = init_command(); From 60c8541eb766bc226a4df078589c212d6121934e Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Sun, 29 Aug 2021 12:57:46 +0200 Subject: [PATCH 13/18] Fix verbose regex for specific space character --- src/regexp/regexp.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/regexp/regexp.rs b/src/regexp/regexp.rs index 9a0b305..bc6e7d5 100644 --- a/src/regexp/regexp.rs +++ b/src/regexp/regexp.rs @@ -299,6 +299,7 @@ fn apply_verbose_mode(regexp: String, config: &RegExpConfig) -> String { .replace(" ", "\\s") .replace(" ", "\\s") .replace("\u{85}", "\\s") + .replace("\u{2005}", "\\s") .replace("\u{2028}", "\\s") .replace(" ", "\\ "); From 5108cbd5083d5b557cef949c1f03ce86587540cc Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Tue, 14 Sep 2021 19:08:16 +0200 Subject: [PATCH 14/18] Replace enum `Feature` with builder methods (#47) --- src/lib.rs | 27 ++-- src/main.rs | 25 ++- src/regexp/builder.rs | 106 ++++++++++++- src/regexp/config.rs | 53 +++++-- src/regexp/feature.rs | 21 +-- src/regexp/mod.rs | 2 + tests/lib_integration_tests.rs | 273 ++++++++++++++++++++------------- tests/property_tests.rs | 2 + 8 files changed, 352 insertions(+), 157 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index cc5883c..7776b98 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -99,10 +99,11 @@ //! ### 4.2 Convert to character classes //! //! ``` -//! use grex::{Feature, RegExpBuilder}; +//! use grex::RegExpBuilder; //! //! let regexp = RegExpBuilder::from(&["a", "aa", "123"]) -//! .with_conversion_of(&[Feature::Digit, Feature::Word]) +//! .with_conversion_of_digits() +//! .with_conversion_of_words() //! .build(); //! assert_eq!(regexp, "^(?:\\d\\d\\d|\\w(?:\\w)?)$"); //! ``` @@ -110,10 +111,10 @@ //! ### 4.3 Convert repeated substrings //! //! ``` -//! use grex::{Feature, RegExpBuilder}; +//! use grex::RegExpBuilder; //! //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) -//! .with_conversion_of(&[Feature::Repetition]) +//! .with_conversion_of_repetitions() //! .build(); //! assert_eq!(regexp, "^(?:a{2}|(?:bc){2}|(?:def){3})$"); //! ``` @@ -126,10 +127,10 @@ //! substring `a` has a length of 1, but the minimum substring length has been set to 2. //! //! ``` -//! use grex::{Feature, RegExpBuilder}; +//! use grex::RegExpBuilder; //! //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) -//! .with_conversion_of(&[Feature::Repetition]) +//! .with_conversion_of_repetitions() //! .with_minimum_substring_length(2) //! .build(); //! assert_eq!(regexp, "^(?:aa|(?:bc){2}|(?:def){3})$"); @@ -139,10 +140,10 @@ //! will be converted because it is the only one that is repeated twice. //! //! ``` -//! use grex::{Feature, RegExpBuilder}; +//! use grex::RegExpBuilder; //! //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) -//! .with_conversion_of(&[Feature::Repetition]) +//! .with_conversion_of_repetitions() //! .with_minimum_repetitions(2) //! .build(); //! assert_eq!(regexp, "^(?:bcbc|aa|(?:def){3})$"); @@ -180,10 +181,10 @@ //! Case-insensitive matching can be enabled like so: //! //! ``` -//! use grex::{Feature, RegExpBuilder}; +//! use grex::RegExpBuilder; //! //! let regexp = RegExpBuilder::from(&["big", "BIGGER"]) -//! .with_conversion_of(&[Feature::CaseInsensitivity]) +//! .with_case_insensitive_matching() //! .build(); //! assert_eq!(regexp, "(?i)^big(?:ger)?$"); //! ``` @@ -194,10 +195,11 @@ //! Extending the previous example, you can switch to capturing groups instead. //! //! ``` -//! use grex::{Feature, RegExpBuilder}; +//! use grex::RegExpBuilder; //! //! let regexp = RegExpBuilder::from(&["big", "BIGGER"]) -//! .with_conversion_of(&[Feature::CaseInsensitivity, Feature::CapturingGroup]) +//! .with_case_insensitive_matching() +//! .with_capturing_groups() //! .build(); //! assert_eq!(regexp, "(?i)^big(ger)?$"); //! ``` @@ -252,5 +254,6 @@ mod fsm; mod regexp; mod unicode_tables; +#[allow(deprecated)] pub use regexp::Feature; pub use regexp::RegExpBuilder; diff --git a/src/main.rs b/src/main.rs index b17ad6a..bb24848 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,7 +14,7 @@ * limitations under the License. */ -use grex::{Feature, RegExpBuilder}; +use grex::RegExpBuilder; use itertools::Itertools; use std::io::{BufRead, Error, ErrorKind, Read}; use std::path::PathBuf; @@ -316,46 +316,41 @@ fn handle_input(cli: &Cli, input: Result, Error>) { match input { Ok(test_cases) => { let mut builder = RegExpBuilder::from(&test_cases); - let mut conversion_features = vec![]; if cli.is_digit_converted { - conversion_features.push(Feature::Digit); + builder.with_conversion_of_digits(); } if cli.is_non_digit_converted { - conversion_features.push(Feature::NonDigit); + builder.with_conversion_of_non_digits(); } if cli.is_space_converted { - conversion_features.push(Feature::Space); + builder.with_conversion_of_whitespace(); } if cli.is_non_space_converted { - conversion_features.push(Feature::NonSpace); + builder.with_conversion_of_non_whitespace(); } if cli.is_word_converted { - conversion_features.push(Feature::Word); + builder.with_conversion_of_words(); } if cli.is_non_word_converted { - conversion_features.push(Feature::NonWord); + builder.with_conversion_of_non_words(); } if cli.is_repetition_converted { - conversion_features.push(Feature::Repetition); + builder.with_conversion_of_repetitions(); } if cli.is_case_ignored { - conversion_features.push(Feature::CaseInsensitivity); + builder.with_case_insensitive_matching(); } if cli.is_group_captured { - conversion_features.push(Feature::CapturingGroup); - } - - if !conversion_features.is_empty() { - builder.with_conversion_of(&conversion_features); + builder.with_capturing_groups(); } if cli.is_non_ascii_char_escaped { diff --git a/src/regexp/builder.rs b/src/regexp/builder.rs index aab4b1f..8a6cdfd 100644 --- a/src/regexp/builder.rs +++ b/src/regexp/builder.rs @@ -14,6 +14,8 @@ * limitations under the License. */ +#![allow(deprecated)] + use crate::regexp::feature::Feature; use crate::regexp::{RegExp, RegExpConfig}; use itertools::Itertools; @@ -74,11 +76,107 @@ impl RegExpBuilder { } } + /// Tells `RegExpBuilder` to convert any Unicode decimal digit to character class `\d`. + /// + /// This method takes precedence over + /// [`with_conversion_of_words`](Self::with_conversion_of_words) if both are set. + /// Decimal digits are converted to `\d`, the remaining word characters to `\w`. + /// + /// This method takes precedence over + /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. + /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`. + pub fn with_conversion_of_digits(&mut self) -> &mut Self { + self.config.is_digit_converted = true; + self + } + + /// Tells `RegExpBuilder` to convert any character which is not + /// a Unicode decimal digit to character class `\D`. + /// + /// This method takes precedence over + /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set. + /// Non-digits which are also non-word characters are converted to `\D`. + /// + /// This method takes precedence over + /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. + /// Non-digits which are also non-space characters are converted to `\D`. + pub fn with_conversion_of_non_digits(&mut self) -> &mut Self { + self.config.is_non_digit_converted = true; + self + } + + /// Tells `RegExpBuilder` to convert any Unicode whitespace character to character class `\s`. + /// + /// This method takes precedence over + /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set. + /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`. + /// + /// This method takes precedence over + /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set. + /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`. + pub fn with_conversion_of_whitespace(&mut self) -> &mut Self { + self.config.is_space_converted = true; + self + } + + /// Tells `RegExpBuilder` to convert any character which is not + /// a Unicode whitespace character to character class `\S`. + pub fn with_conversion_of_non_whitespace(&mut self) -> &mut Self { + self.config.is_non_space_converted = true; + self + } + + /// Tells `RegExpBuilder` to convert any Unicode word character to character class `\w`. + /// + /// This method takes precedence over + /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set. + /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`. + /// + /// This method takes precedence over + /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. + /// Word characters are converted to `\w`, the remaining non-space characters to `\S`. + pub fn with_conversion_of_words(&mut self) -> &mut Self { + self.config.is_word_converted = true; + self + } + + /// Tells `RegExpBuilder` to convert any character which is not + /// a Unicode word character to character class `\W`. + /// + /// This method takes precedence over + /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. + /// Non-words which are also non-space characters are converted to `\W`. + pub fn with_conversion_of_non_words(&mut self) -> &mut Self { + self.config.is_non_word_converted = true; + self + } + + /// Tells `RegExpBuilder` to detect repeated non-overlapping substrings and + /// to convert them to `{min,max}` quantifier notation. + pub fn with_conversion_of_repetitions(&mut self) -> &mut Self { + self.config.is_repetition_converted = true; + self + } + + /// Tells `RegExpBuilder` to enable case-insensitive matching of test cases + /// so that letters match both upper and lower case. + pub fn with_case_insensitive_matching(&mut self) -> &mut Self { + self.config.is_case_insensitive_matching = true; + self + } + + /// Tells `RegExpBuilder` to replace non-capturing groups by capturing ones. + pub fn with_capturing_groups(&mut self) -> &mut Self { + self.config.is_capturing_group_enabled = true; + self + } + /// Tells `RegExpBuilder` which conversions should be performed during /// regular expression generation. The available conversion features /// are listed in the [`Feature`](./enum.Feature.html#variants) enum. /// /// ⚠ Panics if `features` is empty. + #[deprecated(since = "1.3.0", note = "This method will be removed in 1.4.0.")] pub fn with_conversion_of(&mut self, features: &[Feature]) -> &mut Self { if features.is_empty() { panic!("No conversion features have been provided for regular expression generation"); @@ -88,9 +186,7 @@ impl RegExpBuilder { } /// Specifies the minimum quantity of substring repetitions to be converted if - /// [`Feature::Repetition`](./enum.Feature.html#variant.Repetition) - /// is set as one of the features in method - /// [`with_conversion_of`](./struct.RegExpBuilder.html#method.with_conversion_of). + /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set. /// /// If the quantity is not explicitly set with this method, a default value of 1 will be used. /// @@ -104,9 +200,7 @@ impl RegExpBuilder { } /// Specifies the minimum length a repeated substring must have in order to be converted if - /// [`Feature::Repetition`](./enum.Feature.html#variant.Repetition) - /// is set as one of the features in method - /// [`with_conversion_of`](./struct.RegExpBuilder.html#method.with_conversion_of). + /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set. /// /// If the length is not explicitly set with this method, a default value of 1 will be used. /// diff --git a/src/regexp/config.rs b/src/regexp/config.rs index e31235c..2042fe6 100644 --- a/src/regexp/config.rs +++ b/src/regexp/config.rs @@ -14,6 +14,8 @@ * limitations under the License. */ +#![allow(deprecated)] + use crate::regexp::Feature; #[derive(Clone, Debug, Hash, Ord, PartialOrd, Eq, PartialEq)] @@ -21,6 +23,15 @@ pub struct RegExpConfig { pub(crate) conversion_features: Vec, pub(crate) minimum_repetitions: u32, pub(crate) minimum_substring_length: u32, + pub(crate) is_digit_converted: bool, + pub(crate) is_non_digit_converted: bool, + pub(crate) is_space_converted: bool, + pub(crate) is_non_space_converted: bool, + pub(crate) is_word_converted: bool, + pub(crate) is_non_word_converted: bool, + pub(crate) is_repetition_converted: bool, + pub(crate) is_case_insensitive_matching: bool, + pub(crate) is_capturing_group_enabled: bool, pub(crate) is_non_ascii_char_escaped: bool, pub(crate) is_astral_code_point_converted_to_surrogate: bool, pub(crate) is_verbose_mode_enabled: bool, @@ -35,6 +46,15 @@ impl RegExpConfig { conversion_features: vec![], minimum_repetitions: 1, minimum_substring_length: 1, + is_digit_converted: false, + is_non_digit_converted: false, + is_space_converted: false, + is_non_space_converted: false, + is_word_converted: false, + is_non_word_converted: false, + is_repetition_converted: false, + is_case_insensitive_matching: false, + is_capturing_group_enabled: false, is_non_ascii_char_escaped: false, is_astral_code_point_converted_to_surrogate: false, is_verbose_mode_enabled: false, @@ -45,43 +65,54 @@ impl RegExpConfig { } pub(crate) fn is_digit_converted(&self) -> bool { - self.conversion_features.contains(&Feature::Digit) + self.is_digit_converted || self.conversion_features.contains(&Feature::Digit) } pub(crate) fn is_non_digit_converted(&self) -> bool { - self.conversion_features.contains(&Feature::NonDigit) + self.is_non_digit_converted || self.conversion_features.contains(&Feature::NonDigit) } pub(crate) fn is_space_converted(&self) -> bool { - self.conversion_features.contains(&Feature::Space) + self.is_space_converted || self.conversion_features.contains(&Feature::Space) } pub(crate) fn is_non_space_converted(&self) -> bool { - self.conversion_features.contains(&Feature::NonSpace) + self.is_non_space_converted || self.conversion_features.contains(&Feature::NonSpace) } pub(crate) fn is_word_converted(&self) -> bool { - self.conversion_features.contains(&Feature::Word) + self.is_word_converted || self.conversion_features.contains(&Feature::Word) } pub(crate) fn is_non_word_converted(&self) -> bool { - self.conversion_features.contains(&Feature::NonWord) + self.is_non_word_converted || self.conversion_features.contains(&Feature::NonWord) } pub(crate) fn is_repetition_converted(&self) -> bool { - self.conversion_features.contains(&Feature::Repetition) + self.is_repetition_converted || self.conversion_features.contains(&Feature::Repetition) } pub(crate) fn is_case_insensitive_matching(&self) -> bool { - self.conversion_features - .contains(&Feature::CaseInsensitivity) + self.is_case_insensitive_matching + || self + .conversion_features + .contains(&Feature::CaseInsensitivity) } pub(crate) fn is_capturing_group_enabled(&self) -> bool { - self.conversion_features.contains(&Feature::CapturingGroup) + self.is_capturing_group_enabled + || self.conversion_features.contains(&Feature::CapturingGroup) } pub(crate) fn is_char_class_feature_enabled(&self) -> bool { - self.conversion_features.iter().any(|it| it.is_char_class()) + self.is_digit_converted + || self.is_non_digit_converted + || self.is_space_converted + || self.is_non_space_converted + || self.is_word_converted + || self.is_non_word_converted + || self.is_case_insensitive_matching + || self.is_capturing_group_enabled + || self.conversion_features.iter().any(|it| it.is_char_class()) } } diff --git a/src/regexp/feature.rs b/src/regexp/feature.rs index 1ffc739..e6c9bad 100644 --- a/src/regexp/feature.rs +++ b/src/regexp/feature.rs @@ -14,18 +14,21 @@ * limitations under the License. */ +#![allow(deprecated)] + /// This enum specifies the supported conversion features which can be passed to method /// [`RegExpBuilder.with_conversion_of`](./struct.RegExpBuilder.html#method.with_conversion_of). #[derive(Clone, Debug, Hash, Ord, PartialOrd, Eq, PartialEq)] +#[deprecated(since = "1.3.0", note = "This enum will be removed in 1.4.0.")] pub enum Feature { /// This feature converts any Unicode decimal digit to character class `\d`. /// /// It takes precedence over the - /// [`Word`](./enum.Feature.html#variant.Word) feature if both are set. + /// [`Word`](Feature::Word) feature if both are set. /// Decimal digits are converted to `\d`, the remaining word characters to `\w`. /// /// It takes precedence over the - /// [`NonSpace`](./enum.Feature.html#variant.NonSpace) feature if both are set. + /// [`NonSpace`](Feature::NonSpace) feature if both are set. /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`. Digit, @@ -33,22 +36,22 @@ pub enum Feature { /// a Unicode decimal digit to character class `\D`. /// /// It takes precedence over the - /// [`NonWord`](./enum.Feature.html#variant.NonWord) feature if both are set. + /// [`NonWord`](Feature::NonWord) feature if both are set. /// Non-digits which are also non-word characters are converted to `\D`. /// /// It takes precedence over the - /// [`NonSpace`](./enum.Feature.html#variant.NonSpace) feature if both are set. + /// [`NonSpace`](Feature::NonSpace) feature if both are set. /// Non-digits which are also non-space characters are converted to `\D`. NonDigit, /// This feature converts any Unicode whitespace character to character class `\s`. /// /// It takes precedence over the - /// [`NonDigit`](./enum.Feature.html#variant.NonDigit) feature if both are set. + /// [`NonDigit`](Feature::NonDigit) feature if both are set. /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`. /// /// It takes precedence over the - /// [`NonWord`](./enum.Feature.html#variant.NonWord) feature if both are set. + /// [`NonWord`](Feature::NonWord) feature if both are set. /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`. Space, @@ -59,11 +62,11 @@ pub enum Feature { /// This feature converts any Unicode word character to character class `\w`. /// /// It takes precedence over the - /// [`NonDigit`](./enum.Feature.html#variant.NonDigit) feature if both are set. + /// [`NonDigit`](Feature::NonDigit) feature if both are set. /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`. /// /// It takes precedence over the - /// [`NonSpace`](./enum.Feature.html#variant.NonSpace) feature if both are set. + /// [`NonSpace`](Feature::NonSpace) feature if both are set. /// Word characters are converted to `\w`, the remaining non-space characters to `\S`. Word, @@ -71,7 +74,7 @@ pub enum Feature { /// a Unicode word character to character class `\W`. /// /// It takes precedence over the - /// [`NonSpace`](./enum.Feature.html#variant.NonSpace) feature if both are set. + /// [`NonSpace`](Feature::NonSpace) feature if both are set. /// Non-words which are also non-space characters are converted to `\W`. NonWord, diff --git a/src/regexp/mod.rs b/src/regexp/mod.rs index a4132bf..9b46185 100644 --- a/src/regexp/mod.rs +++ b/src/regexp/mod.rs @@ -14,6 +14,8 @@ * limitations under the License. */ +#![allow(deprecated)] + mod builder; mod component; mod config; diff --git a/tests/lib_integration_tests.rs b/tests/lib_integration_tests.rs index 11e8369..cf73606 100644 --- a/tests/lib_integration_tests.rs +++ b/tests/lib_integration_tests.rs @@ -14,7 +14,7 @@ * limitations under the License. */ -use grex::{Feature, RegExpBuilder}; +use grex::RegExpBuilder; use indoc::indoc; use regex::Regex; use rstest::rstest; @@ -105,7 +105,7 @@ mod no_conversion { )] fn succeeds_with_ignore_case_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::CaseInsensitivity]) + .with_case_insensitive_matching() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -150,7 +150,7 @@ mod no_conversion { )] fn succeeds_with_capturing_groups_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::CapturingGroup]) + .with_capturing_groups() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -287,7 +287,7 @@ mod no_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::CaseInsensitivity]) + .with_case_insensitive_matching() .with_verbose_mode() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -387,7 +387,7 @@ mod no_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -399,7 +399,8 @@ mod no_conversion { )] fn succeeds_with_ignore_case_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::CaseInsensitivity]) + .with_conversion_of_repetitions() + .with_case_insensitive_matching() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -415,7 +416,7 @@ mod no_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -432,7 +433,7 @@ mod no_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -567,7 +568,7 @@ mod no_conversion { )] fn succeeds_with_verbose_mode_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_verbose_mode() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -604,7 +605,7 @@ mod no_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_minimum_repetitions(3) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -624,7 +625,7 @@ mod no_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_minimum_substring_length(3) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -644,7 +645,7 @@ mod no_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_minimum_repetitions(3) .with_minimum_substring_length(3) .build(); @@ -682,7 +683,7 @@ mod digit_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit]) + .with_conversion_of_digits() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -696,7 +697,7 @@ mod digit_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit]) + .with_conversion_of_digits() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -711,7 +712,7 @@ mod digit_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit]) + .with_conversion_of_digits() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -729,7 +730,8 @@ mod digit_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -743,7 +745,8 @@ mod digit_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -758,7 +761,8 @@ mod digit_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -778,7 +782,8 @@ mod digit_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() .with_minimum_repetitions(2) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -809,7 +814,7 @@ mod space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Space]) + .with_conversion_of_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -823,7 +828,7 @@ mod space_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Space]) + .with_conversion_of_whitespace() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -838,7 +843,7 @@ mod space_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Space]) + .with_conversion_of_whitespace() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -856,7 +861,8 @@ mod space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Space]) + .with_conversion_of_repetitions() + .with_conversion_of_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -870,7 +876,8 @@ mod space_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Space]) + .with_conversion_of_repetitions() + .with_conversion_of_whitespace() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -885,7 +892,8 @@ mod space_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Space]) + .with_conversion_of_repetitions() + .with_conversion_of_whitespace() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -908,7 +916,8 @@ mod space_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Space]) + .with_conversion_of_repetitions() + .with_conversion_of_whitespace() .with_minimum_repetitions(2) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -943,7 +952,7 @@ mod word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Word]) + .with_conversion_of_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -957,7 +966,7 @@ mod word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Word]) + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -972,7 +981,7 @@ mod word_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Word]) + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -990,7 +999,8 @@ mod word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1004,7 +1014,8 @@ mod word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1019,7 +1030,8 @@ mod word_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1042,7 +1054,8 @@ mod word_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_words() .with_minimum_repetitions(2) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1065,7 +1078,8 @@ mod digit_space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Space]) + .with_conversion_of_digits() + .with_conversion_of_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1079,7 +1093,8 @@ mod digit_space_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Space]) + .with_conversion_of_digits() + .with_conversion_of_whitespace() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1094,7 +1109,8 @@ mod digit_space_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Space]) + .with_conversion_of_digits() + .with_conversion_of_whitespace() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1112,7 +1128,9 @@ mod digit_space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Space]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1126,7 +1144,9 @@ mod digit_space_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Space]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_whitespace() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1141,7 +1161,9 @@ mod digit_space_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Space]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_whitespace() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1164,7 +1186,9 @@ mod digit_space_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Space]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_whitespace() .with_minimum_repetitions(2) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1180,7 +1204,7 @@ mod digit_space_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_minimum_substring_length(3) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1198,7 +1222,7 @@ mod digit_space_conversion { expected_output: &str, ) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_minimum_repetitions(2) .with_minimum_substring_length(3) .build(); @@ -1222,7 +1246,8 @@ mod digit_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Word]) + .with_conversion_of_digits() + .with_conversion_of_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1236,7 +1261,8 @@ mod digit_word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Word]) + .with_conversion_of_digits() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1251,7 +1277,8 @@ mod digit_word_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Word]) + .with_conversion_of_digits() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1269,7 +1296,9 @@ mod digit_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1283,7 +1312,9 @@ mod digit_word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1298,7 +1329,9 @@ mod digit_word_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1320,7 +1353,8 @@ mod space_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Space, Feature::Word]) + .with_conversion_of_whitespace() + .with_conversion_of_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1334,7 +1368,8 @@ mod space_word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Space, Feature::Word]) + .with_conversion_of_whitespace() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1349,7 +1384,8 @@ mod space_word_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Space, Feature::Word]) + .with_conversion_of_whitespace() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1367,7 +1403,9 @@ mod space_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Space, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_whitespace() + .with_conversion_of_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1381,7 +1419,9 @@ mod space_word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Space, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_whitespace() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1396,7 +1436,9 @@ mod space_word_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Space, Feature::Word]) + .with_conversion_of_repetitions() + .with_conversion_of_whitespace() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1418,7 +1460,9 @@ mod digit_space_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Space, Feature::Word]) + .with_conversion_of_digits() + .with_conversion_of_whitespace() + .with_conversion_of_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1432,7 +1476,9 @@ mod digit_space_word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Space, Feature::Word]) + .with_conversion_of_digits() + .with_conversion_of_whitespace() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1447,7 +1493,9 @@ mod digit_space_word_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::Space, Feature::Word]) + .with_conversion_of_digits() + .with_conversion_of_whitespace() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1465,12 +1513,10 @@ mod digit_space_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[ - Feature::Repetition, - Feature::Digit, - Feature::Space, - Feature::Word, - ]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_whitespace() + .with_conversion_of_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1484,12 +1530,10 @@ mod digit_space_word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[ - Feature::Repetition, - Feature::Digit, - Feature::Space, - Feature::Word, - ]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_whitespace() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1504,12 +1548,10 @@ mod digit_space_word_conversion { )] fn succeeds_with_escape_and_surrogate_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[ - Feature::Repetition, - Feature::Digit, - Feature::Space, - Feature::Word, - ]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_whitespace() + .with_conversion_of_words() .with_escaping_of_non_ascii_chars(true) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1531,7 +1573,7 @@ mod non_digit_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonDigit]) + .with_conversion_of_non_digits() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1545,7 +1587,7 @@ mod non_digit_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonDigit]) + .with_conversion_of_non_digits() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1561,7 +1603,8 @@ mod non_digit_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::NonDigit]) + .with_conversion_of_repetitions() + .with_conversion_of_non_digits() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1572,7 +1615,8 @@ mod non_digit_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::NonDigit]) + .with_conversion_of_repetitions() + .with_conversion_of_non_digits() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1595,7 +1639,7 @@ mod non_space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonSpace]) + .with_conversion_of_non_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1613,7 +1657,8 @@ mod non_space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::NonSpace]) + .with_conversion_of_repetitions() + .with_conversion_of_non_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1635,7 +1680,7 @@ mod non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonWord]) + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1649,7 +1694,7 @@ mod non_word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonWord]) + .with_conversion_of_non_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1668,7 +1713,8 @@ mod non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::NonWord]) + .with_conversion_of_repetitions() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1682,7 +1728,8 @@ mod non_word_conversion { )] fn succeeds_with_escape_option(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::NonWord]) + .with_conversion_of_repetitions() + .with_conversion_of_non_words() .with_escaping_of_non_ascii_chars(false) .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); @@ -1705,7 +1752,8 @@ mod non_digit_non_space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonDigit, Feature::NonSpace]) + .with_conversion_of_non_digits() + .with_conversion_of_non_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1720,7 +1768,9 @@ mod non_digit_non_space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::NonDigit, Feature::NonSpace]) + .with_conversion_of_repetitions() + .with_conversion_of_non_digits() + .with_conversion_of_non_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1742,7 +1792,8 @@ mod non_digit_non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonDigit, Feature::NonWord]) + .with_conversion_of_non_digits() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1757,7 +1808,9 @@ mod non_digit_non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::NonDigit, Feature::NonWord]) + .with_conversion_of_repetitions() + .with_conversion_of_non_digits() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1779,7 +1832,8 @@ mod non_space_non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonSpace, Feature::NonWord]) + .with_conversion_of_non_whitespace() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1797,7 +1851,9 @@ mod non_space_non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::NonSpace, Feature::NonWord]) + .with_conversion_of_repetitions() + .with_conversion_of_non_whitespace() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1819,7 +1875,9 @@ mod non_digit_non_space_non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::NonDigit, Feature::NonSpace, Feature::NonWord]) + .with_conversion_of_non_digits() + .with_conversion_of_non_whitespace() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1834,12 +1892,10 @@ mod non_digit_non_space_non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[ - Feature::Repetition, - Feature::NonDigit, - Feature::NonSpace, - Feature::NonWord, - ]) + .with_conversion_of_repetitions() + .with_conversion_of_non_digits() + .with_conversion_of_non_whitespace() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1861,7 +1917,8 @@ mod digit_non_digit_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Digit, Feature::NonDigit]) + .with_conversion_of_digits() + .with_conversion_of_non_digits() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1876,7 +1933,9 @@ mod digit_non_digit_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::NonDigit]) + .with_conversion_of_repetitions() + .with_conversion_of_digits() + .with_conversion_of_non_digits() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1898,7 +1957,8 @@ mod space_non_space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Space, Feature::NonSpace]) + .with_conversion_of_whitespace() + .with_conversion_of_non_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1916,7 +1976,9 @@ mod space_non_space_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Space, Feature::NonSpace]) + .with_conversion_of_repetitions() + .with_conversion_of_whitespace() + .with_conversion_of_non_whitespace() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1938,7 +2000,8 @@ mod word_non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Word, Feature::NonWord]) + .with_conversion_of_words() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); @@ -1956,7 +2019,9 @@ mod word_non_word_conversion { )] fn succeeds(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases) - .with_conversion_of(&[Feature::Repetition, Feature::Word, Feature::NonWord]) + .with_conversion_of_repetitions() + .with_conversion_of_words() + .with_conversion_of_non_words() .build(); assert_that_regexp_is_correct(regexp, expected_output, &test_cases); assert_that_regexp_matches_test_cases(expected_output, test_cases); diff --git a/tests/property_tests.rs b/tests/property_tests.rs index 13e5b2b..7cce08a 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -14,6 +14,8 @@ * limitations under the License. */ +#![allow(deprecated)] + use grex::{Feature, RegExpBuilder}; use proptest::prelude::*; use regex::{Error, Regex, RegexBuilder}; From 2e025e040704a630e37dd4bf939a0da6cd83104c Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Tue, 14 Sep 2021 23:26:11 +0200 Subject: [PATCH 15/18] Update documentation --- README.md | 84 ++++++++++++++++++++++++++++-------------------- RELEASE_NOTES.md | 12 +++++++ src/lib.rs | 16 +++++++++ src/main.rs | 12 +++---- 4 files changed, 84 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 62285f5..abb111d 100644 --- a/README.md +++ b/README.md @@ -3,19 +3,19 @@
[![build](https://github.com/pemistahl/grex/actions/workflows/build.yml/badge.svg)](https://github.com/pemistahl/grex/actions/workflows/build.yml) -[![dependency status](https://deps.rs/crate/grex/1.2.0/status.svg)](https://deps.rs/crate/grex/1.2.0) +[![dependency status](https://deps.rs/crate/grex/1.3.0/status.svg)](https://deps.rs/crate/grex/1.3.0) [![codecov](https://codecov.io/gh/pemistahl/grex/branch/main/graph/badge.svg)](https://codecov.io/gh/pemistahl/grex) [![lines of code](https://tokei.rs/b1/github/pemistahl/grex?category=code)](https://github.com/XAMPPRocky/tokei) [![Downloads](https://img.shields.io/crates/d/grex.svg)](https://crates.io/crates/grex) [![Docs.rs](https://docs.rs/grex/badge.svg)](https://docs.rs/grex) [![Crates.io](https://img.shields.io/crates/v/grex.svg)](https://crates.io/crates/grex) -[![Lib.rs](https://img.shields.io/badge/lib.rs-v1.2.0-blue)](https://lib.rs/crates/grex) +[![Lib.rs](https://img.shields.io/badge/lib.rs-v1.3.0-blue)](https://lib.rs/crates/grex) [![license](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) -[![Linux Download](https://img.shields.io/badge/Linux%20Download-v1.2.0-blue?logo=Linux)](https://github.com/pemistahl/grex/releases/download/v1.2.0/grex-v1.2.0-x86_64-unknown-linux-musl.tar.gz) -[![MacOS Download](https://img.shields.io/badge/macOS%20Download-v1.2.0-blue?logo=Apple)](https://github.com/pemistahl/grex/releases/download/v1.2.0/grex-v1.2.0-x86_64-apple-darwin.tar.gz) -[![Windows Download](https://img.shields.io/badge/Windows%20Download-v1.2.0-blue?logo=Windows)](https://github.com/pemistahl/grex/releases/download/v1.2.0/grex-v1.2.0-x86_64-pc-windows-msvc.zip) +[![Linux Download](https://img.shields.io/badge/Linux%20Download-v1.3.0-blue?logo=Linux)](https://github.com/pemistahl/grex/releases/download/v1.3.0/grex-v1.3.0-x86_64-unknown-linux-musl.tar.gz) +[![MacOS Download](https://img.shields.io/badge/macOS%20Download-v1.3.0-blue?logo=Apple)](https://github.com/pemistahl/grex/releases/download/v1.3.0/grex-v1.3.0-x86_64-apple-darwin.tar.gz) +[![Windows Download](https://img.shields.io/badge/Windows%20Download-v1.3.0-blue?logo=Windows)](https://github.com/pemistahl/grex/releases/download/v1.3.0/grex-v1.3.0-x86_64-pc-windows-msvc.zip)
@@ -114,7 +114,7 @@ toolchain installed, you can install by compiling from source using So the summary of your installation options is: ``` -( choco | scoop | brew | cargo | huber ) install grex +( brew | cargo | choco | huber | scoop ) install grex ``` ### 4.2 The library [Top ▲](#table-of-contents) @@ -123,7 +123,7 @@ In order to use *grex* as a library, simply add it as a dependency to your `Carg ```toml [dependencies] -grex = "1.2.0" +grex = "1.3.0" ``` ## 5. How to use? [Top ▲](#table-of-contents) @@ -133,10 +133,15 @@ All settings can be freely combined with each other. ### 5.1 The command-line tool [Top ▲](#table-of-contents) +Test cases are passed either directly (`grex a b c`) or from a file (`grex -f test_cases.txt`). +*grex* is able to receive its input from Unix pipelines as well, e.g. `cat test_cases.txt | grex -`. + +The following table shows all available flags and options: + ``` $ grex -h -grex 1.2.0 +grex 1.3.0 © 2019-today Peter M. Stahl Licensed under the Apache License, Version 2.0 Downloadable from https://crates.io/crates/grex @@ -161,6 +166,9 @@ FLAGS: -i, --ignore-case Performs case-insensitive matching, letters match both upper and lower case -g, --capture-groups Replaces non-capturing groups by capturing ones -x, --verbose Produces a nicer looking regular expression in verbose mode + --no-start-anchor Removes the caret anchor '^' from the resulting regular expression + --no-end-anchor Removes the dollar sign anchor '$' from the resulting regular expression + --no-anchors Removes the caret and dollar sign anchors from the resulting regular expression -c, --colorize Provides syntax highlighting for the resulting regular expression -h, --help Prints help information -v, --version Prints version information @@ -180,8 +188,8 @@ ARGS: #### 5.2.1 Default settings -Test cases are passed either from a collection via [`RegExpBuilder::from()`](https://docs.rs/grex/1.2.0/grex/struct.RegExpBuilder.html#method.from) -or from a file via [`RegExpBuilder::from_file()`](https://docs.rs/grex/1.2.0/grex/struct.RegExpBuilder.html#method.from_file). +Test cases are passed either from a collection via [`RegExpBuilder::from()`](https://docs.rs/grex/1.3.0/grex/struct.RegExpBuilder.html#method.from) +or from a file via [`RegExpBuilder::from_file()`](https://docs.rs/grex/1.3.0/grex/struct.RegExpBuilder.html#method.from_file). If read from a file, each test case must be on a separate line. Lines may be ended with either a newline `\n` or a carriage return with a line feed `\r\n`. @@ -195,10 +203,11 @@ assert_eq!(regexp, "^a(?:aa?)?$"); #### 5.2.2 Convert to character classes ```rust -use grex::{Feature, RegExpBuilder}; +use grex::RegExpBuilder; let regexp = RegExpBuilder::from(&["a", "aa", "123"]) - .with_conversion_of(&[Feature::Digit, Feature::Word]) + .with_conversion_of_digits() + .with_conversion_of_words() .build(); assert_eq!(regexp, "^(\\d\\d\\d|\\w(?:\\w)?)$"); ``` @@ -206,10 +215,10 @@ assert_eq!(regexp, "^(\\d\\d\\d|\\w(?:\\w)?)$"); #### 5.2.3 Convert repeated substrings ```rust -use grex::{Feature, RegExpBuilder}; +use grex::RegExpBuilder; let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .build(); assert_eq!(regexp, "^(?:a{2}|(?:bc){2}|(?:def){3})$"); ``` @@ -221,10 +230,10 @@ In the following example, the test case `aa` is not converted to `a{2}` because `a` has a length of 1, but the minimum substring length has been set to 2. ```rust -use grex::{Feature, RegExpBuilder}; +use grex::RegExpBuilder; let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_minimum_substring_length(2) .build(); assert_eq!(regexp, "^(?:aa|(?:bc){2}|(?:def){3})$"); @@ -234,10 +243,10 @@ Setting a minimum number of 2 repetitions in the next example, only the test cas converted because it is the only one that is repeated twice. ```rust -use grex::{Feature, RegExpBuilder}; +use grex::RegExpBuilder; let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) - .with_conversion_of(&[Feature::Repetition]) + .with_conversion_of_repetitions() .with_minimum_repetitions(2) .build(); assert_eq!(regexp, "^(?:bcbc|aa|(?:def){3})$"); @@ -274,10 +283,10 @@ The regular expressions that *grex* generates are case-sensitive by default. Case-insensitive matching can be enabled like so: ```rust -use grex::{Feature, RegExpBuilder}; +use grex::RegExpBuilder; let regexp = RegExpBuilder::from(&["big", "BIGGER"]) - .with_conversion_of(&[Feature::CaseInsensitivity]) + .with_case_insensitive_matching() .build(); assert_eq!(regexp, "(?i)^big(?:ger)?$"); ``` @@ -288,10 +297,11 @@ Non-capturing groups are used by default. Extending the previous example, you can switch to capturing groups instead. ```rust -use grex::{Feature, RegExpBuilder}; +use grex::RegExpBuilder; let regexp = RegExpBuilder::from(&["big", "BIGGER"]) - .with_conversion_of(&[Feature::CaseInsensitivity, Feature::CapturingGroup]) + .with_case_insensitive_matching() + .with_capturing_groups() .build(); assert_eq!(regexp, "(?i)^big(ger)?$"); ``` @@ -325,7 +335,23 @@ assert_eq!(regexp, indoc!( )); ``` -#### 5.2.8 Syntax highlighting +#### 5.2.8 Disable anchors + +By default, the anchors `^` and `$` are put around every generated regular expression in order +to ensure that it matches only the test cases given as input. Often enough, however, it is +desired to use the generated pattern as part of a larger one. For this purpose, the anchors +can be disabled, either separately or both of them. + +```rust +use grex::RegExpBuilder; + +let regexp = RegExpBuilder::from(&["a", "aa", "aaa"]) + .without_anchors() + .build(); +assert_eq!(regexp, "a(?:aa?)?"); +``` + +#### 5.2.9 Syntax highlighting ⚠ The method `with_syntax_highlighting()` may only be used if the resulting regular expression is meant to be printed to the console. It is mainly meant to be used for the command-line tool output. @@ -463,22 +489,12 @@ cargo build ``` The source code is accompanied by an extensive test suite consisting of unit tests, integration -tests and property tests. For running the unit and integration tests, simply say: +tests and property tests. For running them, simply say: ``` cargo test ``` -Property tests are disabled by default with the `#[ignore]` annotation because they are -very long-running. They are used for automatically generating test cases for regular -expression conversion. If a test case is found that produces a wrong conversion, it is -shrinked to the shortest test case possible that still produces a wrong result. -This is a very useful tool for finding bugs. If you want to run these tests, say: - -``` -cargo test -- --ignored -``` - ## 7. How does it work? [Top ▲](#table-of-contents) 1. A [deterministic finite automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) (DFA) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index d3651e2..5f8a916 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,15 @@ +## grex 1.3.0 (released on 14 Sep 2021) + +### Features +- anchors can now be disabled so that the generated expression can be used as part of a larger one (#30) +- the command-line tool can now be used within Unix pipelines (#45) + +### Changes +- Additional methods have been added to `RegExpBuilder` in order to replace the enum `Feature` and make the library API more consistent. (#47) + +### Bug Fixes +- Under rare circumstances, the conversion of repetitions did not work. This has been fixed. + ## grex 1.2.0 (released on 28 Mar 2021) ### Features diff --git a/src/lib.rs b/src/lib.rs index 7776b98..7266386 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -233,6 +233,22 @@ //! )); //! ``` //! +//! ### 4.8 Disable anchors +//! +//! By default, the anchors `^` and `$` are put around every generated regular expression in order +//! to ensure that it matches only the test cases given as input. Often enough, however, it is +//! desired to use the generated pattern as part of a larger one. For this purpose, the anchors +//! can be disabled, either separately or both of them. +//! +//! ``` +//! use grex::RegExpBuilder; +//! +//! let regexp = RegExpBuilder::from(&["a", "aa", "aaa"]) +//! .without_anchors() +//! .build(); +//! assert_eq!(regexp, "a(?:aa?)?"); +//! ``` +//! //! ### 5. How does it work? //! //! 1. A [deterministic finite automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) (DFA) diff --git a/src/main.rs b/src/main.rs index bb24848..a644dfa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -181,8 +181,8 @@ struct Cli { #[structopt( name = "no-start-anchor", long, - help = "Removes caret anchor '^' from resulting regular expression", - long_help = "Removes caret anchor '^' from resulting regular expression.\n\n\ + help = "Removes the caret anchor '^' from the resulting regular expression", + long_help = "Removes the caret anchor '^' from the resulting regular expression.\n\n\ By default, the caret anchor is added to every generated regular\n\ expression which guarantees that the expression matches the test cases\n\ given as input only at the start of a string.\n\ @@ -195,8 +195,8 @@ struct Cli { #[structopt( name = "no-end-anchor", long, - help = "Removes dollar sign anchor '$' from resulting regular expression", - long_help = "Removes dollar sign anchor '$' from resulting regular expression.\n\n\ + help = "Removes the dollar sign anchor '$' from the resulting regular expression", + long_help = "Removes the dollar sign anchor '$' from the resulting regular expression.\n\n\ By default, the dollar sign anchor is added to every generated regular\n\ expression which guarantees that the expression matches the test cases\n\ given as input only at the end of a string.\n\ @@ -209,8 +209,8 @@ struct Cli { #[structopt( name = "no-anchors", long, - help = "Removes caret and dollar sign anchors from resulting regular expression", - long_help = "Removes caret and dollar sign anchors from resulting regular expression.\n\n\ + help = "Removes the caret and dollar sign anchors from the resulting regular expression", + long_help = "Removes the caret and dollar sign anchors from the resulting regular expression.\n\n\ By default, anchors are added to every generated regular expression\n\ which guarantee that the expression exactly matches only the test cases\n\ given as input and nothing else.\n\ From 3f2df020cb98a137bcf31a4dbb1c4ce2bed7edec Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Wed, 15 Sep 2021 12:45:29 +0200 Subject: [PATCH 16/18] Fix colorized regex generation without anchors (#30) --- src/regexp/regexp.rs | 16 +++++++++++++--- tests/lib_integration_tests.rs | 4 ++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/regexp/regexp.rs b/src/regexp/regexp.rs index bc6e7d5..e394d03 100644 --- a/src/regexp/regexp.rs +++ b/src/regexp/regexp.rs @@ -42,7 +42,7 @@ impl RegExp { if config.is_start_anchor_disabled && config.is_end_anchor_disabled - && !Self::is_each_test_case_matched(&mut ast, test_cases) + && !Self::is_each_test_case_matched(&mut ast, test_cases, config) { dfa = Dfa::from(&grapheme_clusters, false, config); ast = Expression::from(dfa, config); @@ -88,9 +88,19 @@ impl RegExp { clusters } - fn is_each_test_case_matched(expr: &mut Expression, test_cases: &[String]) -> bool { + fn is_each_test_case_matched( + expr: &mut Expression, + test_cases: &[String], + config: &RegExpConfig, + ) -> bool { + let regex = if config.is_output_colorized { + let color_replace_regex = Regex::new("\u{1b}\\[(?:\\d+;\\d+|0)m").unwrap(); + Regex::new(&*color_replace_regex.replace_all(&expr.to_string(), "")).unwrap() + } else { + Regex::new(&expr.to_string()).unwrap() + }; + for _ in 1..test_cases.len() { - let regex = Regex::new(&expr.to_string()).unwrap(); if test_cases .iter() .all(|test_case| regex.find_iter(test_case).count() == 1) diff --git a/tests/lib_integration_tests.rs b/tests/lib_integration_tests.rs index cf73606..970343b 100644 --- a/tests/lib_integration_tests.rs +++ b/tests/lib_integration_tests.rs @@ -308,13 +308,13 @@ mod no_conversion { } #[rstest(test_cases, expected_output, - case(vec!["bab", "b", "cb", "bba"], "(?:(?:ba|c)b|b(?:ba)?)"), + case(vec!["bab", "b", "cb", "bba"], "(?:b(?:ba|ab)?|cb)"), case(vec!["a", "aba", "baaa", "aaab"], "(?:baaa|a(?:aab|ba)?)"), case(vec!["a", "abab", "bbb", "aaac"], "(?:a(?:bab|aac)?|bbb)"), case( // https://github.com/pemistahl/grex/issues/31 vec!["agbhd", "eibcd", "egbcd", "fbjbf", "agbh", "eibc", "egbc", "ebc", "fbc", "cd", "f", "c", "abcd", "ebcd", "fbcd"], - "(?:(?:e(?:[gi])?bc|(?:fb)?c)d?|a(?:gbhd?|bcd)|f(?:bjbf)?)") + "(?:a(?:gbhd?|bcd)|e(?:ibcd?|gbcd?|bcd?)|f(?:b(?:jbf|cd?))?|cd?)") )] fn succeeds_without_anchors(test_cases: Vec<&str>, expected_output: &str) { let regexp = RegExpBuilder::from(&test_cases).without_anchors().build(); From f98ff30a0d1bc5a7a92bc6b3d68b25f7b72ad29b Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Wed, 15 Sep 2021 12:52:04 +0200 Subject: [PATCH 17/18] Update dependencies --- Cargo.lock | 59 +++++++++++++----------------------------------- Cargo.toml | 8 +++---- RELEASE_NOTES.md | 4 ++-- 3 files changed, 22 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d92e3d2..fa59c49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -22,9 +22,9 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54f002ce7d0c5e809ebb02be78fd503aeed4a511fd0fcaff6e6914cbdabbfa33" +checksum = "b800c4403e8105d959595e1f88119e78bc12bc874c4336973658b648a746ba93" dependencies = [ "bstr", "doc-comment", @@ -136,9 +136,9 @@ checksum = "398ea4fabe40b9b0d885340a2a991a44c8a645624075ad966d21f88688e2b69e" [[package]] name = "float-cmp" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1267f4ac4f343772758f7b1bdcbe767c218bbab93bb432acbf5162bbf85a6c4" +checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4" dependencies = [ "num-traits", ] @@ -314,15 +314,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "pest" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" -dependencies = [ - "ucd-trie", -] - [[package]] name = "petgraph" version = "0.6.0" @@ -341,9 +332,9 @@ checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" [[package]] name = "predicates" -version = "2.0.0" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6e46ca79eb4e21e2ec14430340c71250ab69332abf85521c95d3a8bc336aa76" +checksum = "c143348f141cc87aab5b950021bac6145d0e5ae754b0591de23244cee42c9308" dependencies = [ "difflib", "float-cmp", @@ -544,9 +535,9 @@ dependencies = [ [[package]] name = "rstest" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "041bb0202c14f6a158bbbf086afb03d0c6e975c2dec7d4912f8061ed44f290af" +checksum = "2288c66aeafe3b2ed227c981f364f9968fa952ef0b30e84ada4486e7ee24d00a" dependencies = [ "cfg-if", "proc-macro2", @@ -557,9 +548,9 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.3.3" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] @@ -578,21 +569,9 @@ dependencies = [ [[package]] name = "semver" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver-parser" -version = "0.10.2" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" -dependencies = [ - "pest", -] +checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012" [[package]] name = "strsim" @@ -602,9 +581,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.22" +version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69b041cdcb67226aca307e6e7be44c8806423d83e018bd662360a93dabce4d71" +checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa" dependencies = [ "clap", "lazy_static", @@ -613,9 +592,9 @@ dependencies = [ [[package]] name = "structopt-derive" -version = "0.4.15" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7813934aecf5f51a54775e00068c237de98489463968231a51746bbbc03f9c10" +checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba" dependencies = [ "heck", "proc-macro-error", @@ -664,12 +643,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7f741b240f1a48843f9b8e0444fb55fb2a4ff67293b50a9179dfd5ea67f8d41" -[[package]] -name = "ucd-trie" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" - [[package]] name = "unic-char-property" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index 2a7c60c..3e15d17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,15 +35,15 @@ lazy_static = "1.4.0" ndarray = "0.15.3" petgraph = {version = "0.6.0", default-features = false, features = ["stable_graph"]} regex = "1.5.4" -structopt = "0.3.22" +structopt = "0.3.23" unic-char-range = "0.9.0" unic-ucd-category = "0.9.0" unicode-segmentation = "1.8.0" [dev-dependencies] -assert_cmd = "2.0.0" +assert_cmd = "2.0.1" indoc = "1.0.3" -predicates = "2.0.0" +predicates = "2.0.2" proptest = "1.0.0" -rstest = "0.10.0" +rstest = "0.11.0" tempfile = "3.2.0" diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 5f8a916..d90e820 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,4 +1,4 @@ -## grex 1.3.0 (released on 14 Sep 2021) +## grex 1.3.0 (released on 15 Sep 2021) ### Features - anchors can now be disabled so that the generated expression can be used as part of a larger one (#30) @@ -8,7 +8,7 @@ - Additional methods have been added to `RegExpBuilder` in order to replace the enum `Feature` and make the library API more consistent. (#47) ### Bug Fixes -- Under rare circumstances, the conversion of repetitions did not work. This has been fixed. +- Under rare circumstances, the conversion of repetitions did not work. This has been fixed. (#36) ## grex 1.2.0 (released on 28 Mar 2021) From 6145328a6ffc4a74bf4af52585790c33df0d4aed Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Wed, 15 Sep 2021 12:58:10 +0200 Subject: [PATCH 18/18] Fix release workflow --- .github/workflows/release.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cdfe614..77fc671 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -62,7 +62,9 @@ jobs: if: ${{ matrix.os == 'windows-latest' }} run: | choco install zip - zip target/${{ matrix.target }}/release/grex-${{ steps.get_version.outputs.version }}-${{ matrix.target }}.zip target/${{ matrix.target }}/release/grex.exe + cd target/${{ matrix.target }}/release + zip grex-${{ steps.get_version.outputs.version }}-${{ matrix.target }}.zip grex.exe + cd ../../.. - name: Create tar.gz file on macOS and Linux if: ${{ matrix.os != 'windows-latest' }} @@ -79,9 +81,8 @@ jobs: file_glob: true file: target/${{ matrix.target }}/release/grex-${{ steps.get_version.outputs.version }}-${{ matrix.target }}.{zip,tar.gz} - # ENABLE AGAIN FOR NEXT RELEASE 1.3 - #- name: Upload release to crates.io - # uses: katyo/publish-crates@v1 - # if: ${{ matrix.os == 'ubuntu-latest' }} - # with: - # registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} + - name: Upload release to crates.io + uses: katyo/publish-crates@v1 + if: ${{ matrix.os == 'ubuntu-latest' }} + with: + registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }}