diff --git a/Cargo.lock b/Cargo.lock index 0877aae..40118fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -171,7 +171,7 @@ dependencies = [ [[package]] name = "grex" -version = "0.4.0" +version = "1.0.0" dependencies = [ "assert_cmd 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", "colored 1.9.2 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/Cargo.toml b/Cargo.toml index cf620b2..e5a8555 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ [package] name = "grex" -version = "0.4.0" +version = "1.0.0" authors = ["Peter M. Stahl "] description = """ grex generates regular expressions from user-provided test cases. diff --git a/README.md b/README.md index 620d407..8ab2242 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,9 @@ [![Downloads](https://img.shields.io/crates/d/grex.svg)](https://crates.io/crates/grex) [![license](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) -[![Linux Download](https://img.shields.io/badge/Linux%20Download-v0.3.2-blue?logo=Linux)](https://github.com/pemistahl/grex/releases/download/v0.3.2/grex-v0.3.2-x86_64-unknown-linux-musl.tar.gz) -[![MacOS Download](https://img.shields.io/badge/macOS%20Download-v0.3.2-blue?logo=Apple)](https://github.com/pemistahl/grex/releases/download/v0.3.2/grex-v0.3.2-x86_64-apple-darwin.tar.gz) -[![Windows Download](https://img.shields.io/badge/Windows%20Download-v0.3.2-blue?logo=Windows)](https://github.com/pemistahl/grex/releases/download/v0.3.2/grex-v0.3.2-x86_64-pc-windows-msvc.zip) +[![Linux Download](https://img.shields.io/badge/Linux%20Download-v1.0.0-blue?logo=Linux)](https://github.com/pemistahl/grex/releases/download/v1.0.0/grex-v1.0.0-x86_64-unknown-linux-musl.tar.gz) +[![MacOS Download](https://img.shields.io/badge/macOS%20Download-v1.0.0-blue?logo=Apple)](https://github.com/pemistahl/grex/releases/download/v1.0.0/grex-v1.0.0-x86_64-apple-darwin.tar.gz) +[![Windows Download](https://img.shields.io/badge/Windows%20Download-v1.0.0-blue?logo=Windows)](https://github.com/pemistahl/grex/releases/download/v1.0.0/grex-v1.0.0-x86_64-pc-windows-msvc.zip) ## Table of Contents 1. [What does this tool do?](#what-does-tool-do) @@ -23,15 +23,20 @@ 4.1 [The command-line tool](#how-to-use-cli) 4.2 [The library](#how-to-use-library) 4.3 [Examples](#examples) -5. [How does it work?](#how-does-it-work) -6. [Do you want to contribute?](#contribution) +5. [How to build?](#how-to-build) +6. [How does it work?](#how-does-it-work) +7. [Do you want to contribute?](#contribution) ## 1. What does this tool do? [Top ▲](#table-of-contents) *grex* is a library as well as a command-line utility that is meant to simplify the often complicated and tedious task of creating regular expressions. It does so by automatically -generating regular expressions from user-provided test cases. +generating regular expressions from user-provided test cases. The produced expressions +are Perl-compatible regular expressions (PCRE) which are also compatible with the +regular expression parser in the [*regex*](https://crates.io/crates/regex) crate. +Other regular expression parsers or respective libraries from other programming languages +have not been tested so far. This project has started as a Rust port of the JavaScript tool [*regexgen*](https://github.com/devongovett/regexgen) written by @@ -53,8 +58,9 @@ With the use of command-line flags (in the CLI tool) or preprocessing methods - alternation using `|` operator - optionality using `?` quantifier - escaping of non-ascii characters, with optional conversion of astral code points to surrogate pairs -- concatenation of all of the former +- fully Unicode-aware, correctly handles graphemes consisting of multiple Unicode symbols - reading input strings from the command-line or from a file +- optional syntax highlighting for nicer output in supported terminals ## 3. How to install? [Top ▲](#table-of-contents) @@ -90,61 +96,53 @@ In order to use *grex* as a library, simply add it as a dependency to your `Carg ```toml [dependencies] -grex = "0.3" +grex = "1.0.0" ``` ## 4. How to use? [Top ▲](#table-of-contents) -Every generated regular expression is surrounded by the anchors `^` and `$` so that it does not -accidentally match substrings. - ### 4.1 The command-line tool [Top ▲](#table-of-contents) ``` -$ grex --help -grex 0.3.2 -Peter M. Stahl +$ grex -h + +grex 1.0.0 +© 2019-2020 Peter M. Stahl +Licensed under the Apache License, Version 2.0 +Downloadable from https://crates.io/crates/grex +Source code at https://github.com/pemistahl/grex + grex generates regular expressions from user-provided test cases. USAGE: grex [FLAGS] ... --file FLAGS: - -r, --convert-repetitions - Detects repeated non-overlapping substrings and - converts them to {min,max} quantifier notation - - -e, --escape - Replaces all non-ASCII characters with unicode escape sequences - - --with-surrogates - Converts astral code points to surrogate pairs if --escape is set - - -h, --help - Prints help information - - -v, --version - Prints version information + -d, --digits Converts any Unicode decimal digit to \d + -D, --non-digits Converts any character which is not a Unicode decimal digit to \D + -s, --spaces Converts any Unicode whitespace character to \s + -S, --non-spaces Converts any character which is not a Unicode whitespace character to \S + -w, --words Converts any Unicode word character to \w + -W, --non-words Converts any character which is not a Unicode word character to \W + -r, --repetitions Detects repeated non-overlapping substrings and + converts them to {min,max} quantifier notation + -e, --escape Replaces all non-ASCII characters with unicode escape sequences + --with-surrogates Converts astral code points to surrogate pairs if --escape is set + -c, --colorize Provides syntax highlighting for the resulting regular expression + -h, --help Prints help information + -v, --version Prints version information OPTIONS: - -f, --file - Reads test cases separated by newline characters from a file + -f, --file Reads test cases separated by newline characters from a file ARGS: - ... - One or more test cases separated by blank space -``` - -Input strings can be read from the command line or from a file. Every file must be encoded as UTF-8 -and every input string must be on a separate line: - -``` -$ grex -f my-input-file.txt + ... One or more test cases separated by blank space + ``` ### 4.2 The library [Top ▲](#table-of-contents) -#### Default settings +#### 4.2.1 Default settings ```rust use grex::RegExpBuilder; @@ -153,7 +151,7 @@ let regexp = RegExpBuilder::from(&["a", "aa", "aaa"]).build(); assert_eq!(regexp, "^a(aa?)?$"); ``` -#### Convert repeated substrings +#### 4.2.2 Convert repeated substrings ```rust use grex::{Feature, RegExpBuilder}; @@ -164,20 +162,34 @@ let regexp = RegExpBuilder::from(&["a", "aa", "aaa"]) assert_eq!(regexp, "^a{1,3}$"); ``` -#### Escape non-ascii characters +#### 4.2.3 Convert to character classes + +```rust +use grex::{Feature, RegExpBuilder}; + +let regexp = RegExpBuilder::from(&["a", "aa", "123"]) + .with_conversion_of(&[Feature::Digit, Feature::Word]) + .build(); +assert_eq!(regexp, "^(\\d\\d\\d|\\w\\w|\\w)$"); +``` + +#### 4.2.4 Escape non-ascii characters ```rust use grex::RegExpBuilder; let regexp = RegExpBuilder::from(&["You smell like 💩."]) - .with_escaped_non_ascii_chars(false) + .with_escaping_of_non_ascii_chars(false) .build(); assert_eq!(regexp, "^You smell like \\u{1f4a9}\\.$"); ``` -#### Escape astral code points using surrogate pairs +#### 4.2.5 Escape astral code points using surrogate pairs -Old versions of JavaScript do not support unicode escape sequences for the astral code planes (range `U+010000` to `U+10FFFF`). In order to support these symbols in JavaScript regular expressions, the conversion to surrogate pairs is necessary. More information on that matter can be found [here](https://mathiasbynens.be/notes/javascript-unicode). +Old versions of JavaScript do not support unicode escape sequences for the astral code planes +(range `U+010000` to `U+10FFFF`). In order to support these symbols in JavaScript regular +expressions, the conversion to surrogate pairs is necessary. More information on that matter +can be found [here](https://mathiasbynens.be/notes/javascript-unicode). ```rust use grex::RegExpBuilder; @@ -188,7 +200,7 @@ let regexp = RegExpBuilder::from(&["You smell like 💩."]) assert_eq!(regexp, "^You smell like \\u{d83d}\\u{dca9}\\.$"); ``` -#### Combine multiple features +#### 4.2.6 Combine multiple features ```rust use grex::{Feature, RegExpBuilder}; @@ -200,44 +212,166 @@ let regexp = RegExpBuilder::from(&["You smell like 💩💩💩."]) assert_eq!(regexp, "^You smel{2} like \\u{1f4a9}{3}\\.$"); ``` +```rust +use grex::{Feature, RegExpBuilder}; + +let regexp = RegExpBuilder::from(&["a", "aa", "123"]) + .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Word]) + .build(); +assert_eq!(regexp, "^(\\w{1,2}|\\d{3})$"); +``` + +#### 4.2.7 Syntax highlighting + +⚠ The method `with_syntax_highlighting()` may only be used if the resulting regular expression is meant to +be printed to the console. The regex string representation returned from enabling +this setting cannot be fed into the [*regex*](https://crates.io/crates/regex) crate. + +```rust +use grex::RegExpBuilder; + +let regexp = RegExpBuilder::from(&["a", "aa", "123"]) + .with_syntax_highlighting() + .build(); +``` + ### 4.3 Examples [Top ▲](#table-of-contents) -The following table showcases what *grex* can do (using the command-line flags here): - -| Input | Output | Note | -| ----- | ------ | ---- | -| `a b c` | `^[a-c]$` | | -| `a c d e f` | `^[ac-f]$` | | -| `1 3 4 5 6` | `^[13-6]$` | | -| `a b x de` | ^(de|[abx])$ | | -| `a b bc` | ^(bc?|a)$ | | -| `a aa aaa` | `^a(aa?)?$` | | -| `a ab abc` | `^a(bc?)?$` | | -| `3.5 4.5 4,5` | ^(3\\.5|4[,.]5)$ | | -| `[a-z]` | `^\[a\-z\]$` | Regex syntax characters are escaped. | -| `y̆ a z` | ^([az]|y̆)$ | Grapheme `y̆` consists of two unicode symbols:
`U+0079` (Latin Small Letter Y)
`U+0306` (Combining Breve).
This is why it is not part of
the character class. | -| `"I ♥ cake" "I ♥ cookies"` | ^I ♥ c(ookies|ake)$ | Input containing blank space must be
surrounded by quotation marks. | -| `"I \u{2665} cake"` | `^I ♥ cake$` | Unicode escape sequences are converted
back to the original unicode symbol. | -| `-r aaa` | `^a{3}$` | | -| `-r abababa` | `^(ab){3}a$` | | -| `-r aababab` | `^a(ab){3}$` | | -| `-r abababaa` | `^(ab){3}a{2}$` | | -| `-r a aa aaa` | `^a{1,3}$` | | -| `-r b ba baa baaa` | `^b(a{1,3})?$` | | -| `-r b ba baa baaaa` | ^b(a{1,2}|a{4})?$ | | -| `-r xy̆y̆z xy̆y̆y̆z` | `^x(y̆){2,3}z$` | The parentheses are needed because
`y̆` consists of two unicode symbols. | -| `-r xy̆y̆z xy̆y̆y̆y̆z` | ^x((y̆){2}|(y̆){4})z$ | | -| `-r zyxx yxx` | `^z?yx{2}$` | | -| `-r 4.5 44.5 44.55 4.55 ` | `^4{1,2}\.5{1,2}$` | | -| `-r "I ♥♥ cake"` | `^I ♥{2} cake$` | | -| `-r "I \u{2665}\u{2665} cake"` | `^I ♥{2} cake$` | | -| `-e "I ♥♥ you."` | `^I \u{2665}\u{2665} you\.$` | | -| `-e -r "I ♥♥ you."` | `^I \u{2665}{2} you\.$` | | -| `-e "You smell like 💩💩."` | `^You smell like \u{1f4a9}\u{1f4a9}\.$` | | -| `-e -r "You smell like 💩💩."` | `^You smel{2} like \u{1f4a9}{2}\.$` | | -| `-e -r --with-surrogates "You smell like 💩💩."` | `^You smel{2} like (\u{d83d}\u{dca9}){2}\.$` | For languages such as older
JavaScript versions not supporting
astral codepoints (`U+010000` to `U+10FFFF`),
conversion to surrogate pairs is possible.
More info about this issue can be found [here](https://mathiasbynens.be/notes/javascript-unicode). | - -## 5. How does it work? [Top ▲](#table-of-contents) +The following examples show the various supported regex syntax features: + +``` +$ grex a b c +^[a-c]$ + +$ grex a c d e f +^[ac-f]$ + +$ grex a b x de +^(de|[abx])$ + +$ grex a b bc +^(bc?|a)$ + +$ grex [a-z] +^\[a\-z\]$ + +$ grex -r b ba baa baaa +^b(a{1,3})?$ + +$ grex -r b ba baa baaaa +^b(a{1,2}|a{4})?$ + +$ grex y̆ a z +^(y̆|[az])$ +Note: +Grapheme y̆ consists of two Unicode symbols: +U+0079 (Latin Small Letter Y) +U+0306 (Combining Breve). + +$ grex "I ♥ cake" "I ♥ cookies" +^I ♥ c(ookies|ake)$ +Note: +Input containing blank space must be +surrounded by quotation marks. +``` + +The string `"I ♥♥♥ 36 and ٣ and 💩💩."` serves as input for the following examples using the command-line notation: + +``` +$ grex +^I ♥♥♥ 36 and ٣ and 💩💩\.$ + +$ grex -e +^I \u{2665}\u{2665}\u{2665} 36 and \u{663} and \u{1f4a9}\u{1f4a9}\.$ + +$ grex -e --with-surrogates +^I \u{2665}\u{2665}\u{2665} 36 and \u{663} and \u{d83d}\u{dca9}\u{d83d}\u{dca9}\.$ + +$ grex -d +^I ♥♥♥ \d\d and \d and 💩💩\.$ + +$ grex -s +^I\s♥♥♥\s36\sand\s٣\sand\s💩💩\.$ + +$ grex -w +^\w ♥♥♥ \w\w \w\w\w \w \w\w\w 💩💩\.$ + +$ grex -D +^\D\D\D\D\D\D36\D\D\D\D\D٣\D\D\D\D\D\D\D\D$ + +$ grex -S +^\S \S\S\S \S\S \S\S\S \S \S\S\S \S\S\S$ + +$ grex -dsw +^\w\s♥♥♥\s\d\d\s\w\w\w\s\d\s\w\w\w\s💩💩\.$ + +$ grex -dswW +^\w\s\W\W\W\s\d\d\s\w\w\w\s\d\s\w\w\w\s\W\W\W$ + +$ grex -r +^I ♥{3} 36 and ٣ and 💩{2}\.$ + +$ grex -er +^I \u{2665}{3} 36 and \u{663} and \u{1f4a9}{2}\.$ + +$ grex -er --with-surrogates +^I \u{2665}{3} 36 and \u{663} and (\u{d83d}\u{dca9}){2}\.$ + +$ grex -dr +^I ♥{3} \d(\d and ){2}💩{2}\.$ + +$ grex -rs +^I\s♥{3}\s36\sand\s٣\sand\s💩{2}\.$ + +$ grex -rw +^\w ♥{3} \w(\w \w{3} ){2}💩{2}\.$ + +$ grex -Dr +^\D{6}36\D{5}٣\D{8}$ + +$ grex -rS +^\S \S(\S{2} ){2}\S{3} \S \S{3} \S{3}$ + +$ grex -rW +^I\W{5}36\Wand\W٣\Wand\W{4}$ + +$ grex -drsw +^\w\s♥{3}\s\d(\d\s\w{3}\s){2}💩{2}\.$ + +$ grex -drswW +^\w\s\W{3}\s\d(\d\s\w{3}\s){2}\W{3}$ +``` + +## 5. How to build? [Top ▲](#table-of-contents) + +In order to build the source code yourself, you need the +[stable Rust toolchain](https://www.rust-lang.org/tools/install) installed on your machine +so that [*cargo*](https://doc.rust-lang.org/cargo/), the Rust package manager is available. + +``` +git clone https://github.com/pemistahl/grex.git +cd grex +cargo build +``` + +The source code is accompanied by an extensive test suite consisting of unit tests, integration +tests and property tests. For running the unit and integration tests, simply say: + +``` +cargo test +``` + +Property tests are disabled by default with the `#[ignore]` annotation because they are +very long-running. They are used for automatically generating test cases for regular +expression conversion. If a test case is found that produces a wrong conversion, it is +shrinked to the shortest test case possible that still produces a wrong result. +This is a very useful tool for finding bugs. If you want to run these tests, say: + +``` +cargo test -- --ignored +``` + +## 6. How does it work? [Top ▲](#table-of-contents) 1. A [deterministic finite automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) (DFA) is created from the input strings. @@ -245,6 +379,6 @@ The following table showcases what *grex* can do (using the command-line flags h 3. The minimized DFA is expressed as a system of linear equations which are solved with [Brzozowski's algebraic method](http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392), resulting in the final regular expression. -## 6. Do you want to contribute? [Top ▲](#table-of-contents) +## 7. Do you want to contribute? [Top ▲](#table-of-contents) In case you want to contribute something to *grex* even though it's in a very early stage of development, then I encourage you to do so nevertheless. Do you have ideas for cool features? Or have you found any bugs so far? Feel free to open an issue or send a pull request. It's very much appreciated. :-) \ No newline at end of file diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index a8bc366..f428dfb 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,13 @@ +## grex 1.0.0 (released on 02 Feb 2020) + +### Features +- conversion to character classes `\d`, `\D`, `\s`, `\S`, `\w`, `\W` is now supported +- repetition detection now works with arbitrarily nested expressions. Input strings such as `aaabaaab` which were previously converted to `^(aaab){2}$` are now converted to `^(a{3}b){2}$`. +- optional syntax highlighting for the produced regular expressions can now be enabled using the `--colorize` command-line flag or with the library method `RegExpBuilder.with_syntax_highlighting()` + +### Test Coverage +- new unit tests, integration tests and property tests have been added + ## grex 0.3.2 (released on 12 Jan 2020) ### Test Coverage diff --git a/src/lib.rs b/src/lib.rs index 61b1968..b01b569 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,11 @@ //! //! *grex* is a library as well as a command-line utility that is meant to simplify the often //! complicated and tedious task of creating regular expressions. It does so by automatically -//! generating regular expressions from user-provided test cases. +//! generating regular expressions from user-provided test cases. The produced expressions +//! are Perl-compatible regular expressions (PCRE) which are also compatible with the +//! regular expression parser in the [*regex*](https://crates.io/crates/regex) crate. +//! Other regular expression parsers or respective libraries from other programming languages +//! have not been tested so far. //! //! This project has started as a Rust port of the JavaScript tool //! [*regexgen*](https://github.com/devongovett/regexgen) written by @@ -41,8 +45,9 @@ //! - alternation using `|` operator //! - optionality using `?` quantifier //! - escaping of non-ascii characters, with optional conversion of astral code points to surrogate pairs -//! - concatenation of all of the former +//! - fully Unicode-aware, correctly handles graphemes consisting of multiple Unicode symbols //! - reading input strings from the command-line or from a file +//! - optional syntax highlighting for nicer output in supported terminals //! //! ## 3. How to use? //! @@ -71,7 +76,18 @@ //! assert_eq!(regexp, "^a{1,3}$"); //! ``` //! -//! ### 3.3 Escape non-ascii characters +//! ### 3.3 Convert to character classes +//! +//! ``` +//! use grex::{Feature, RegExpBuilder}; +//! +//! let regexp = RegExpBuilder::from(&["a", "aa", "123"]) +//! .with_conversion_of(&[Feature::Digit, Feature::Word]) +//! .build(); +//! assert_eq!(regexp, "^(\\d\\d\\d|\\w\\w|\\w)$"); +//! ``` +//! +//! ### 3.4 Escape non-ascii characters //! //! ``` //! use grex::RegExpBuilder; @@ -82,9 +98,13 @@ //! assert_eq!(regexp, "^You smell like \\u{1f4a9}\\.$"); //! ``` //! -//! ### 3.4 Escape astral code points using surrogate pairs +//! ### 3.5 Escape astral code points using surrogate pairs //! -//! Old versions of JavaScript do not support unicode escape sequences for the astral code planes (range `U+010000` to `U+10FFFF`). In order to support these symbols in JavaScript regular expressions, the conversion to surrogate pairs is necessary. More information on that matter can be found [here](https://mathiasbynens.be/notes/javascript-unicode). +//! Old versions of JavaScript do not support unicode escape sequences for +//! the astral code planes (range `U+010000` to `U+10FFFF`). In order to +//! support these symbols in JavaScript regular expressions, the conversion +//! to surrogate pairs is necessary. More information on that matter can be +//! found [here](https://mathiasbynens.be/notes/javascript-unicode). //! //! ``` //! use grex::RegExpBuilder; @@ -106,6 +126,15 @@ //! .build(); //! assert_eq!(regexp, "^You smel{2} like \\u{1f4a9}{3}\\.$"); //! ``` +//! +//! ```rust +//! use grex::{Feature, RegExpBuilder}; +//! +//! let regexp = RegExpBuilder::from(&["a", "aa", "123"]) +//! .with_conversion_of(&[Feature::Repetition, Feature::Digit, Feature::Word]) +//! .build(); +//! assert_eq!(regexp, "^(\\w{1,2}|\\d{3})$"); +//! ``` #[macro_use] mod macros; diff --git a/src/main.rs b/src/main.rs index a739bc6..7d65cd1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,7 +21,14 @@ use std::path::PathBuf; use structopt::StructOpt; #[derive(StructOpt)] -#[structopt(author, about, version_short = "v")] +#[structopt( + author = "© 2019-2020 Peter M. Stahl ", + about = "Licensed under the Apache License, Version 2.0\n\ + Downloadable from https://crates.io/crates/grex\n\ + Source code at https://github.com/pemistahl/grex\n\n\ + grex generates regular expressions from user-provided test cases.", + version_short = "v" +)] struct CLI { #[structopt( value_name = "INPUT", diff --git a/src/regexp.rs b/src/regexp.rs index b796195..de497ab 100644 --- a/src/regexp.rs +++ b/src/regexp.rs @@ -72,6 +72,11 @@ impl RegExpBuilder { self } + /// Tells `RegExpBuilder` to provide syntax highlighting for the resulting regular expression. + /// + /// ⚠ This method may only be used if the resulting regular expression is meant to + /// be printed to the console. The regex string representation returned from enabling + /// this setting cannot be fed into the [*regex*](https://crates.io/crates/regex) crate. pub fn with_syntax_highlighting(&mut self) -> &mut Self { self.is_output_colorized = true; self