Skip to content

Commit

Permalink
Address review concerns on wording
Browse files Browse the repository at this point in the history
  • Loading branch information
Bingran Hu committed Jul 23, 2024
1 parent 30b2b5f commit 35d34c5
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 22 deletions.
14 changes: 7 additions & 7 deletions components/core/src/clp/regex_utils/constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace clp::regex_utils {
constexpr size_t cCharBitarraySize = 128;

/**
* Create an ASCII character lookup table at compile time.
* Creates an ASCII character lookup table at compile time.
*
* @param char_str A string that contains the characters to look up.
* @return The lookup table as bit array.
Expand All @@ -18,7 +18,7 @@ constexpr size_t cCharBitarraySize = 128;
) -> std::array<bool, cCharBitarraySize> {
std::array<bool, cCharBitarraySize> bit_array{};
bit_array.fill(false);
for (char const ch : char_str) {
for (auto const ch : char_str) {
bit_array.at(ch) = true;
}
return bit_array;
Expand All @@ -38,11 +38,11 @@ constexpr char cEscapeChar{'\\'};
constexpr char cCharsetNegate{'^'};

// Character bitmaps
// This is a more complete set of meta characters than necessary, as the user might not be fully
// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences.
constexpr auto cRegexEscapeSeqMetaChars = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\");
// This is the set of meta characters that need to be escaped in the wildcard syntax.
constexpr auto cWildcardMetaChars = create_char_bit_array("?*\\");
// The set of characters that can be preceded with an escape backslash. When escaped, these
// characters are used as literals.
constexpr auto cRegexEscapeSeqMetaCharsLUT = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\");
// The set of metacharacters that need to be escaped in the wildcard syntax.
constexpr auto cWildcardMetaCharsLUT = create_char_bit_array("?*\\");
} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_CONSTANTS_HPP
11 changes: 6 additions & 5 deletions components/core/src/clp/regex_utils/regex_translation_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,11 @@ using StateTransitionFuncSig
[[nodiscard]] StateTransitionFuncSig dot_state_transition;

/**
* Appends regex metacharacters literally to the wildcard string.
* Appends an escaped regex metacharacter as a literal character to the wildcard string by
* discarding its preceding backslash.
*
* These metacharacters are escaped by backslashes, so they have their special meanings suppressed.
* For metacharacters shared by the regex and the wildcard syntax, keep the escape backslashes.
* The preceding backslash must be kept for characters that also have special meanings in the
* wildcard syntax, e.g. `abc.\*xyz` should be translated into `abc?\*xyz` instead of `abc?*xyz`.
*/
[[nodiscard]] StateTransitionFuncSig escaped_state_transition;

Expand Down Expand Up @@ -178,10 +179,10 @@ auto escaped_state_transition(
[[maybe_unused]] RegexToWildcardTranslatorConfig const& config
) -> error_code {
auto const ch{*it};
if (!cRegexEscapeSeqMetaChars.at(ch)) {
if (false == cRegexEscapeSeqMetaCharsLUT.at(ch)) {
return ErrorCode::IllegalEscapeSequence;
}
if (cWildcardMetaChars.at(ch)) {
if (cWildcardMetaCharsLUT.at(ch)) {
wildcard_str = wildcard_str + cEscapeChar + ch;
} else {
wildcard_str += ch;
Expand Down
17 changes: 11 additions & 6 deletions components/core/tests/test-regex_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,42 @@ using clp::regex_utils::ErrorCode;
using clp::regex_utils::regex_to_wildcard;
using clp::regex_utils::RegexToWildcardTranslatorConfig;

TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") {
// Test empty string
TEST_CASE("regex_to_wildcard_simple_translations", "[regex_utils][re2wc][simple_translations]") {
REQUIRE(regex_to_wildcard("").value().empty());

// Test simple wildcard translations
REQUIRE((regex_to_wildcard("xyz").value() == "xyz"));
REQUIRE((regex_to_wildcard(". xyz .* zyx .").value() == "? xyz * zyx ?"));
REQUIRE((regex_to_wildcard(". xyz .+ zyx .*").value() == "? xyz ?* zyx *"));
}

// Test unescaped meta characters
TEST_CASE("regex_to_wildcard_unescaped_metachar", "[regex_utils][re2wc][unescaped_metachar]") {
REQUIRE((regex_to_wildcard(".? xyz .* zyx .").error() == ErrorCode::UnsupportedQuestionMark));
REQUIRE((regex_to_wildcard(". xyz .** zyx .").error() == ErrorCode::UntranslatableStar));
REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == ErrorCode::UntranslatablePlus));
REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == ErrorCode::UnsupportedPipe));
REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == ErrorCode::IllegalCaret));
REQUIRE((regex_to_wildcard(". xyz $.* zyx .").error() == ErrorCode::IllegalDollarSign));
}


// Test escaped meta characters
TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_metachar]") {
// Escape backslash is superfluous for the following set of characters
REQUIRE((regex_to_wildcard("<>-_/=!").value() == "<>-_/=!"));
REQUIRE((regex_to_wildcard("\\<\\>\\-\\_\\/\\=\\!").value() == "<>-_/=!"));
// Test the full escape sequences set
REQUIRE(
(regex_to_wildcard("\\*\\+\\?\\|\\^\\$\\.\\{\\}\\[\\]\\(\\)\\<\\>\\-\\_\\/\\=\\!\\\\")
.value()
== "\\*+\\?|^$.{}[]()<>-_/=!\\\\")
);
// Test unsupported escape sequences
REQUIRE(
(regex_to_wildcard("abc\\Qdefghi\\Ejkl").error()
== clp::regex_utils::ErrorCode::IllegalEscapeSequence)
);
}

TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") {
TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") {
// Test anchors and prefix/suffix wildcards
RegexToWildcardTranslatorConfig const config{false, true};
REQUIRE(((regex_to_wildcard("^", config).value() == "*")));
Expand Down
10 changes: 6 additions & 4 deletions docs/src/dev-guide/components-core/regex-utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,13 @@ For a detailed description on the options order and usage, see the
* Turn `.+` into `?*`
* E.g. `abc.*def.ghi.+` will get translated to `abc*def?ghi?*`
* Metacharacter escape sequences
* Append the escaped metacharacters literally to the wildcard output string.
* The list of characters that require escaping is `[\/^$.|?*+(){}`.
* The list of characters that are optional to escape is `],<>-_=!`. Escaping them will not have
any effect (the escape backslash is ignored). This list may be expanded in the future.
* An escaped regex metacharacter is treated as a literal and appended to the wildcard output.
* The list of characters that require escaping to have their special meanings suppressed is
`[\/^$.|?*+(){}`.
* Superfluous escape characters are ignored for the following characters: `],<>-_=!`.
* E.g. `a\[\+b\-\_c-_d` will get translated to `a[+b-_c-_d`
* Note: generally, any non-alphanumeric character can be escaped to use it as a literal. The
list this utils library supports is non-exhaustive and can be expanded when necessary.
* For metacharacters shared by both syntaxes, keep the escape backslashes.
* The list of characters that fall into this category is `*?\`. All wildcard metacharacters are
also regex metacharacters.
Expand Down

0 comments on commit 35d34c5

Please sign in to comment.