Skip to content

Commit

Permalink
regex-utils: Add support for handling escaped regex metacharacters. (y…
Browse files Browse the repository at this point in the history
  • Loading branch information
Bill-hbrhbr authored Jul 24, 2024
1 parent 2a6218e commit 9f85a88
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 24 deletions.
2 changes: 1 addition & 1 deletion components/core/.clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ IncludeCategories:
# Library headers. Update when adding new libraries.
# NOTE: clang-format retains leading white-space on a line in violation of the YAML spec.
- Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|mariadb\
|mongocxx|msgpack|outcome|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)"
|mongocxx|msgpack|outcome|regex_utils|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)"
Priority: 3
# C system headers
- Regex: "^<.+\\.h>"
Expand Down
4 changes: 4 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ auto ErrorCodeCategory::message(int ev) const -> string {
case ErrorCode::IllegalDollarSign:
return "Failed to translate due to end anchor `$` in the middle of the string.";

case ErrorCode::IllegalEscapeSequence:
return "Currently only supports escape sequences that are used to suppress special "
"meanings of regex metacharacters. Alphanumeric characters are disallowed.";

case ErrorCode::UnmatchedParenthesis:
return "Unmatched opening `(` or closing `)`.";

Expand Down
1 change: 1 addition & 0 deletions components/core/src/clp/regex_utils/ErrorCode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ enum class ErrorCode : uint8_t {
UnsupportedPipe,
IllegalCaret,
IllegalDollarSign,
IllegalEscapeSequence,
UnmatchedParenthesis,
};

Expand Down
30 changes: 30 additions & 0 deletions components/core/src/clp/regex_utils/constants.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP
#define CLP_REGEX_UTILS_CONSTANTS_HPP

#include <array>
#include <cstddef>
#include <string_view>

namespace clp::regex_utils {
constexpr size_t cCharBitarraySize = 128;

/**
* Creates an ASCII character lookup table at compile time.
*
* @param char_str A string that contains the characters to look up.
* @return The lookup table as bit array.
*/
[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str
) -> std::array<bool, cCharBitarraySize> {
std::array<bool, cCharBitarraySize> bit_array{};
bit_array.fill(false);
for (auto const ch : char_str) {
bit_array.at(ch) = true;
}
return bit_array;
}

// Wildcard meta characters
constexpr char cZeroOrMoreCharsWildcard{'*'};
constexpr char cSingleCharWildcard{'?'};
Expand All @@ -14,6 +36,14 @@ constexpr char cRegexStartAnchor{'^'};
constexpr char cRegexEndAnchor{'$'};
constexpr char cEscapeChar{'\\'};
constexpr char cCharsetNegate{'^'};

// Character bitmaps
// The set of regex metacharacters that can be preceded with an escape backslash to be treated as a
// literal.
constexpr auto cRegexEscapeSeqMetaCharsLut = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\");
// The set of wildcard metacharacters that must remain escaped in the translated string to be
// treated as a literal.
constexpr auto cWildcardMetaCharsLut = create_char_bit_array("?*\\");
} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_CONSTANTS_HPP
67 changes: 51 additions & 16 deletions components/core/src/clp/regex_utils/regex_translation_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,19 @@ class TranslatorState {
*
* This list may be expanded as the translator supports translating more regex patterns.
* <ul>
* <li>NORMAL: The initial state, where characters have no special meanings and are treated
* <li>Normal: The initial state, where characters have no special meanings and are treated
* literally.</li>
* <li>DOT: Encountered a period `.`. Expecting wildcard expression.</li>
* <li>END: Encountered a dollar sign `$`, meaning the regex string has reached the end
* <li>Dot: Encountered a period `.`. Expecting wildcard expression.</li>
* <li>Escaped: Encountered a backslash `\`. Expecting an escape sequence.</li>
* <li>End: Encountered a dollar sign `$`, meaning the regex string has reached the end
* anchor.</li>
* </ul>
*/
enum class RegexPatternState : uint8_t {
NORMAL = 0,
DOT,
END,
Normal = 0,
Dot,
Escaped,
End,
};

// Constructor
Expand All @@ -51,7 +53,7 @@ class TranslatorState {

private:
// Members
RegexPatternState m_state{RegexPatternState::NORMAL};
RegexPatternState m_state{RegexPatternState::Normal};
};

/**
Expand Down Expand Up @@ -92,13 +94,22 @@ using StateTransitionFuncSig
*/
[[nodiscard]] StateTransitionFuncSig dot_state_transition;

/**
* Appends an escaped regex metacharacter as a literal character to the wildcard string by
* discarding its preceding backslash.
*
* The preceding backslash must be kept for characters that also have special meanings in the
* wildcard syntax, e.g. `abc.\*xyz` should be translated into `abc?\*xyz` instead of `abc?*xyz`.
*/
[[nodiscard]] StateTransitionFuncSig escaped_state_transition;

/**
* Disallows the appearances of other characters after encountering an end anchor in the string.
*/
[[nodiscard]] StateTransitionFuncSig end_state_transition;

/**
* States other than the NORMAL state may require special handling after the whole regex string has
* States other than the Normal state may require special handling after the whole regex string has
* been scanned and processed.
*/
[[nodiscard]] StateTransitionFuncSig final_state_cleanup;
Expand All @@ -112,10 +123,13 @@ auto normal_state_transition(
auto const ch{*it};
switch (ch) {
case '.':
state.set_next_state(TranslatorState::RegexPatternState::DOT);
state.set_next_state(TranslatorState::RegexPatternState::Dot);
break;
case cEscapeChar:
state.set_next_state(TranslatorState::RegexPatternState::Escaped);
break;
case cRegexEndAnchor:
state.set_next_state(TranslatorState::RegexPatternState::END);
state.set_next_state(TranslatorState::RegexPatternState::End);
break;
case cRegexZeroOrMore:
return ErrorCode::UntranslatableStar;
Expand Down Expand Up @@ -155,7 +169,25 @@ auto dot_state_transition(
--it;
break;
}
state.set_next_state(TranslatorState::RegexPatternState::NORMAL);
state.set_next_state(TranslatorState::RegexPatternState::Normal);
return ErrorCode::Success;
}

auto escaped_state_transition(
TranslatorState& state,
string_view::const_iterator& it,
string& wildcard_str,
[[maybe_unused]] RegexToWildcardTranslatorConfig const& config
) -> error_code {
auto const ch{*it};
if (false == cRegexEscapeSeqMetaCharsLut.at(ch)) {
return ErrorCode::IllegalEscapeSequence;
}
if (cWildcardMetaCharsLut.at(ch)) {
wildcard_str += cEscapeChar;
}
wildcard_str += ch;
state.set_next_state(TranslatorState::RegexPatternState::Normal);
return ErrorCode::Success;
}

Expand All @@ -178,7 +210,7 @@ auto final_state_cleanup(
RegexToWildcardTranslatorConfig const& config
) -> error_code {
switch (state.get_state()) {
case TranslatorState::RegexPatternState::DOT:
case TranslatorState::RegexPatternState::Dot:
// The last character is a single `.`, without the possibility of becoming a
// multichar wildcard
wildcard_str += cSingleCharWildcard;
Expand All @@ -187,7 +219,7 @@ auto final_state_cleanup(
break;
}

if (TranslatorState::RegexPatternState::END != state.get_state()
if (TranslatorState::RegexPatternState::End != state.get_state()
&& config.add_prefix_suffix_wildcards())
{
wildcard_str += cZeroOrMoreCharsWildcard;
Expand Down Expand Up @@ -220,13 +252,16 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
error_code ec{};
while (it != regex_str.cend()) {
switch (state.get_state()) {
case TranslatorState::RegexPatternState::NORMAL:
case TranslatorState::RegexPatternState::Normal:
ec = normal_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::DOT:
case TranslatorState::RegexPatternState::Dot:
ec = dot_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::END:
case TranslatorState::RegexPatternState::Escaped:
ec = escaped_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::End:
ec = end_state_transition(state, it, wildcard_str, config);
break;
default:
Expand Down
30 changes: 23 additions & 7 deletions components/core/tests/test-regex_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,47 @@
#include <Catch2/single_include/catch2/catch.hpp>
#include <regex_utils/ErrorCode.hpp>
#include <regex_utils/regex_translation_utils.hpp>
#include <regex_utils/RegexToWildcardTranslatorConfig.hpp>

#include <Catch2/single_include/catch2/catch.hpp>

using clp::regex_utils::ErrorCode;
using clp::regex_utils::regex_to_wildcard;
using clp::regex_utils::RegexToWildcardTranslatorConfig;

TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") {
// Test empty string
TEST_CASE("regex_to_wildcard_simple_translations", "[regex_utils][re2wc][simple_translations]") {
REQUIRE(regex_to_wildcard("").value().empty());

// Test simple wildcard translations
REQUIRE((regex_to_wildcard("xyz").value() == "xyz"));
REQUIRE((regex_to_wildcard(". xyz .* zyx .").value() == "? xyz * zyx ?"));
REQUIRE((regex_to_wildcard(". xyz .+ zyx .*").value() == "? xyz ?* zyx *"));
}

// Test unescaped meta characters
TEST_CASE("regex_to_wildcard_unescaped_metachar", "[regex_utils][re2wc][unescaped_metachar]") {
REQUIRE((regex_to_wildcard(".? xyz .* zyx .").error() == ErrorCode::UnsupportedQuestionMark));
REQUIRE((regex_to_wildcard(". xyz .** zyx .").error() == ErrorCode::UntranslatableStar));
REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == ErrorCode::UntranslatablePlus));
REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == ErrorCode::UnsupportedPipe));
REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == ErrorCode::IllegalCaret));
REQUIRE((regex_to_wildcard(". xyz $.* zyx .").error() == ErrorCode::IllegalDollarSign));
}

TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_metachar]") {
// Escape backslash is superfluous for the following set of characters
REQUIRE((regex_to_wildcard("<>-_/=!").value() == "<>-_/=!"));
REQUIRE((regex_to_wildcard("\\<\\>\\-\\_\\/\\=\\!").value() == "<>-_/=!"));
// Test the full escape sequences set
REQUIRE(
(regex_to_wildcard("\\*\\+\\?\\|\\^\\$\\.\\{\\}\\[\\]\\(\\)\\<\\>\\-\\_\\/\\=\\!\\\\")
.value()
== "\\*+\\?|^$.{}[]()<>-_/=!\\\\")
);
// Test unsupported escape sequences
REQUIRE(
(regex_to_wildcard("abc\\Qdefghi\\Ejkl").error()
== clp::regex_utils::ErrorCode::IllegalEscapeSequence)
);
}

TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") {
TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") {
// Test anchors and prefix/suffix wildcards
RegexToWildcardTranslatorConfig const config{false, true};
REQUIRE(((regex_to_wildcard("^", config).value() == "*")));
Expand Down
15 changes: 15 additions & 0 deletions docs/src/dev-guide/components-core/regex-utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,21 @@ For a detailed description on the options order and usage, see the
* Turn `.*` into `*`
* Turn `.+` into `?*`
* E.g. `abc.*def.ghi.+` will get translated to `abc*def?ghi?*`
* Metacharacter escape sequences
* An escaped regex metacharacter is treated as a literal and appended to the wildcard output.
* The list of characters that require escaping to have their special meanings suppressed is
`[\/^$.|?*+(){}`.
* Superfluous escape characters are ignored for the following characters: `],<>-_=!`.
* E.g. `a\[\+b\-\_c-_d` will get translated to `a[+b-_c-_d`
* Note: generally, any non-alphanumeric character can be escaped to use it as a literal. The
list this utils library supports is non-exhaustive and can be expanded when necessary.
* For metacharacters shared by both syntaxes, keep the escape backslashes.
* The list of characters that fall into this category is `*?\`. All wildcard metacharacters are
also regex metacharacters.
* E.g. `a\*b\?c\\d` will get translated to `a\*b\?c\\d` (no change)
* Escape sequences with alphanumeric characters are disallowed.
* E.g. Special utility escape sequences `\Q`, `\E`, `\A` etc. and back references `\1` `\2` etc.
cannot be translated.
### Custom configuration
Expand Down

0 comments on commit 9f85a88

Please sign in to comment.