Skip to content

Commit

Permalink
Implement translator logic to reduce regex character sets into wildca…
Browse files Browse the repository at this point in the history
…rds.
  • Loading branch information
Bill-hbrhbr committed Jul 25, 2024
1 parent 9f85a88 commit e74f043
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 10 deletions.
6 changes: 6 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ auto ErrorCodeCategory::message(int ev) const -> string {
case ErrorCode::UnmatchedParenthesis:
return "Unmatched opening `(` or closing `)`.";

case ErrorCode::IncompleteCharsetStructure:
return "Unmatched closing `]` at the end of the string.";

case ErrorCode::UnsupportedCharsetPattern:
return "Currently only supports character set that contains a single character.";

default:
return "(unrecognized error)";
}
Expand Down
2 changes: 2 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ enum class ErrorCode : uint8_t {
IllegalDollarSign,
IllegalEscapeSequence,
UnmatchedParenthesis,
IncompleteCharsetStructure,
UnsupportedCharsetPattern,
};

/**
Expand Down
3 changes: 3 additions & 0 deletions components/core/src/clp/regex_utils/constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ constexpr auto cRegexEscapeSeqMetaCharsLut = create_char_bit_array("*+?|^$.{}[](
// The set of wildcard metacharacters that must remain escaped in the translated string to be
// treated as a literal.
constexpr auto cWildcardMetaCharsLut = create_char_bit_array("?*\\");
// The set of metacharacters that can be preceded with an escape backslash in the regex character
// set to be treated as a literal.
constexpr auto cRegexCharsetEscapeSeqMetaCharsLut = create_char_bit_array("^-]\\");
} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_CONSTANTS_HPP
152 changes: 145 additions & 7 deletions components/core/src/clp/regex_utils/regex_translation_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
#include <system_error>

#include <outcome/single-header/outcome.hpp>
#include <string_utils/string_utils.hpp>

#include "regex_utils/constants.hpp"
#include "regex_utils/ErrorCode.hpp"
#include "regex_utils/RegexToWildcardTranslatorConfig.hpp"

namespace clp::regex_utils {
using clp::string_utils::is_alphabet;
using std::error_code;
using std::string;
using std::string_view;
Expand All @@ -31,6 +33,8 @@ class TranslatorState {
* literally.</li>
* <li>Dot: Encountered a period `.`. Expecting wildcard expression.</li>
* <li>Escaped: Encountered a backslash `\`. Expecting an escape sequence.</li>
* <li>Charset: Encountered an opening square bracket `[`. Expecting a character set.</li>
* <li>CharsetEscaped: Encountered an escape backslash in the character set.</li>
* <li>End: Encountered a dollar sign `$`, meaning the regex string has reached the end
* anchor.</li>
* </ul>
Expand All @@ -39,21 +43,28 @@ class TranslatorState {
Normal = 0,
Dot,
Escaped,
Charset,
CharsetEscaped,
End,
};

// Constructor
TranslatorState() = default;
TranslatorState(string_view::const_iterator it) : m_it{it} {};

// Getters
[[nodiscard]] auto get_state() const -> RegexPatternState { return m_state; }

[[nodiscard]] auto get_marked_iterator() const -> string_view::const_iterator { return m_it; }

// Setters
auto set_next_state(RegexPatternState const& state) -> void { m_state = state; }

void mark_iterator(string_view::const_iterator it) { m_it = it; }

private:
// Members
RegexPatternState m_state{RegexPatternState::Normal};
string_view::const_iterator m_it;
};

/**
Expand All @@ -65,7 +76,7 @@ class TranslatorState {
* @param[in, out] it The iterator that represents the current regex string scan position. May be
* updated to advance or backtrack the scan position.
* @param[out] wildcard_str The translated wildcard string. May or may not be updated.
* @param[in] config The translator config.
* @param[in] config The translator config predefined by the user.
* @return clp::regex_utils::ErrorCode
*/
using StateTransitionFuncSig
Expand Down Expand Up @@ -103,6 +114,25 @@ using StateTransitionFuncSig
*/
[[nodiscard]] StateTransitionFuncSig escaped_state_transition;

/**
* Attempts to reduce regex character sets into a single character so that the regex string is still
* translatable to wildcard.
*
* In most cases, only a trival character set containing a single character is reducable. However,
* if the output wildcard query will be analyzed in case-insensitive mode, character set patterns
* such as [aA] [Bb] are also reducable.
* Throws two possible kinds of error codes, with IncompleteCharsetStructure having a higher
* precedence over UnsupportedCharsetPattern.
*/
[[nodiscard]] StateTransitionFuncSig charset_state_transition;

/**
* A transient state used to defer handling of escape sequences in a charset pattern.
*
* Allows the charset state to accurately capture the appearance of a closing bracket `]`.
*/
[[nodiscard]] StateTransitionFuncSig charsetescaped_state_transition;

/**
* Disallows the appearances of other characters after encountering an end anchor in the string.
*/
Expand All @@ -114,6 +144,36 @@ using StateTransitionFuncSig
*/
[[nodiscard]] StateTransitionFuncSig final_state_cleanup;

// Other helpers
/**
* Appends a single character as a literal to the wildcard string.
*
* If the literal is a metacharacter in the wildcard syntax, prepend the literal with an escape
* backslash.
* @param ch The literal to be appended.
* @param wildcard_str The wildcard string to be updated.
*/
inline auto append_single_char_to_wildcard(char const ch, string& wildcard_str) -> void {
if (cWildcardMetaCharsLut.at(ch)) {
wildcard_str += cEscapeChar;
}
wildcard_str += ch;
}

/**
* Detects if the two input arguments are a matching pair of upper and lowercase characters.
*
* @param ch0
* @param ch1
* @return True if the input is a matching pair.
*/
inline auto matching_upper_lower_case_char_pair(char const ch0, char const ch1) -> bool {
int const upper_lower_case_ascii_offset{'a' - 'A'};
return (is_alphabet(ch0) && is_alphabet(ch1)
&& (((ch0 - ch1) == upper_lower_case_ascii_offset)
|| ((ch1 - ch0) == upper_lower_case_ascii_offset)));
}

auto normal_state_transition(
TranslatorState& state,
string_view::const_iterator& it,
Expand All @@ -128,6 +188,10 @@ auto normal_state_transition(
case cEscapeChar:
state.set_next_state(TranslatorState::RegexPatternState::Escaped);
break;
case '[':
state.mark_iterator(it + 1); // Mark the first character of character set
state.set_next_state(TranslatorState::RegexPatternState::Charset);
break;
case cRegexEndAnchor:
state.set_next_state(TranslatorState::RegexPatternState::End);
break;
Expand Down Expand Up @@ -183,14 +247,74 @@ auto escaped_state_transition(
if (false == cRegexEscapeSeqMetaCharsLut.at(ch)) {
return ErrorCode::IllegalEscapeSequence;
}
if (cWildcardMetaCharsLut.at(ch)) {
wildcard_str += cEscapeChar;
append_single_char_to_wildcard(ch, wildcard_str);
state.set_next_state(TranslatorState::RegexPatternState::Normal);
return ErrorCode::Success;
}

auto charset_state_transition(
TranslatorState& state,
string_view::const_iterator& it,
string& wildcard_str,
RegexToWildcardTranslatorConfig const& config
) -> error_code {
auto const ch{*it};
string_view::const_iterator charset_start{state.get_marked_iterator()}; // avoid casting to ptr
auto const charset_len{it - charset_start};

if (']' != ch) {
// Only process charset until a closing bracket is reached.
if (cEscapeChar == ch) {
state.set_next_state(TranslatorState::RegexPatternState::CharsetEscaped);
}
return ErrorCode::Success;
}
wildcard_str += ch;

if (0 == charset_len || charset_len > 2) {
// Does not support empty charset or pattern that is longer than two characters.
return ErrorCode::UnsupportedCharsetPattern;
}

// Passed the length check. Now check for accepted charset patterns.
auto const ch0{*charset_start};
auto const ch1{*(charset_start + 1)};
char parsed_char{};

if (1 == charset_len) {
if (cCharsetNegate == ch0 || cEscapeChar == ch0) {
return ErrorCode::UnsupportedCharsetPattern;
}
parsed_char = ch0;
} else { // 2 == charset_len
if (cEscapeChar == ch0 && cRegexCharsetEscapeSeqMetaCharsLut.at(ch1)) {
// 2-char escape sequence
parsed_char = ch1;
} else if (config.case_insensitive_wildcard()
&& matching_upper_lower_case_char_pair(ch0, ch1))
{
// case-insensitive patterns like [aA] [Bb] etc.
parsed_char = ch0 > ch1 ? ch0 : ch1; // choose the lower case character
} else {
return ErrorCode::UnsupportedCharsetPattern;
}
}

append_single_char_to_wildcard(parsed_char, wildcard_str);
state.set_next_state(TranslatorState::RegexPatternState::Normal);
return ErrorCode::Success;
}

auto charsetescaped_state_transition(
TranslatorState& state,
[[maybe_unused]] string_view::const_iterator& it,
[[maybe_unused]] string& wildcard_str,
[[maybe_unused]] RegexToWildcardTranslatorConfig const& config
) -> error_code {
// Defer the handling of escape sequences to entire character set analysis..
state.set_next_state(TranslatorState::RegexPatternState::Charset);
return ErrorCode::Success;
}

auto end_state_transition(
[[maybe_unused]] TranslatorState& state,
string_view::const_iterator& it,
Expand All @@ -215,6 +339,10 @@ auto final_state_cleanup(
// multichar wildcard
wildcard_str += cSingleCharWildcard;
break;
case TranslatorState::RegexPatternState::Charset:
case TranslatorState::RegexPatternState::CharsetEscaped:
return ErrorCode::IncompleteCharsetStructure;
break;
default:
break;
}
Expand All @@ -226,10 +354,14 @@ auto final_state_cleanup(
}
return ErrorCode::Success;
}

} // namespace

auto regex_to_wildcard(string_view regex_str) -> OUTCOME_V2_NAMESPACE::std_result<string> {
return regex_to_wildcard(regex_str, {false, false});
return regex_to_wildcard(
regex_str,
{/*case_insensitive_wildcard=*/false, /*add_prefix_suffix_wildcards=*/false}
);
}

auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig const& config)
Expand All @@ -238,9 +370,9 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
return string{};
}

TranslatorState state;
string_view::const_iterator it{regex_str.cbegin()};
string wildcard_str;
TranslatorState state{it};

// If there is no starting anchor character, append multichar wildcard prefix
if (cRegexStartAnchor == *it) {
Expand All @@ -261,6 +393,12 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
case TranslatorState::RegexPatternState::Escaped:
ec = escaped_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::Charset:
ec = charset_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::CharsetEscaped:
ec = charsetescaped_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::End:
ec = end_state_transition(state, it, wildcard_str, config);
break;
Expand Down
43 changes: 42 additions & 1 deletion components/core/tests/test-regex_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,50 @@ TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_me
);
}

TEST_CASE("regex_to_wildcard_charset", "[regex_utils][re2wc][charset]") {
REQUIRE((regex_to_wildcard("x[y]z").value() == "xyz"));
REQUIRE((regex_to_wildcard("x[\\^]z").value() == "x^z"));
REQUIRE((regex_to_wildcard("x[\\]]z").value() == "x]z"));
REQUIRE((regex_to_wildcard("x[-]z").value() == "x-z"));
REQUIRE((regex_to_wildcard("x[\\-]z").value() == "x-z"));
REQUIRE((regex_to_wildcard("x[\\\\]z").value() == "x\\\\z"));
REQUIRE((regex_to_wildcard("[a][b][\\^][-][\\-][\\]][\\\\][c][d]").value() == "ab^--]\\\\cd"));

REQUIRE((regex_to_wildcard("x[]y").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("x[a-z]y").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("x[^^]y").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("x[^0-9]y").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("[xX][yY]").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9]").error() == ErrorCode::UnsupportedCharsetPattern));

REQUIRE((regex_to_wildcard("[\\").error() == ErrorCode::IncompleteCharsetStructure));
REQUIRE((regex_to_wildcard("[\\\\").error() == ErrorCode::IncompleteCharsetStructure));
REQUIRE((regex_to_wildcard("[xX").error() == ErrorCode::IncompleteCharsetStructure));
REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9").error() == ErrorCode::IncompleteCharsetStructure));
}

TEST_CASE("regex_to_wildcard_case_insensitive_config", "[regex_utils][re2wc][case_insensitive]") {
RegexToWildcardTranslatorConfig const config{/*case_insensitive_wildcard=*/true, false};
REQUIRE((regex_to_wildcard("[xX][yY]", config).value() == "xy"));
REQUIRE((regex_to_wildcard("[Yy][Xx]", config).value() == "yx"));
REQUIRE((regex_to_wildcard("[aA][Bb][Cc]", config).value() == "abc"));
REQUIRE((regex_to_wildcard("[aA][Bb][\\^][-][\\]][Cc][dD]", config).value() == "ab^-]cd"));

REQUIRE((regex_to_wildcard("[xX").error() == ErrorCode::IncompleteCharsetStructure));
REQUIRE(
(regex_to_wildcard("[aA][Bb][^[-[\\[Cc[dD", config).error()
== ErrorCode::IncompleteCharsetStructure)
);
REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9]").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE(
(regex_to_wildcard("[aA][Bb][^[-[\\[Cc[dD]", config).error()
== ErrorCode::UnsupportedCharsetPattern)
);
}

TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") {
// Test anchors and prefix/suffix wildcards
RegexToWildcardTranslatorConfig const config{false, true};
RegexToWildcardTranslatorConfig const config{false, /*add_prefix_suffix_wildcards=*/true};
REQUIRE(((regex_to_wildcard("^", config).value() == "*")));
REQUIRE((regex_to_wildcard("$", config).value() == "*"));
REQUIRE((regex_to_wildcard("^xyz$", config).value() == "xyz"));
Expand Down
11 changes: 9 additions & 2 deletions docs/src/dev-guide/components-core/regex-utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,21 @@ For a detailed description on the options order and usage, see the
* Escape sequences with alphanumeric characters are disallowed.
* E.g. Special utility escape sequences `\Q`, `\E`, `\A` etc. and back references `\1` `\2` etc.
cannot be translated.
* Character set
* Reduces a character set into a single character if possible.
* A trivial character set containing a single character or a single escaped metacharacter.
* E.g. `[a]` into `a`, `[\^]` into `^`
* If the `case_insensitive_wildcard` config is turned on, the translator can also reduce the
following patterns into a single lowercase character:
* E.g. `[aA]` into `a`, `[Bb]` into `b`, `[xX][Yy][zZ]` into `xyz`
### Custom configuration
The `RegexToWildcardTranslatorConfig` class objects are currently immutable once instantiated. The
constructor takes the following arguments in order:
* `case_insensitive_wildcard`: to be added later along with the character set translation
implementation.
* `case_insensitive_wildcard`: see *Character set* bullet point in the [Functionalities]
(#functionalities) section.
* `add_prefix_suffix_wildcards`: in the absence of regex anchors, add prefix or suffix wildcards so
the query becomes a substring query.
Expand Down

0 comments on commit e74f043

Please sign in to comment.