diff --git a/CMakeLists.txt b/CMakeLists.txt index 117cde51..ac30e1c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,9 @@ set(SOURCE_FILES src/log_surgeon/finite_automata/PrefixTree.hpp src/log_surgeon/finite_automata/RegexAST.hpp src/log_surgeon/finite_automata/RegexDFA.hpp - src/log_surgeon/finite_automata/RegexDFA.tpp + src/log_surgeon/finite_automata/RegexDFAState.hpp + src/log_surgeon/finite_automata/RegexDFAStatePair.hpp + src/log_surgeon/finite_automata/RegexDFAStateType.hpp src/log_surgeon/finite_automata/RegexNFA.hpp src/log_surgeon/finite_automata/RegexNFAState.hpp src/log_surgeon/finite_automata/RegexNFAStateType.hpp diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index a5d0e433..19d696b7 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -42,7 +42,7 @@ auto get_intersect_for_query( } RegexNFA nfa(std::move(rules)); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); - auto schema_types = dfa1->get_intersect(dfa2); + auto schema_types = dfa1->get_intersect(dfa2.get()); std::cout << search_string << ":"; for (auto const& schema_type : schema_types) { std::cout << m_id_symbol[schema_type] << ","; diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index ddb12cfa..726ff68f 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index b549d7c1..3e8ad149 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -1,126 +1,22 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP -#include #include #include #include -#include #include -#include -#include -#include +#include namespace log_surgeon::finite_automata { -enum class RegexDFAStateType { - Byte, - UTF8 -}; - -template -class RegexDFAState { -public: - using Tree = UnicodeIntervalTree*>; - - auto add_matching_variable_id(uint32_t const variable_id) -> void { - m_matching_variable_ids.push_back(variable_id); - } - - [[nodiscard]] auto get_matching_variable_ids() const -> std::vector const& { - return m_matching_variable_ids; - } - - [[nodiscard]] auto is_accepting() const -> bool { return !m_matching_variable_ids.empty(); } - - auto add_byte_transition(uint8_t const& byte, RegexDFAState* dest_state) -> void { - m_bytes_transition[byte] = dest_state; - } - - /** - * Returns the next state the DFA transitions to on input character (byte or - * utf8) - * @param character - * @return RegexDFAState* - */ - [[nodiscard]] auto next(uint32_t character) const -> RegexDFAState*; - -private: - std::vector m_matching_variable_ids; - RegexDFAState* m_bytes_transition[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == - // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) - // in that case. - std::conditional_t> m_tree_transitions; -}; - -/** - * Class for a pair of DFA states, where each state in the pair belongs to a different DFA. - * This class is used to facilitate the construction of an intersection DFA from two separate DFAs. - * Each instance represents a state in the intersection DFA and follows these rules: - * - * - A pair is considered accepting if both states are accepting in their respective DFAs. - * - A pair is considered reachable if both its states are reachable in their respective DFAs - * from this pair's states. - * - * NOTE: Only the first state in the pair contains the variable types matched by the pair. - */ -template -class RegexDFAStatePair { -public: - RegexDFAStatePair(DFAState const* state1, DFAState const* state2) - : m_state1(state1), - m_state2(state2) {}; - - /** - * Used for ordering in a set by considering the states' addresses - * @param rhs - * @return Whether m_state1 in lhs has a lower address than in rhs, or if they're equal, - * whether m_state2 in lhs has a lower address than in rhs - */ - auto operator<(RegexDFAStatePair const& rhs) const -> bool { - if (m_state1 == rhs.m_state1) { - return m_state2 < rhs.m_state2; - } - return m_state1 < rhs.m_state1; - } - - /** - * Generates all pairs reachable from the current pair via any string and store any reachable - * pair not previously visited in unvisited_pairs - * @param visited_pairs Previously visited pairs - * @param unvisited_pairs Set to add unvisited reachable pairs - */ - auto get_reachable_pairs( - std::set>& visited_pairs, - std::set>& unvisited_pairs - ) const -> void; - - [[nodiscard]] auto is_accepting() const -> bool { - return m_state1->is_accepting() && m_state2->is_accepting(); - } - - [[nodiscard]] auto get_matching_variable_ids() const -> std::vector const& { - return m_state1->get_matching_variable_ids(); - } - -private: - DFAState const* m_state1; - DFAState const* m_state2; -}; - -using RegexDFAByteState = RegexDFAState; -using RegexDFAUTF8State = RegexDFAState; - // TODO: rename `RegexDFA` to `DFA` template class RegexDFA { public: /** - * Creates a new DFA state based on a set of NFA states and adds it to - * m_states - * @param nfa_state_set - * @return DFAStateType* + * Creates a new DFA state based on a set of NFA states and adds it to `m_states`. + * @param nfa_state_set The set of NFA states represented by this DFA state. + * @return A pointer to the new DFA state. */ template auto new_state(std::set const& nfa_state_set) -> DFAStateType*; @@ -128,22 +24,52 @@ class RegexDFA { auto get_root() const -> DFAStateType const* { return m_states.at(0).get(); } /** - * Compares this dfa with dfa_in to determine the set of schema types in - * this dfa that are reachable by any type in dfa_in. A type is considered - * reachable if there is at least one string for which: (1) this dfa returns - * a set of types containing the type, and (2) dfa_in returns any non-empty - * set of types. - * @param dfa_in - * @return The set of schema types reachable by dfa_in + * Compares this dfa with `dfa_in` to determine the set of schema types in this dfa that are + * reachable by any type in `dfa_in`. A type is considered reachable if there is at least one + * string for which: (1) this dfa returns a set of types containing the type, and (2) `dfa_in` + * returns any non-empty set of types. + * @param dfa_in The dfa with which to take the intersect. + * @return The set of schema types reachable by `dfa_in`. */ - [[nodiscard]] auto get_intersect(std::unique_ptr const& dfa_in - ) const -> std::set; + [[nodiscard]] auto get_intersect(RegexDFA const* dfa_in) const -> std::set; private: std::vector> m_states; }; -} // namespace log_surgeon::finite_automata -#include "RegexDFA.tpp" +template +template +auto RegexDFA::new_state(std::set const& nfa_state_set +) -> DFAStateType* { + m_states.emplace_back(std::make_unique()); + auto* dfa_state = m_states.back().get(); + for (auto const* nfa_state : nfa_state_set) { + if (nfa_state->is_accepting()) { + dfa_state->add_matching_variable_id(nfa_state->get_matching_variable_id()); + } + } + return dfa_state; +} + +template +auto RegexDFA::get_intersect(RegexDFA const* dfa_in) const -> std::set { + std::set schema_types; + std::set> unvisited_pairs; + std::set> visited_pairs; + unvisited_pairs.emplace(this->get_root(), dfa_in->get_root()); + // TODO: Handle UTF-8 (multi-byte transitions) as well + while (false == unvisited_pairs.empty()) { + auto current_pair_it = unvisited_pairs.begin(); + if (current_pair_it->is_accepting()) { + auto const& matching_variable_ids = current_pair_it->get_matching_variable_ids(); + schema_types.insert(matching_variable_ids.cbegin(), matching_variable_ids.cend()); + } + visited_pairs.insert(*current_pair_it); + current_pair_it->get_reachable_pairs(visited_pairs, unvisited_pairs); + unvisited_pairs.erase(current_pair_it); + } + return schema_types; +} +} // namespace log_surgeon::finite_automata #endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP diff --git a/src/log_surgeon/finite_automata/RegexDFA.tpp b/src/log_surgeon/finite_automata/RegexDFA.tpp deleted file mode 100644 index 458a5565..00000000 --- a/src/log_surgeon/finite_automata/RegexDFA.tpp +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_TPP -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_TPP - -namespace log_surgeon::finite_automata { -template -auto RegexDFAState::next(uint32_t character) const -> RegexDFAState* { - if constexpr (RegexDFAStateType::Byte == stateType) { - return m_bytes_transition[character]; - } else { - if (character < cSizeOfByte) { - return m_bytes_transition[character]; - } - std::unique_ptr> result - = m_tree_transitions.find(Interval(character, character)); - assert(result->size() <= 1); - if (!result->empty()) { - return result->front().m_value; - } - return nullptr; - } -} - -template -auto RegexDFAStatePair::get_reachable_pairs( - std::set>& visited_pairs, - std::set>& unvisited_pairs -) const -> void { - // TODO: Handle UTF-8 (multi-byte transitions) as well - for (uint32_t i = 0; i < cSizeOfByte; i++) { - auto next_state1 = m_state1->next(i); - auto next_state2 = m_state2->next(i); - if (next_state1 != nullptr && next_state2 != nullptr) { - RegexDFAStatePair reachable_pair{next_state1, next_state2}; - if (visited_pairs.count(reachable_pair) == 0) { - unvisited_pairs.insert(reachable_pair); - } - } - } -} - -template -template -auto RegexDFA::new_state(std::set const& nfa_state_set -) -> DFAStateType* { - m_states.emplace_back(std::make_unique()); - auto* dfa_state = m_states.back().get(); - for (auto const* nfa_state : nfa_state_set) { - if (nfa_state->is_accepting()) { - dfa_state->add_matching_variable_id(nfa_state->get_matching_variable_id()); - } - } - return dfa_state; -} - -template -auto RegexDFA::get_intersect(std::unique_ptr const& dfa_in -) const -> std::set { - std::set schema_types; - std::set> unvisited_pairs; - std::set> visited_pairs; - unvisited_pairs.emplace(this->get_root(), dfa_in->get_root()); - // TODO: Handle UTF-8 (multi-byte transitions) as well - while (false == unvisited_pairs.empty()) { - auto current_pair_it = unvisited_pairs.begin(); - if (current_pair_it->is_accepting()) { - auto const& matching_variable_ids = current_pair_it->get_matching_variable_ids(); - schema_types.insert(matching_variable_ids.cbegin(), matching_variable_ids.cend()); - } - visited_pairs.insert(*current_pair_it); - current_pair_it->get_reachable_pairs(visited_pairs, unvisited_pairs); - unvisited_pairs.erase(current_pair_it); - } - return schema_types; -} -} // namespace log_surgeon::finite_automata - -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_TPP diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp new file mode 100644 index 00000000..3c0ef4ca --- /dev/null +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -0,0 +1,80 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE +#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace log_surgeon::finite_automata { +template +class RegexDFAState; + +using RegexDFAByteState = RegexDFAState; +using RegexDFAUTF8State = RegexDFAState; + +template +class RegexDFAState { +public: + using Tree = UnicodeIntervalTree*>; + + RegexDFAState() { + std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); + } + + auto add_matching_variable_id(uint32_t const variable_id) -> void { + m_matching_variable_ids.push_back(variable_id); + } + + [[nodiscard]] auto get_matching_variable_ids() const -> std::vector const& { + return m_matching_variable_ids; + } + + [[nodiscard]] auto is_accepting() const -> bool { + return false == m_matching_variable_ids.empty(); + } + + auto add_byte_transition(uint8_t const& byte, RegexDFAState* dest_state) -> void { + m_bytes_transition[byte] = dest_state; + } + + /** + * @param character The character (byte or utf8) to transition on. + * @return A pointer to the DFA state reached after transitioning on `character`. + */ + [[nodiscard]] auto next(uint32_t character) const -> RegexDFAState*; + +private: + std::vector m_matching_variable_ids; + RegexDFAState* m_bytes_transition[cSizeOfByte]; + // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, + // so we use an empty class (`std::tuple<>`) in that case. + std::conditional_t> m_tree_transitions; +}; + +template +auto RegexDFAState::next(uint32_t character) const -> RegexDFAState* { + if constexpr (RegexDFAStateType::Byte == stateType) { + return m_bytes_transition[character]; + } else { + if (character < cSizeOfByte) { + return m_bytes_transition[character]; + } + std::unique_ptr> result + = m_tree_transitions.find(Interval(character, character)); + assert(result->size() <= 1); + if (false == result->empty()) { + return result->front().m_value; + } + return nullptr; + } +} +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE diff --git a/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp b/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp new file mode 100644 index 00000000..208a3e81 --- /dev/null +++ b/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp @@ -0,0 +1,85 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_PAIR +#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_PAIR + +#include +#include +#include + +#include + +namespace log_surgeon::finite_automata { +/** + * Class for a pair of DFA states, where each state in the pair belongs to a different DFA. + * This class is used to facilitate the construction of an intersection DFA from two separate DFAs. + * Each instance represents a state in the intersection DFA and follows these rules: + * + * - A pair is considered accepting if both states are accepting in their respective DFAs. + * - A pair is considered reachable if both its states are reachable in their respective DFAs + * from this pair's states. + * + * NOTE: Only the first state in the pair contains the variable types matched by the pair. + */ +template +class RegexDFAStatePair { +public: + RegexDFAStatePair(DFAState const* state1, DFAState const* state2) + : m_state1(state1), + m_state2(state2) {}; + + /** + * Used for ordering in a set by considering the states' addresses + * @param rhs + * @return Whether m_state1 in lhs has a lower address than in rhs, or if they're equal, + * whether m_state2 in lhs has a lower address than in rhs + */ + auto operator<(RegexDFAStatePair const& rhs) const -> bool { + if (m_state1 == rhs.m_state1) { + return m_state2 < rhs.m_state2; + } + return m_state1 < rhs.m_state1; + } + + /** + * Generates all pairs reachable from the current pair via any string and store any reachable + * pair not previously visited in unvisited_pairs + * @param visited_pairs Previously visited pairs + * @param unvisited_pairs Set to add unvisited reachable pairs + */ + auto get_reachable_pairs( + std::set>& visited_pairs, + std::set>& unvisited_pairs + ) const -> void; + + [[nodiscard]] auto is_accepting() const -> bool { + return m_state1->is_accepting() && m_state2->is_accepting(); + } + + [[nodiscard]] auto get_matching_variable_ids() const -> std::vector const& { + return m_state1->get_matching_variable_ids(); + } + +private: + DFAState const* m_state1; + DFAState const* m_state2; +}; + +template +auto RegexDFAStatePair::get_reachable_pairs( + std::set>& visited_pairs, + std::set>& unvisited_pairs +) const -> void { + // TODO: Handle UTF-8 (multi-byte transitions) as well + for (uint32_t i = 0; i < cSizeOfByte; i++) { + auto next_state1 = m_state1->next(i); + auto next_state2 = m_state2->next(i); + if (next_state1 != nullptr && next_state2 != nullptr) { + RegexDFAStatePair reachable_pair{next_state1, next_state2}; + if (visited_pairs.count(reachable_pair) == 0) { + unvisited_pairs.insert(reachable_pair); + } + } + } +} +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_PAIR diff --git a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp b/src/log_surgeon/finite_automata/RegexDFAStateType.hpp new file mode 100644 index 00000000..ae4e52d4 --- /dev/null +++ b/src/log_surgeon/finite_automata/RegexDFAStateType.hpp @@ -0,0 +1,13 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE +#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE + +#include + +namespace log_surgeon::finite_automata { +enum class RegexDFAStateType : uint8_t { + Byte, + UTF8 +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE