diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index c7dab9db..8a8aeb33 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -405,11 +405,17 @@ auto Lexer::epsilon_closure(NFAStateType const* stat } // TODO: currently treat tagged transitions as epsilon transitions - for (auto const& positive_tagged_transition : - current_state->get_positive_tagged_transitions()) + for (auto const& positive_tagged_start_transition : + current_state->get_positive_tagged_start_transitions()) { - stack.push(positive_tagged_transition.get_dest_state()); + stack.push(positive_tagged_start_transition.get_dest_state()); } + auto const& optional_positive_tagged_end_transition + = current_state->get_positive_tagged_end_transition(); + if (optional_positive_tagged_end_transition.has_value()) { + stack.push(optional_positive_tagged_end_transition.value().get_dest_state()); + } + auto const& optional_negative_tagged_transition = current_state->get_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index c0c6b04f..beeb588e 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -693,11 +693,11 @@ class RegexASTCapture : public RegexAST { /** * Adds the needed `RegexNFA::states` to the passed in nfa to handle a - * `RegexASTCapture` before transitioning to an accepting `end_state`. + * `RegexASTCapture` before transitioning to a `dest_state`. * @param nfa - * @param end_state + * @param dest_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; + auto add_to_nfa(RegexNFA* nfa, NFAStateType* dest_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -892,11 +892,51 @@ template } template -void RegexASTCapture::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) - const { - auto* state_with_positive_tagged_transition - = nfa->new_state_with_positive_tagged_transition(m_tag.get(), end_state); - m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_positive_tagged_transition); +auto RegexASTCapture::add_to_nfa( + RegexNFA* nfa, + NFAStateType* dest_state +) const -> void { + // TODO: move this into a documentation file in the future, and reference it here. + // The NFA constructed for a capture group follows the structure below, with tagged transitions + // explicitly labeled for clarity: + // +---------------------+ + // | `m_root` | + // +---------------------+ + // | `m_tag` start + // | (positive tagged start transition) + // v + // +---------------------+ + // |`capture_start_state`| + // +---------------------+ + // | + // | (epsilon transition) + // v + // +---------------------+ + // | `m_group_regex_ast` | + // | (nested NFA) | + // +---------------------+ + // | `m_negative_tags` + // | (negative tagged transition) + // v + // +---------------------+ + // | `capture_end_state` | + // +---------------------+ + // | `m_tag` end + // | (positive tagged end transition) + // v + // +---------------------+ + // | `dest_state` | + // +---------------------+ + auto [capture_start_state, capture_end_state] + = nfa->new_start_and_end_states_with_positive_tagged_transitions( + m_tag.get(), + dest_state + ); + + auto* initial_root = nfa->get_root(); + nfa->set_root(capture_start_state); + m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, capture_end_state); + nfa->set_root(initial_root); } template diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index 7919a0c6..ba9791b1 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -35,13 +35,13 @@ class RegexNFA { [[nodiscard]] auto new_state() -> NFAStateType*; /** - * Creates a unique_ptr for an NFA state with a positive tagged transition and adds it to + * Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to * `m_states`. * @param tag * @param dest_state - * @return NFAStateType* + * @return A new state with a positive tagged end transition to `dest_state`. */ - [[nodiscard]] auto new_state_with_positive_tagged_transition( + [[nodiscard]] auto new_state_with_positive_tagged_end_transition( Tag const* tag, NFAStateType const* dest_state ) -> NFAStateType*; @@ -58,6 +58,19 @@ class RegexNFA { NFAStateType const* dest_state ) -> NFAStateType*; + /** + * Creates the start and end states for a capture group. + * @param tag The tag associated with the capture group. + * @param dest_state + * @return A pair of states: + * - A new state with a positive tagged start transition from `m_root`. + * - A new state with a positive tagged end transition to `dest_state`. + */ + [[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions( + Tag const* tag, + NFAStateType const* dest_state + ) -> std::pair; + /** * @return A vector representing the traversal order of the NFA states using breadth-first * search (BFS). @@ -101,7 +114,7 @@ auto RegexNFA::new_state() -> NFAStateType* { } template -auto RegexNFA::new_state_with_positive_tagged_transition( +auto RegexNFA::new_state_with_positive_tagged_end_transition( Tag const* tag, NFAStateType const* dest_state ) -> NFAStateType* { @@ -118,6 +131,18 @@ auto RegexNFA::new_state_with_negative_tagged_transition( return m_states.back().get(); } +template +auto RegexNFA::new_start_and_end_states_with_positive_tagged_transitions( + Tag const* tag, + NFAStateType const* dest_state +) -> std::pair { + auto* start_state = new_state(); + m_root->add_positive_tagged_start_transition(tag, start_state); + + auto* end_state = new_state_with_positive_tagged_end_transition(tag, dest_state); + return {start_state, end_state}; +} + template auto RegexNFA::get_bfs_traversal_order() const -> std::vector { std::queue state_queue; @@ -147,11 +172,19 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vectorget_epsilon_transitions()) { add_to_queue_and_visited(dest_state); } - for (auto const& positive_tagged_transition : - current_state->get_positive_tagged_transitions()) + for (auto const& positive_tagged_start_transition : + current_state->get_positive_tagged_start_transitions()) { - add_to_queue_and_visited(positive_tagged_transition.get_dest_state()); + add_to_queue_and_visited(positive_tagged_start_transition.get_dest_state()); + } + + auto const& optional_positive_tagged_end_transition + = current_state->get_positive_tagged_end_transition(); + if (optional_positive_tagged_end_transition.has_value()) { + add_to_queue_and_visited(optional_positive_tagged_end_transition.value().get_dest_state( + )); } + auto const& optional_negative_tagged_transition = current_state->get_negative_tagged_transition(); if (optional_negative_tagged_transition.has_value()) { diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/RegexNFAState.hpp index dd21557b..8fce8cf7 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexNFAState.hpp @@ -32,7 +32,7 @@ class RegexNFAState { RegexNFAState() = default; RegexNFAState(Tag const* tag, RegexNFAState const* dest_state) - : m_positive_tagged_transitions{{tag, dest_state}} {} + : m_positive_tagged_end_transition{PositiveTaggedTransition{tag, dest_state}} {} RegexNFAState(std::vector tags, RegexNFAState const* dest_state) : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} @@ -49,9 +49,19 @@ class RegexNFAState { return m_matching_variable_id; } - [[nodiscard]] auto get_positive_tagged_transitions( + auto + add_positive_tagged_start_transition(Tag const* tag, RegexNFAState const* dest_state) -> void { + m_positive_tagged_start_transitions.emplace_back(tag, dest_state); + } + + [[nodiscard]] auto get_positive_tagged_start_transitions( ) const -> std::vector> const& { - return m_positive_tagged_transitions; + return m_positive_tagged_start_transitions; + } + + [[nodiscard]] auto get_positive_tagged_end_transition( + ) const -> std::optional> const& { + return m_positive_tagged_end_transition; } [[nodiscard]] auto get_negative_tagged_transition( @@ -100,7 +110,8 @@ class RegexNFAState { private: bool m_accepting{false}; uint32_t m_matching_variable_id{0}; - std::vector> m_positive_tagged_transitions; + std::vector> m_positive_tagged_start_transitions; + std::optional> m_positive_tagged_end_transition; std::optional> m_negative_tagged_transition; std::vector m_epsilon_transitions; std::array, cSizeOfByte> m_bytes_transitions; @@ -176,14 +187,27 @@ auto RegexNFAState::serialize( epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state))); } - std::vector positive_tagged_transitions; - for (auto const& positive_tagged_transition : m_positive_tagged_transitions) { - auto const optional_serialized_positive_transition - = positive_tagged_transition.serialize(state_ids); - if (false == optional_serialized_positive_transition.has_value()) { + std::vector serialized_positive_tagged_start_transitions; + for (auto const& positive_tagged_start_transition : m_positive_tagged_start_transitions) { + auto const optional_serialized_positive_start_transition + = positive_tagged_start_transition.serialize(state_ids); + if (false == optional_serialized_positive_start_transition.has_value()) { + return std::nullopt; + } + serialized_positive_tagged_start_transitions.emplace_back( + optional_serialized_positive_start_transition.value() + ); + } + + std::string serialized_positive_tagged_end_transition; + if (m_positive_tagged_end_transition.has_value()) { + auto const optional_serialized_positive_end_transition + = m_positive_tagged_end_transition.value().serialize(state_ids); + if (false == optional_serialized_positive_end_transition.has_value()) { return std::nullopt; } - positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value()); + serialized_positive_tagged_end_transition + = optional_serialized_positive_end_transition.value(); } std::string negative_tagged_transition_string; @@ -200,13 +224,15 @@ auto RegexNFAState::serialize( = m_accepting ? fmt::format("accepting_tag={},", m_matching_variable_id) : ""; return fmt::format( - "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{" - "{}}},negative_tagged_transition={{{}}}", + "{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_start_" + "transitions={{{}}},positive_tagged_end_transitions={{{}}},negative_tagged_transition={" + "{{}}}", state_ids.at(this), accepting_tag_string, fmt::join(byte_transitions, ","), fmt::join(epsilon_transitions, ","), - fmt::join(positive_tagged_transitions, ","), + fmt::join(serialized_positive_tagged_start_transitions, ","), + serialized_positive_tagged_end_transition, negative_tagged_transition_string ); } diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index c7a599b2..6a92f4bb 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -49,58 +49,91 @@ TEST_CASE("Test NFA", "[NFA]") { // Compare against expected output string expected_serialized_nfa = "0:byte_transitions={A-->1,Z-->2}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "1:byte_transitions={a-->3,b-->3,c-->4,d-->4}," + expected_serialized_nfa += "1:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={3[letter]}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; expected_serialized_nfa += "2:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={}," - "negative_tagged_transition={5[letter1,letter2,letter,containerID]}\n"; + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={4[letter1,letter2,letter,containerID]}\n"; expected_serialized_nfa += "3:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={6[letter1]}," + "positive_tagged_start_transitions={5[letter1],6[letter2]}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "4:byte_transitions={}," + expected_serialized_nfa += "4:accepting_tag=0,byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={7[letter2]}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "5:accepting_tag=0,byte_transitions={}," + expected_serialized_nfa += "5:byte_transitions={a-->7,b-->7}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "6:byte_transitions={}," + expected_serialized_nfa += "6:byte_transitions={c-->8,d-->8}," "epsilon_transitions={}," - "positive_tagged_transitions={}," - "negative_tagged_transition={8[letter2]}\n"; + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={}\n"; expected_serialized_nfa += "7:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={}," - "negative_tagged_transition={8[letter1]}\n"; + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={9[letter1]}," + "negative_tagged_transition={}\n"; expected_serialized_nfa += "8:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={9[letter]}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={10[letter2]}," + "negative_tagged_transition={}\n"; + expected_serialized_nfa += "9:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={11[letter2]}\n"; + expected_serialized_nfa += "10:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," + "negative_tagged_transition={11[letter1]}\n"; + expected_serialized_nfa += "11:byte_transitions={}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={12[letter]}," + "negative_tagged_transition={}\n"; + expected_serialized_nfa += "12:byte_transitions={B-->13}," + "epsilon_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "9:byte_transitions={B-->10}," + expected_serialized_nfa += "13:byte_transitions={}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={14[containerID]}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "10:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" - "11,7-->11,8-->11,9-->11}," + expected_serialized_nfa += "14:byte_transitions={0-->15,1-->15,2-->15,3-->15,4-->15,5-->15,6-->" + "15,7-->15,8-->15,9-->15}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "11:byte_transitions={0-->11,1-->11,2-->11,3-->11,4-->11,5-->11,6-->" - "11,7-->11,8-->11,9-->11}," + expected_serialized_nfa += "15:byte_transitions={0-->15,1-->15,2-->15,3-->15,4-->15,5-->15,6-->" + "15,7-->15,8-->15,9-->15}," "epsilon_transitions={}," - "positive_tagged_transitions={12[containerID]}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={16[containerID]}," "negative_tagged_transition={}\n"; - expected_serialized_nfa += "12:byte_transitions={C-->5}," + expected_serialized_nfa += "16:byte_transitions={C-->4}," "epsilon_transitions={}," - "positive_tagged_transitions={}," + "positive_tagged_start_transitions={}," + "positive_tagged_end_transitions={}," "negative_tagged_transition={}\n"; // Compare expected and actual line-by-line