Skip to content

Commit

Permalink
feat: Split NFA positive tags into start and end transitions to encap…
Browse files Browse the repository at this point in the history
…sulate a capture group. (#50)

Co-authored-by: Lin Zhihao <[email protected]>
  • Loading branch information
SharafMohamed and LinZhihao-723 authored Nov 21, 2024
1 parent 5bb1916 commit 3f13224
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 57 deletions.
12 changes: 9 additions & 3 deletions src/log_surgeon/Lexer.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,11 +405,17 @@ auto Lexer<NFAStateType, DFAStateType>::epsilon_closure(NFAStateType const* stat
}

// TODO: currently treat tagged transitions as epsilon transitions
for (auto const& positive_tagged_transition :
current_state->get_positive_tagged_transitions())
for (auto const& positive_tagged_start_transition :
current_state->get_positive_tagged_start_transitions())
{
stack.push(positive_tagged_transition.get_dest_state());
stack.push(positive_tagged_start_transition.get_dest_state());
}
auto const& optional_positive_tagged_end_transition
= current_state->get_positive_tagged_end_transition();
if (optional_positive_tagged_end_transition.has_value()) {
stack.push(optional_positive_tagged_end_transition.value().get_dest_state());
}

auto const& optional_negative_tagged_transition
= current_state->get_negative_tagged_transition();
if (optional_negative_tagged_transition.has_value()) {
Expand Down
56 changes: 48 additions & 8 deletions src/log_surgeon/finite_automata/RegexAST.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -693,11 +693,11 @@ class RegexASTCapture : public RegexAST<NFAStateType> {

/**
* Adds the needed `RegexNFA::states` to the passed in nfa to handle a
* `RegexASTCapture` before transitioning to an accepting `end_state`.
* `RegexASTCapture` before transitioning to a `dest_state`.
* @param nfa
* @param end_state
* @param dest_state
*/
auto add_to_nfa(RegexNFA<NFAStateType>* nfa, NFAStateType* end_state) const -> void override;
auto add_to_nfa(RegexNFA<NFAStateType>* nfa, NFAStateType* dest_state) const -> void override;

[[nodiscard]] auto serialize() const -> std::u32string override;

Expand Down Expand Up @@ -892,11 +892,51 @@ template <typename NFAStateType>
}

template <typename NFAStateType>
void RegexASTCapture<NFAStateType>::add_to_nfa(RegexNFA<NFAStateType>* nfa, NFAStateType* end_state)
const {
auto* state_with_positive_tagged_transition
= nfa->new_state_with_positive_tagged_transition(m_tag.get(), end_state);
m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, state_with_positive_tagged_transition);
auto RegexASTCapture<NFAStateType>::add_to_nfa(
RegexNFA<NFAStateType>* nfa,
NFAStateType* dest_state
) const -> void {
// TODO: move this into a documentation file in the future, and reference it here.
// The NFA constructed for a capture group follows the structure below, with tagged transitions
// explicitly labeled for clarity:
// +---------------------+
// | `m_root` |
// +---------------------+
// | `m_tag` start
// | (positive tagged start transition)
// v
// +---------------------+
// |`capture_start_state`|
// +---------------------+
// |
// | (epsilon transition)
// v
// +---------------------+
// | `m_group_regex_ast` |
// | (nested NFA) |
// +---------------------+
// | `m_negative_tags`
// | (negative tagged transition)
// v
// +---------------------+
// | `capture_end_state` |
// +---------------------+
// | `m_tag` end
// | (positive tagged end transition)
// v
// +---------------------+
// | `dest_state` |
// +---------------------+
auto [capture_start_state, capture_end_state]
= nfa->new_start_and_end_states_with_positive_tagged_transitions(
m_tag.get(),
dest_state
);

auto* initial_root = nfa->get_root();
nfa->set_root(capture_start_state);
m_group_regex_ast->add_to_nfa_with_negative_tags(nfa, capture_end_state);
nfa->set_root(initial_root);
}

template <typename NFAStateType>
Expand Down
47 changes: 40 additions & 7 deletions src/log_surgeon/finite_automata/RegexNFA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ class RegexNFA {
[[nodiscard]] auto new_state() -> NFAStateType*;

/**
* Creates a unique_ptr for an NFA state with a positive tagged transition and adds it to
* Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to
* `m_states`.
* @param tag
* @param dest_state
* @return NFAStateType*
* @return A new state with a positive tagged end transition to `dest_state`.
*/
[[nodiscard]] auto new_state_with_positive_tagged_transition(
[[nodiscard]] auto new_state_with_positive_tagged_end_transition(
Tag const* tag,
NFAStateType const* dest_state
) -> NFAStateType*;
Expand All @@ -58,6 +58,19 @@ class RegexNFA {
NFAStateType const* dest_state
) -> NFAStateType*;

/**
* Creates the start and end states for a capture group.
* @param tag The tag associated with the capture group.
* @param dest_state
* @return A pair of states:
* - A new state with a positive tagged start transition from `m_root`.
* - A new state with a positive tagged end transition to `dest_state`.
*/
[[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions(
Tag const* tag,
NFAStateType const* dest_state
) -> std::pair<NFAStateType*, NFAStateType*>;

/**
* @return A vector representing the traversal order of the NFA states using breadth-first
* search (BFS).
Expand Down Expand Up @@ -101,7 +114,7 @@ auto RegexNFA<NFAStateType>::new_state() -> NFAStateType* {
}

template <typename NFAStateType>
auto RegexNFA<NFAStateType>::new_state_with_positive_tagged_transition(
auto RegexNFA<NFAStateType>::new_state_with_positive_tagged_end_transition(
Tag const* tag,
NFAStateType const* dest_state
) -> NFAStateType* {
Expand All @@ -118,6 +131,18 @@ auto RegexNFA<NFAStateType>::new_state_with_negative_tagged_transition(
return m_states.back().get();
}

template <typename NFAStateType>
auto RegexNFA<NFAStateType>::new_start_and_end_states_with_positive_tagged_transitions(
Tag const* tag,
NFAStateType const* dest_state
) -> std::pair<NFAStateType*, NFAStateType*> {
auto* start_state = new_state();
m_root->add_positive_tagged_start_transition(tag, start_state);

auto* end_state = new_state_with_positive_tagged_end_transition(tag, dest_state);
return {start_state, end_state};
}

template <typename NFAStateType>
auto RegexNFA<NFAStateType>::get_bfs_traversal_order() const -> std::vector<NFAStateType const*> {
std::queue<NFAStateType const*> state_queue;
Expand Down Expand Up @@ -147,11 +172,19 @@ auto RegexNFA<NFAStateType>::get_bfs_traversal_order() const -> std::vector<NFAS
for (auto const* dest_state : current_state->get_epsilon_transitions()) {
add_to_queue_and_visited(dest_state);
}
for (auto const& positive_tagged_transition :
current_state->get_positive_tagged_transitions())
for (auto const& positive_tagged_start_transition :
current_state->get_positive_tagged_start_transitions())
{
add_to_queue_and_visited(positive_tagged_transition.get_dest_state());
add_to_queue_and_visited(positive_tagged_start_transition.get_dest_state());
}

auto const& optional_positive_tagged_end_transition
= current_state->get_positive_tagged_end_transition();
if (optional_positive_tagged_end_transition.has_value()) {
add_to_queue_and_visited(optional_positive_tagged_end_transition.value().get_dest_state(
));
}

auto const& optional_negative_tagged_transition
= current_state->get_negative_tagged_transition();
if (optional_negative_tagged_transition.has_value()) {
Expand Down
52 changes: 39 additions & 13 deletions src/log_surgeon/finite_automata/RegexNFAState.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class RegexNFAState {
RegexNFAState() = default;

RegexNFAState(Tag const* tag, RegexNFAState const* dest_state)
: m_positive_tagged_transitions{{tag, dest_state}} {}
: m_positive_tagged_end_transition{PositiveTaggedTransition{tag, dest_state}} {}

RegexNFAState(std::vector<Tag const*> tags, RegexNFAState const* dest_state)
: m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {}
Expand All @@ -49,9 +49,19 @@ class RegexNFAState {
return m_matching_variable_id;
}

[[nodiscard]] auto get_positive_tagged_transitions(
auto
add_positive_tagged_start_transition(Tag const* tag, RegexNFAState const* dest_state) -> void {
m_positive_tagged_start_transitions.emplace_back(tag, dest_state);
}

[[nodiscard]] auto get_positive_tagged_start_transitions(
) const -> std::vector<PositiveTaggedTransition<RegexNFAState>> const& {
return m_positive_tagged_transitions;
return m_positive_tagged_start_transitions;
}

[[nodiscard]] auto get_positive_tagged_end_transition(
) const -> std::optional<PositiveTaggedTransition<RegexNFAState>> const& {
return m_positive_tagged_end_transition;
}

[[nodiscard]] auto get_negative_tagged_transition(
Expand Down Expand Up @@ -100,7 +110,8 @@ class RegexNFAState {
private:
bool m_accepting{false};
uint32_t m_matching_variable_id{0};
std::vector<PositiveTaggedTransition<RegexNFAState>> m_positive_tagged_transitions;
std::vector<PositiveTaggedTransition<RegexNFAState>> m_positive_tagged_start_transitions;
std::optional<PositiveTaggedTransition<RegexNFAState>> m_positive_tagged_end_transition;
std::optional<NegativeTaggedTransition<RegexNFAState>> m_negative_tagged_transition;
std::vector<RegexNFAState*> m_epsilon_transitions;
std::array<std::vector<RegexNFAState*>, cSizeOfByte> m_bytes_transitions;
Expand Down Expand Up @@ -176,14 +187,27 @@ auto RegexNFAState<state_type>::serialize(
epsilon_transitions.emplace_back(std::to_string(state_ids.at(dest_state)));
}

std::vector<std::string> positive_tagged_transitions;
for (auto const& positive_tagged_transition : m_positive_tagged_transitions) {
auto const optional_serialized_positive_transition
= positive_tagged_transition.serialize(state_ids);
if (false == optional_serialized_positive_transition.has_value()) {
std::vector<std::string> serialized_positive_tagged_start_transitions;
for (auto const& positive_tagged_start_transition : m_positive_tagged_start_transitions) {
auto const optional_serialized_positive_start_transition
= positive_tagged_start_transition.serialize(state_ids);
if (false == optional_serialized_positive_start_transition.has_value()) {
return std::nullopt;
}
serialized_positive_tagged_start_transitions.emplace_back(
optional_serialized_positive_start_transition.value()
);
}

std::string serialized_positive_tagged_end_transition;
if (m_positive_tagged_end_transition.has_value()) {
auto const optional_serialized_positive_end_transition
= m_positive_tagged_end_transition.value().serialize(state_ids);
if (false == optional_serialized_positive_end_transition.has_value()) {
return std::nullopt;
}
positive_tagged_transitions.emplace_back(optional_serialized_positive_transition.value());
serialized_positive_tagged_end_transition
= optional_serialized_positive_end_transition.value();
}

std::string negative_tagged_transition_string;
Expand All @@ -200,13 +224,15 @@ auto RegexNFAState<state_type>::serialize(
= m_accepting ? fmt::format("accepting_tag={},", m_matching_variable_id) : "";

return fmt::format(
"{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_transitions={{"
"{}}},negative_tagged_transition={{{}}}",
"{}:{}byte_transitions={{{}}},epsilon_transitions={{{}}},positive_tagged_start_"
"transitions={{{}}},positive_tagged_end_transitions={{{}}},negative_tagged_transition={"
"{{}}}",
state_ids.at(this),
accepting_tag_string,
fmt::join(byte_transitions, ","),
fmt::join(epsilon_transitions, ","),
fmt::join(positive_tagged_transitions, ","),
fmt::join(serialized_positive_tagged_start_transitions, ","),
serialized_positive_tagged_end_transition,
negative_tagged_transition_string
);
}
Expand Down
Loading

0 comments on commit 3f13224

Please sign in to comment.