-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add
PrefixTree
and RegisterHandler
to support TDFA simulati…
…on. (#56) Co-authored-by: Lin Zhihao <[email protected]>
- Loading branch information
1 parent
3f13224
commit 99b5b08
Showing
7 changed files
with
388 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#include "PrefixTree.hpp" | ||
|
||
#include <stdexcept> | ||
#include <vector> | ||
|
||
namespace log_surgeon::finite_automata { | ||
auto PrefixTree::get_reversed_positions(id_t const node_id) const -> std::vector<position_t> { | ||
if (m_nodes.size() <= node_id) { | ||
throw std::out_of_range("Prefix tree index out of range."); | ||
} | ||
|
||
std::vector<position_t> reversed_positions; | ||
auto current_node{m_nodes[node_id]}; | ||
while (false == current_node.is_root()) { | ||
reversed_positions.push_back(current_node.get_position()); | ||
current_node = m_nodes[current_node.get_parent_id_unsafe()]; | ||
} | ||
return reversed_positions; | ||
} | ||
} // namespace log_surgeon::finite_automata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP | ||
#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP | ||
|
||
#include <cstddef> | ||
#include <cstdint> | ||
#include <optional> | ||
#include <stdexcept> | ||
#include <vector> | ||
|
||
namespace log_surgeon::finite_automata { | ||
/** | ||
* Represents a prefix tree to store register data during TDFA simulation. Each node in the tree | ||
* stores a single position in the lexed string. Each path from the root to an index corresponds to | ||
* a sequence of positions for an individual tag: | ||
* - Positive position node: Indicates the tag was matched at the position. | ||
* - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, | ||
* it indicates the tag was never matched. If the negative tag is along a path containing positive | ||
* nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain | ||
* a one-to-one mapping between the contained capture group and the enclosing capture group. | ||
*/ | ||
class PrefixTree { | ||
public: | ||
using id_t = uint32_t; | ||
using position_t = int32_t; | ||
|
||
static constexpr id_t cRootId{0}; | ||
|
||
PrefixTree() : m_nodes{{std::nullopt, -1}} {} | ||
|
||
/** | ||
* @param parent_node_id Index of the inserted node's parent in the prefix tree. | ||
* @param position The position in the lexed string. | ||
* @return The index of the newly inserted node in the tree. | ||
* @throw std::out_of_range if the parent's index is out of range. | ||
*/ | ||
[[maybe_unused]] auto insert(id_t const parent_node_id, position_t const position) -> id_t { | ||
if (m_nodes.size() <= parent_node_id) { | ||
throw std::out_of_range("Predecessor index out of range."); | ||
} | ||
|
||
m_nodes.emplace_back(parent_node_id, position); | ||
return m_nodes.size() - 1; | ||
} | ||
|
||
auto set(id_t const node_id, position_t const position) -> void { | ||
m_nodes.at(node_id).set_position(position); | ||
} | ||
|
||
[[nodiscard]] auto size() const -> size_t { return m_nodes.size(); } | ||
|
||
/** | ||
* @param node_id The index of the node. | ||
* @return A vector containing positions in order from the given index up to but not including | ||
* the root node. | ||
* @throw std::out_of_range if the index is out of range. | ||
*/ | ||
[[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector<position_t>; | ||
|
||
private: | ||
class Node { | ||
public: | ||
Node(std::optional<id_t> const parent_id, position_t const position) | ||
: m_parent_id{parent_id}, | ||
m_position{position} {} | ||
|
||
[[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } | ||
|
||
/** | ||
* Gets the parent ID without checking if it's `std::nullopt`. | ||
* NOTE: This method should only be used if the caller has checked the node is not the root. | ||
* @return The ID of the parent node in the prefix tree. | ||
*/ | ||
[[nodiscard]] auto get_parent_id_unsafe() const -> id_t { | ||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access) | ||
return m_parent_id.value(); | ||
} | ||
|
||
auto set_position(position_t const position) -> void { m_position = position; } | ||
|
||
[[nodiscard]] auto get_position() const -> position_t { return m_position; } | ||
|
||
private: | ||
std::optional<id_t> m_parent_id; | ||
position_t m_position; | ||
}; | ||
|
||
std::vector<Node> m_nodes; | ||
}; | ||
} // namespace log_surgeon::finite_automata | ||
|
||
#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP | ||
#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP | ||
|
||
#include <cstddef> | ||
#include <vector> | ||
|
||
#include <log_surgeon/finite_automata/PrefixTree.hpp> | ||
|
||
namespace log_surgeon::finite_automata { | ||
/** | ||
* The register handler maintains a prefix tree that is sufficient to represent all registers. | ||
* The register handler also contains a vector of registers, and performs the set, copy, and append | ||
* operations for these registers. | ||
* | ||
* NOTE: For efficiency, registers are not initialized when lexing a new string; instead, it is the | ||
* DFA's responsibility to set the register values when needed. | ||
*/ | ||
class RegisterHandler { | ||
public: | ||
auto add_register( | ||
PrefixTree::id_t const prefix_tree_parent_node_id, | ||
PrefixTree::position_t const position | ||
) -> void { | ||
auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; | ||
m_registers.emplace_back(prefix_tree_node_id); | ||
} | ||
|
||
auto set_register(size_t const reg_id, PrefixTree::position_t const position) -> void { | ||
m_prefix_tree.set(m_registers.at(reg_id), position); | ||
} | ||
|
||
auto copy_register(size_t const dest_reg_id, size_t const source_reg_id) -> void { | ||
m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); | ||
} | ||
|
||
auto append_position(size_t const reg_id, PrefixTree::position_t const position) -> void { | ||
auto const node_id{m_registers.at(reg_id)}; | ||
m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position); | ||
} | ||
|
||
[[nodiscard]] auto get_reversed_positions(size_t const reg_id | ||
) const -> std::vector<PrefixTree::position_t> { | ||
return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id)); | ||
} | ||
|
||
private: | ||
PrefixTree m_prefix_tree; | ||
std::vector<PrefixTree::id_t> m_registers; | ||
}; | ||
} // namespace log_surgeon::finite_automata | ||
|
||
#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#include <limits> | ||
#include <stdexcept> | ||
#include <vector> | ||
|
||
#include <catch2/catch_test_macros.hpp> | ||
|
||
#include <log_surgeon/finite_automata/PrefixTree.hpp> | ||
|
||
using log_surgeon::finite_automata::PrefixTree; | ||
using id_t = PrefixTree::id_t; | ||
using position_t = PrefixTree::position_t; | ||
|
||
TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { | ||
constexpr auto cRootId{PrefixTree::cRootId}; | ||
constexpr position_t cInitialPos1{4}; | ||
constexpr position_t cSetPos1{10}; | ||
|
||
SECTION("Newly constructed tree works correctly") { | ||
PrefixTree const tree; | ||
|
||
// A newly constructed tree should return no positions as the root node is ignored | ||
REQUIRE(tree.get_reversed_positions(cRootId).empty()); | ||
} | ||
|
||
SECTION("Inserting nodes into the prefix tree works correctly") { | ||
constexpr position_t cInitialPos2{7}; | ||
constexpr position_t cInitialPos3{9}; | ||
constexpr position_t cMaxPos{std::numeric_limits<position_t>::max()}; | ||
constexpr position_t cNegativePos1{-1}; | ||
constexpr position_t cNegativePos2{-100}; | ||
constexpr position_t cTreeSize1{4}; | ||
constexpr position_t cTreeSize2{8}; | ||
|
||
PrefixTree tree; | ||
|
||
// Test basic insertions | ||
auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; | ||
auto const node_id_2{tree.insert(node_id_1, cInitialPos2)}; | ||
auto const node_id_3{tree.insert(node_id_2, cInitialPos3)}; | ||
REQUIRE(std::vector<position_t>{cInitialPos1} == tree.get_reversed_positions(node_id_1)); | ||
REQUIRE(std::vector<position_t>{cInitialPos2, cInitialPos1} | ||
== tree.get_reversed_positions(node_id_2)); | ||
REQUIRE(std::vector<position_t>{cInitialPos3, cInitialPos2, cInitialPos1} | ||
== tree.get_reversed_positions(node_id_3)); | ||
REQUIRE(cTreeSize1 == tree.size()); | ||
|
||
// Test insertion with large position values | ||
auto const node_id_4{tree.insert(cRootId, cMaxPos)}; | ||
REQUIRE(cMaxPos == tree.get_reversed_positions(node_id_4)[0]); | ||
|
||
// Test insertion with negative position values | ||
auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; | ||
auto const node_id_6{tree.insert(node_id_5, cInitialPos1)}; | ||
auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; | ||
REQUIRE(std::vector<position_t>{cNegativePos1} == tree.get_reversed_positions(node_id_5)); | ||
REQUIRE(std::vector<position_t>{cInitialPos1, cNegativePos1} | ||
== tree.get_reversed_positions(node_id_6)); | ||
REQUIRE(std::vector<position_t>{cNegativePos2, cInitialPos1, cNegativePos1} | ||
== tree.get_reversed_positions(node_id_7)); | ||
REQUIRE(cTreeSize2 == tree.size()); | ||
} | ||
|
||
SECTION("Invalid index access throws correctly") { | ||
PrefixTree tree; | ||
REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); | ||
|
||
tree.insert(cRootId, cInitialPos1); | ||
REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); | ||
|
||
REQUIRE_THROWS_AS( | ||
tree.get_reversed_positions(std::numeric_limits<id_t>::max()), | ||
std::out_of_range | ||
); | ||
} | ||
|
||
SECTION("Set position for a valid index works correctly") { | ||
constexpr position_t cSetPos2{12}; | ||
constexpr position_t cSetPos3{15}; | ||
constexpr position_t cSetPos4{20}; | ||
|
||
PrefixTree tree; | ||
// Test that you can set the root node for sanity, although this value is not used | ||
tree.set(cRootId, cSetPos1); | ||
|
||
// Test updates to different nodes | ||
auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; | ||
auto const node_id_2{tree.insert(node_id_1, cInitialPos1)}; | ||
tree.set(node_id_1, cSetPos1); | ||
tree.set(node_id_2, cSetPos2); | ||
REQUIRE(std::vector<position_t>{cSetPos1} == tree.get_reversed_positions(node_id_1)); | ||
REQUIRE(std::vector<position_t>{cSetPos2, cSetPos1} | ||
== tree.get_reversed_positions(node_id_2)); | ||
|
||
// Test multiple updates to the same node | ||
tree.set(node_id_2, cSetPos3); | ||
tree.set(node_id_2, cSetPos4); | ||
REQUIRE(std::vector<position_t>{cSetPos4, cSetPos1} | ||
== tree.get_reversed_positions(node_id_2)); | ||
|
||
// Test that updates don't affect unrelated paths | ||
auto const node_id_3{tree.insert(cRootId, cSetPos2)}; | ||
tree.set(node_id_3, cSetPos3); | ||
REQUIRE(std::vector<position_t>{cSetPos1} == tree.get_reversed_positions(node_id_1)); | ||
REQUIRE(std::vector<position_t>{cSetPos4, cSetPos1} | ||
== tree.get_reversed_positions(node_id_2)); | ||
} | ||
|
||
SECTION("Set position for an invalid index throws correctly") { | ||
constexpr id_t cInvalidNodeId{100}; | ||
|
||
PrefixTree tree; | ||
|
||
// Test setting position before any insertions | ||
REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range); | ||
|
||
// Test setting position just beyond valid range | ||
auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; | ||
REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range); | ||
} | ||
} |
Oops, something went wrong.