From 99b5b08608a53a199bcba3f57aedd64b7ba0bd0c Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Fri, 6 Dec 2024 15:49:58 -0500 Subject: [PATCH] feat: Add `PrefixTree` and `RegisterHandler` to support TDFA simulation. (#56) Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- CMakeLists.txt | 3 + .../finite_automata/PrefixTree.cpp | 20 +++ .../finite_automata/PrefixTree.hpp | 91 +++++++++++++ .../finite_automata/RegisterHandler.hpp | 52 ++++++++ tests/CMakeLists.txt | 5 +- tests/test-prefix-tree.cpp | 120 ++++++++++++++++++ tests/test-register-handler.cpp | 98 ++++++++++++++ 7 files changed, 388 insertions(+), 1 deletion(-) create mode 100644 src/log_surgeon/finite_automata/PrefixTree.cpp create mode 100644 src/log_surgeon/finite_automata/PrefixTree.hpp create mode 100644 src/log_surgeon/finite_automata/RegisterHandler.hpp create mode 100644 tests/test-prefix-tree.cpp create mode 100644 tests/test-register-handler.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e76ecb8c..117cde51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,12 +93,15 @@ set(SOURCE_FILES src/log_surgeon/SchemaParser.hpp src/log_surgeon/Token.cpp src/log_surgeon/Token.hpp + src/log_surgeon/finite_automata/PrefixTree.cpp + src/log_surgeon/finite_automata/PrefixTree.hpp src/log_surgeon/finite_automata/RegexAST.hpp src/log_surgeon/finite_automata/RegexDFA.hpp src/log_surgeon/finite_automata/RegexDFA.tpp src/log_surgeon/finite_automata/RegexNFA.hpp src/log_surgeon/finite_automata/RegexNFAState.hpp src/log_surgeon/finite_automata/RegexNFAStateType.hpp + src/log_surgeon/finite_automata/RegisterHandler.hpp src/log_surgeon/finite_automata/Tag.hpp src/log_surgeon/finite_automata/TaggedTransition.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp new file mode 100644 index 00000000..4a652346 --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -0,0 +1,20 @@ +#include "PrefixTree.hpp" + +#include +#include + +namespace log_surgeon::finite_automata { +auto PrefixTree::get_reversed_positions(id_t const node_id) const -> std::vector { + if (m_nodes.size() <= node_id) { + throw std::out_of_range("Prefix tree index out of range."); + } + + std::vector reversed_positions; + auto current_node{m_nodes[node_id]}; + while (false == current_node.is_root()) { + reversed_positions.push_back(current_node.get_position()); + current_node = m_nodes[current_node.get_parent_id_unsafe()]; + } + return reversed_positions; +} +} // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp new file mode 100644 index 00000000..e2de78aa --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -0,0 +1,91 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP + +#include +#include +#include +#include +#include + +namespace log_surgeon::finite_automata { +/** + * Represents a prefix tree to store register data during TDFA simulation. Each node in the tree + * stores a single position in the lexed string. Each path from the root to an index corresponds to + * a sequence of positions for an individual tag: + * - Positive position node: Indicates the tag was matched at the position. + * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, + * it indicates the tag was never matched. If the negative tag is along a path containing positive + * nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain + * a one-to-one mapping between the contained capture group and the enclosing capture group. + */ +class PrefixTree { +public: + using id_t = uint32_t; + using position_t = int32_t; + + static constexpr id_t cRootId{0}; + + PrefixTree() : m_nodes{{std::nullopt, -1}} {} + + /** + * @param parent_node_id Index of the inserted node's parent in the prefix tree. + * @param position The position in the lexed string. + * @return The index of the newly inserted node in the tree. + * @throw std::out_of_range if the parent's index is out of range. + */ + [[maybe_unused]] auto insert(id_t const parent_node_id, position_t const position) -> id_t { + if (m_nodes.size() <= parent_node_id) { + throw std::out_of_range("Predecessor index out of range."); + } + + m_nodes.emplace_back(parent_node_id, position); + return m_nodes.size() - 1; + } + + auto set(id_t const node_id, position_t const position) -> void { + m_nodes.at(node_id).set_position(position); + } + + [[nodiscard]] auto size() const -> size_t { return m_nodes.size(); } + + /** + * @param node_id The index of the node. + * @return A vector containing positions in order from the given index up to but not including + * the root node. + * @throw std::out_of_range if the index is out of range. + */ + [[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector; + +private: + class Node { + public: + Node(std::optional const parent_id, position_t const position) + : m_parent_id{parent_id}, + m_position{position} {} + + [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } + + /** + * Gets the parent ID without checking if it's `std::nullopt`. + * NOTE: This method should only be used if the caller has checked the node is not the root. + * @return The ID of the parent node in the prefix tree. + */ + [[nodiscard]] auto get_parent_id_unsafe() const -> id_t { + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + return m_parent_id.value(); + } + + auto set_position(position_t const position) -> void { m_position = position; } + + [[nodiscard]] auto get_position() const -> position_t { return m_position; } + + private: + std::optional m_parent_id; + position_t m_position; + }; + + std::vector m_nodes; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp new file mode 100644 index 00000000..d61240e3 --- /dev/null +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -0,0 +1,52 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP + +#include +#include + +#include + +namespace log_surgeon::finite_automata { +/** + * The register handler maintains a prefix tree that is sufficient to represent all registers. + * The register handler also contains a vector of registers, and performs the set, copy, and append + * operations for these registers. + * + * NOTE: For efficiency, registers are not initialized when lexing a new string; instead, it is the + * DFA's responsibility to set the register values when needed. + */ +class RegisterHandler { +public: + auto add_register( + PrefixTree::id_t const prefix_tree_parent_node_id, + PrefixTree::position_t const position + ) -> void { + auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; + m_registers.emplace_back(prefix_tree_node_id); + } + + auto set_register(size_t const reg_id, PrefixTree::position_t const position) -> void { + m_prefix_tree.set(m_registers.at(reg_id), position); + } + + auto copy_register(size_t const dest_reg_id, size_t const source_reg_id) -> void { + m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); + } + + auto append_position(size_t const reg_id, PrefixTree::position_t const position) -> void { + auto const node_id{m_registers.at(reg_id)}; + m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position); + } + + [[nodiscard]] auto get_reversed_positions(size_t const reg_id + ) const -> std::vector { + return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id)); + } + +private: + PrefixTree m_prefix_tree; + std::vector m_registers; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d150252f..ec974e6b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,10 +2,13 @@ set( SOURCES_LOG_SURGEON ../src/log_surgeon/FileReader.cpp ../src/log_surgeon/FileReader.hpp + ../src/log_surgeon/finite_automata/PrefixTree.cpp + ../src/log_surgeon/finite_automata/PrefixTree.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp ../src/log_surgeon/finite_automata/RegexNFA.hpp ../src/log_surgeon/finite_automata/RegexNFAState.hpp ../src/log_surgeon/finite_automata/RegexNFAStateType.hpp + ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/LALR1Parser.cpp @@ -21,7 +24,7 @@ set( ../src/log_surgeon/Token.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-tag.cpp) +set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-prefix-tree.cpp test-register-handler.cpp test-tag.cpp) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp new file mode 100644 index 00000000..66d8f8a0 --- /dev/null +++ b/tests/test-prefix-tree.cpp @@ -0,0 +1,120 @@ +#include +#include +#include + +#include + +#include + +using log_surgeon::finite_automata::PrefixTree; +using id_t = PrefixTree::id_t; +using position_t = PrefixTree::position_t; + +TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { + constexpr auto cRootId{PrefixTree::cRootId}; + constexpr position_t cInitialPos1{4}; + constexpr position_t cSetPos1{10}; + + SECTION("Newly constructed tree works correctly") { + PrefixTree const tree; + + // A newly constructed tree should return no positions as the root node is ignored + REQUIRE(tree.get_reversed_positions(cRootId).empty()); + } + + SECTION("Inserting nodes into the prefix tree works correctly") { + constexpr position_t cInitialPos2{7}; + constexpr position_t cInitialPos3{9}; + constexpr position_t cMaxPos{std::numeric_limits::max()}; + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + constexpr position_t cTreeSize1{4}; + constexpr position_t cTreeSize2{8}; + + PrefixTree tree; + + // Test basic insertions + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInitialPos3)}; + REQUIRE(std::vector{cInitialPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInitialPos2, cInitialPos1} + == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} + == tree.get_reversed_positions(node_id_3)); + REQUIRE(cTreeSize1 == tree.size()); + + // Test insertion with large position values + auto const node_id_4{tree.insert(cRootId, cMaxPos)}; + REQUIRE(cMaxPos == tree.get_reversed_positions(node_id_4)[0]); + + // Test insertion with negative position values + auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; + auto const node_id_6{tree.insert(node_id_5, cInitialPos1)}; + auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; + REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); + REQUIRE(std::vector{cInitialPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_6)); + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_7)); + REQUIRE(cTreeSize2 == tree.size()); + } + + SECTION("Invalid index access throws correctly") { + PrefixTree tree; + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); + + tree.insert(cRootId, cInitialPos1); + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); + + REQUIRE_THROWS_AS( + tree.get_reversed_positions(std::numeric_limits::max()), + std::out_of_range + ); + } + + SECTION("Set position for a valid index works correctly") { + constexpr position_t cSetPos2{12}; + constexpr position_t cSetPos3{15}; + constexpr position_t cSetPos4{20}; + + PrefixTree tree; + // Test that you can set the root node for sanity, although this value is not used + tree.set(cRootId, cSetPos1); + + // Test updates to different nodes + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos1)}; + tree.set(node_id_1, cSetPos1); + tree.set(node_id_2, cSetPos2); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos2, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + + // Test multiple updates to the same node + tree.set(node_id_2, cSetPos3); + tree.set(node_id_2, cSetPos4); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + + // Test that updates don't affect unrelated paths + auto const node_id_3{tree.insert(cRootId, cSetPos2)}; + tree.set(node_id_3, cSetPos3); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + } + + SECTION("Set position for an invalid index throws correctly") { + constexpr id_t cInvalidNodeId{100}; + + PrefixTree tree; + + // Test setting position before any insertions + REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range); + + // Test setting position just beyond valid range + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range); + } +} diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp new file mode 100644 index 00000000..e8102e22 --- /dev/null +++ b/tests/test-register-handler.cpp @@ -0,0 +1,98 @@ +#include +#include +#include + +#include + +#include +#include + +using log_surgeon::finite_automata::RegisterHandler; +using position_t = log_surgeon::finite_automata::PrefixTree::position_t; + +namespace { +/** + * @param num_registers The number of registers managed by the handler. + * @return The newly initialized register handler. + */ +[[nodiscard]] auto handler_init(size_t num_registers) -> RegisterHandler; + +auto handler_init(size_t const num_registers) -> RegisterHandler { + constexpr position_t cDefaultPos{0}; + + RegisterHandler handler; + for (size_t i{0}; i < num_registers; ++i) { + handler.add_register(i, cDefaultPos); + } + return handler; +} +} // namespace + +TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { + constexpr position_t cInitialPos1{5}; + constexpr size_t cNumRegisters{5}; + constexpr size_t cRegId1{0}; + constexpr size_t cRegId2{1}; + + SECTION("Initial state is empty") { + RegisterHandler empty_handler{handler_init(0)}; + REQUIRE_THROWS_AS(empty_handler.get_reversed_positions(cRegId1), std::out_of_range); + } + + RegisterHandler handler{handler_init(cNumRegisters)}; + + SECTION("Set register position correctly") { + handler.set_register(cRegId1, cInitialPos1); + REQUIRE(std::vector{cInitialPos1} == handler.get_reversed_positions(cRegId1)); + } + + SECTION("Register relationships are maintained") { + constexpr position_t cInitialPos2{10}; + constexpr position_t cInitialPos3{15}; + constexpr size_t cRegId3{2}; + + handler.set_register(cRegId1, cInitialPos1); + handler.set_register(cRegId2, cInitialPos2); + handler.set_register(cRegId3, cInitialPos3); + + auto positions{handler.get_reversed_positions(cRegId3)}; + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} + == handler.get_reversed_positions(cRegId3)); + } + + SECTION("Copy register index correctly") { + handler.set_register(cRegId1, cInitialPos1); + handler.copy_register(cRegId2, cRegId1); + REQUIRE(std::vector{cInitialPos1} == handler.get_reversed_positions(cRegId2)); + } + + SECTION("`append_position` appends position correctly") { + constexpr position_t cAppendPos{10}; + + handler.set_register(cRegId1, cInitialPos1); + handler.append_position(cRegId1, cAppendPos); + REQUIRE(std::vector{cAppendPos, cInitialPos1} + == handler.get_reversed_positions(cRegId1)); + } + + SECTION("Throws out of range correctly") { + constexpr size_t cInvalidRegId{10}; + + REQUIRE_THROWS_AS(handler.set_register(cInvalidRegId, cInitialPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cInvalidRegId, cRegId2), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cRegId1, cInvalidRegId), std::out_of_range); + REQUIRE_THROWS_AS(handler.append_position(cInvalidRegId, cInitialPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.get_reversed_positions(cInvalidRegId), std::out_of_range); + } + + SECTION("Handles negative position values correctly") { + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + + handler.set_register(cRegId1, cNegativePos1); + handler.append_position(cRegId1, cInitialPos1); + handler.append_position(cRegId1, cNegativePos2); + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} + == handler.get_reversed_positions(cRegId1)); + } +}