From 1671e39943cedd001427076df92adafe8a91d1f8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:15:16 -0500 Subject: [PATCH 01/16] Move constants into scope for test-prefix-tree.cpp. --- tests/test-prefix-tree.cpp | 63 ++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp index 27c7988..66d8f8a 100644 --- a/tests/test-prefix-tree.cpp +++ b/tests/test-prefix-tree.cpp @@ -10,22 +10,11 @@ using log_surgeon::finite_automata::PrefixTree; using id_t = PrefixTree::id_t; using position_t = PrefixTree::position_t; -constexpr auto cRootId{PrefixTree::cRootId}; -constexpr id_t cInvalidNodeId{100}; -constexpr position_t cInsertPos1{4}; -constexpr position_t cInsertPos2{7}; -constexpr position_t cInsertPos3{9}; -constexpr position_t cMaxPos{std::numeric_limits::max()}; -constexpr position_t cNegativePos1{-1}; -constexpr position_t cNegativePos2{-100}; -constexpr position_t cSetPos1{10}; -constexpr position_t cSetPos2{12}; -constexpr position_t cSetPos3{15}; -constexpr position_t cSetPos4{20}; -constexpr position_t cTreeSize1{4}; -constexpr position_t cTreeSize2{8}; - TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { + constexpr auto cRootId{PrefixTree::cRootId}; + constexpr position_t cInitialPos1{4}; + constexpr position_t cSetPos1{10}; + SECTION("Newly constructed tree works correctly") { PrefixTree const tree; @@ -34,16 +23,24 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Inserting nodes into the prefix tree works correctly") { + constexpr position_t cInitialPos2{7}; + constexpr position_t cInitialPos3{9}; + constexpr position_t cMaxPos{std::numeric_limits::max()}; + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + constexpr position_t cTreeSize1{4}; + constexpr position_t cTreeSize2{8}; + PrefixTree tree; // Test basic insertions - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - auto const node_id_2{tree.insert(node_id_1, cInsertPos2)}; - auto const node_id_3{tree.insert(node_id_2, cInsertPos3)}; - REQUIRE(std::vector{cInsertPos1} == tree.get_reversed_positions(node_id_1)); - REQUIRE(std::vector{cInsertPos2, cInsertPos1} + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInitialPos3)}; + REQUIRE(std::vector{cInitialPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInitialPos2, cInitialPos1} == tree.get_reversed_positions(node_id_2)); - REQUIRE(std::vector{cInsertPos3, cInsertPos2, cInsertPos1} + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} == tree.get_reversed_positions(node_id_3)); REQUIRE(cTreeSize1 == tree.size()); @@ -53,12 +50,12 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { // Test insertion with negative position values auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; - auto const node_id_6{tree.insert(node_id_5, cInsertPos1)}; + auto const node_id_6{tree.insert(node_id_5, cInitialPos1)}; auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); - REQUIRE(std::vector{cInsertPos1, cNegativePos1} + REQUIRE(std::vector{cInitialPos1, cNegativePos1} == tree.get_reversed_positions(node_id_6)); - REQUIRE(std::vector{cNegativePos2, cInsertPos1, cNegativePos1} + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} == tree.get_reversed_positions(node_id_7)); REQUIRE(cTreeSize2 == tree.size()); } @@ -67,7 +64,7 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { PrefixTree tree; REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); - tree.insert(cRootId, cInsertPos1); + tree.insert(cRootId, cInitialPos1); REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); REQUIRE_THROWS_AS( @@ -77,13 +74,17 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Set position for a valid index works correctly") { + constexpr position_t cSetPos2{12}; + constexpr position_t cSetPos3{15}; + constexpr position_t cSetPos4{20}; + PrefixTree tree; // Test that you can set the root node for sanity, although this value is not used tree.set(cRootId, cSetPos1); // Test updates to different nodes - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - auto const node_id_2{tree.insert(node_id_1, cInsertPos1)}; + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos1)}; tree.set(node_id_1, cSetPos1); tree.set(node_id_2, cSetPos2); REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); @@ -105,13 +106,15 @@ TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { } SECTION("Set position for an invalid index throws correctly") { + constexpr id_t cInvalidNodeId{100}; + PrefixTree tree; // Test setting position before any insertions - REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos4), std::out_of_range); + REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range); // Test setting position just beyond valid range - auto const node_id_1{tree.insert(cRootId, cInsertPos1)}; - REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos4), std::out_of_range); + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range); } } From 748dfc5566f6ab72df78bccbee99b1c9b503e2db Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:21:05 -0500 Subject: [PATCH 02/16] Rename to handler_init and return handler. --- tests/test-register-handler.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp index 2371fc9..e8102e2 100644 --- a/tests/test-register-handler.cpp +++ b/tests/test-register-handler.cpp @@ -12,17 +12,19 @@ using position_t = log_surgeon::finite_automata::PrefixTree::position_t; namespace { /** - * @param handler The register handler that will contain the new registers. - * @param num_registers The number of registers to initialize. + * @param num_registers The number of registers managed by the handler. + * @return The newly initialized register handler. */ -auto registers_init(RegisterHandler& handler, size_t num_registers) -> void; +[[nodiscard]] auto handler_init(size_t num_registers) -> RegisterHandler; -auto registers_init(RegisterHandler& handler, size_t const num_registers) -> void { +auto handler_init(size_t const num_registers) -> RegisterHandler { constexpr position_t cDefaultPos{0}; + RegisterHandler handler; for (size_t i{0}; i < num_registers; ++i) { handler.add_register(i, cDefaultPos); } + return handler; } } // namespace @@ -32,13 +34,12 @@ TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { constexpr size_t cRegId1{0}; constexpr size_t cRegId2{1}; - RegisterHandler handler; - SECTION("Initial state is empty") { - REQUIRE_THROWS_AS(handler.get_reversed_positions(cRegId1), std::out_of_range); + RegisterHandler empty_handler{handler_init(0)}; + REQUIRE_THROWS_AS(empty_handler.get_reversed_positions(cRegId1), std::out_of_range); } - registers_init(handler, cNumRegisters); + RegisterHandler handler{handler_init(cNumRegisters)}; SECTION("Set register position correctly") { handler.set_register(cRegId1, cInitialPos1); From 8abf35a15dfef1d26adfdfbc4895a20cb535a323 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Dec 2024 18:22:39 -0500 Subject: [PATCH 03/16] Add docstring for get_parent_id_unsafe(). --- src/log_surgeon/finite_automata/PrefixTree.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp index ab88d80..e2de78a 100644 --- a/src/log_surgeon/finite_automata/PrefixTree.hpp +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -65,6 +65,11 @@ class PrefixTree { [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } + /** + * Gets the parent ID without checking if it's `std::nullopt`. + * NOTE: This method should only be used if the caller has checked the node is not the root. + * @return The ID of the parent node in the prefix tree. + */ [[nodiscard]] auto get_parent_id_unsafe() const -> id_t { // NOLINTNEXTLINE(bugprone-unchecked-optional-access) return m_parent_id.value(); From 99b5b08608a53a199bcba3f57aedd64b7ba0bd0c Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Fri, 6 Dec 2024 15:49:58 -0500 Subject: [PATCH 04/16] feat: Add `PrefixTree` and `RegisterHandler` to support TDFA simulation. (#56) Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- CMakeLists.txt | 3 + .../finite_automata/PrefixTree.cpp | 20 +++ .../finite_automata/PrefixTree.hpp | 91 +++++++++++++ .../finite_automata/RegisterHandler.hpp | 52 ++++++++ tests/CMakeLists.txt | 5 +- tests/test-prefix-tree.cpp | 120 ++++++++++++++++++ tests/test-register-handler.cpp | 98 ++++++++++++++ 7 files changed, 388 insertions(+), 1 deletion(-) create mode 100644 src/log_surgeon/finite_automata/PrefixTree.cpp create mode 100644 src/log_surgeon/finite_automata/PrefixTree.hpp create mode 100644 src/log_surgeon/finite_automata/RegisterHandler.hpp create mode 100644 tests/test-prefix-tree.cpp create mode 100644 tests/test-register-handler.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e76ecb8..117cde5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,12 +93,15 @@ set(SOURCE_FILES src/log_surgeon/SchemaParser.hpp src/log_surgeon/Token.cpp src/log_surgeon/Token.hpp + src/log_surgeon/finite_automata/PrefixTree.cpp + src/log_surgeon/finite_automata/PrefixTree.hpp src/log_surgeon/finite_automata/RegexAST.hpp src/log_surgeon/finite_automata/RegexDFA.hpp src/log_surgeon/finite_automata/RegexDFA.tpp src/log_surgeon/finite_automata/RegexNFA.hpp src/log_surgeon/finite_automata/RegexNFAState.hpp src/log_surgeon/finite_automata/RegexNFAStateType.hpp + src/log_surgeon/finite_automata/RegisterHandler.hpp src/log_surgeon/finite_automata/Tag.hpp src/log_surgeon/finite_automata/TaggedTransition.hpp src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp diff --git a/src/log_surgeon/finite_automata/PrefixTree.cpp b/src/log_surgeon/finite_automata/PrefixTree.cpp new file mode 100644 index 0000000..4a65234 --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.cpp @@ -0,0 +1,20 @@ +#include "PrefixTree.hpp" + +#include +#include + +namespace log_surgeon::finite_automata { +auto PrefixTree::get_reversed_positions(id_t const node_id) const -> std::vector { + if (m_nodes.size() <= node_id) { + throw std::out_of_range("Prefix tree index out of range."); + } + + std::vector reversed_positions; + auto current_node{m_nodes[node_id]}; + while (false == current_node.is_root()) { + reversed_positions.push_back(current_node.get_position()); + current_node = m_nodes[current_node.get_parent_id_unsafe()]; + } + return reversed_positions; +} +} // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/PrefixTree.hpp b/src/log_surgeon/finite_automata/PrefixTree.hpp new file mode 100644 index 0000000..e2de78a --- /dev/null +++ b/src/log_surgeon/finite_automata/PrefixTree.hpp @@ -0,0 +1,91 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP + +#include +#include +#include +#include +#include + +namespace log_surgeon::finite_automata { +/** + * Represents a prefix tree to store register data during TDFA simulation. Each node in the tree + * stores a single position in the lexed string. Each path from the root to an index corresponds to + * a sequence of positions for an individual tag: + * - Positive position node: Indicates the tag was matched at the position. + * - Negative position node: Indicates the tag was unmatched. If a negative node is the entire path, + * it indicates the tag was never matched. If the negative tag is along a path containing positive + * nodes, it functions as a placeholder. This can be useful for nested capture groups, to maintain + * a one-to-one mapping between the contained capture group and the enclosing capture group. + */ +class PrefixTree { +public: + using id_t = uint32_t; + using position_t = int32_t; + + static constexpr id_t cRootId{0}; + + PrefixTree() : m_nodes{{std::nullopt, -1}} {} + + /** + * @param parent_node_id Index of the inserted node's parent in the prefix tree. + * @param position The position in the lexed string. + * @return The index of the newly inserted node in the tree. + * @throw std::out_of_range if the parent's index is out of range. + */ + [[maybe_unused]] auto insert(id_t const parent_node_id, position_t const position) -> id_t { + if (m_nodes.size() <= parent_node_id) { + throw std::out_of_range("Predecessor index out of range."); + } + + m_nodes.emplace_back(parent_node_id, position); + return m_nodes.size() - 1; + } + + auto set(id_t const node_id, position_t const position) -> void { + m_nodes.at(node_id).set_position(position); + } + + [[nodiscard]] auto size() const -> size_t { return m_nodes.size(); } + + /** + * @param node_id The index of the node. + * @return A vector containing positions in order from the given index up to but not including + * the root node. + * @throw std::out_of_range if the index is out of range. + */ + [[nodiscard]] auto get_reversed_positions(id_t node_id) const -> std::vector; + +private: + class Node { + public: + Node(std::optional const parent_id, position_t const position) + : m_parent_id{parent_id}, + m_position{position} {} + + [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } + + /** + * Gets the parent ID without checking if it's `std::nullopt`. + * NOTE: This method should only be used if the caller has checked the node is not the root. + * @return The ID of the parent node in the prefix tree. + */ + [[nodiscard]] auto get_parent_id_unsafe() const -> id_t { + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + return m_parent_id.value(); + } + + auto set_position(position_t const position) -> void { m_position = position; } + + [[nodiscard]] auto get_position() const -> position_t { return m_position; } + + private: + std::optional m_parent_id; + position_t m_position; + }; + + std::vector m_nodes; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_PREFIX_TREE_HPP diff --git a/src/log_surgeon/finite_automata/RegisterHandler.hpp b/src/log_surgeon/finite_automata/RegisterHandler.hpp new file mode 100644 index 0000000..d61240e --- /dev/null +++ b/src/log_surgeon/finite_automata/RegisterHandler.hpp @@ -0,0 +1,52 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP + +#include +#include + +#include + +namespace log_surgeon::finite_automata { +/** + * The register handler maintains a prefix tree that is sufficient to represent all registers. + * The register handler also contains a vector of registers, and performs the set, copy, and append + * operations for these registers. + * + * NOTE: For efficiency, registers are not initialized when lexing a new string; instead, it is the + * DFA's responsibility to set the register values when needed. + */ +class RegisterHandler { +public: + auto add_register( + PrefixTree::id_t const prefix_tree_parent_node_id, + PrefixTree::position_t const position + ) -> void { + auto const prefix_tree_node_id{m_prefix_tree.insert(prefix_tree_parent_node_id, position)}; + m_registers.emplace_back(prefix_tree_node_id); + } + + auto set_register(size_t const reg_id, PrefixTree::position_t const position) -> void { + m_prefix_tree.set(m_registers.at(reg_id), position); + } + + auto copy_register(size_t const dest_reg_id, size_t const source_reg_id) -> void { + m_registers.at(dest_reg_id) = m_registers.at(source_reg_id); + } + + auto append_position(size_t const reg_id, PrefixTree::position_t const position) -> void { + auto const node_id{m_registers.at(reg_id)}; + m_registers.at(reg_id) = m_prefix_tree.insert(node_id, position); + } + + [[nodiscard]] auto get_reversed_positions(size_t const reg_id + ) const -> std::vector { + return m_prefix_tree.get_reversed_positions(m_registers.at(reg_id)); + } + +private: + PrefixTree m_prefix_tree; + std::vector m_registers; +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGISTER_HANDLER_HPP diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d150252..ec974e6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,10 +2,13 @@ set( SOURCES_LOG_SURGEON ../src/log_surgeon/FileReader.cpp ../src/log_surgeon/FileReader.hpp + ../src/log_surgeon/finite_automata/PrefixTree.cpp + ../src/log_surgeon/finite_automata/PrefixTree.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp ../src/log_surgeon/finite_automata/RegexNFA.hpp ../src/log_surgeon/finite_automata/RegexNFAState.hpp ../src/log_surgeon/finite_automata/RegexNFAStateType.hpp + ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp ../src/log_surgeon/LALR1Parser.cpp @@ -21,7 +24,7 @@ set( ../src/log_surgeon/Token.hpp ) -set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-tag.cpp) +set(SOURCES_TESTS test-lexer.cpp test-NFA.cpp test-prefix-tree.cpp test-register-handler.cpp test-tag.cpp) add_executable(unit-test ${SOURCES_LOG_SURGEON} ${SOURCES_TESTS}) target_link_libraries(unit-test PRIVATE Catch2::Catch2WithMain log_surgeon::log_surgeon) diff --git a/tests/test-prefix-tree.cpp b/tests/test-prefix-tree.cpp new file mode 100644 index 0000000..66d8f8a --- /dev/null +++ b/tests/test-prefix-tree.cpp @@ -0,0 +1,120 @@ +#include +#include +#include + +#include + +#include + +using log_surgeon::finite_automata::PrefixTree; +using id_t = PrefixTree::id_t; +using position_t = PrefixTree::position_t; + +TEST_CASE("`PrefixTree` operations", "[PrefixTree]") { + constexpr auto cRootId{PrefixTree::cRootId}; + constexpr position_t cInitialPos1{4}; + constexpr position_t cSetPos1{10}; + + SECTION("Newly constructed tree works correctly") { + PrefixTree const tree; + + // A newly constructed tree should return no positions as the root node is ignored + REQUIRE(tree.get_reversed_positions(cRootId).empty()); + } + + SECTION("Inserting nodes into the prefix tree works correctly") { + constexpr position_t cInitialPos2{7}; + constexpr position_t cInitialPos3{9}; + constexpr position_t cMaxPos{std::numeric_limits::max()}; + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + constexpr position_t cTreeSize1{4}; + constexpr position_t cTreeSize2{8}; + + PrefixTree tree; + + // Test basic insertions + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos2)}; + auto const node_id_3{tree.insert(node_id_2, cInitialPos3)}; + REQUIRE(std::vector{cInitialPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cInitialPos2, cInitialPos1} + == tree.get_reversed_positions(node_id_2)); + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} + == tree.get_reversed_positions(node_id_3)); + REQUIRE(cTreeSize1 == tree.size()); + + // Test insertion with large position values + auto const node_id_4{tree.insert(cRootId, cMaxPos)}; + REQUIRE(cMaxPos == tree.get_reversed_positions(node_id_4)[0]); + + // Test insertion with negative position values + auto const node_id_5{tree.insert(cRootId, cNegativePos1)}; + auto const node_id_6{tree.insert(node_id_5, cInitialPos1)}; + auto const node_id_7{tree.insert(node_id_6, cNegativePos2)}; + REQUIRE(std::vector{cNegativePos1} == tree.get_reversed_positions(node_id_5)); + REQUIRE(std::vector{cInitialPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_6)); + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} + == tree.get_reversed_positions(node_id_7)); + REQUIRE(cTreeSize2 == tree.size()); + } + + SECTION("Invalid index access throws correctly") { + PrefixTree tree; + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); + + tree.insert(cRootId, cInitialPos1); + REQUIRE_THROWS_AS(tree.get_reversed_positions(tree.size()), std::out_of_range); + + REQUIRE_THROWS_AS( + tree.get_reversed_positions(std::numeric_limits::max()), + std::out_of_range + ); + } + + SECTION("Set position for a valid index works correctly") { + constexpr position_t cSetPos2{12}; + constexpr position_t cSetPos3{15}; + constexpr position_t cSetPos4{20}; + + PrefixTree tree; + // Test that you can set the root node for sanity, although this value is not used + tree.set(cRootId, cSetPos1); + + // Test updates to different nodes + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + auto const node_id_2{tree.insert(node_id_1, cInitialPos1)}; + tree.set(node_id_1, cSetPos1); + tree.set(node_id_2, cSetPos2); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos2, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + + // Test multiple updates to the same node + tree.set(node_id_2, cSetPos3); + tree.set(node_id_2, cSetPos4); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + + // Test that updates don't affect unrelated paths + auto const node_id_3{tree.insert(cRootId, cSetPos2)}; + tree.set(node_id_3, cSetPos3); + REQUIRE(std::vector{cSetPos1} == tree.get_reversed_positions(node_id_1)); + REQUIRE(std::vector{cSetPos4, cSetPos1} + == tree.get_reversed_positions(node_id_2)); + } + + SECTION("Set position for an invalid index throws correctly") { + constexpr id_t cInvalidNodeId{100}; + + PrefixTree tree; + + // Test setting position before any insertions + REQUIRE_THROWS_AS(tree.set(cInvalidNodeId, cSetPos1), std::out_of_range); + + // Test setting position just beyond valid range + auto const node_id_1{tree.insert(cRootId, cInitialPos1)}; + REQUIRE_THROWS_AS(tree.set(node_id_1 + 1, cSetPos1), std::out_of_range); + } +} diff --git a/tests/test-register-handler.cpp b/tests/test-register-handler.cpp new file mode 100644 index 0000000..e8102e2 --- /dev/null +++ b/tests/test-register-handler.cpp @@ -0,0 +1,98 @@ +#include +#include +#include + +#include + +#include +#include + +using log_surgeon::finite_automata::RegisterHandler; +using position_t = log_surgeon::finite_automata::PrefixTree::position_t; + +namespace { +/** + * @param num_registers The number of registers managed by the handler. + * @return The newly initialized register handler. + */ +[[nodiscard]] auto handler_init(size_t num_registers) -> RegisterHandler; + +auto handler_init(size_t const num_registers) -> RegisterHandler { + constexpr position_t cDefaultPos{0}; + + RegisterHandler handler; + for (size_t i{0}; i < num_registers; ++i) { + handler.add_register(i, cDefaultPos); + } + return handler; +} +} // namespace + +TEST_CASE("`RegisterHandler` tests", "[RegisterHandler]") { + constexpr position_t cInitialPos1{5}; + constexpr size_t cNumRegisters{5}; + constexpr size_t cRegId1{0}; + constexpr size_t cRegId2{1}; + + SECTION("Initial state is empty") { + RegisterHandler empty_handler{handler_init(0)}; + REQUIRE_THROWS_AS(empty_handler.get_reversed_positions(cRegId1), std::out_of_range); + } + + RegisterHandler handler{handler_init(cNumRegisters)}; + + SECTION("Set register position correctly") { + handler.set_register(cRegId1, cInitialPos1); + REQUIRE(std::vector{cInitialPos1} == handler.get_reversed_positions(cRegId1)); + } + + SECTION("Register relationships are maintained") { + constexpr position_t cInitialPos2{10}; + constexpr position_t cInitialPos3{15}; + constexpr size_t cRegId3{2}; + + handler.set_register(cRegId1, cInitialPos1); + handler.set_register(cRegId2, cInitialPos2); + handler.set_register(cRegId3, cInitialPos3); + + auto positions{handler.get_reversed_positions(cRegId3)}; + REQUIRE(std::vector{cInitialPos3, cInitialPos2, cInitialPos1} + == handler.get_reversed_positions(cRegId3)); + } + + SECTION("Copy register index correctly") { + handler.set_register(cRegId1, cInitialPos1); + handler.copy_register(cRegId2, cRegId1); + REQUIRE(std::vector{cInitialPos1} == handler.get_reversed_positions(cRegId2)); + } + + SECTION("`append_position` appends position correctly") { + constexpr position_t cAppendPos{10}; + + handler.set_register(cRegId1, cInitialPos1); + handler.append_position(cRegId1, cAppendPos); + REQUIRE(std::vector{cAppendPos, cInitialPos1} + == handler.get_reversed_positions(cRegId1)); + } + + SECTION("Throws out of range correctly") { + constexpr size_t cInvalidRegId{10}; + + REQUIRE_THROWS_AS(handler.set_register(cInvalidRegId, cInitialPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cInvalidRegId, cRegId2), std::out_of_range); + REQUIRE_THROWS_AS(handler.copy_register(cRegId1, cInvalidRegId), std::out_of_range); + REQUIRE_THROWS_AS(handler.append_position(cInvalidRegId, cInitialPos1), std::out_of_range); + REQUIRE_THROWS_AS(handler.get_reversed_positions(cInvalidRegId), std::out_of_range); + } + + SECTION("Handles negative position values correctly") { + constexpr position_t cNegativePos1{-1}; + constexpr position_t cNegativePos2{-100}; + + handler.set_register(cRegId1, cNegativePos1); + handler.append_position(cRegId1, cInitialPos1); + handler.append_position(cRegId1, cNegativePos2); + REQUIRE(std::vector{cNegativePos2, cInitialPos1, cNegativePos1} + == handler.get_reversed_positions(cRegId1)); + } +} From a12a3607d6d4040f50acaa03c31b4a200e6f1a29 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 18:47:56 -0500 Subject: [PATCH 05/16] Fix comment length. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index a916b26..7277b6e 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -47,9 +47,8 @@ class RegexDFAState { private: std::vector m_matching_variable_ids; RegexDFAState* m_bytes_transition[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == - // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) - // in that case. + // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, + // so we use an empty class (`std::tuple<>`) in that case. std::conditional_t> m_tree_transitions; }; From 244d122ee630b2b347c6e95fbb2d16d8ef230eeb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 18:52:46 -0500 Subject: [PATCH 06/16] Initialize byte transitions. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 7277b6e..90d83e5 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -22,6 +22,10 @@ class RegexDFAState { public: using Tree = UnicodeIntervalTree*>; + RegexDFAState() { + std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); + } + auto add_matching_variable_id(uint32_t const variable_id) -> void { m_matching_variable_ids.push_back(variable_id); } From 176391b490f51d940aed2bb216446ed7d5be0958 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 19:17:54 -0500 Subject: [PATCH 07/16] Use const* in place of unique_ptr reference; Update docstrings. --- src/log_surgeon/finite_automata/RegexDFA.hpp | 26 +++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index 63e0a80..3e8ad14 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -14,10 +14,9 @@ template class RegexDFA { public: /** - * Creates a new DFA state based on a set of NFA states and adds it to - * m_states - * @param nfa_state_set - * @return DFAStateType* + * Creates a new DFA state based on a set of NFA states and adds it to `m_states`. + * @param nfa_state_set The set of NFA states represented by this DFA state. + * @return A pointer to the new DFA state. */ template auto new_state(std::set const& nfa_state_set) -> DFAStateType*; @@ -25,16 +24,14 @@ class RegexDFA { auto get_root() const -> DFAStateType const* { return m_states.at(0).get(); } /** - * Compares this dfa with dfa_in to determine the set of schema types in - * this dfa that are reachable by any type in dfa_in. A type is considered - * reachable if there is at least one string for which: (1) this dfa returns - * a set of types containing the type, and (2) dfa_in returns any non-empty - * set of types. - * @param dfa_in - * @return The set of schema types reachable by dfa_in + * Compares this dfa with `dfa_in` to determine the set of schema types in this dfa that are + * reachable by any type in `dfa_in`. A type is considered reachable if there is at least one + * string for which: (1) this dfa returns a set of types containing the type, and (2) `dfa_in` + * returns any non-empty set of types. + * @param dfa_in The dfa with which to take the intersect. + * @return The set of schema types reachable by `dfa_in`. */ - [[nodiscard]] auto get_intersect(std::unique_ptr const& dfa_in - ) const -> std::set; + [[nodiscard]] auto get_intersect(RegexDFA const* dfa_in) const -> std::set; private: std::vector> m_states; @@ -55,8 +52,7 @@ auto RegexDFA::new_state(std::set const& nfa_state_ } template -auto RegexDFA::get_intersect(std::unique_ptr const& dfa_in -) const -> std::set { +auto RegexDFA::get_intersect(RegexDFA const* dfa_in) const -> std::set { std::set schema_types; std::set> unvisited_pairs; std::set> visited_pairs; From 012f61f1c78eb7afcf888c7e7018c7d2503ca5e4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 19:20:17 -0500 Subject: [PATCH 08/16] Update intersect test to compile. --- examples/intersect-test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index a5d0e43..19d696b 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -42,7 +42,7 @@ auto get_intersect_for_query( } RegexNFA nfa(std::move(rules)); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); - auto schema_types = dfa1->get_intersect(dfa2); + auto schema_types = dfa1->get_intersect(dfa2.get()); std::cout << search_string << ":"; for (auto const& schema_type : schema_types) { std::cout << m_id_symbol[schema_type] << ","; From 96a6363b2a4fcd27171e88a50e6a21b8cfd8f11e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Dec 2024 19:23:39 -0500 Subject: [PATCH 09/16] Update next() docstring. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 90d83e5..8c3e179 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -41,10 +41,8 @@ class RegexDFAState { } /** - * Returns the next state the DFA transitions to on input character (byte or - * utf8) - * @param character - * @return RegexDFAState* + * @param character The character (byte or utf8) to transition on. + * @return A pointer to the DFA state reached after transitioning on `character`. */ [[nodiscard]] auto next(uint32_t character) const -> RegexDFAState*; From a4a93b4d47fe17582325cc9e139f3060afd57722 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:24:09 -0500 Subject: [PATCH 10/16] Rename to state_type. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index fb19214..98aedca 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -17,7 +17,7 @@ class RegexDFAState; using RegexDFAByteState = RegexDFAState; using RegexDFAUTF8State = RegexDFAState; -template +template class RegexDFAState { public: using Tree = UnicodeIntervalTree; @@ -49,14 +49,14 @@ class RegexDFAState { private: std::vector m_matching_variable_ids; RegexDFAState* m_bytes_transition[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, + // NOTE: We don't need m_tree_transitions for the `state_type == RegexDFAStateType::Byte` case, // so we use an empty class (`std::tuple<>`) in that case. - std::conditional_t> m_tree_transitions; + std::conditional_t> m_tree_transitions; }; -template -auto RegexDFAState::next(uint32_t character) const -> RegexDFAState* { - if constexpr (RegexDFAStateType::Byte == stateType) { +template +auto RegexDFAState::next(uint32_t character) const -> RegexDFAState* { + if constexpr (RegexDFAStateType::Byte == state_type) { return m_bytes_transition[character]; } else { if (character < cSizeOfByte) { From 421c3de5c132a73635bbf0b29fe1fefba4e1b07f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:29:35 -0500 Subject: [PATCH 11/16] Update headers. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 1 + src/log_surgeon/finite_automata/RegexDFAStatePair.hpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 8c3e179..efa8e74 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include diff --git a/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp b/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp index 9672900..208a3e8 100644 --- a/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAStatePair.hpp @@ -1,10 +1,11 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_PAIR #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_PAIR +#include #include #include -#include +#include namespace log_surgeon::finite_automata { /** From 1b945a11058df5408997cde53288c0550ede86cb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:33:05 -0500 Subject: [PATCH 12/16] Update Lexer headers. --- src/log_surgeon/Lexer.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index ddb12cf..726ff68 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include From 78c41256d17a9d201cfef112158b63a92243ddad Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:37:37 -0500 Subject: [PATCH 13/16] Add header for conditional_t. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index efa8e74..5e99f2e 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -1,6 +1,7 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE +#include #include #include #include From 33623fa51bcd4dd8791d301c83a2ac787e80a05e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:39:58 -0500 Subject: [PATCH 14/16] Linter. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 5e99f2e..92f5b23 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -1,11 +1,11 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE -#include #include #include #include #include +#include #include #include From 5bbeafce0c95afe89460b2655c84f5feb06e0f3b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:47:03 -0500 Subject: [PATCH 15/16] Linter. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index be2b0ae..e7a166d 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -53,7 +53,8 @@ class RegexDFAState { RegexDFAState* m_bytes_transition[cSizeOfByte]; // NOTE: We don't need m_tree_transitions for the `state_type == RegexDFAStateType::Byte` case, // so we use an empty class (`std::tuple<>`) in that case. - std::conditional_t> m_tree_transitions; + std::conditional_t> + m_tree_transitions; }; template From 0decaf50de260891bddf2aee9c6a2b126d12b04a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 8 Dec 2024 02:57:17 -0500 Subject: [PATCH 16/16] Change ! to false ==. --- src/log_surgeon/finite_automata/RegexDFAState.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/RegexDFAState.hpp index 92f5b23..3c0ef4c 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/RegexDFAState.hpp @@ -36,7 +36,9 @@ class RegexDFAState { return m_matching_variable_ids; } - [[nodiscard]] auto is_accepting() const -> bool { return !m_matching_variable_ids.empty(); } + [[nodiscard]] auto is_accepting() const -> bool { + return false == m_matching_variable_ids.empty(); + } auto add_byte_transition(uint8_t const& byte, RegexDFAState* dest_state) -> void { m_bytes_transition[byte] = dest_state; @@ -67,7 +69,7 @@ auto RegexDFAState::next(uint32_t character) const -> RegexDFAState> result = m_tree_transitions.find(Interval(character, character)); assert(result->size() <= 1); - if (!result->empty()) { + if (false == result->empty()) { return result->front().m_value; } return nullptr;