diff --git a/CMakeLists.txt b/CMakeLists.txt index ac30e1c9..ceb932dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,9 +68,9 @@ set(SOURCE_FILES src/log_surgeon/Constants.hpp src/log_surgeon/FileReader.cpp src/log_surgeon/FileReader.hpp - src/log_surgeon/LALR1Parser.cpp - src/log_surgeon/LALR1Parser.hpp - src/log_surgeon/LALR1Parser.tpp + src/log_surgeon/Lalr1Parser.cpp + src/log_surgeon/Lalr1Parser.hpp + src/log_surgeon/Lalr1Parser.tpp src/log_surgeon/Lexer.hpp src/log_surgeon/Lexer.tpp src/log_surgeon/LexicalRule.hpp @@ -96,13 +96,13 @@ set(SOURCE_FILES src/log_surgeon/finite_automata/PrefixTree.cpp src/log_surgeon/finite_automata/PrefixTree.hpp src/log_surgeon/finite_automata/RegexAST.hpp - src/log_surgeon/finite_automata/RegexDFA.hpp - src/log_surgeon/finite_automata/RegexDFAState.hpp - src/log_surgeon/finite_automata/RegexDFAStatePair.hpp - src/log_surgeon/finite_automata/RegexDFAStateType.hpp - src/log_surgeon/finite_automata/RegexNFA.hpp - src/log_surgeon/finite_automata/RegexNFAState.hpp - src/log_surgeon/finite_automata/RegexNFAStateType.hpp + src/log_surgeon/finite_automata/Dfa.hpp + src/log_surgeon/finite_automata/DfaState.hpp + src/log_surgeon/finite_automata/DfaStatePair.hpp + src/log_surgeon/finite_automata/DfaStateType.hpp + src/log_surgeon/finite_automata/Nfa.hpp + src/log_surgeon/finite_automata/NfaState.hpp + src/log_surgeon/finite_automata/NfaStateType.hpp src/log_surgeon/finite_automata/RegisterHandler.hpp src/log_surgeon/finite_automata/Tag.hpp src/log_surgeon/finite_automata/TaggedTransition.hpp diff --git a/examples/intersect-test.cpp b/examples/intersect-test.cpp index 19d696b7..4c3bff89 100644 --- a/examples/intersect-test.cpp +++ b/examples/intersect-test.cpp @@ -5,10 +5,10 @@ #include #include -using log_surgeon::finite_automata::RegexDFA; -using log_surgeon::finite_automata::RegexDFAByteState; -using log_surgeon::finite_automata::RegexNFA; -using log_surgeon::finite_automata::RegexNFAByteState; +using log_surgeon::finite_automata::ByteDfaState; +using log_surgeon::finite_automata::ByteNfaState; +using log_surgeon::finite_automata::Dfa; +using log_surgeon::finite_automata::Nfa; using log_surgeon::lexers::ByteLexer; using log_surgeon::LexicalRule; using log_surgeon::ParserAST; @@ -17,11 +17,11 @@ using std::string; using std::unique_ptr; using std::vector; -using ByteLexicalRule = log_surgeon::LexicalRule; +using ByteLexicalRule = log_surgeon::LexicalRule; auto get_intersect_for_query( std::map& m_id_symbol, - std::unique_ptr>& dfa1, + std::unique_ptr>& dfa1, std::string const& search_string ) -> void { std::string processed_search_string; @@ -40,7 +40,7 @@ auto get_intersect_for_query( auto* schema_var_ast = dynamic_cast(parser_ast.get()); rules.emplace_back(0, std::move(schema_var_ast->m_regex_ptr)); } - RegexNFA nfa(std::move(rules)); + Nfa nfa(std::move(rules)); auto dfa2 = ByteLexer::nfa_to_dfa(nfa); auto schema_types = dfa1->get_intersect(dfa2.get()); std::cout << search_string << ":"; @@ -78,7 +78,7 @@ auto main() -> int { rules.emplace_back(m_id_symbol.size(), std::move(var_ast->m_regex_ptr)); m_id_symbol[m_id_symbol.size()] = var_ast->m_name; } - RegexNFA nfa(std::move(rules)); + Nfa nfa(std::move(rules)); auto dfa = ByteLexer::nfa_to_dfa(nfa); get_intersect_for_query(m_id_symbol, dfa, "*1*"); get_intersect_for_query(m_id_symbol, dfa, "*a*"); diff --git a/src/log_surgeon/Buffer.hpp b/src/log_surgeon/Buffer.hpp index 2a35195a..de51c360 100644 --- a/src/log_surgeon/Buffer.hpp +++ b/src/log_surgeon/Buffer.hpp @@ -72,8 +72,8 @@ class Buffer { // Currently needed for compression [[nodiscard]] auto get_mutable_active_buffer() -> Item* { return m_active_storage; } - void - copy(Item const* storage_to_copy_first, Item const* storage_to_copy_last, uint32_t offset) { + auto copy(Item const* storage_to_copy_first, Item const* storage_to_copy_last, uint32_t offset) + -> void { std::copy(storage_to_copy_first, storage_to_copy_last, m_active_storage + offset); } diff --git a/src/log_surgeon/BufferParser.hpp b/src/log_surgeon/BufferParser.hpp index 75eb41a4..4a1a8e78 100644 --- a/src/log_surgeon/BufferParser.hpp +++ b/src/log_surgeon/BufferParser.hpp @@ -20,7 +20,7 @@ class BufferParser { /** * Constructs the parser using the given schema file. * @param schema_file_path - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure parsing the schema file or processing the schema * AST. */ @@ -29,7 +29,7 @@ class BufferParser { /** * Constructs the parser using the given schema AST. * @param schema_ast - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure processing the schema AST. */ explicit BufferParser(std::unique_ptr schema_ast); diff --git a/src/log_surgeon/LALR1Parser.cpp b/src/log_surgeon/Lalr1Parser.cpp similarity index 66% rename from src/log_surgeon/LALR1Parser.cpp rename to src/log_surgeon/Lalr1Parser.cpp index 5268091d..a7f54a63 100644 --- a/src/log_surgeon/LALR1Parser.cpp +++ b/src/log_surgeon/Lalr1Parser.cpp @@ -1,4 +1,4 @@ -#include "LALR1Parser.hpp" +#include "Lalr1Parser.hpp" namespace log_surgeon { MatchedSymbol NonTerminal::m_all_children[cSizeOfAllChildren]; @@ -8,9 +8,9 @@ ParserAST::~ParserAST() = default; uint32_t NonTerminal::m_next_children_start = 0; NonTerminal::NonTerminal(Production* p) - : m_children_start(NonTerminal::m_next_children_start), + : m_children_start(m_next_children_start), m_production(p), m_ast(nullptr) { - NonTerminal::m_next_children_start += p->m_body.size(); + m_next_children_start += p->m_body.size(); } } // namespace log_surgeon diff --git a/src/log_surgeon/LALR1Parser.hpp b/src/log_surgeon/Lalr1Parser.hpp similarity index 94% rename from src/log_surgeon/LALR1Parser.hpp rename to src/log_surgeon/Lalr1Parser.hpp index c31e78e0..15801689 100644 --- a/src/log_surgeon/LALR1Parser.hpp +++ b/src/log_surgeon/Lalr1Parser.hpp @@ -120,7 +120,7 @@ struct Production { * has nothing on its LHS (i.e., HEAD -> {}) * @return bool */ - [[nodiscard]] auto is_epsilon() const -> bool { return this->m_body.empty(); } + [[nodiscard]] auto is_epsilon() const -> bool { return m_body.empty(); } uint32_t m_index; uint32_t m_head; @@ -158,16 +158,14 @@ struct Item { * @return bool */ [[nodiscard]] auto has_dot_at_end() const -> bool { - return this->m_dot == this->m_production->m_body.size(); + return m_dot == m_production->m_body.size(); } /** * Returns the next unmatched symbol in the production based on the dot. * @return uint32_t */ - [[nodiscard]] auto next_symbol() const -> uint32_t { - return this->m_production->m_body.at(this->m_dot); - } + [[nodiscard]] auto next_symbol() const -> uint32_t { return m_production->m_body.at(m_dot); } Production* m_production; uint32_t m_dot; @@ -202,10 +200,10 @@ struct ItemSet { std::vector m_actions; }; -template -class LALR1Parser : public Parser { +template +class Lalr1Parser : public Parser { public: - LALR1Parser(); + Lalr1Parser(); /** * Add a lexical rule to m_lexer @@ -214,7 +212,7 @@ class LALR1Parser : public Parser { */ auto add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) -> void override; /** @@ -224,7 +222,7 @@ class LALR1Parser : public Parser { */ auto add_token_group( std::string const& name, - std::unique_ptr> rule_group + std::unique_ptr> rule_group ) -> void; /** @@ -276,7 +274,7 @@ class LALR1Parser : public Parser { */ auto report_error() -> std::string; - /* Lexer m_lexer; */ + /* Lexer m_lexer; */ std::stack m_parse_stack_matches; std::stack m_parse_stack_states; ItemSet* m_root_item_set_ptr{nullptr}; @@ -396,6 +394,8 @@ class LALR1Parser : public Parser { auto symbol_is_token(uint32_t s) -> bool { return m_terminals.find(s) != m_terminals.end(); } + using Parser::m_lexer; + std::set m_terminals; std::set m_nullable; std::map, std::unique_ptr> m_lr0_item_sets; @@ -407,6 +407,6 @@ class LALR1Parser : public Parser { }; } // namespace log_surgeon -#include "LALR1Parser.tpp" +#include "Lalr1Parser.tpp" #endif // LOG_SURGEON_LALR1_PARSER_HPP diff --git a/src/log_surgeon/LALR1Parser.tpp b/src/log_surgeon/Lalr1Parser.tpp similarity index 67% rename from src/log_surgeon/LALR1Parser.tpp rename to src/log_surgeon/Lalr1Parser.tpp index 2edfa29a..f7021a25 100644 --- a/src/log_surgeon/LALR1Parser.tpp +++ b/src/log_surgeon/Lalr1Parser.tpp @@ -53,8 +53,8 @@ namespace { } } // namespace -template -LALR1Parser::LALR1Parser() { +template +Lalr1Parser::Lalr1Parser() { m_terminals.insert((uint32_t)SymbolId::TokenEnd); m_terminals.insert((uint32_t)SymbolId::TokenUncaughtString); m_terminals.insert((uint32_t)SymbolId::TokenInt); @@ -65,43 +65,42 @@ LALR1Parser::LALR1Parser() { m_terminals.insert((uint32_t)SymbolId::TokenNewline); } -template -void LALR1Parser::add_rule( +template +auto Lalr1Parser::add_rule( std::string const& name, - std::unique_ptr> rule -) { - Parser::add_rule(name, std::move(rule)); - m_terminals.insert(this->m_lexer.m_symbol_id[name]); + std::unique_ptr> rule +) -> void { + Parser::add_rule(name, std::move(rule)); + m_terminals.insert(m_lexer.m_symbol_id[name]); } -template -void LALR1Parser::add_token_group( +template +auto Lalr1Parser::add_token_group( std::string const& name, - std::unique_ptr> rule_group -) { + std::unique_ptr> rule_group +) -> void { add_rule(name, std::move(rule_group)); } -template -void LALR1Parser::add_token_chain( +template +auto Lalr1Parser::add_token_chain( std::string const& name, std::string const& chain -) { +) -> void { assert(chain.size() > 1); - std::unique_ptr> first_char_rule - = std::make_unique>(chain[0]); - std::unique_ptr> second_char_rule - = std::make_unique>(chain[1]); - std::unique_ptr> rule_chain - = std::make_unique>( - std::move(first_char_rule), - std::move(second_char_rule) - ); + auto first_char_rule + = std::make_unique>(chain[0]); + auto second_char_rule + = std::make_unique>(chain[1]); + auto rule_chain = std::make_unique>( + std::move(first_char_rule), + std::move(second_char_rule) + ); for (uint32_t i = 2; i < chain.size(); i++) { - char next_char = chain[i]; - std::unique_ptr> next_char_rule - = std::make_unique>(next_char); - rule_chain = std::make_unique>( + auto next_char = chain[i]; + auto next_char_rule + = std::make_unique>(next_char); + rule_chain = std::make_unique>( std::move(rule_chain), std::move(next_char_rule) ); @@ -109,15 +108,15 @@ void LALR1Parser::add_token_chain( add_rule(name, std::move(rule_chain)); } -template -auto LALR1Parser::add_production( +template +auto Lalr1Parser::add_production( std::string const& head, std::vector const& body, SemanticRule semantic_rule ) -> uint32_t { - if (this->m_lexer.m_symbol_id.find(head) == this->m_lexer.m_symbol_id.end()) { - this->m_lexer.m_symbol_id[head] = this->m_lexer.m_symbol_id.size(); - this->m_lexer.m_id_symbol[this->m_lexer.m_symbol_id[head]] = head; + if (m_lexer.m_symbol_id.find(head) == m_lexer.m_symbol_id.end()) { + m_lexer.m_symbol_id[head] = m_lexer.m_symbol_id.size(); + m_lexer.m_id_symbol[m_lexer.m_symbol_id[head]] = head; } uint32_t n = m_productions.size(); auto it = m_productions_map.find(head); @@ -131,13 +130,13 @@ auto LALR1Parser::add_production( } std::unique_ptr p(new Production); p->m_index = n; - p->m_head = this->m_lexer.m_symbol_id[head]; + p->m_head = m_lexer.m_symbol_id[head]; for (std::string const& symbol_string : body) { - if (this->m_lexer.m_symbol_id.find(symbol_string) == this->m_lexer.m_symbol_id.end()) { - this->m_lexer.m_symbol_id[symbol_string] = this->m_lexer.m_symbol_id.size(); - this->m_lexer.m_id_symbol[this->m_lexer.m_symbol_id[symbol_string]] = symbol_string; + if (m_lexer.m_symbol_id.find(symbol_string) == m_lexer.m_symbol_id.end()) { + m_lexer.m_symbol_id[symbol_string] = m_lexer.m_symbol_id.size(); + m_lexer.m_id_symbol[m_lexer.m_symbol_id[symbol_string]] = symbol_string; } - p->m_body.push_back(this->m_lexer.m_symbol_id[symbol_string]); + p->m_body.push_back(m_lexer.m_symbol_id[symbol_string]); } p->m_semantic_rule = std::move(semantic_rule); m_non_terminals.insert(std::pair>(p->m_head, {})); @@ -150,9 +149,9 @@ auto LALR1Parser::add_production( return n; } -template -void LALR1Parser::generate() { - this->m_lexer.generate(); +template +auto Lalr1Parser::generate() -> void { + m_lexer.generate(); assert(!m_productions.empty()); generate_lr0_kernels(); generate_first_sets(); @@ -160,29 +159,29 @@ void LALR1Parser::generate() { generate_lalr1_parsing_table(); } -template -void LALR1Parser::generate_lr0_kernels() { - Production* root_production_ptr = m_productions[m_root_production_id].get(); +template +auto Lalr1Parser::generate_lr0_kernels() -> void { + auto* root_production_ptr = m_productions[m_root_production_id].get(); Item root_item(root_production_ptr, 0, cNullSymbol); - std::unique_ptr item_set0 = std::make_unique(); + auto item_set0 = std::make_unique(); item_set0->m_kernel.insert(root_item); std::deque unused_item_sets; item_set0->m_index = m_lr0_item_sets.size(); unused_item_sets.push_back(item_set0.get()); m_lr0_item_sets[item_set0->m_kernel] = std::move(item_set0); while (!unused_item_sets.empty()) { - ItemSet* item_set_ptr = unused_item_sets.back(); + auto* item_set_ptr = unused_item_sets.back(); unused_item_sets.pop_back(); generate_lr0_closure(item_set_ptr); - for (uint32_t const& next_symbol : m_terminals) { - ItemSet* new_item_set_ptr = go_to(item_set_ptr, next_symbol); + for (auto const& next_symbol : m_terminals) { + auto* new_item_set_ptr = go_to(item_set_ptr, next_symbol); if (new_item_set_ptr != nullptr) { unused_item_sets.push_back(new_item_set_ptr); } } - for (std::map>::value_type const& kv : m_non_terminals) { - uint32_t next_symbol = kv.first; - ItemSet* new_item_set_ptr = go_to(item_set_ptr, next_symbol); + for (auto const& kv : m_non_terminals) { + auto next_symbol = kv.first; + auto* new_item_set_ptr = go_to(item_set_ptr, next_symbol); if (new_item_set_ptr != nullptr) { unused_item_sets.push_back(new_item_set_ptr); } @@ -190,8 +189,8 @@ void LALR1Parser::generate_lr0_kernels() { } } -template -auto LALR1Parser::lr_closure_helper( +template +auto Lalr1Parser::lr_closure_helper( ItemSet* item_set_ptr, Item const* item, uint32_t* next_symbol @@ -204,20 +203,21 @@ auto LALR1Parser::lr_closure_helper( return true; } *next_symbol = item->next_symbol(); - if (this->symbol_is_token(*next_symbol)) { + if (symbol_is_token(*next_symbol)) { return true; } return false; } -template -void LALR1Parser::generate_lr0_closure(ItemSet* item_set_ptr) { +template +auto Lalr1Parser::generate_lr0_closure(ItemSet* item_set_ptr +) -> void { std::deque q( item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end() ); // {{S'->(dot)S, ""}} while (!q.empty()) { - Item item = q.back(); // {S'->(dot)S, ""} + auto item = q.back(); // {S'->(dot)S, ""} q.pop_back(); uint32_t next_symbol = 0; if (lr_closure_helper(item_set_ptr, &item, &next_symbol)) { @@ -233,14 +233,14 @@ void LALR1Parser::generate_lr0_closure(ItemSet* item } } -template -auto LALR1Parser::go_to( +template +auto Lalr1Parser::go_to( ItemSet* from_item_set, uint32_t const& next_symbol ) -> ItemSet* { - std::unique_ptr next_item_set_ptr = std::make_unique(); + auto next_item_set_ptr = std::make_unique(); assert(from_item_set != nullptr); - for (Item const& item : from_item_set->m_closure) { + for (auto const& item : from_item_set->m_closure) { if (item.has_dot_at_end()) { continue; } @@ -253,7 +253,7 @@ auto LALR1Parser::go_to( return nullptr; } if (m_lr0_item_sets.find(next_item_set_ptr->m_kernel) != m_lr0_item_sets.end()) { - ItemSet* existing_item_set_ptr = m_lr0_item_sets[next_item_set_ptr->m_kernel].get(); + auto* existing_item_set_ptr = m_lr0_item_sets[next_item_set_ptr->m_kernel].get(); m_go_to_table[from_item_set->m_index][next_symbol] = existing_item_set_ptr->m_index; from_item_set->m_next[next_symbol] = existing_item_set_ptr; } else { @@ -266,24 +266,24 @@ auto LALR1Parser::go_to( return nullptr; } -template -void LALR1Parser::generate_first_sets() { +template +auto Lalr1Parser::generate_first_sets() -> void { for (uint32_t const& s : m_terminals) { m_firsts.insert(std::pair>(s, {s})); } - bool changed = true; + auto changed = true; while (changed) { changed = false; - for (std::unique_ptr const& p : m_productions) { - std::set& f = m_firsts[p->m_head]; + for (auto const& p : m_productions) { + auto& f = m_firsts[p->m_head]; if (p->is_epsilon()) { changed = changed || m_nullable.insert(p->m_head).second; continue; } - size_t old = f.size(); + auto old = f.size(); size_t i = 0; - for (uint32_t const& s : p->m_body) { - std::set& f2 = m_firsts[s]; + for (auto const& s : p->m_body) { + auto& f2 = m_firsts[s]; f.insert(f2.begin(), f2.end()); if (m_nullable.find(s) == m_nullable.end()) { break; @@ -298,15 +298,14 @@ void LALR1Parser::generate_first_sets() { } } -template -void LALR1Parser::generate_lr1_item_sets() { - for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) - { - for (Item const& l0_item : kv.second->m_kernel) { +template +auto Lalr1Parser::generate_lr1_item_sets() -> void { + for (auto const& kv : m_lr0_item_sets) { + for (auto const& l0_item : kv.second->m_kernel) { ItemSet temp_item_set; temp_item_set.m_kernel.insert(l0_item); generate_lr1_closure(&temp_item_set); - for (Item const& l1_item : temp_item_set.m_closure) { + for (auto const& l1_item : temp_item_set.m_closure) { if (l1_item.m_lookahead != cNullSymbol) { m_spontaneous_map[l1_item.m_production].insert(l1_item.m_lookahead); } else { @@ -319,9 +318,8 @@ void LALR1Parser::generate_lr1_item_sets() { } } std::map> lookaheads; - for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) - { - for (Item const& l0_item : kv.second->m_kernel) { + for (auto const& kv : m_lr0_item_sets) { + for (auto const& l0_item : kv.second->m_kernel) { lookaheads[l0_item].insert( m_spontaneous_map[l0_item.m_production].begin(), m_spontaneous_map[l0_item.m_production].end() @@ -331,27 +329,26 @@ void LALR1Parser::generate_lr1_item_sets() { } } } - bool changed = true; + auto changed = true; while (changed) { changed = false; - for (std::map>::value_type& kv : m_propagate_map) { - Item item_from = kv.first; - for (Item const& item_to : kv.second) { - size_t size_before = lookaheads[item_to].size(); + for (auto& kv : m_propagate_map) { + auto item_from = kv.first; + for (auto const& item_to : kv.second) { + auto size_before = lookaheads[item_to].size(); lookaheads[item_to].insert( lookaheads[item_from].begin(), lookaheads[item_from].end() ); - size_t size_after = lookaheads[item_to].size(); + auto size_after = lookaheads[item_to].size(); changed = changed || size_after > size_before; } } } - for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) - { - std::unique_ptr lr1_item_set_ptr = std::make_unique(); - for (Item const& l0_item : kv.second->m_kernel) { - for (int const& lookahead : lookaheads[l0_item]) { + for (auto const& kv : m_lr0_item_sets) { + auto lr1_item_set_ptr = std::make_unique(); + for (auto const& l0_item : kv.second->m_kernel) { + for (auto const& lookahead : lookaheads[l0_item]) { Item lr1_item(l0_item.m_production, l0_item.m_dot, lookahead); lr1_item_set_ptr->m_kernel.insert(lr1_item); } @@ -366,13 +363,9 @@ void LALR1Parser::generate_lr1_item_sets() { m_lr1_item_sets[lr1_item_set_ptr->m_kernel] = std::move(lr1_item_set_ptr); } // this seems like the wrong way to do this still: - for (std::map, std::unique_ptr>::value_type const& kv1 : - m_lr1_item_sets) - { + for (auto const& kv1 : m_lr1_item_sets) { for (auto const& next_index : m_go_to_table[kv1.second->m_index]) { - for (std::map, std::unique_ptr>::value_type const& kv2 : - m_lr1_item_sets) - { + for (auto const& kv2 : m_lr1_item_sets) { if (next_index.second == kv2.second->m_index) { kv1.second->m_next[next_index.first] = kv2.second.get(); break; @@ -382,21 +375,22 @@ void LALR1Parser::generate_lr1_item_sets() { } } -template -void LALR1Parser::generate_lr1_closure(ItemSet* item_set_ptr) { +template +auto Lalr1Parser::generate_lr1_closure(ItemSet* item_set_ptr +) -> void { std::deque queue(item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end()); while (!queue.empty()) { - Item item = queue.back(); + auto item = queue.back(); queue.pop_back(); uint32_t next_symbol = 0; if (lr_closure_helper(item_set_ptr, &item, &next_symbol)) { continue; } std::vector lookaheads; - size_t pos = item.m_dot + 1; + auto pos = item.m_dot + 1; while (pos < item.m_production->m_body.size()) { - uint32_t symbol = item.m_production->m_body.at(pos); - std::set symbol_firsts = m_firsts.find(symbol)->second; + auto symbol = item.m_production->m_body.at(pos); + auto symbol_firsts = m_firsts.find(symbol)->second; lookaheads.insert( lookaheads.end(), std::make_move_iterator(symbol_firsts.begin()), @@ -410,33 +404,32 @@ void LALR1Parser::generate_lr1_closure(ItemSet* item if (pos == item.m_production->m_body.size()) { lookaheads.push_back(item.m_lookahead); } - for (Production* const p : m_non_terminals.at(next_symbol)) { - for (uint32_t const& l : lookaheads) { + for (auto* const p : m_non_terminals.at(next_symbol)) { + for (auto const& l : lookaheads) { queue.emplace_back(p, 0, l); } } } } -template -void LALR1Parser::generate_lalr1_parsing_table() { +template +auto Lalr1Parser::generate_lalr1_parsing_table() -> void { generate_lalr1_goto(); generate_lalr1_action(); } -template -void LALR1Parser::generate_lalr1_goto() { +template +auto Lalr1Parser::generate_lalr1_goto() -> void { // done already at end of generate_lr1_item_sets()? } // Dragon book page 253 -template -void LALR1Parser::generate_lalr1_action() { - for (std::map, std::unique_ptr>::value_type const& kv : m_lr1_item_sets) - { - ItemSet* item_set_ptr = kv.second.get(); - item_set_ptr->m_actions.resize(this->m_lexer.m_symbol_id.size(), false); - for (Item const& item : item_set_ptr->m_closure) { +template +auto Lalr1Parser::generate_lalr1_action() -> void { + for (auto const& kv : m_lr1_item_sets) { + auto* item_set_ptr = kv.second.get(); + item_set_ptr->m_actions.resize(m_lexer.m_symbol_id.size(), false); + for (auto const& item : item_set_ptr->m_closure) { if (!item.has_dot_at_end()) { if (m_terminals.find(item.next_symbol()) == m_terminals.end() && m_non_terminals.find(item.next_symbol()) == m_non_terminals.end()) @@ -444,7 +437,7 @@ void LALR1Parser::generate_lalr1_action() { continue; } assert(item_set_ptr->m_next.find(item.next_symbol()) != item_set_ptr->m_next.end()); - Action& action = item_set_ptr->m_actions[item.next_symbol()]; + auto& action = item_set_ptr->m_actions[item.next_symbol()]; if (!std::holds_alternative(action)) { if (std::holds_alternative(action) && std::get(action) == item_set_ptr->m_next[item.next_symbol()]) @@ -453,7 +446,7 @@ void LALR1Parser::generate_lalr1_action() { } std::string conflict_msg{}; conflict_msg += "For symbol "; - conflict_msg += this->m_lexer.m_id_symbol[item.next_symbol()]; + conflict_msg += m_lexer.m_id_symbol[item.next_symbol()]; conflict_msg += ", adding shift to "; conflict_msg += std::to_string(item_set_ptr->m_next[item.next_symbol()]->m_index); @@ -464,11 +457,10 @@ void LALR1Parser::generate_lalr1_action() { conflict_msg += "\n"; } else { conflict_msg += "shift-reduce conflict with reduction "; - conflict_msg - += this->m_lexer.m_id_symbol[std::get(action)->m_head]; + conflict_msg += m_lexer.m_id_symbol[std::get(action)->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : std::get(action)->m_body) { - conflict_msg += this->m_lexer.m_id_symbol[symbol] + ","; + conflict_msg += m_lexer.m_id_symbol[symbol] + ","; } conflict_msg += "}\n"; } @@ -482,16 +474,16 @@ void LALR1Parser::generate_lalr1_action() { Action action = true; item_set_ptr->m_actions[(uint32_t)SymbolId::TokenEnd] = action; } else { - Action& action = item_set_ptr->m_actions[item.m_lookahead]; + auto& action = item_set_ptr->m_actions[item.m_lookahead]; if (!std::holds_alternative(action)) { std::string conflict_msg{}; conflict_msg += "For symbol "; - conflict_msg += this->m_lexer.m_id_symbol[item.m_lookahead]; + conflict_msg += m_lexer.m_id_symbol[item.m_lookahead]; conflict_msg += ", adding reduction "; - conflict_msg += this->m_lexer.m_id_symbol[item.m_production->m_head]; + conflict_msg += m_lexer.m_id_symbol[item.m_production->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : item.m_production->m_body) { - conflict_msg += this->m_lexer.m_id_symbol[symbol] + ","; + conflict_msg += m_lexer.m_id_symbol[symbol] + ","; } conflict_msg += "} causes "; if (std::holds_alternative(action)) { @@ -501,11 +493,10 @@ void LALR1Parser::generate_lalr1_action() { } else { conflict_msg += "reduce-reduce conflict with reduction "; conflict_msg - += this->m_lexer - .m_id_symbol[std::get(action)->m_head]; + += m_lexer.m_id_symbol[std::get(action)->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : std::get(action)->m_body) { - conflict_msg += this->m_lexer.m_id_symbol[symbol] + ","; + conflict_msg += m_lexer.m_id_symbol[symbol] + ","; } conflict_msg += "}\n"; } @@ -518,14 +509,14 @@ void LALR1Parser::generate_lalr1_action() { } } -template -auto LALR1Parser::get_input_after_last_newline( +template +auto Lalr1Parser::get_input_after_last_newline( std::stack& parse_stack_matches ) -> std::string { std::string error_message_reversed; - bool done = false; + auto done = false; while (!parse_stack_matches.empty() && !done) { - MatchedSymbol top_symbol = std::move(parse_stack_matches.top()); + auto top_symbol = std::move(parse_stack_matches.top()); parse_stack_matches.pop(); std::visit( Overloaded{ @@ -536,7 +527,7 @@ auto LALR1Parser::get_input_after_last_newline( // input is being read backwards, so reverse // each token so that when the entire input is // reversed each token is displayed correctly - std::string token_string = token.to_string(); + auto token_string = token.to_string(); std::reverse(token_string.begin(), token_string.end()); error_message_reversed += token_string; } @@ -557,15 +548,15 @@ auto LALR1Parser::get_input_after_last_newline( return error_message_reversed; } -template -auto LALR1Parser::get_input_until_next_newline(Token* error_token +template +auto Lalr1Parser::get_input_until_next_newline(Token* error_token ) -> std::string { std::string rest_of_line; - bool next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd); - bool next_has_newline = (error_token->to_string().find('\n') != std::string::npos) + auto next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenEnd); + auto next_has_newline = (error_token->to_string().find('\n') != std::string::npos) || (error_token->to_string().find('\r') != std::string::npos); while (!next_has_newline && !next_is_end_token) { - Token token = get_next_symbol(); + auto token = get_next_symbol(); next_has_newline = (token.to_string().find('\n') != std::string::npos) || (token.to_string().find('\r') != std::string::npos); if (!next_has_newline) { @@ -577,19 +568,19 @@ auto LALR1Parser::get_input_until_next_newline(Token return rest_of_line; } -template -auto LALR1Parser::report_error() -> std::string { +template +auto Lalr1Parser::report_error() -> std::string { assert(m_next_token == std::nullopt); assert(!m_parse_stack_matches.empty()); - MatchedSymbol top_symbol = std::move(m_parse_stack_matches.top()); + auto top_symbol = std::move(m_parse_stack_matches.top()); m_parse_stack_matches.pop(); - uint32_t line_num = get_line_num(top_symbol); - Token token = std::get(top_symbol); - std::string consumed_input = get_input_after_last_newline(m_parse_stack_matches); + auto line_num = get_line_num(top_symbol); + auto token = std::get(top_symbol); + auto consumed_input = get_input_after_last_newline(m_parse_stack_matches); std::string error_type{}; std::string error_indicator; - Token error_token = token; - std::string rest_of_line = get_input_until_next_newline(&error_token); + auto error_token = token; + auto rest_of_line = get_input_until_next_newline(&error_token); for (uint32_t i = 0; i < consumed_input.size() + 10; i++) { error_indicator += " "; } @@ -600,17 +591,17 @@ auto LALR1Parser::report_error() -> std::string { } else { error_type = "expected "; for (uint32_t i = 0; i < m_parse_stack_states.top()->m_actions.size(); i++) { - Action action = m_parse_stack_states.top()->m_actions[i]; + auto action = m_parse_stack_states.top()->m_actions[i]; if (action.index() != 0) { error_type += "'"; if (auto* regex_ast_literal - = dynamic_cast*>( - this->m_lexer.get_rule(i) + = dynamic_cast*>( + m_lexer.get_rule(i) )) { error_type += unescape(char(regex_ast_literal->get_character())); } else { - error_type += this->m_lexer.m_id_symbol[i]; + error_type += m_lexer.m_id_symbol[i]; } error_type += "',"; } @@ -618,9 +609,9 @@ auto LALR1Parser::report_error() -> std::string { error_type.pop_back(); error_type += " before '" + unescape(token.to_string()[0]) + "' token"; } - std::string error_string = "Schema:" + std::to_string(line_num + 1) + ":" - + std::to_string(consumed_input.size() + 1) - + ": error: " + error_type + "\n"; + auto error_string = "Schema:" + std::to_string(line_num + 1) + ":" + + std::to_string(consumed_input.size() + 1) + ": error: " + error_type + + "\n"; for (int i = 0; i < 10; i++) { error_string += " "; } @@ -628,14 +619,14 @@ auto LALR1Parser::report_error() -> std::string { return error_string; } -template -auto LALR1Parser::parse(Reader& reader) -> NonTerminal { +template +auto Lalr1Parser::parse(Reader& reader) -> NonTerminal { reset(); m_parse_stack_states.push(m_root_item_set_ptr); - bool accept = false; + auto accept = false; while (true) { m_input_buffer.read_if_safe(reader); - Token next_terminal = get_next_symbol(); + auto next_terminal = get_next_symbol(); if (parse_advance(next_terminal, &accept)) { break; } @@ -644,14 +635,14 @@ auto LALR1Parser::parse(Reader& reader) -> NonTermin throw std::runtime_error(report_error()); } assert(!m_parse_stack_matches.empty()); - MatchedSymbol m = std::move(m_parse_stack_matches.top()); + auto m = std::move(m_parse_stack_matches.top()); m_parse_stack_matches.pop(); assert(m_parse_stack_matches.empty()); return std::move(std::get(m)); } -template -void LALR1Parser::reset() { +template +auto Lalr1Parser::reset() -> void { m_next_token = std::nullopt; while (!m_parse_stack_states.empty()) { m_parse_stack_states.pop(); @@ -660,27 +651,25 @@ void LALR1Parser::reset() { m_parse_stack_matches.pop(); } m_input_buffer.reset(); - this->m_lexer.reset(); + m_lexer.reset(); } -template -auto LALR1Parser::get_next_symbol() -> Token { +template +auto Lalr1Parser::get_next_symbol() -> Token { if (m_next_token == std::nullopt) { Token token; - if (ErrorCode error = this->m_lexer.scan(m_input_buffer, token); - ErrorCode::Success != error) - { + if (auto error = m_lexer.scan(m_input_buffer, token); ErrorCode::Success != error) { throw std::runtime_error("Error scanning in lexer."); } return token; } - Token s = m_next_token.value(); + auto s = m_next_token.value(); m_next_token = std::nullopt; return s; } -template -auto LALR1Parser::parse_advance(Token& next_token, bool* accept) +template +auto Lalr1Parser::parse_advance(Token& next_token, bool* accept) -> bool { for (auto const type : *next_token.m_type_ids_ptr) { if (parse_symbol(type, next_token, accept)) { @@ -693,14 +682,14 @@ auto LALR1Parser::parse_advance(Token& next_token, b return true; } -template -auto LALR1Parser::parse_symbol( +template +auto Lalr1Parser::parse_symbol( uint32_t const& type_id, Token& next_token, bool* accept ) -> bool { - ItemSet* curr = m_parse_stack_states.top(); - Action& it = curr->m_actions[type_id]; + auto* curr = m_parse_stack_states.top(); + auto& it = curr->m_actions[type_id]; bool ret = false; std::visit( Overloaded{ @@ -722,7 +711,7 @@ auto LALR1Parser::parse_symbol( [&ret, &next_token, this](Production* reduce) { m_next_token = next_token; NonTerminal matched_non_terminal(reduce); - size_t n = reduce->m_body.size(); + auto n = reduce->m_body.size(); for (size_t i = 0; i < n; i++) { m_parse_stack_states.pop(); assert((matched_non_terminal.m_children_start + n - i - 1) @@ -743,9 +732,8 @@ auto LALR1Parser::parse_symbol( matched_non_terminal.m_ast = reduce->m_semantic_rule(&matched_non_terminal); } - ItemSet* curr = m_parse_stack_states.top(); - Action const& it - = curr->m_actions[matched_non_terminal.m_production->m_head]; + auto* curr = m_parse_stack_states.top(); + auto const& it = curr->m_actions[matched_non_terminal.m_production->m_head]; m_parse_stack_states.push(std::get(it)); m_parse_stack_matches.emplace(std::move(matched_non_terminal)); ret = true; diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 726ff68f..a392502a 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -11,16 +11,16 @@ #include #include +#include +#include +#include #include -#include -#include -#include #include #include #include namespace log_surgeon { -template +template class Lexer { public: static inline std::vector const cTokenEndTypes = {(uint32_t)SymbolId::TokenEnd}; @@ -29,11 +29,11 @@ class Lexer { /** * Generate a DFA from an NFA - * @param finite_automata::RegexNFA nfa - * @return std::unique_ptr> + * @param finite_automata::Nfa nfa + * @return std::unique_ptr> */ - static auto nfa_to_dfa(finite_automata::RegexNFA& nfa - ) -> std::unique_ptr>; + static auto nfa_to_dfa(finite_automata::Nfa& nfa + ) -> std::unique_ptr>; /** * Add a delimiters line from the schema to the lexer @@ -46,15 +46,17 @@ class Lexer { * @param id * @param regex */ - auto add_rule(uint32_t const& id, std::unique_ptr> rule) - -> void; + auto add_rule( + uint32_t const& id, + std::unique_ptr> rule + ) -> void; /** * Return regex pattern for a rule name * @param variable_id * @return finite_automata::RegexAST* */ - auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; + auto get_rule(uint32_t variable_id) -> finite_automata::RegexAST*; /** * Generate DFA for lexer @@ -124,7 +126,7 @@ class Lexer { } [[nodiscard]] auto get_dfa( - ) const -> std::unique_ptr> const& { + ) const -> std::unique_ptr> const& { return m_dfa; } @@ -136,7 +138,7 @@ class Lexer { * Return epsilon_closure over m_epsilon_transitions * @return */ - static auto epsilon_closure(NFAStateType const* state_ptr) -> std::set; + static auto epsilon_closure(TypedNfaState const* state_ptr) -> std::set; /** * Get next character from the input buffer @@ -154,17 +156,17 @@ class Lexer { std::set m_type_ids_set; std::array m_is_delimiter{false}; std::array m_is_first_char{false}; - std::vector> m_rules; + std::vector> m_rules; uint32_t m_line{0}; bool m_has_delimiters{false}; - std::unique_ptr> m_dfa; + std::unique_ptr> m_dfa; bool m_asked_for_more_data{false}; - DFAStateType const* m_prev_state{nullptr}; + TypedDfaState const* m_prev_state{nullptr}; }; namespace lexers { -using ByteLexer = Lexer; -using UTF8Lexer = Lexer; +using ByteLexer = Lexer; +using Utf8Lexer = Lexer; } // namespace lexers } // namespace log_surgeon diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 8a8aeb33..c6bfdba8 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -17,8 +17,8 @@ * 4 byte: 0x10000 - 0x1FFFFF : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ namespace log_surgeon { -template -void Lexer::flip_states(uint32_t old_storage_size) { +template +auto Lexer::flip_states(uint32_t old_storage_size) -> void { if (m_match_pos >= old_storage_size / 2) { m_match_pos -= old_storage_size / 2; } else { @@ -38,10 +38,10 @@ void Lexer::flip_states(uint32_t old_storage_size) { } } -template -auto Lexer::scan(ParserInputBuffer& input_buffer, Token& token) +template +auto Lexer::scan(ParserInputBuffer& input_buffer, Token& token) -> ErrorCode { - DFAStateType const* state = m_dfa->get_root(); + auto const* state = m_dfa->get_root(); if (m_asked_for_more_data) { state = m_prev_state; m_asked_for_more_data = false; @@ -65,9 +65,10 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To m_type_ids = nullptr; } while (true) { - uint32_t prev_byte_buf_pos = input_buffer.storage().pos(); - unsigned char next_char{utf8::cCharErr}; - if (ErrorCode err = input_buffer.get_next_character(next_char); ErrorCode::Success != err) { + auto prev_byte_buf_pos = input_buffer.storage().pos(); + auto next_char{utf8::cCharErr}; + if (auto const err = input_buffer.get_next_character(next_char); ErrorCode::Success != err) + { m_asked_for_more_data = true; m_prev_state = state; return err; @@ -80,7 +81,7 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To m_match_pos = prev_byte_buf_pos; m_match_line = m_line; } - DFAStateType* next = state->next(next_char); + auto* next = state->next(next_char); if (next_char == '\n') { m_line++; if (m_has_delimiters && !m_match) { @@ -166,13 +167,13 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To } // TODO: this is duplicating almost all the code of scan() -template -auto Lexer::scan_with_wildcard( +template +auto Lexer::scan_with_wildcard( ParserInputBuffer& input_buffer, char wildcard, Token& token ) -> ErrorCode { - DFAStateType const* state = m_dfa->get_root(); + auto const* state = m_dfa->get_root(); if (m_asked_for_more_data) { state = m_prev_state; m_asked_for_more_data = false; @@ -196,7 +197,7 @@ auto Lexer::scan_with_wildcard( m_type_ids = nullptr; } while (true) { - uint32_t prev_byte_buf_pos = input_buffer.storage().pos(); + auto prev_byte_buf_pos = input_buffer.storage().pos(); unsigned char next_char{utf8::cCharErr}; if (ErrorCode err = input_buffer.get_next_character(next_char); ErrorCode::Success != err) { m_asked_for_more_data = true; @@ -211,7 +212,7 @@ auto Lexer::scan_with_wildcard( m_match_pos = prev_byte_buf_pos; m_match_line = m_line; } - DFAStateType const* next = state->next(next_char); + TypedDfaState const* next = state->next(next_char); if (next_char == '\n') { m_line++; if (m_has_delimiters && !m_match) { @@ -239,7 +240,7 @@ auto Lexer::scan_with_wildcard( // BFS (keep track of m_type_ids) if (wildcard == '?') { for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - DFAStateType* next_state = state->next(byte); + auto* next_state = state->next(byte); if (next_state->is_accepting() == false) { token = Token{m_last_match_pos, @@ -252,11 +253,11 @@ auto Lexer::scan_with_wildcard( } } } else if (wildcard == '*') { - std::stack unvisited_states; - std::set visited_states; + std::stack unvisited_states; + std::set visited_states; unvisited_states.push(state); while (!unvisited_states.empty()) { - DFAStateType const* current_state = unvisited_states.top(); + TypedDfaState const* current_state = unvisited_states.top(); if (current_state == nullptr || current_state->is_accepting() == false) { token = Token{m_last_match_pos, @@ -273,7 +274,7 @@ auto Lexer::scan_with_wildcard( if (m_is_delimiter[byte]) { continue; } - DFAStateType const* next_state = current_state->next(byte); + TypedDfaState const* next_state = current_state->next(byte); if (visited_states.find(next_state) == visited_states.end()) { unvisited_states.push(next_state); } @@ -299,11 +300,11 @@ auto Lexer::scan_with_wildcard( } } -template -auto Lexer::increase_buffer_capacity(ParserInputBuffer& input_buffer +template +auto Lexer::increase_buffer_capacity(ParserInputBuffer& input_buffer ) -> void { uint32_t old_storage_size{0}; - bool flipped_static_buffer{false}; + auto flipped_static_buffer{false}; input_buffer.increase_capacity(old_storage_size, flipped_static_buffer); if (old_storage_size < input_buffer.storage().size()) { if (flipped_static_buffer) { @@ -316,8 +317,8 @@ auto Lexer::increase_buffer_capacity(ParserInputBuff } } -template -void Lexer::reset() { +template +void Lexer::reset() { m_last_match_pos = 0; m_match = false; m_line = 0; @@ -330,8 +331,8 @@ void Lexer::reset() { m_prev_state = nullptr; } -template -void Lexer::prepend_start_of_file_char(ParserInputBuffer& input_buffer +template +void Lexer::prepend_start_of_file_char(ParserInputBuffer& input_buffer ) { m_prev_state = m_dfa->get_root()->next(utf8::cCharStartOfFile); m_asked_for_more_data = true; @@ -341,30 +342,30 @@ void Lexer::prepend_start_of_file_char(ParserInputBu m_type_ids = nullptr; } -template -void Lexer::add_delimiters(std::vector const& delimiters) { +template +void Lexer::add_delimiters(std::vector const& delimiters) { assert(!delimiters.empty()); m_has_delimiters = true; - for (bool& i : m_is_delimiter) { + for (auto& i : m_is_delimiter) { i = false; } - for (uint32_t delimiter : delimiters) { + for (auto delimiter : delimiters) { m_is_delimiter[delimiter] = true; } m_is_delimiter[utf8::cCharStartOfFile] = true; } -template -void Lexer::add_rule( +template +void Lexer::add_rule( uint32_t const& id, - std::unique_ptr> rule + std::unique_ptr> rule ) { m_rules.emplace_back(id, std::move(rule)); } -template -auto Lexer::get_rule(uint32_t const variable_id -) -> finite_automata::RegexAST* { +template +auto Lexer::get_rule(uint32_t const variable_id +) -> finite_automata::RegexAST* { for (auto const& rule : m_rules) { if (rule.get_variable_id() == variable_id) { return rule.get_regex(); @@ -373,12 +374,12 @@ auto Lexer::get_rule(uint32_t const variable_id return nullptr; } -template -void Lexer::generate() { - finite_automata::RegexNFA nfa{std::move(m_rules)}; +template +void Lexer::generate() { + finite_automata::Nfa nfa{std::move(m_rules)}; // TODO: DFA ignores tags. E.g., treats "capture:user=(?\d+)" as "capture:user=\d+" m_dfa = nfa_to_dfa(nfa); - DFAStateType const* state = m_dfa->get_root(); + auto const* state = m_dfa->get_root(); for (uint32_t i = 0; i < cSizeOfByte; i++) { if (state->next(i) != nullptr) { m_is_first_char[i] = true; @@ -388,11 +389,11 @@ void Lexer::generate() { } } -template -auto Lexer::epsilon_closure(NFAStateType const* state_ptr -) -> std::set { - std::set closure_set; - std::stack stack; +template +auto Lexer::epsilon_closure(TypedNfaState const* state_ptr +) -> std::set { + std::set closure_set; + std::stack stack; stack.push(state_ptr); while (!stack.empty()) { auto const* current_state = stack.top(); @@ -425,49 +426,47 @@ auto Lexer::epsilon_closure(NFAStateType const* stat return closure_set; } -template -auto Lexer::nfa_to_dfa(finite_automata::RegexNFA& nfa -) -> std::unique_ptr> { - typedef std::set StateSet; - std::unique_ptr> dfa - = std::make_unique>(); - std::map dfa_states; +template +auto Lexer::nfa_to_dfa(finite_automata::Nfa& nfa +) -> std::unique_ptr> { + typedef std::set StateSet; + auto dfa = std::make_unique>(); + std::map dfa_states; std::stack unmarked_sets; auto create_dfa_state - = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> DFAStateType* { - DFAStateType* state = dfa->new_state(set); + = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> TypedDfaState* { + auto* state = dfa->new_state(set); dfa_states[set] = state; unmarked_sets.push(set); return state; }; - StateSet start_set = epsilon_closure(nfa.get_root()); + auto start_set = epsilon_closure(nfa.get_root()); create_dfa_state(start_set); while (!unmarked_sets.empty()) { - StateSet set = unmarked_sets.top(); + auto set = unmarked_sets.top(); unmarked_sets.pop(); - DFAStateType* dfa_state = dfa_states.at(set); + auto* dfa_state = dfa_states.at(set); std::map ascii_transitions_map; // map transitions_map; - for (NFAStateType const* s0 : set) { + for (TypedNfaState const* s0 : set) { for (uint32_t i = 0; i < cSizeOfByte; i++) { - for (NFAStateType* const s1 : s0->get_byte_transitions(i)) { + for (TypedNfaState* const s1 : s0->get_byte_transitions(i)) { StateSet closure = epsilon_closure(s1); ascii_transitions_map[i].insert(closure.begin(), closure.end()); } } // TODO: add this for the utf8 case /* - for (const typename NFAStateType::Tree::Data& data : s0->get_tree_transitions().all()) { - for (NFAStateType* const s1 : data.m_value) { - StateSet closure = epsilon_closure(s1); + for (const typename TypedNfaState::Tree::Data& data : s0->get_tree_transitions().all()) + { for (TypedNfaState* const s1 : data.m_value) { StateSet closure = epsilon_closure(s1); transitions_map[data.m_interval].insert(closure.begin(), closure.end()); } } */ } auto next_dfa_state - = [&dfa_states, &create_dfa_state](StateSet const& set) -> DFAStateType* { - DFAStateType* state{nullptr}; + = [&dfa_states, &create_dfa_state](StateSet const& set) -> TypedDfaState* { + TypedDfaState* state{nullptr}; auto it = dfa_states.find(set); if (it == dfa_states.end()) { state = create_dfa_state(set); @@ -477,15 +476,15 @@ auto Lexer::nfa_to_dfa(finite_automata::RegexNFA::value_type const& kv : ascii_transitions_map) { - DFAStateType* dest_state = next_dfa_state(kv.second); + auto* dest_state = next_dfa_state(kv.second); dfa_state->add_byte_transition(kv.first, dest_state); } // TODO: add this for the utf8 case /* - for (const typename map::value_type& kv : + for (const typename map::value_type& kv : transitions_map) { - DFAStateType* dest_state = next_dfa_state(kv.second); + TypedDfaState* dest_state = next_dfa_state(kv.second); dfa_state->add_tree_transition(kv.first, dest_state); } */ diff --git a/src/log_surgeon/LexicalRule.hpp b/src/log_surgeon/LexicalRule.hpp index 9aa93286..6ab7e861 100644 --- a/src/log_surgeon/LexicalRule.hpp +++ b/src/log_surgeon/LexicalRule.hpp @@ -6,13 +6,13 @@ #include namespace log_surgeon { -template +template class LexicalRule { public: // Constructor LexicalRule( uint32_t const variable_id, - std::unique_ptr> regex + std::unique_ptr> regex ) : m_variable_id(variable_id), m_regex(std::move(regex)) {} @@ -21,22 +21,22 @@ class LexicalRule { * Adds AST representing the lexical rule to the NFA * @param nfa */ - auto add_to_nfa(finite_automata::RegexNFA* nfa) const -> void; + auto add_to_nfa(finite_automata::Nfa* nfa) const -> void; [[nodiscard]] auto get_variable_id() const -> uint32_t { return m_variable_id; } - [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { + [[nodiscard]] auto get_regex() const -> finite_automata::RegexAST* { // TODO: make the returned pointer constant return m_regex.get(); } private: uint32_t m_variable_id; - std::unique_ptr> m_regex; + std::unique_ptr> m_regex; }; -template -void LexicalRule::add_to_nfa(finite_automata::RegexNFA* nfa) const { +template +void LexicalRule::add_to_nfa(finite_automata::Nfa* nfa) const { auto* end_state = nfa->new_state(); end_state->set_accepting(true); end_state->set_matching_variable_id(m_variable_id); diff --git a/src/log_surgeon/LogEvent.cpp b/src/log_surgeon/LogEvent.cpp index 709e62ce..8a3deead 100644 --- a/src/log_surgeon/LogEvent.cpp +++ b/src/log_surgeon/LogEvent.cpp @@ -42,7 +42,7 @@ auto LogEventView::reset() -> void { start = 1; } for (uint32_t i = start; i < m_log_output_buffer->pos(); i++) { - Token& token = m_log_output_buffer->get_mutable_token(i); + auto& token = m_log_output_buffer->get_mutable_token(i); raw_log += token.to_string_view(); } return raw_log; @@ -51,7 +51,7 @@ auto LogEventView::reset() -> void { auto LogEventView::get_logtype() const -> std::string { std::string logtype; for (uint32_t i = 1; i < m_log_output_buffer->pos(); i++) { - Token& token = m_log_output_buffer->get_mutable_token(i); + auto& token = m_log_output_buffer->get_mutable_token(i); if (token.m_type_ids_ptr->at(0) == (uint32_t)SymbolId::TokenUncaughtString) { logtype += token.to_string_view(); } else { diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index b98e7121..ab7cde5d 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -17,6 +17,8 @@ using std::unique_ptr; using std::vector; namespace log_surgeon { +using finite_automata::ByteDfaState; +using finite_automata::ByteNfaState; using finite_automata::RegexAST; using finite_automata::RegexASTCat; using finite_automata::RegexASTGroup; @@ -24,8 +26,6 @@ using finite_automata::RegexASTInteger; using finite_automata::RegexASTLiteral; using finite_automata::RegexASTMultiplication; using finite_automata::RegexASTOr; -using finite_automata::RegexDFAByteState; -using finite_automata::RegexNFAByteState; LogParser::LogParser(string const& schema_file_path) : LogParser::LogParser(SchemaParser::try_schema_file(schema_file_path)) {} @@ -43,7 +43,7 @@ auto LogParser::add_delimiters(unique_ptr const& delimiters) -> void } } -void LogParser::add_rules(std::unique_ptr schema_ast) { +auto LogParser::add_rules(std::unique_ptr schema_ast) -> void { for (auto const& delimiters : schema_ast->m_delimiters) { add_delimiters(delimiters); } @@ -62,26 +62,24 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { for (unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* rule = dynamic_cast(parser_ast.get()); if (rule->m_name == "timestamp") { - unique_ptr> first_timestamp_regex_ast( - rule->m_regex_ptr->clone() + unique_ptr> first_timestamp_regex_ast(rule->m_regex_ptr->clone() ); - unique_ptr> r1 - = make_unique>(utf8::cCharStartOfFile); + unique_ptr> r1 + = make_unique>(utf8::cCharStartOfFile); add_rule( "firstTimestamp", - make_unique>( + make_unique>( std::move(r1), std::move(first_timestamp_regex_ast) ) ); - unique_ptr> newline_timestamp_regex_ast( - rule->m_regex_ptr->clone() - ); - unique_ptr> r2 - = make_unique>('\n'); + unique_ptr> newline_timestamp_regex_ast(rule->m_regex_ptr->clone( + )); + unique_ptr> r2 + = make_unique>('\n'); add_rule( "newLineTimestamp", - make_unique>( + make_unique>( std::move(r2), std::move(newline_timestamp_regex_ast) ) @@ -142,11 +140,9 @@ void LogParser::add_rules(std::unique_ptr schema_ast) { } // For log-specific lexing: modify variable regex to contain a delimiter at the start. - unique_ptr> delimiter_group - = make_unique>( - RegexASTGroup(delimiters) - ); - rule->m_regex_ptr = make_unique>( + unique_ptr> delimiter_group + = make_unique>(RegexASTGroup(delimiters)); + rule->m_regex_ptr = make_unique>( std::move(delimiter_group), std::move(rule->m_regex_ptr) ); @@ -197,7 +193,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { // make a message with just the '\n' character next_token.m_end_pos = next_token.m_start_pos + 1; next_token.m_type_ids_ptr - = &Lexer::cTokenUncaughtStringTypes; + = &Lexer::cTokenUncaughtStringTypes; output_buffer->set_token(1, next_token); output_buffer->set_pos(2); m_input_buffer.set_consumed_pos(next_token.m_start_pos); @@ -263,7 +259,7 @@ auto LogParser::parse(LogParser::ParsingAction& parsing_action) -> ErrorCode { Token curr_token = output_buffer->get_curr_token(); curr_token.m_end_pos = curr_token.m_start_pos + 1; curr_token.m_type_ids_ptr - = &Lexer::cTokenUncaughtStringTypes; + = &Lexer::cTokenUncaughtStringTypes; output_buffer->set_curr_token(curr_token); if (0 == m_start_of_log_message.m_start_pos) { m_input_buffer.set_consumed_pos(m_input_buffer.storage().size() - 1); diff --git a/src/log_surgeon/LogParser.hpp b/src/log_surgeon/LogParser.hpp index 91339144..7605fe5f 100644 --- a/src/log_surgeon/LogParser.hpp +++ b/src/log_surgeon/LogParser.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include @@ -15,8 +15,7 @@ namespace log_surgeon { // TODO: Compare c-array vs. vectors (its underlying array) for buffers -class LogParser - : public Parser { +class LogParser : public Parser { public: enum class ParsingAction { None, @@ -27,7 +26,7 @@ class LogParser /** * Constructs the parser using the given schema file. * @param schema_file_path - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure parsing the schema file or processing the schema * AST. */ @@ -36,7 +35,7 @@ class LogParser /** * Constructs the parser using the given schema AST. * @param schema_ast - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure processing the schema AST. */ explicit LogParser(std::unique_ptr schema_ast); diff --git a/src/log_surgeon/Parser.hpp b/src/log_surgeon/Parser.hpp index 994501b8..37d5734e 100644 --- a/src/log_surgeon/Parser.hpp +++ b/src/log_surgeon/Parser.hpp @@ -5,19 +5,19 @@ namespace log_surgeon { -template +template class Parser { public: Parser(); virtual auto add_rule( std::string const& name, - std::unique_ptr> rule + std::unique_ptr> rule ) -> void; auto add_token(std::string const& name, char rule_char) -> void; - Lexer m_lexer; + Lexer m_lexer; }; } // namespace log_surgeon diff --git a/src/log_surgeon/Parser.tpp b/src/log_surgeon/Parser.tpp index be307af3..6a4da164 100644 --- a/src/log_surgeon/Parser.tpp +++ b/src/log_surgeon/Parser.tpp @@ -7,8 +7,8 @@ namespace log_surgeon { -template -Parser::Parser() { +template +Parser::Parser() { // TODO move clp-reserved symbols out of the parser m_lexer.m_symbol_id[cTokenEnd] = (uint32_t)SymbolId::TokenEnd; m_lexer.m_symbol_id[cTokenUncaughtString] = (uint32_t)SymbolId::TokenUncaughtString; @@ -29,11 +29,11 @@ Parser::Parser() { m_lexer.m_id_symbol[(uint32_t)SymbolId::TokenNewline] = cTokenNewline; } -template -void Parser::add_rule( +template +auto Parser::add_rule( std::string const& name, - std::unique_ptr> rule -) { + std::unique_ptr> rule +) -> void { if (m_lexer.m_symbol_id.find(name) == m_lexer.m_symbol_id.end()) { m_lexer.m_symbol_id[name] = m_lexer.m_symbol_id.size(); m_lexer.m_id_symbol[m_lexer.m_symbol_id[name]] = name; @@ -41,9 +41,10 @@ void Parser::add_rule( m_lexer.add_rule(m_lexer.m_symbol_id[name], std::move(rule)); } -template -void Parser::add_token(std::string const& name, char rule_char) { - add_rule(name, std::make_unique>(rule_char)); +template +auto Parser::add_token(std::string const& name, char rule_char) + -> void { + add_rule(name, std::make_unique>(rule_char)); } } // namespace log_surgeon diff --git a/src/log_surgeon/ParserInputBuffer.cpp b/src/log_surgeon/ParserInputBuffer.cpp index 15fde349..8fdb70b2 100644 --- a/src/log_surgeon/ParserInputBuffer.cpp +++ b/src/log_surgeon/ParserInputBuffer.cpp @@ -9,7 +9,7 @@ using std::string; using std::to_string; namespace log_surgeon { -void ParserInputBuffer::reset() { +auto ParserInputBuffer::reset() -> void { m_log_fully_consumed = false; m_finished_reading_input = false; m_pos_last_read_char = 0; @@ -107,12 +107,12 @@ auto ParserInputBuffer::get_next_character(unsigned char& next_char) -> ErrorCod // the user to wrap their input buffer. It tricks the LogParser and // ParserInputBuffer into thinking it never reaches the wrap, while still // respecting the actual size of the buffer the user passed in. -void ParserInputBuffer::set_storage( +auto ParserInputBuffer::set_storage( char* storage, uint32_t size, uint32_t pos, bool finished_reading_input -) { +) -> void { reset(); m_storage.set_active_buffer(storage, size * 2, pos); m_finished_reading_input = finished_reading_input; diff --git a/src/log_surgeon/ReaderParser.hpp b/src/log_surgeon/ReaderParser.hpp index 9465efbd..805cd7b4 100644 --- a/src/log_surgeon/ReaderParser.hpp +++ b/src/log_surgeon/ReaderParser.hpp @@ -19,7 +19,7 @@ class ReaderParser { /** * Constructs the parser using the the given schema file. * @param schema_file_path - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure parsing the schema file or processing the schema * AST. */ @@ -28,7 +28,7 @@ class ReaderParser { /** * Constructs the parser using the given schema AST. * @param schema_ast - * @throw std::runtime_error from LALR1Parser, RegexAST, or Lexer + * @throw std::runtime_error from Lalr1Parser, RegexAST, or Lexer * describing the failure processing the schema AST. */ explicit ReaderParser(std::unique_ptr schema_ast); diff --git a/src/log_surgeon/Schema.cpp b/src/log_surgeon/Schema.cpp index e2008b18..dd4b0869 100644 --- a/src/log_surgeon/Schema.cpp +++ b/src/log_surgeon/Schema.cpp @@ -11,7 +11,7 @@ Schema::Schema(std::string const& schema_file_path) : m_schema_ast{SchemaParser::try_schema_file(schema_file_path)} {} auto Schema::add_variable(std::string_view const var_schema, int const priority) const -> void { - std::unique_ptr const schema_ast = SchemaParser::try_schema_string(var_schema); + auto const schema_ast = SchemaParser::try_schema_string(var_schema); m_schema_ast->add_schema_var(std::move(schema_ast->m_schema_vars[0]), priority); } } // namespace log_surgeon diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index 56760262..d36271ca 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -11,30 +11,30 @@ #include #include #include -#include +#include #include #include using ParserValueRegex = log_surgeon::ParserValue>>; + log_surgeon::finite_automata::RegexAST>>; using RegexASTByte - = log_surgeon::finite_automata::RegexAST; -using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup< - log_surgeon::finite_automata::RegexNFAByteState>; -using RegexASTIntegerByte = log_surgeon::finite_automata::RegexASTInteger< - log_surgeon::finite_automata::RegexNFAByteState>; -using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral< - log_surgeon::finite_automata::RegexNFAByteState>; + = log_surgeon::finite_automata::RegexAST; +using RegexASTGroupByte + = log_surgeon::finite_automata::RegexASTGroup; +using RegexASTIntegerByte + = log_surgeon::finite_automata::RegexASTInteger; +using RegexASTLiteralByte + = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication< - log_surgeon::finite_automata::RegexNFAByteState>; + log_surgeon::finite_automata::ByteNfaState>; using RegexASTOrByte - = log_surgeon::finite_automata::RegexASTOr; -using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat< - log_surgeon::finite_automata::RegexNFAByteState>; -using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture< - log_surgeon::finite_automata::RegexNFAByteState>; -using RegexASTEmptyByte = log_surgeon::finite_automata::RegexASTEmpty< - log_surgeon::finite_automata::RegexNFAByteState>; + = log_surgeon::finite_automata::RegexASTOr; +using RegexASTCatByte + = log_surgeon::finite_automata::RegexASTCat; +using RegexASTCaptureByte + = log_surgeon::finite_automata::RegexASTCapture; +using RegexASTEmptyByte + = log_surgeon::finite_automata::RegexASTEmpty; using std::make_unique; using std::string; @@ -416,7 +416,7 @@ static auto new_delimiter_string_rule(NonTerminal* m) -> unique_ptr { return make_unique(character); } -void SchemaParser::add_lexical_rules() { +auto SchemaParser::add_lexical_rules() -> void { if (m_special_regex_characters.empty()) { m_special_regex_characters.emplace('(', "Lparen"); m_special_regex_characters.emplace(')', "Rparen"); @@ -481,7 +481,7 @@ void SchemaParser::add_lexical_rules() { add_token_group("CommentCharacters", std::move(comment_characters)); } -void SchemaParser::add_productions() { +auto SchemaParser::add_productions() -> void { // add_production("Schema", {}, new_schema_rule); add_production("Schema", {"Comment"}, new_schema_rule); add_production("Schema", {"SchemaVar"}, new_schema_rule_with_var); diff --git a/src/log_surgeon/SchemaParser.hpp b/src/log_surgeon/SchemaParser.hpp index c5081287..36db6119 100644 --- a/src/log_surgeon/SchemaParser.hpp +++ b/src/log_surgeon/SchemaParser.hpp @@ -5,7 +5,7 @@ #include #include -#include +#include namespace log_surgeon { // ASTs used in SchemaParser AST @@ -46,8 +46,7 @@ class SchemaVarAST : public ParserAST { // Constructor SchemaVarAST( std::string name, - std::unique_ptr> - regex_ptr, + std::unique_ptr> regex_ptr, uint32_t line_num ) : m_line_num(line_num), @@ -56,7 +55,7 @@ class SchemaVarAST : public ParserAST { uint32_t m_line_num; std::string m_name; - std::unique_ptr> m_regex_ptr; + std::unique_ptr> m_regex_ptr; }; class DelimiterStringAST : public ParserAST { @@ -69,9 +68,8 @@ class DelimiterStringAST : public ParserAST { std::vector m_delimiters; }; -class SchemaParser : public LALR1Parser< - finite_automata::RegexNFAByteState, - finite_automata::RegexDFAByteState> { +class SchemaParser + : public Lalr1Parser { public: /** * File wrapper around generate_schema_ast() diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/Dfa.hpp similarity index 60% rename from src/log_surgeon/finite_automata/RegexDFA.hpp rename to src/log_surgeon/finite_automata/Dfa.hpp index 3e8ad149..5d425dd2 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/Dfa.hpp @@ -1,27 +1,26 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP +#ifndef LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP #include #include #include #include -#include +#include namespace log_surgeon::finite_automata { -// TODO: rename `RegexDFA` to `DFA` -template -class RegexDFA { +template +class Dfa { public: /** * Creates a new DFA state based on a set of NFA states and adds it to `m_states`. * @param nfa_state_set The set of NFA states represented by this DFA state. * @return A pointer to the new DFA state. */ - template - auto new_state(std::set const& nfa_state_set) -> DFAStateType*; + template + auto new_state(std::set const& nfa_state_set) -> TypedDfaState*; - auto get_root() const -> DFAStateType const* { return m_states.at(0).get(); } + auto get_root() const -> TypedDfaState const* { return m_states.at(0).get(); } /** * Compares this dfa with `dfa_in` to determine the set of schema types in this dfa that are @@ -31,17 +30,17 @@ class RegexDFA { * @param dfa_in The dfa with which to take the intersect. * @return The set of schema types reachable by `dfa_in`. */ - [[nodiscard]] auto get_intersect(RegexDFA const* dfa_in) const -> std::set; + [[nodiscard]] auto get_intersect(Dfa const* dfa_in) const -> std::set; private: - std::vector> m_states; + std::vector> m_states; }; -template -template -auto RegexDFA::new_state(std::set const& nfa_state_set -) -> DFAStateType* { - m_states.emplace_back(std::make_unique()); +template +template +auto Dfa::new_state(std::set const& nfa_state_set +) -> TypedDfaState* { + m_states.emplace_back(std::make_unique()); auto* dfa_state = m_states.back().get(); for (auto const* nfa_state : nfa_state_set) { if (nfa_state->is_accepting()) { @@ -51,12 +50,12 @@ auto RegexDFA::new_state(std::set const& nfa_state_ return dfa_state; } -template -auto RegexDFA::get_intersect(RegexDFA const* dfa_in) const -> std::set { +template +auto Dfa::get_intersect(Dfa const* dfa_in) const -> std::set { std::set schema_types; - std::set> unvisited_pairs; - std::set> visited_pairs; - unvisited_pairs.emplace(this->get_root(), dfa_in->get_root()); + std::set> unvisited_pairs; + std::set> visited_pairs; + unvisited_pairs.emplace(get_root(), dfa_in->get_root()); // TODO: Handle UTF-8 (multi-byte transitions) as well while (false == unvisited_pairs.empty()) { auto current_pair_it = unvisited_pairs.begin(); @@ -72,4 +71,4 @@ auto RegexDFA::get_intersect(RegexDFA const* dfa_in) const -> std: } } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_HPP diff --git a/src/log_surgeon/finite_automata/RegexDFAState.hpp b/src/log_surgeon/finite_automata/DfaState.hpp similarity index 51% rename from src/log_surgeon/finite_automata/RegexDFAState.hpp rename to src/log_surgeon/finite_automata/DfaState.hpp index 3c0ef4ca..f25b25ac 100644 --- a/src/log_surgeon/finite_automata/RegexDFAState.hpp +++ b/src/log_surgeon/finite_automata/DfaState.hpp @@ -1,5 +1,5 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE +#ifndef LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE +#define LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE #include #include @@ -9,24 +9,22 @@ #include #include -#include +#include #include namespace log_surgeon::finite_automata { -template -class RegexDFAState; +template +class DfaState; -using RegexDFAByteState = RegexDFAState; -using RegexDFAUTF8State = RegexDFAState; +using ByteDfaState = DfaState; +using Utf8DfaState = DfaState; -template -class RegexDFAState { +template +class DfaState { public: - using Tree = UnicodeIntervalTree*>; + using Tree = UnicodeIntervalTree; - RegexDFAState() { - std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); - } + DfaState() { std::fill(std::begin(m_bytes_transition), std::end(m_bytes_transition), nullptr); } auto add_matching_variable_id(uint32_t const variable_id) -> void { m_matching_variable_ids.push_back(variable_id); @@ -40,7 +38,7 @@ class RegexDFAState { return false == m_matching_variable_ids.empty(); } - auto add_byte_transition(uint8_t const& byte, RegexDFAState* dest_state) -> void { + auto add_byte_transition(uint8_t const& byte, DfaState* dest_state) -> void { m_bytes_transition[byte] = dest_state; } @@ -48,19 +46,19 @@ class RegexDFAState { * @param character The character (byte or utf8) to transition on. * @return A pointer to the DFA state reached after transitioning on `character`. */ - [[nodiscard]] auto next(uint32_t character) const -> RegexDFAState*; + [[nodiscard]] auto next(uint32_t character) const -> DfaState*; private: std::vector m_matching_variable_ids; - RegexDFAState* m_bytes_transition[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, - // so we use an empty class (`std::tuple<>`) in that case. - std::conditional_t> m_tree_transitions; + DfaState* m_bytes_transition[cSizeOfByte]; + // NOTE: We don't need m_tree_transitions for the `state_type == DfaStateType::Byte` case, so we + // use an empty class (`std::tuple<>`) in that case. + std::conditional_t> m_tree_transitions; }; -template -auto RegexDFAState::next(uint32_t character) const -> RegexDFAState* { - if constexpr (RegexDFAStateType::Byte == stateType) { +template +auto DfaState::next(uint32_t character) const -> DfaState* { + if constexpr (DfaStateType::Byte == state_type) { return m_bytes_transition[character]; } else { if (character < cSizeOfByte) { @@ -77,4 +75,4 @@ auto RegexDFAState::next(uint32_t character) const -> RegexDFAState #include @@ -19,20 +19,20 @@ namespace log_surgeon::finite_automata { * * NOTE: Only the first state in the pair contains the variable types matched by the pair. */ -template -class RegexDFAStatePair { +template +class DfaStatePair { public: - RegexDFAStatePair(DFAState const* state1, DFAState const* state2) + DfaStatePair(TypedDfaState const* state1, TypedDfaState const* state2) : m_state1(state1), m_state2(state2) {}; /** - * Used for ordering in a set by considering the states' addresses + * Used for ordering in a set by considering the states' addresses. * @param rhs - * @return Whether m_state1 in lhs has a lower address than in rhs, or if they're equal, - * whether m_state2 in lhs has a lower address than in rhs + * @return Whether `m_state1` in lhs has a lower address than in rhs, or if they're equal, + * whether `m_state2` in lhs has a lower address than in rhs. */ - auto operator<(RegexDFAStatePair const& rhs) const -> bool { + auto operator<(DfaStatePair const& rhs) const -> bool { if (m_state1 == rhs.m_state1) { return m_state2 < rhs.m_state2; } @@ -41,13 +41,13 @@ class RegexDFAStatePair { /** * Generates all pairs reachable from the current pair via any string and store any reachable - * pair not previously visited in unvisited_pairs - * @param visited_pairs Previously visited pairs - * @param unvisited_pairs Set to add unvisited reachable pairs + * pair not previously visited in `unvisited_pairs`. + * @param visited_pairs Previously visited pairs. + * @param unvisited_pairs Set to add unvisited reachable pairs. */ auto get_reachable_pairs( - std::set>& visited_pairs, - std::set>& unvisited_pairs + std::set& visited_pairs, + std::set& unvisited_pairs ) const -> void; [[nodiscard]] auto is_accepting() const -> bool { @@ -59,21 +59,21 @@ class RegexDFAStatePair { } private: - DFAState const* m_state1; - DFAState const* m_state2; + TypedDfaState const* m_state1; + TypedDfaState const* m_state2; }; -template -auto RegexDFAStatePair::get_reachable_pairs( - std::set>& visited_pairs, - std::set>& unvisited_pairs +template +auto DfaStatePair::get_reachable_pairs( + std::set& visited_pairs, + std::set& unvisited_pairs ) const -> void { // TODO: Handle UTF-8 (multi-byte transitions) as well for (uint32_t i = 0; i < cSizeOfByte; i++) { auto next_state1 = m_state1->next(i); auto next_state2 = m_state2->next(i); if (next_state1 != nullptr && next_state2 != nullptr) { - RegexDFAStatePair reachable_pair{next_state1, next_state2}; + DfaStatePair reachable_pair{next_state1, next_state2}; if (visited_pairs.count(reachable_pair) == 0) { unvisited_pairs.insert(reachable_pair); } @@ -82,4 +82,4 @@ auto RegexDFAStatePair::get_reachable_pairs( } } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_PAIR +#endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE_PAIR diff --git a/src/log_surgeon/finite_automata/DfaStateType.hpp b/src/log_surgeon/finite_automata/DfaStateType.hpp new file mode 100644 index 00000000..017134c8 --- /dev/null +++ b/src/log_surgeon/finite_automata/DfaStateType.hpp @@ -0,0 +1,13 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE_TYPE +#define LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE_TYPE + +#include + +namespace log_surgeon::finite_automata { +enum class DfaStateType : uint8_t { + Byte, + Utf8 +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_DFA_STATE_TYPE diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/Nfa.hpp similarity index 65% rename from src/log_surgeon/finite_automata/RegexNFA.hpp rename to src/log_surgeon/finite_automata/Nfa.hpp index ba9791b1..8eaaaadd 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/Nfa.hpp @@ -1,13 +1,10 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP +#ifndef LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP +#define LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP #include #include -#include #include -#include #include -#include #include #include #include @@ -16,23 +13,22 @@ #include #include -#include +#include #include namespace log_surgeon::finite_automata { -// TODO: rename `RegexNFA` to `NFA` -template -class RegexNFA { +template +class Nfa { public: - using StateVec = std::vector; + using StateVec = std::vector; - explicit RegexNFA(std::vector> rules); + explicit Nfa(std::vector> rules); /** * Creates a unique_ptr for an NFA state with no tagged transitions and adds it to `m_states`. - * @return NFAStateType* + * @return TypedNfaState* */ - [[nodiscard]] auto new_state() -> NFAStateType*; + [[nodiscard]] auto new_state() -> TypedNfaState*; /** * Creates a unique_ptr for an NFA state with a positive tagged end transition and adds it to @@ -43,20 +39,20 @@ class RegexNFA { */ [[nodiscard]] auto new_state_with_positive_tagged_end_transition( Tag const* tag, - NFAStateType const* dest_state - ) -> NFAStateType*; + TypedNfaState const* dest_state + ) -> TypedNfaState*; /** * Creates a unique_ptr for an NFA state with a negative tagged transition and adds it to * `m_states`. * @param tags * @param dest_state - * @return NFAStateType* + * @return TypedNfaState* */ [[nodiscard]] auto new_state_with_negative_tagged_transition( std::vector tags, - NFAStateType const* dest_state - ) -> NFAStateType*; + TypedNfaState const* dest_state + ) -> TypedNfaState*; /** * Creates the start and end states for a capture group. @@ -68,38 +64,38 @@ class RegexNFA { */ [[nodiscard]] auto new_start_and_end_states_with_positive_tagged_transitions( Tag const* tag, - NFAStateType const* dest_state - ) -> std::pair; + TypedNfaState const* dest_state + ) -> std::pair; /** * @return A vector representing the traversal order of the NFA states using breadth-first * search (BFS). */ - [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; + [[nodiscard]] auto get_bfs_traversal_order() const -> std::vector; /** * @return A string representation of the NFA. */ [[nodiscard]] auto serialize() const -> std::string; - auto add_root_interval(Interval interval, NFAStateType* dest_state) -> void { + auto add_root_interval(Interval interval, TypedNfaState* dest_state) -> void { m_root->add_interval(interval, dest_state); } - auto set_root(NFAStateType* root) -> void { m_root = root; } + auto set_root(TypedNfaState* root) -> void { m_root = root; } - auto get_root() -> NFAStateType* { return m_root; } + auto get_root() -> TypedNfaState* { return m_root; } private: - std::vector> m_states; - NFAStateType* m_root; + std::vector> m_states; + TypedNfaState* m_root; // Store the rules locally as they contain information needed by the NFA. E.g., transitions in // the NFA point to tags in the rule ASTs. - std::vector> m_rules; + std::vector> m_rules; }; -template -RegexNFA::RegexNFA(std::vector> rules) +template +Nfa::Nfa(std::vector> rules) : m_root{new_state()}, m_rules{std::move(rules)} { for (auto const& rule : m_rules) { @@ -107,35 +103,35 @@ RegexNFA::RegexNFA(std::vector> rules) } } -template -auto RegexNFA::new_state() -> NFAStateType* { - m_states.emplace_back(std::make_unique()); +template +auto Nfa::new_state() -> TypedNfaState* { + m_states.emplace_back(std::make_unique()); return m_states.back().get(); } -template -auto RegexNFA::new_state_with_positive_tagged_end_transition( +template +auto Nfa::new_state_with_positive_tagged_end_transition( Tag const* tag, - NFAStateType const* dest_state -) -> NFAStateType* { - m_states.emplace_back(std::make_unique(tag, dest_state)); + TypedNfaState const* dest_state +) -> TypedNfaState* { + m_states.emplace_back(std::make_unique(tag, dest_state)); return m_states.back().get(); } -template -auto RegexNFA::new_state_with_negative_tagged_transition( +template +auto Nfa::new_state_with_negative_tagged_transition( std::vector tags, - NFAStateType const* dest_state -) -> NFAStateType* { - m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); + TypedNfaState const* dest_state +) -> TypedNfaState* { + m_states.emplace_back(std::make_unique(std::move(tags), dest_state)); return m_states.back().get(); } -template -auto RegexNFA::new_start_and_end_states_with_positive_tagged_transitions( +template +auto Nfa::new_start_and_end_states_with_positive_tagged_transitions( Tag const* tag, - NFAStateType const* dest_state -) -> std::pair { + TypedNfaState const* dest_state +) -> std::pair { auto* start_state = new_state(); m_root->add_positive_tagged_start_transition(tag, start_state); @@ -143,16 +139,16 @@ auto RegexNFA::new_start_and_end_states_with_positive_tagged_trans return {start_state, end_state}; } -template -auto RegexNFA::get_bfs_traversal_order() const -> std::vector { - std::queue state_queue; - std::unordered_set visited_states; - std::vector visited_order; +template +auto Nfa::get_bfs_traversal_order() const -> std::vector { + std::queue state_queue; + std::unordered_set visited_states; + std::vector visited_order; visited_states.reserve(m_states.size()); visited_order.reserve(m_states.size()); auto add_to_queue_and_visited - = [&state_queue, &visited_states](NFAStateType const* dest_state) { + = [&state_queue, &visited_states](TypedNfaState const* dest_state) { if (visited_states.insert(dest_state).second) { state_queue.push(dest_state); } @@ -194,11 +190,11 @@ auto RegexNFA::get_bfs_traversal_order() const -> std::vector -auto RegexNFA::serialize() const -> std::string { +template +auto Nfa::serialize() const -> std::string { auto const traversal_order = get_bfs_traversal_order(); - std::unordered_map state_ids; + std::unordered_map state_ids; for (auto const* state : traversal_order) { state_ids.emplace(state, state_ids.size()); } @@ -214,4 +210,4 @@ auto RegexNFA::serialize() const -> std::string { } } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_NFA_HPP diff --git a/src/log_surgeon/finite_automata/RegexNFAState.hpp b/src/log_surgeon/finite_automata/NfaState.hpp similarity index 72% rename from src/log_surgeon/finite_automata/RegexNFAState.hpp rename to src/log_surgeon/finite_automata/NfaState.hpp index 8fce8cf7..339f38f0 100644 --- a/src/log_surgeon/finite_automata/RegexNFAState.hpp +++ b/src/log_surgeon/finite_automata/NfaState.hpp @@ -1,11 +1,10 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE +#ifndef LOG_SURGEON_FINITE_AUTOMATA_NFA_STATE +#define LOG_SURGEON_FINITE_AUTOMATA_NFA_STATE #include #include #include #include -#include #include #include #include @@ -13,28 +12,28 @@ #include -#include +#include #include #include namespace log_surgeon::finite_automata { -template -class RegexNFAState; +template +class NfaState; -using RegexNFAByteState = RegexNFAState; -using RegexNFAUTF8State = RegexNFAState; +using ByteNfaState = NfaState; +using Utf8NfaState = NfaState; -template -class RegexNFAState { +template +class NfaState { public: - using Tree = UnicodeIntervalTree; + using Tree = UnicodeIntervalTree; - RegexNFAState() = default; + NfaState() = default; - RegexNFAState(Tag const* tag, RegexNFAState const* dest_state) + NfaState(Tag const* tag, NfaState const* dest_state) : m_positive_tagged_end_transition{PositiveTaggedTransition{tag, dest_state}} {} - RegexNFAState(std::vector tags, RegexNFAState const* dest_state) + NfaState(std::vector tags, NfaState const* dest_state) : m_negative_tagged_transition{NegativeTaggedTransition{std::move(tags), dest_state}} {} auto set_accepting(bool accepting) -> void { m_accepting = accepting; } @@ -49,52 +48,50 @@ class RegexNFAState { return m_matching_variable_id; } - auto - add_positive_tagged_start_transition(Tag const* tag, RegexNFAState const* dest_state) -> void { + auto add_positive_tagged_start_transition(Tag const* tag, NfaState const* dest_state) -> void { m_positive_tagged_start_transitions.emplace_back(tag, dest_state); } [[nodiscard]] auto get_positive_tagged_start_transitions( - ) const -> std::vector> const& { + ) const -> std::vector> const& { return m_positive_tagged_start_transitions; } [[nodiscard]] auto get_positive_tagged_end_transition( - ) const -> std::optional> const& { + ) const -> std::optional> const& { return m_positive_tagged_end_transition; } [[nodiscard]] auto get_negative_tagged_transition( - ) const -> std::optional> const& { + ) const -> std::optional> const& { return m_negative_tagged_transition; } - auto add_epsilon_transition(RegexNFAState* epsilon_transition) -> void { + auto add_epsilon_transition(NfaState* epsilon_transition) -> void { m_epsilon_transitions.push_back(epsilon_transition); } - [[nodiscard]] auto get_epsilon_transitions() const -> std::vector const& { + [[nodiscard]] auto get_epsilon_transitions() const -> std::vector const& { return m_epsilon_transitions; } - auto add_byte_transition(uint8_t byte, RegexNFAState* dest_state) -> void { + auto add_byte_transition(uint8_t byte, NfaState* dest_state) -> void { m_bytes_transitions[byte].push_back(dest_state); } - [[nodiscard]] auto get_byte_transitions(uint8_t byte - ) const -> std::vector const& { + [[nodiscard]] auto get_byte_transitions(uint8_t byte) const -> std::vector const& { return m_bytes_transitions[byte]; } auto get_tree_transitions() -> Tree const& { return m_tree_transitions; } /** - Add dest_state to m_bytes_transitions if all values in interval are a byte, otherwise add - dest_state to m_tree_transitions + Add `dest_state` to `m_bytes_transitions` if all values in interval are a byte, otherwise add + `dest_state` to `m_tree_transitions`. * @param interval * @param dest_state */ - auto add_interval(Interval interval, RegexNFAState* dest_state) -> void; + auto add_interval(Interval interval, NfaState* dest_state) -> void; /** * @param state_ids A map of states to their unique identifiers. @@ -104,26 +101,25 @@ class RegexNFAState { * @return Forwards `NegativeTaggedTransition::serialize`'s return value (std::nullopt) on * failure. */ - [[nodiscard]] auto serialize(std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional; private: bool m_accepting{false}; uint32_t m_matching_variable_id{0}; - std::vector> m_positive_tagged_start_transitions; - std::optional> m_positive_tagged_end_transition; - std::optional> m_negative_tagged_transition; - std::vector m_epsilon_transitions; - std::array, cSizeOfByte> m_bytes_transitions; + std::vector> m_positive_tagged_start_transitions; + std::optional> m_positive_tagged_end_transition; + std::optional> m_negative_tagged_transition; + std::vector m_epsilon_transitions; + std::array, cSizeOfByte> m_bytes_transitions; // NOTE: We don't need m_tree_transitions for the `stateType == - // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) + // NfaStateType::Byte` case, so we use an empty class (`std::tuple<>`) // in that case. - std::conditional_t> - m_tree_transitions; + std::conditional_t> m_tree_transitions; }; -template -auto RegexNFAState::add_interval(Interval interval, RegexNFAState* dest_state) -> void { +template +auto NfaState::add_interval(Interval interval, NfaState* dest_state) -> void { if (interval.first < cSizeOfByte) { uint32_t const bound = std::min(interval.second, cSizeOfByte - 1); for (uint32_t i = interval.first; i <= bound; i++) { @@ -131,7 +127,7 @@ auto RegexNFAState::add_interval(Interval interval, RegexNFAState* d } interval.first = bound + 1; } - if constexpr (RegexNFAStateType::UTF8 == state_type) { + if constexpr (NfaStateType::Utf8 == state_type) { if (interval.second < cSizeOfByte) { return; } @@ -141,7 +137,7 @@ auto RegexNFAState::add_interval(Interval interval, RegexNFAState* d uint32_t overlap_low = std::max(data.m_interval.first, interval.first); uint32_t overlap_high = std::min(data.m_interval.second, interval.second); - std::vector tree_states = data.m_value; + std::vector tree_states = data.m_value; tree_states.push_back(dest_state); m_tree_transitions.insert(Interval(overlap_low, overlap_high), tree_states); if (data.m_interval.first < interval.first) { @@ -169,9 +165,8 @@ auto RegexNFAState::add_interval(Interval interval, RegexNFAState* d } } -template -auto RegexNFAState::serialize( - std::unordered_map const& state_ids +template +auto NfaState::serialize(std::unordered_map const& state_ids ) const -> std::optional { std::vector byte_transitions; for (uint32_t idx{0}; idx < cSizeOfByte; ++idx) { @@ -238,4 +233,4 @@ auto RegexNFAState::serialize( } } // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE +#endif // LOG_SURGEON_FINITE_AUTOMATA_NFA_STATE diff --git a/src/log_surgeon/finite_automata/NfaStateType.hpp b/src/log_surgeon/finite_automata/NfaStateType.hpp new file mode 100644 index 00000000..1cc56de1 --- /dev/null +++ b/src/log_surgeon/finite_automata/NfaStateType.hpp @@ -0,0 +1,13 @@ +#ifndef LOG_SURGEON_FINITE_AUTOMATA_NFA_STATE_TYPE +#define LOG_SURGEON_FINITE_AUTOMATA_NFA_STATE_TYPE + +#include + +namespace log_surgeon::finite_automata { +enum class NfaStateType : uint8_t { + Byte, + Utf8 +}; +} // namespace log_surgeon::finite_automata + +#endif // LOG_SURGEON_FINITE_AUTOMATA_NFA_STATE_TYPE diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index beeb588e..bb55f62d 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -24,8 +24,8 @@ #include namespace log_surgeon::finite_automata { -template -class RegexNFA; +template +class Nfa; // TODO: rename `RegexAST` to `RegexASTNode` /** @@ -40,9 +40,9 @@ class RegexNFA; * ASTs built using this class are assumed to be constructed in a bottom-up manner, where all * descendant nodes are created first. * - * @tparam NFAStateType Whether this AST is used for byte lexing or UTF-8 lexing. + * @tparam TypedNfaState Whether this AST is used for byte lexing or UTF-8 lexing. */ -template +template class RegexAST { public: RegexAST() = default; @@ -70,12 +70,12 @@ class RegexAST { virtual auto remove_delimiters_from_wildcard(std::vector& delimiters) -> void = 0; /** - * Add the needed RegexNFA::states to the passed in nfa to handle the + * Add the needed Nfa::states to the passed in nfa to handle the * current node before transitioning to an accepting end_state * @param nfa * @param end_state */ - virtual auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void = 0; + virtual auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void = 0; /** * Serializes the AST with this node as the root. @@ -108,8 +108,8 @@ class RegexAST { * @param nfa * @param end_state */ - auto add_to_nfa_with_negative_tags(RegexNFA* nfa, NFAStateType* end_state) const - -> void { + auto + add_to_nfa_with_negative_tags(Nfa* nfa, TypedNfaState* end_state) const -> void { // Handle negative tags as: // root --(regex)--> state_with_negative_tagged_transition --(negative tags)--> end_state if (false == m_negative_tags.empty()) { @@ -155,10 +155,10 @@ class RegexAST { * repetition with a minimum repetition of 0. Namely, we treat `R{0,N}` as `R{1,N} | ∅`. Then, the * NFA handles the 0 repetition case using the logic in `RegexASTOR` (i.e., adding a negative * transition for every capture group matched in `R{1,N}`). - * @tparam NFAStateType Whether this AST is used for byte lexing or UTF-8 lexing. + * @tparam TypedNfaState Whether this AST is used for byte lexing or UTF-8 lexing. */ -template -class RegexASTEmpty : public RegexAST { +template +class RegexASTEmpty : public RegexAST { public: RegexASTEmpty() = default; @@ -178,8 +178,8 @@ class RegexASTEmpty : public RegexAST { } auto add_to_nfa( - [[maybe_unused]] RegexNFA* nfa, - [[maybe_unused]] NFAStateType* end_state + [[maybe_unused]] Nfa* nfa, + [[maybe_unused]] TypedNfaState* end_state ) const -> void override { // Do nothing as adding an empty node to the NFA is a null operation. } @@ -187,8 +187,8 @@ class RegexASTEmpty : public RegexAST { [[nodiscard]] auto serialize() const -> std::u32string override; }; -template -class RegexASTLiteral : public RegexAST { +template +class RegexASTLiteral : public RegexAST { public: explicit RegexASTLiteral(uint32_t character); @@ -221,12 +221,12 @@ class RegexASTLiteral : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a + * Add the needed Nfa::states to the passed in nfa to handle a * RegexASTLiteral before transitioning to an accepting end_state * @param nfa * @param end_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -236,8 +236,8 @@ class RegexASTLiteral : public RegexAST { uint32_t m_character; }; -template -class RegexASTInteger : public RegexAST { +template +class RegexASTInteger : public RegexAST { public: explicit RegexASTInteger(uint32_t digit); @@ -274,12 +274,12 @@ class RegexASTInteger : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a + * Add the needed Nfa::states to the passed in nfa to handle a * RegexASTInteger before transitioning to an accepting end_state * @param nfa * @param end_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -291,24 +291,24 @@ class RegexASTInteger : public RegexAST { std::vector m_digits; }; -template -class RegexASTGroup : public RegexAST { +template +class RegexASTGroup : public RegexAST { public: using Range = std::pair; RegexASTGroup() = default; - explicit RegexASTGroup(RegexASTLiteral const* right); + explicit RegexASTGroup(RegexASTLiteral const* right); explicit RegexASTGroup(RegexASTGroup const* right); - RegexASTGroup(RegexASTGroup const* left, RegexASTLiteral const* right); + RegexASTGroup(RegexASTGroup const* left, RegexASTLiteral const* right); RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right); RegexASTGroup( - RegexASTLiteral const* left, - RegexASTLiteral const* right + RegexASTLiteral const* left, + RegexASTLiteral const* right ); RegexASTGroup(uint32_t min, uint32_t max); @@ -382,12 +382,12 @@ class RegexASTGroup : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a + * Add the needed Nfa::states to the passed in nfa to handle a * RegexASTGroup before transitioning to an accepting end_state * @param nfa * @param end_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; @@ -424,20 +424,20 @@ class RegexASTGroup : public RegexAST { std::vector m_ranges; }; -template -class RegexASTOr : public RegexAST { +template +class RegexASTOr : public RegexAST { public: ~RegexASTOr() override = default; RegexASTOr( - std::unique_ptr> left, - std::unique_ptr> right + std::unique_ptr> left, + std::unique_ptr> right ); RegexASTOr(RegexASTOr const& rhs) - : RegexAST(rhs), - m_left(std::unique_ptr>(rhs.m_left->clone())), - m_right(std::unique_ptr>(rhs.m_right->clone())) {} + : RegexAST(rhs), + m_left(std::unique_ptr>(rhs.m_left->clone())), + m_right(std::unique_ptr>(rhs.m_right->clone())) {} /** * Used for cloning a unique_pointer of type RegexASTOr @@ -469,38 +469,38 @@ class RegexASTOr : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a + * Add the needed Nfa::states to the passed in nfa to handle a * RegexASTOr before transitioning to an accepting end_state * @param nfa * @param end_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } + [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } - [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } + [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } private: - std::unique_ptr> m_left; - std::unique_ptr> m_right; + std::unique_ptr> m_left; + std::unique_ptr> m_right; }; -template -class RegexASTCat : public RegexAST { +template +class RegexASTCat : public RegexAST { public: ~RegexASTCat() override = default; RegexASTCat( - std::unique_ptr> left, - std::unique_ptr> right + std::unique_ptr> left, + std::unique_ptr> right ); RegexASTCat(RegexASTCat const& rhs) - : RegexAST(rhs), - m_left(std::unique_ptr>(rhs.m_left->clone())), - m_right(std::unique_ptr>(rhs.m_right->clone())) {} + : RegexAST(rhs), + m_left(std::unique_ptr>(rhs.m_left->clone())), + m_right(std::unique_ptr>(rhs.m_right->clone())) {} /** * Used for cloning a unique_pointer of type RegexASTCat @@ -532,38 +532,38 @@ class RegexASTCat : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a + * Add the needed Nfa::states to the passed in nfa to handle a * RegexASTCat before transitioning to an accepting end_state * @param nfa * @param end_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } + [[nodiscard]] auto get_left() const -> RegexAST const* { return m_left.get(); } - [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } + [[nodiscard]] auto get_right() const -> RegexAST const* { return m_right.get(); } private: - std::unique_ptr> m_left; - std::unique_ptr> m_right; + std::unique_ptr> m_left; + std::unique_ptr> m_right; }; -template -class RegexASTMultiplication : public RegexAST { +template +class RegexASTMultiplication : public RegexAST { public: ~RegexASTMultiplication() override = default; RegexASTMultiplication( - std::unique_ptr> operand, + std::unique_ptr> operand, uint32_t min, uint32_t max ); RegexASTMultiplication(RegexASTMultiplication const& rhs) - : RegexAST(rhs), - m_operand(std::unique_ptr>(rhs.m_operand->clone())), + : RegexAST(rhs), + m_operand(std::unique_ptr>(rhs.m_operand->clone())), m_min(rhs.m_min), m_max(rhs.m_max) {} @@ -596,18 +596,18 @@ class RegexASTMultiplication : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a + * Add the needed Nfa::states to the passed in nfa to handle a * RegexASTMultiplication before transitioning to an accepting end_state * @param nfa * @param end_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; - [[nodiscard]] auto is_infinite() const -> bool { return this->m_max == 0; } + [[nodiscard]] auto is_infinite() const -> bool { return m_max == 0; } - [[nodiscard]] auto get_operand() const -> std::unique_ptr> const& { + [[nodiscard]] auto get_operand() const -> std::unique_ptr> const& { return m_operand; } @@ -616,7 +616,7 @@ class RegexASTMultiplication : public RegexAST { [[nodiscard]] auto get_max() const -> uint32_t { return m_max; } private: - std::unique_ptr> m_operand; + std::unique_ptr> m_operand; uint32_t m_min; uint32_t m_max; }; @@ -626,10 +626,10 @@ class RegexASTMultiplication : public RegexAST { * NOTE: * - `m_tag` is always expected to be non-null. * - `m_group_regex_ast` is always expected to be non-null. - * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template -class RegexASTCapture : public RegexAST { +template +class RegexASTCapture : public RegexAST { public: ~RegexASTCapture() override = default; @@ -639,7 +639,7 @@ class RegexASTCapture : public RegexAST { * @throw std::invalid_argument if `group_regex_ast` or `tag` are `nullptr`. */ RegexASTCapture( - std::unique_ptr> group_regex_ast, + std::unique_ptr> group_regex_ast, std::unique_ptr tag ) : m_group_regex_ast{( @@ -649,19 +649,19 @@ class RegexASTCapture : public RegexAST { )}, m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : std::move(tag)} { - RegexAST::set_subtree_positive_tags( + RegexAST::set_subtree_positive_tags( m_group_regex_ast->get_subtree_positive_tags() ); - RegexAST::add_subtree_positive_tags({m_tag.get()}); + RegexAST::add_subtree_positive_tags({m_tag.get()}); } RegexASTCapture(RegexASTCapture const& rhs) - : RegexAST{rhs}, + : RegexAST{rhs}, m_group_regex_ast{ - std::unique_ptr>(rhs.m_group_regex_ast->clone()) + std::unique_ptr>(rhs.m_group_regex_ast->clone()) }, m_tag{std::make_unique(*rhs.m_tag)} { - RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(rhs.get_subtree_positive_tags()); } /** @@ -692,183 +692,183 @@ class RegexASTCapture : public RegexAST { } /** - * Adds the needed `RegexNFA::states` to the passed in nfa to handle a + * Adds the needed `Nfa::states` to the passed in nfa to handle a * `RegexASTCapture` before transitioning to a `dest_state`. * @param nfa * @param dest_state */ - auto add_to_nfa(RegexNFA* nfa, NFAStateType* dest_state) const -> void override; + auto add_to_nfa(Nfa* nfa, TypedNfaState* dest_state) const -> void override; [[nodiscard]] auto serialize() const -> std::u32string override; [[nodiscard]] auto get_group_name() const -> std::string_view { return m_tag->get_name(); } [[nodiscard]] auto get_group_regex_ast( - ) const -> std::unique_ptr> const& { + ) const -> std::unique_ptr> const& { return m_group_regex_ast; } private: - std::unique_ptr> m_group_regex_ast; + std::unique_ptr> m_group_regex_ast; std::unique_ptr m_tag; }; -template -[[nodiscard]] auto RegexASTEmpty::serialize() const -> std::u32string { - return fmt::format(U"{}", RegexAST::serialize_negative_tags()); +template +[[nodiscard]] auto RegexASTEmpty::serialize() const -> std::u32string { + return fmt::format(U"{}", RegexAST::serialize_negative_tags()); } -template -RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} +template +RegexASTLiteral::RegexASTLiteral(uint32_t character) : m_character(character) {} -template -void RegexASTLiteral::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) +template +void RegexASTLiteral::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { nfa->add_root_interval(Interval(m_character, m_character), end_state); } -template -[[nodiscard]] auto RegexASTLiteral::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTLiteral::serialize() const -> std::u32string { return fmt::format( U"{}{}", static_cast(m_character), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTInteger::RegexASTInteger(uint32_t digit) { +template +RegexASTInteger::RegexASTInteger(uint32_t digit) { digit = digit - '0'; m_digits.push_back(digit); } -template -RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t digit) +template +RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t digit) : m_digits(std::move(left->m_digits)) { digit = digit - '0'; m_digits.push_back(digit); } -template -void RegexASTInteger::add_to_nfa( - [[maybe_unused]] RegexNFA* nfa, - [[maybe_unused]] NFAStateType* end_state +template +void RegexASTInteger::add_to_nfa( + [[maybe_unused]] Nfa* nfa, + [[maybe_unused]] TypedNfaState* end_state ) const { throw std::runtime_error("Unsupported"); } -template -[[nodiscard]] auto RegexASTInteger::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTInteger::serialize() const -> std::u32string { auto const digits_string = fmt::format("{}", fmt::join(m_digits, "")); return fmt::format( U"{}{}", std::u32string(digits_string.begin(), digits_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTOr::RegexASTOr( - std::unique_ptr> left, - std::unique_ptr> right +template +RegexASTOr::RegexASTOr( + std::unique_ptr> left, + std::unique_ptr> right ) : m_left(std::move(left)), m_right(std::move(right)) { m_left->set_negative_tags(m_right->get_subtree_positive_tags()); m_right->set_negative_tags(m_left->get_subtree_positive_tags()); - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); + RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); } -template -void RegexASTOr::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) +template +void RegexASTOr::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { m_left->add_to_nfa_with_negative_tags(nfa, end_state); m_right->add_to_nfa_with_negative_tags(nfa, end_state); } -template -[[nodiscard]] auto RegexASTOr::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTOr::serialize() const -> std::u32string { return fmt::format( U"({})|({}){}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTCat::RegexASTCat( - std::unique_ptr> left, - std::unique_ptr> right +template +RegexASTCat::RegexASTCat( + std::unique_ptr> left, + std::unique_ptr> right ) : m_left(std::move(left)), m_right(std::move(right)) { - RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); - RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_left->get_subtree_positive_tags()); + RegexAST::add_subtree_positive_tags(m_right->get_subtree_positive_tags()); } -template -void RegexASTCat::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) +template +void RegexASTCat::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { - NFAStateType* saved_root = nfa->get_root(); - NFAStateType* intermediate_state = nfa->new_state(); + TypedNfaState* saved_root = nfa->get_root(); + TypedNfaState* intermediate_state = nfa->new_state(); m_left->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); m_right->add_to_nfa_with_negative_tags(nfa, end_state); nfa->set_root(saved_root); } -template -[[nodiscard]] auto RegexASTCat::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTCat::serialize() const -> std::u32string { return fmt::format( U"{}{}{}", nullptr != m_left ? m_left->serialize() : U"null", nullptr != m_right ? m_right->serialize() : U"null", - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTMultiplication::RegexASTMultiplication( - std::unique_ptr> operand, +template +RegexASTMultiplication::RegexASTMultiplication( + std::unique_ptr> operand, uint32_t const min, uint32_t const max ) : m_operand(std::move(operand)), m_min(min), m_max(max) { - RegexAST::set_subtree_positive_tags(m_operand->get_subtree_positive_tags()); + RegexAST::set_subtree_positive_tags(m_operand->get_subtree_positive_tags()); } -template -void RegexASTMultiplication::add_to_nfa( - RegexNFA* nfa, - NFAStateType* end_state +template +void RegexASTMultiplication::add_to_nfa( + Nfa* nfa, + TypedNfaState* end_state ) const { - NFAStateType* saved_root = nfa->get_root(); - if (this->m_min == 0) { + TypedNfaState* saved_root = nfa->get_root(); + if (m_min == 0) { nfa->get_root()->add_epsilon_transition(end_state); } else { - for (uint32_t i = 1; i < this->m_min; i++) { - NFAStateType* intermediate_state = nfa->new_state(); + for (uint32_t i = 1; i < m_min; i++) { + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } m_operand->add_to_nfa_with_negative_tags(nfa, end_state); } - if (this->is_infinite()) { + if (is_infinite()) { nfa->set_root(end_state); m_operand->add_to_nfa_with_negative_tags(nfa, end_state); - } else if (this->m_max > this->m_min) { - if (this->m_min != 0) { - NFAStateType* intermediate_state = nfa->new_state(); + } else if (m_max > m_min) { + if (m_min != 0) { + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } - for (uint32_t i = this->m_min + 1; i < this->m_max; ++i) { + for (uint32_t i = m_min + 1; i < m_max; ++i) { m_operand->add_to_nfa_with_negative_tags(nfa, end_state); - NFAStateType* intermediate_state = nfa->new_state(); + TypedNfaState* intermediate_state = nfa->new_state(); m_operand->add_to_nfa_with_negative_tags(nfa, intermediate_state); nfa->set_root(intermediate_state); } @@ -877,8 +877,8 @@ void RegexASTMultiplication::add_to_nfa( nfa->set_root(saved_root); } -template -[[nodiscard]] auto RegexASTMultiplication::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTMultiplication::serialize() const -> std::u32string { auto const min_string = std::to_string(m_min); auto const max_string = std::to_string(m_max); @@ -887,15 +887,13 @@ template nullptr != m_operand ? m_operand->serialize() : U"null", std::u32string(min_string.begin(), min_string.end()), is_infinite() ? U"inf" : std::u32string(max_string.begin(), max_string.end()), - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -auto RegexASTCapture::add_to_nfa( - RegexNFA* nfa, - NFAStateType* dest_state -) const -> void { +template +auto RegexASTCapture::add_to_nfa(Nfa* nfa, TypedNfaState* dest_state) + const -> void { // TODO: move this into a documentation file in the future, and reference it here. // The NFA constructed for a capture group follows the structure below, with tagged transitions // explicitly labeled for clarity: @@ -939,21 +937,21 @@ auto RegexASTCapture::add_to_nfa( nfa->set_root(initial_root); } -template -[[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTCapture::serialize() const -> std::u32string { auto const tag_name_u32 = std::u32string(m_tag->get_name().cbegin(), m_tag->get_name().cend()); return fmt::format( U"({})<{}>{}", m_group_regex_ast->serialize(), tag_name_u32, - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } -template -RegexASTGroup::RegexASTGroup( +template +RegexASTGroup::RegexASTGroup( RegexASTGroup const* left, - RegexASTLiteral const* right + RegexASTLiteral const* right ) { if (right == nullptr) { throw std::runtime_error("RegexASTGroup1: right == nullptr: A bracket expression in the " @@ -965,16 +963,16 @@ RegexASTGroup::RegexASTGroup( m_ranges.emplace_back(right->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right) +template +RegexASTGroup::RegexASTGroup(RegexASTGroup const* left, RegexASTGroup const* right) : m_negate(left->m_negate), m_ranges(left->m_ranges) { assert(right->m_ranges.size() == 1); // Only add LiteralRange m_ranges.push_back(right->m_ranges[0]); } -template -RegexASTGroup::RegexASTGroup(RegexASTLiteral const* right) { +template +RegexASTGroup::RegexASTGroup(RegexASTLiteral const* right) { if (right == nullptr) { throw std::runtime_error("RegexASTGroup2: right == nullptr: A bracket expression in the " "schema contains illegal characters, remember to escape special " @@ -984,16 +982,16 @@ RegexASTGroup::RegexASTGroup(RegexASTLiteral const* m_ranges.emplace_back(right->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(RegexASTGroup const* right) : m_negate(false) { +template +RegexASTGroup::RegexASTGroup(RegexASTGroup const* right) : m_negate(false) { assert(right->m_ranges.size() == 1); // Only add LiteralRange m_ranges.push_back(right->m_ranges[0]); } -template -RegexASTGroup::RegexASTGroup( - RegexASTLiteral const* left, - RegexASTLiteral const* right +template +RegexASTGroup::RegexASTGroup( + RegexASTLiteral const* left, + RegexASTLiteral const* right ) { if (left == nullptr || right == nullptr) { throw std::runtime_error( @@ -1007,22 +1005,22 @@ RegexASTGroup::RegexASTGroup( m_ranges.emplace_back(left->get_character(), right->get_character()); } -template -RegexASTGroup::RegexASTGroup(std::vector const& literals) +template +RegexASTGroup::RegexASTGroup(std::vector const& literals) : m_negate(false) { for (uint32_t literal : literals) { m_ranges.emplace_back(literal, literal); } } -template -RegexASTGroup::RegexASTGroup(uint32_t min, uint32_t max) : m_negate(false) { +template +RegexASTGroup::RegexASTGroup(uint32_t min, uint32_t max) : m_negate(false) { m_ranges.emplace_back(min, max); } // ranges must be sorted -template -auto RegexASTGroup::merge(std::vector const& ranges) -> std::vector { +template +auto RegexASTGroup::merge(std::vector const& ranges) -> std::vector { std::vector merged_ranges; if (ranges.empty()) { return merged_ranges; @@ -1042,8 +1040,8 @@ auto RegexASTGroup::merge(std::vector const& ranges) -> std } // ranges must be sorted and non-overlapping -template -auto RegexASTGroup::complement(std::vector const& ranges +template +auto RegexASTGroup::complement(std::vector const& ranges ) -> std::vector { std::vector complemented; uint32_t low = 0; @@ -1059,15 +1057,15 @@ auto RegexASTGroup::complement(std::vector const& ranges return complemented; } -template -void RegexASTGroup::add_to_nfa(RegexNFA* nfa, NFAStateType* end_state) +template +void RegexASTGroup::add_to_nfa(Nfa* nfa, TypedNfaState* end_state) const { // TODO: there should be a better way to do this with a set and keep m_ranges sorted, but we // have to consider removing overlap + taking the compliment. auto merged_ranges = m_ranges; std::sort(merged_ranges.begin(), merged_ranges.end()); merged_ranges = merge(merged_ranges); - if (this->m_negate) { + if (m_negate) { merged_ranges = complement(merged_ranges); } for (auto const& [begin, end] : merged_ranges) { @@ -1075,8 +1073,8 @@ void RegexASTGroup::add_to_nfa(RegexNFA* nfa, NFASta } } -template -[[nodiscard]] auto RegexASTGroup::serialize() const -> std::u32string { +template +[[nodiscard]] auto RegexASTGroup::serialize() const -> std::u32string { std::u32string ranges_serialized; if (m_is_wildcard) { ranges_serialized += U"*"; @@ -1102,7 +1100,7 @@ template U"[{}{}]{}", m_negate ? U"^" : U"", ranges_serialized, - RegexAST::serialize_negative_tags() + RegexAST::serialize_negative_tags() ); } } // namespace log_surgeon::finite_automata diff --git a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp b/src/log_surgeon/finite_automata/RegexDFAStateType.hpp deleted file mode 100644 index ae4e52d4..00000000 --- a/src/log_surgeon/finite_automata/RegexDFAStateType.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE - -#include - -namespace log_surgeon::finite_automata { -enum class RegexDFAStateType : uint8_t { - Byte, - UTF8 -}; -} // namespace log_surgeon::finite_automata - -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_STATE_TYPE diff --git a/src/log_surgeon/finite_automata/RegexNFAStateType.hpp b/src/log_surgeon/finite_automata/RegexNFAStateType.hpp deleted file mode 100644 index 24ef2153..00000000 --- a/src/log_surgeon/finite_automata/RegexNFAStateType.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE_TYPE -#define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE_TYPE - -#include - -namespace log_surgeon::finite_automata { -enum class RegexNFAStateType : uint8_t { - Byte, - UTF8 -}; -} // namespace log_surgeon::finite_automata - -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_STATE_TYPE diff --git a/src/log_surgeon/finite_automata/TaggedTransition.hpp b/src/log_surgeon/finite_automata/TaggedTransition.hpp index 86fe7a39..43315b2a 100644 --- a/src/log_surgeon/finite_automata/TaggedTransition.hpp +++ b/src/log_surgeon/finite_automata/TaggedTransition.hpp @@ -1,6 +1,7 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION #define LOG_SURGEON_FINITE_AUTOMATA_TAGGED_TRANSITION +#include #include #include #include @@ -12,13 +13,12 @@ #include namespace log_surgeon::finite_automata { - /** * Represents an NFA transition indicating that a capture group has been matched. * NOTE: `m_tag` is always expected to be non-null. - * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template +template class PositiveTaggedTransition { public: /** @@ -26,18 +26,18 @@ class PositiveTaggedTransition { * @param dest_state * @throw std::invalid_argument if `tag` is `nullptr`. */ - PositiveTaggedTransition(Tag const* tag, NFAStateType const* dest_state) + PositiveTaggedTransition(Tag const* tag, TypedNfaState const* dest_state) : m_tag{nullptr == tag ? throw std::invalid_argument("Tag cannot be null") : tag}, m_dest_state{dest_state} {} - [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } + [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the positive tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ - [[nodiscard]] auto serialize(std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { @@ -48,15 +48,15 @@ class PositiveTaggedTransition { private: Tag const* m_tag; - NFAStateType const* m_dest_state; + TypedNfaState const* m_dest_state; }; /** * Represents an NFA transition indicating that a capture group has been unmatched. * NOTE: All tags in `m_tags` are always expected to be non-null. - * @tparam NFAStateType Specifies the type of transition (bytes or UTF-8 characters). + * @tparam TypedNfaState Specifies the type of transition (bytes or UTF-8 characters). */ -template +template class NegativeTaggedTransition { public: /** @@ -64,7 +64,7 @@ class NegativeTaggedTransition { * @param dest_state * @throw std::invalid_argument if any elements in `tags` is `nullptr`. */ - NegativeTaggedTransition(std::vector tags, NFAStateType const* dest_state) + NegativeTaggedTransition(std::vector tags, TypedNfaState const* dest_state) : m_tags{[&tags] { if (std::ranges::any_of(tags, [](Tag const* tag) { return nullptr == tag; })) { throw std::invalid_argument("Tags cannot contain null elements"); @@ -73,14 +73,14 @@ class NegativeTaggedTransition { }()}, m_dest_state{dest_state} {} - [[nodiscard]] auto get_dest_state() const -> NFAStateType const* { return m_dest_state; } + [[nodiscard]] auto get_dest_state() const -> TypedNfaState const* { return m_dest_state; } /** * @param state_ids A map of states to their unique identifiers. * @return A string representation of the negative tagged transition on success. * @return std::nullopt if `m_dest_state` is not in `state_ids`. */ - [[nodiscard]] auto serialize(std::unordered_map const& state_ids + [[nodiscard]] auto serialize(std::unordered_map const& state_ids ) const -> std::optional { auto const state_id_it = state_ids.find(m_dest_state); if (state_id_it == state_ids.end()) { @@ -94,7 +94,7 @@ class NegativeTaggedTransition { private: std::vector m_tags; - NFAStateType const* m_dest_state; + TypedNfaState const* m_dest_state; }; } // namespace log_surgeon::finite_automata diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ec974e6b..0551615b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,15 +5,15 @@ set( ../src/log_surgeon/finite_automata/PrefixTree.cpp ../src/log_surgeon/finite_automata/PrefixTree.hpp ../src/log_surgeon/finite_automata/RegexAST.hpp - ../src/log_surgeon/finite_automata/RegexNFA.hpp - ../src/log_surgeon/finite_automata/RegexNFAState.hpp - ../src/log_surgeon/finite_automata/RegexNFAStateType.hpp + ../src/log_surgeon/finite_automata/Nfa.hpp + ../src/log_surgeon/finite_automata/NfaState.hpp + ../src/log_surgeon/finite_automata/NfaStateType.hpp ../src/log_surgeon/finite_automata/RegisterHandler.hpp ../src/log_surgeon/finite_automata/Tag.hpp ../src/log_surgeon/finite_automata/TaggedTransition.hpp - ../src/log_surgeon/LALR1Parser.cpp - ../src/log_surgeon/LALR1Parser.hpp - ../src/log_surgeon/LALR1Parser.tpp + ../src/log_surgeon/Lalr1Parser.cpp + ../src/log_surgeon/Lalr1Parser.hpp + ../src/log_surgeon/Lalr1Parser.tpp ../src/log_surgeon/ParserInputBuffer.hpp ../src/log_surgeon/ParserInputBuffer.cpp ../src/log_surgeon/Schema.hpp diff --git a/tests/test-NFA.cpp b/tests/test-NFA.cpp index 6a92f4bb..160d421a 100644 --- a/tests/test-NFA.cpp +++ b/tests/test-NFA.cpp @@ -7,28 +7,28 @@ #include #include +#include #include -#include #include #include using log_surgeon::cSizeOfByte; -using log_surgeon::finite_automata::RegexNFAByteState; +using log_surgeon::finite_automata::ByteNfaState; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; using std::string; using std::stringstream; using std::vector; -using ByteLexicalRule = log_surgeon::LexicalRule; -using ByteNFA = log_surgeon::finite_automata::RegexNFA; -using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat; -using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture; -using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup; -using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral; +using ByteLexicalRule = log_surgeon::LexicalRule; +using ByteNFA = log_surgeon::finite_automata::Nfa; +using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat; +using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture; +using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup; +using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte - = log_surgeon::finite_automata::RegexASTMultiplication; -using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; + = log_surgeon::finite_automata::RegexASTMultiplication; +using RegexASTOrByte = log_surgeon::finite_automata::RegexASTOr; TEST_CASE("Test NFA", "[NFA]") { Schema schema; diff --git a/tests/test-lexer.cpp b/tests/test-lexer.cpp index dd305a76..48b2185c 100644 --- a/tests/test-lexer.cpp +++ b/tests/test-lexer.cpp @@ -6,8 +6,8 @@ #include +#include #include -#include #include #include @@ -18,18 +18,18 @@ using std::u32string; using std::vector; using std::wstring_convert; -using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat< - log_surgeon::finite_automata::RegexNFAByteState>; -using RegexASTCaptureByte = log_surgeon::finite_automata::RegexASTCapture< - log_surgeon::finite_automata::RegexNFAByteState>; -using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup< - log_surgeon::finite_automata::RegexNFAByteState>; -using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral< - log_surgeon::finite_automata::RegexNFAByteState>; +using RegexASTCatByte + = log_surgeon::finite_automata::RegexASTCat; +using RegexASTCaptureByte + = log_surgeon::finite_automata::RegexASTCapture; +using RegexASTGroupByte + = log_surgeon::finite_automata::RegexASTGroup; +using RegexASTLiteralByte + = log_surgeon::finite_automata::RegexASTLiteral; using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication< - log_surgeon::finite_automata::RegexNFAByteState>; + log_surgeon::finite_automata::ByteNfaState>; using RegexASTOrByte - = log_surgeon::finite_automata::RegexASTOr; + = log_surgeon::finite_automata::RegexASTOr; using log_surgeon::SchemaVarAST; namespace {