From 0d12da52f69a9b9d686ed8566e3f9ad7178dc1ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Chocholat=C3=BD?= Date: Thu, 14 Nov 2024 09:49:30 +0100 Subject: [PATCH 1/5] feat(re2): Add tests for ^ and $ regex symbols --- tests/re2parser.cc | 154 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 1 deletion(-) diff --git a/tests/re2parser.cc b/tests/re2parser.cc index bf7dc3a80..f5bcfa20b 100644 --- a/tests/re2parser.cc +++ b/tests/re2parser.cc @@ -1,8 +1,10 @@ #include #include -#include "mata/nfa/nfa.hh" #include "mata/parser/re2parser.hh" +#include "mata/nfa/builder.hh" +#include "mata/nfa/nfa.hh" + using namespace mata::nfa; using Symbol = mata::Symbol; @@ -1315,3 +1317,153 @@ TEST_CASE("mata::Parser bug epsilon") CHECK(x.is_in_lang(Run{Word{'a', 'a', 'a', 'a'}, {}})); } } // }}} + +TEST_CASE("mata::parser Parsing regexes with ^ and $") { + Nfa nfa; + Nfa expected{}; + + SECTION("Handling of '\\'") { + mata::parser::create_nfa(&nfa, "a\\\\b"); + expected = mata::nfa::builder::parse_from_mata( + std::string{ R"( + @NFA-explicit + %Alphabet-auto + %Initial q0 + %Final q3 + q0 97 q1 + q1 92 q2 + q2 98 q3)" + }); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("a|b$, a simple OR example with end marker") { + mata::parser::create_nfa(&nfa, "a|b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a|b, a simple OR example with begin marker") { + mata::parser::create_nfa(&nfa, "^a|b"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a|b$, a simple OR example with begin and end marker") { + mata::parser::create_nfa(&nfa, "^a|b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^(a|b)$, a simple OR example with begin and end marker around capture group") { + mata::parser::create_nfa(&nfa, "^(a|b)$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("a$|b, a simple OR example with end marker on the left side") { + mata::parser::create_nfa(&nfa, "a$|b"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a$|^b$, a simple OR example with multiple begin and end markers") { + mata::parser::create_nfa(&nfa, "^a$|^b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("aed|(bab)$, a simple OR example with trailing end marker") { + mata::parser::create_nfa(&nfa, "aed|(bab)$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(1, 'e', 2); + expected.delta.add(2, 'd', 3); + expected.delta.add(0, 'b', 4); + expected.delta.add(4, 'a', 5); + expected.delta.add(5, 'b', 3); + expected.final.insert(3); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("aed|bab$, a simple OR example with trailing end marker") { + mata::parser::create_nfa(&nfa, "aed|bab$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(1, 'e', 2); + expected.delta.add(2, 'd', 3); + expected.delta.add(0, 'b', 4); + expected.delta.add(4, 'a', 5); + expected.delta.add(5, 'b', 3); + expected.final.insert(3); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^systempath\\=https|ftp$ correct parentheses") { + mata::parser::create_nfa(&nfa, "^[sS][yY][sS][tT][eE][mM][pP][aA][tT][hH]\\\\=(([hH][tT]{2}[pP][sS]?)|([fF][tT][pP]))$"); + expected = mata::nfa::builder::parse_from_mata(std::string{ R"( + @NFA-explicit + %Alphabet-auto + %Initial q0 + %Final q16 q17 + q0 83 q1 + q0 115 q1 + q1 89 q2 + q1 121 q2 + q2 83 q3 + q2 115 q3 + q3 84 q4 + q3 116 q4 + q4 69 q5 + q4 101 q5 + q5 77 q6 + q5 109 q6 + q6 80 q7 + q6 112 q7 + q7 65 q8 + q7 97 q8 + q8 84 q9 + q8 116 q9 + q9 72 q10 + q9 104 q10 + q10 92 q11 + q11 61 q12 + q12 70 q18 + q12 72 q13 + q12 102 q18 + q12 104 q13 + q13 84 q14 + q13 116 q14 + q14 84 q15 + q14 116 q15 + q15 80 q16 + q15 112 q16 + q16 83 q17 + q16 115 q17 + q18 84 q19 + q18 116 q19 + q19 80 q17 + q19 112 q17 + )" + }); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } +} From 459c6cd524c8bd77adf9d1e233af5e2a22836725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Chocholat=C3=BD?= Date: Thu, 14 Nov 2024 10:01:14 +0100 Subject: [PATCH 2/5] fix(re2): Fix creating NFAs from regexes with ^ and $ regex symbols --- src/re2parser.cc | 50 +++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/src/re2parser.cc b/src/re2parser.cc index 03e9f2554..beeb1bea0 100644 --- a/src/re2parser.cc +++ b/src/re2parser.cc @@ -136,11 +136,14 @@ namespace { this->outgoingEdges = std::vector>> (prog_size); // We traverse all the states and create corresponding states and edges in Nfa - for (mata::nfa::State current_state = start_state; current_state < prog_size; current_state++) { - re2::Prog::Inst *inst = prog->inst(static_cast(current_state)); + for (State current_state = start_state, re2_state = start_state; re2_state < prog_size; ++ + re2_state) { + /// Whether to increment the current state @c current_state when the @c re2_state increments. + bool increment_current_state{true}; + re2::Prog::Inst *inst = prog->inst(static_cast(re2_state)); // Every type of state can be final (due to epsilon transition), so we check it regardless of its type - if (this->state_cache.is_final_state[current_state]) { - this->make_state_final(current_state, explicit_nfa); + if (this->state_cache.is_final_state[re2_state]) { + this->make_state_final(re2_state, explicit_nfa); } switch (inst->opcode()) { default: @@ -164,33 +167,35 @@ namespace { empty_flag = static_cast(inst->empty()); // ^ - beginning of line if (empty_flag & re2::kEmptyBeginLine) { - // TODO Symbol? - symbols.push_back(300); + increment_current_state = false; } // $ - end of line if (empty_flag & re2::kEmptyEndLine) { - // TODO Symbol? - symbols.push_back(10); + // TODO How to handle? + // symbols.push_back(301); + increment_current_state = false; } // \A - beginning of text if (empty_flag & re2::kEmptyBeginText) { - // TODO Symbol? - symbols.push_back(301); + increment_current_state = false; } // \z - end of text if (empty_flag & re2::kEmptyEndText) { - // TODO Symbol? - symbols.push_back(302); + // TODO How to handle? + // symbols.push_back(302); + increment_current_state = false; } // \b - word boundary if (empty_flag & re2::kEmptyWordBoundary) { - // TODO Symbol? - symbols.push_back(303); + // TODO How to handle? + // symbols.push_back(303); + increment_current_state = false; } // \B - not \b if (empty_flag & re2::kEmptyNonWordBoundary) { - // TODO Symbol? - symbols.push_back(304); + // TODO How to handle? + // symbols.push_back(304); + increment_current_state = false; } break; // kInstByteRange represents states with a "byte range" on the outgoing transition(s) @@ -212,15 +217,17 @@ namespace { if (!use_epsilon) { // There is an epsilon transition to the currentState+1 we will need to copy transitions of // the currentState+1 to the currentState. - if (!this->state_cache.is_last[current_state]) { - for (auto state: this->state_cache.state_mapping[current_state + 1]) { - copyEdgesFromTo.emplace_back(state, current_state); + if (!this->state_cache.is_last[re2_state]) { + for (auto state: this->state_cache.state_mapping[re2_state + 1]) { + copyEdgesFromTo.emplace_back(state, re2_state); } } } symbols.clear(); break; } + + if (increment_current_state) { ++current_state; } } if (!use_epsilon) { // We will traverse the vector in reversed order. Like that, we will also handle chains of epsilon transitions @@ -419,7 +426,8 @@ namespace { if (inst->last()) { this->state_cache.is_last[state] = true; } - if (inst->opcode() == re2::kInstMatch) { + if (inst->opcode() == re2::kInstMatch || + (inst->opcode() == re2::kInstEmptyWidth && inst->empty() & re2::kEmptyEndText)) { this->state_cache.is_final_state[state] = true; } } @@ -505,6 +513,8 @@ void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool us RegexParser regexParser{}; auto parsed_regex = regexParser.parse_regex_string(pattern); auto program = parsed_regex->CompileToProg(regexParser.options.max_mem() * 2 / 3); + // FIXME: use_epsilon = false completely breaks the method convert_pro_to_nfa(). Needs fixing before allowing to + // pass the argument use_epsilon to convert_pro_to_nfa(). regexParser.convert_pro_to_nfa(nfa, program, true, epsilon_value); delete program; // Decrements reference count and deletes object if the count reaches 0 From 73f9e01d4acf5c3dcbb479251121375e72353f60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Chocholat=C3=BD?= Date: Fri, 15 Nov 2024 10:16:20 +0100 Subject: [PATCH 3/5] fix(python): Add missing Python dependencies for Python binding --- bindings/python/requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bindings/python/requirements.txt b/bindings/python/requirements.txt index f7cbea49f..5c3b30c5f 100644 --- a/bindings/python/requirements.txt +++ b/bindings/python/requirements.txt @@ -7,3 +7,7 @@ ipython>=7.9.0 pandas>=1.3.5 networkx>=2.6.3 graphviz>=0.20.0 +setuptools +papermill +ipykernel +seaborn From 5a388f7a89f4aedf8d3eb14cddf59f3fa7e4a114 Mon Sep 17 00:00:00 2001 From: koniksedy Date: Sat, 16 Nov 2024 11:41:42 +0100 Subject: [PATCH 4/5] to_mata and to_dot with filename --- include/mata/nfa/nfa.hh | 41 +++++++++++++++++++++++++++++------------ src/nfa/nfa.cc | 19 ++++++++++++++++++- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/include/mata/nfa/nfa.hh b/include/mata/nfa/nfa.hh index 63e1dc426..7dfd47cde 100644 --- a/include/mata/nfa/nfa.hh +++ b/include/mata/nfa/nfa.hh @@ -173,7 +173,7 @@ public: BoolVector get_useful_states() const; /** - * @brief Structure for storing callback functions (event handlers) utilizing + * @brief Structure for storing callback functions (event handlers) utilizing * Tarjan's SCC discover algorithm. */ struct TarjanDiscoverCallback { @@ -189,7 +189,7 @@ public: /** * @brief Tarjan's SCC discover algorihm. - * + * * @param callback Callbacks class to instantiate callbacks for the Tarjan's algorithm. */ void tarjan_scc_discover(const TarjanDiscoverCallback& callback) const; @@ -218,9 +218,9 @@ public: /** * @brief Get some shortest accepting run from state @p q - * + * * Assumes that @p q is a state of this automaton and that there is some accepting run from @p q - * + * * @param distances_to_final Vector of the lengths of the shortest runs from states (can be computed using distances_to_final()) */ Run get_shortest_accepting_run_from_state(State q, const std::vector& distances_to_final) const; @@ -280,6 +280,12 @@ public: */ void print_to_dot(std::ostream &output) const; + /** + * @brief Prints the automaton to the file in DOT format + * @param filename Name of the file to print the automaton to + */ + void print_to_dot(const std::string& filename) const; + /** * @brief Prints the automaton in mata format * @@ -289,6 +295,7 @@ public: * TODO handle alphabet of the automaton, currently we print the exact value of the symbols */ std::string print_to_mata() const; + /** * @brief Prints the automaton to the output stream in mata format * @@ -298,12 +305,22 @@ public: */ void print_to_mata(std::ostream &output) const; + /** + * @brief Prints the automaton to the file in mata format + * + * If you need to parse the automaton again, use IntAlphabet in construct() + * + * TODO handle alphabet of the automaton, currently we print the exact value of the symbols + * @param filename Name of the file to print the automaton to + */ + void print_to_mata(const std::string& filename) const; + // TODO: Relict from VATA. What to do with inclusion/ universality/ this post function? Revise all of them. StateSet post(const StateSet& states, const Symbol& symbol) const; /** - * Check whether the language of NFA is empty. - * Currently calls is_lang_empty_scc if cex is null + * Check whether the language of NFA is empty. + * Currently calls is_lang_empty_scc if cex is null * @param[out] cex Counter-example path for a case the language is not empty. * @return True if the language is empty, false otherwise. */ @@ -311,7 +328,7 @@ public: /** * @brief Check if the language is empty using Tarjan's SCC discover algorithm. - * + * * @return Language empty <-> True */ bool is_lang_empty_scc() const; @@ -334,17 +351,17 @@ public: /** * @brief Is the automaton graph acyclic? Used for checking language finiteness. - * + * * @return true <-> Automaton graph is acyclic. */ bool is_acyclic() const; /** * @brief Is the automaton flat? - * - * Flat automaton is an NFA whose every SCC is a simple loop. Basically each state in an + * + * Flat automaton is an NFA whose every SCC is a simple loop. Basically each state in an * SCC has at most one successor within this SCC. - * + * * @return true <-> Automaton graph is flat. */ bool is_flat() const; @@ -374,7 +391,7 @@ public: /** * @brief Get the set of all words in the language of the automaton whose length is <= @p max_length - * + * * If you have an automaton with finite language (can be checked using @ref is_acyclic), * you can get all words by calling * get_words(aut.num_of_states()) diff --git a/src/nfa/nfa.cc b/src/nfa/nfa.cc index b544ed830..9b156f464 100644 --- a/src/nfa/nfa.cc +++ b/src/nfa/nfa.cc @@ -5,6 +5,7 @@ #include #include #include +#include // MATA headers #include "mata/utils/sparse-set.hh" @@ -404,7 +405,7 @@ bool Nfa::is_flat() const { mata::nfa::Nfa::TarjanDiscoverCallback callback {}; callback.scc_discover = [&](const std::vector& scc, const std::vector& tarjan_stack) -> bool { (void)tarjan_stack; - + for(const mata::nfa::State& st : scc) { bool one_input_visited = false; for (const mata::nfa::SymbolPost& sp : this->delta[st]) { @@ -459,6 +460,14 @@ void Nfa::print_to_dot(std::ostream &output) const { output << "}" << std::endl; } +void Nfa::print_to_dot(const std::string& filename) const { + std::ofstream output(filename); + if (!output) { + throw std::ios_base::failure("Failed to open file: " + filename); + } + print_to_dot(output); +} + std::string Nfa::print_to_mata() const { std::stringstream output; print_to_mata(output); @@ -492,6 +501,14 @@ void Nfa::print_to_mata(std::ostream &output) const { } } +void Nfa::print_to_mata(const std::string& filename) const { + std::ofstream output(filename); + if (!output) { + throw std::ios_base::failure("Failed to open file: " + filename); + } + print_to_mata(output); +} + Nfa Nfa::get_one_letter_aut(Symbol abstract_symbol) const { Nfa digraph{num_of_states(), StateSet(initial), StateSet(final) }; // Add directed transitions for digraph. From 0d3b26eb2438dd5431dfa3ca9c4741c5b7aafbc2 Mon Sep 17 00:00:00 2001 From: koniksedy Date: Sat, 16 Nov 2024 12:28:05 +0100 Subject: [PATCH 5/5] bug fix --- src/re2parser.cc | 2 +- tests/re2parser.cc | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/re2parser.cc b/src/re2parser.cc index 03e9f2554..bca480be1 100644 --- a/src/re2parser.cc +++ b/src/re2parser.cc @@ -202,7 +202,7 @@ namespace { symbols.push_back(symbol); // Foldcase causes RE2 to do a case-insensitive match, so transitions will be made for // both uppercase and lowercase symbols - if (inst->foldcase()) { + if (inst->foldcase() && symbol >= 'a' && symbol <= 'z') { symbols.push_back(symbol-ascii_shift_value); } } diff --git a/tests/re2parser.cc b/tests/re2parser.cc index bf7dc3a80..3aa91d14a 100644 --- a/tests/re2parser.cc +++ b/tests/re2parser.cc @@ -1273,6 +1273,24 @@ TEST_CASE("mata::Parser error") CHECK(!x.is_in_lang(Run{ Word{ 'a', 'a', 'a', 'a', 'a', 'a' }, {} })); } + SECTION("Regex from issue #456") { + Nfa x; + mata::parser::create_nfa(&x, "[\\x00-\\x5a\\x5c-\\x7F]"); + + Nfa y; + State initial_s = 0; + State final_s = 1; + y.initial.insert(initial_s); + y.final.insert(final_s); + for (Symbol c = 0; c <= 0x7F; c++) { + if (c == 0x5B) { + continue; + } + y.delta.add(initial_s, c, final_s); + } + CHECK(are_equivalent(x, y)); + } + SECTION("Another failing regex") { Nfa x; mata::parser::create_nfa(&x, "(cd(abcde)+)|(a(aaa)+|ccc+)");