diff --git a/bindings/python/requirements.txt b/bindings/python/requirements.txt index f7cbea49f..5c3b30c5f 100644 --- a/bindings/python/requirements.txt +++ b/bindings/python/requirements.txt @@ -7,3 +7,7 @@ ipython>=7.9.0 pandas>=1.3.5 networkx>=2.6.3 graphviz>=0.20.0 +setuptools +papermill +ipykernel +seaborn diff --git a/include/mata/nfa/nfa.hh b/include/mata/nfa/nfa.hh index f232f3ab9..d6f360e1d 100644 --- a/include/mata/nfa/nfa.hh +++ b/include/mata/nfa/nfa.hh @@ -288,6 +288,12 @@ public: */ void print_to_dot(std::ostream &output) const; + /** + * @brief Prints the automaton to the file in DOT format + * @param filename Name of the file to print the automaton to + */ + void print_to_dot(const std::string& filename) const; + /** * @brief Prints the automaton in mata format * @@ -297,6 +303,7 @@ public: * TODO handle alphabet of the automaton, currently we print the exact value of the symbols */ std::string print_to_mata() const; + /** * @brief Prints the automaton to the output stream in mata format * @@ -306,6 +313,16 @@ public: */ void print_to_mata(std::ostream &output) const; + /** + * @brief Prints the automaton to the file in mata format + * + * If you need to parse the automaton again, use IntAlphabet in construct() + * + * TODO handle alphabet of the automaton, currently we print the exact value of the symbols + * @param filename Name of the file to print the automaton to + */ + void print_to_mata(const std::string& filename) const; + // TODO: Relict from VATA. What to do with inclusion/ universality/ this post function? Revise all of them. StateSet post(const StateSet& states, const Symbol& symbol) const; diff --git a/src/nfa/nfa.cc b/src/nfa/nfa.cc index 1348d820f..332b5f953 100644 --- a/src/nfa/nfa.cc +++ b/src/nfa/nfa.cc @@ -5,6 +5,7 @@ #include #include #include +#include // MATA headers #include "mata/utils/sparse-set.hh" @@ -459,6 +460,14 @@ void Nfa::print_to_dot(std::ostream &output) const { output << "}" << std::endl; } +void Nfa::print_to_dot(const std::string& filename) const { + std::ofstream output(filename); + if (!output) { + throw std::ios_base::failure("Failed to open file: " + filename); + } + print_to_dot(output); +} + std::string Nfa::print_to_mata() const { std::stringstream output; print_to_mata(output); @@ -492,6 +501,14 @@ void Nfa::print_to_mata(std::ostream &output) const { } } +void Nfa::print_to_mata(const std::string& filename) const { + std::ofstream output(filename); + if (!output) { + throw std::ios_base::failure("Failed to open file: " + filename); + } + print_to_mata(output); +} + Nfa Nfa::get_one_letter_aut(Symbol abstract_symbol) const { Nfa digraph{num_of_states(), StateSet(initial), StateSet(final) }; // Add directed transitions for digraph. diff --git a/src/re2parser.cc b/src/re2parser.cc index 6a39cb3b2..fe7a246bf 100644 --- a/src/re2parser.cc +++ b/src/re2parser.cc @@ -137,11 +137,14 @@ namespace { this->outgoingEdges = std::vector>> (prog_size); // We traverse all the states and create corresponding states and edges in Nfa - for (mata::nfa::State current_state = start_state; current_state < prog_size; current_state++) { - re2::Prog::Inst *inst = prog->inst(static_cast(current_state)); + for (State current_state = start_state, re2_state = start_state; re2_state < prog_size; ++ + re2_state) { + /// Whether to increment the current state @c current_state when the @c re2_state increments. + bool increment_current_state{true}; + re2::Prog::Inst *inst = prog->inst(static_cast(re2_state)); // Every type of state can be final (due to epsilon transition), so we check it regardless of its type - if (this->state_cache.is_final_state[current_state]) { - this->make_state_final(current_state, explicit_nfa); + if (this->state_cache.is_final_state[re2_state]) { + this->make_state_final(re2_state, explicit_nfa); } switch (inst->opcode()) { default: @@ -165,33 +168,35 @@ namespace { empty_flag = static_cast(inst->empty()); // ^ - beginning of line if (empty_flag & re2::kEmptyBeginLine) { - // TODO Symbol? - symbols.push_back(300); + increment_current_state = false; } // $ - end of line if (empty_flag & re2::kEmptyEndLine) { - // TODO Symbol? - symbols.push_back(10); + // TODO How to handle? + // symbols.push_back(301); + increment_current_state = false; } // \A - beginning of text if (empty_flag & re2::kEmptyBeginText) { - // TODO Symbol? - symbols.push_back(301); + increment_current_state = false; } // \z - end of text if (empty_flag & re2::kEmptyEndText) { - // TODO Symbol? - symbols.push_back(302); + // TODO How to handle? + // symbols.push_back(302); + increment_current_state = false; } // \b - word boundary if (empty_flag & re2::kEmptyWordBoundary) { - // TODO Symbol? - symbols.push_back(303); + // TODO How to handle? + // symbols.push_back(303); + increment_current_state = false; } // \B - not \b if (empty_flag & re2::kEmptyNonWordBoundary) { - // TODO Symbol? - symbols.push_back(304); + // TODO How to handle? + // symbols.push_back(304); + increment_current_state = false; } break; // kInstByteRange represents states with a "byte range" on the outgoing transition(s) @@ -203,7 +208,7 @@ namespace { symbols.push_back(symbol); // Foldcase causes RE2 to do a case-insensitive match, so transitions will be made for // both uppercase and lowercase symbols - if (inst->foldcase()) { + if (inst->foldcase() && symbol >= 'a' && symbol <= 'z') { symbols.push_back(symbol-ascii_shift_value); } } @@ -213,15 +218,17 @@ namespace { if (!use_epsilon) { // There is an epsilon transition to the currentState+1 we will need to copy transitions of // the currentState+1 to the currentState. - if (!this->state_cache.is_last[current_state]) { - for (auto state: this->state_cache.state_mapping[current_state + 1]) { - copyEdgesFromTo.emplace_back(state, current_state); + if (!this->state_cache.is_last[re2_state]) { + for (auto state: this->state_cache.state_mapping[re2_state + 1]) { + copyEdgesFromTo.emplace_back(state, re2_state); } } } symbols.clear(); break; } + + if (increment_current_state) { ++current_state; } } if (!use_epsilon) { // We will traverse the vector in reversed order. Like that, we will also handle chains of epsilon transitions @@ -420,7 +427,8 @@ namespace { if (inst->last()) { this->state_cache.is_last[state] = true; } - if (inst->opcode() == re2::kInstMatch) { + if (inst->opcode() == re2::kInstMatch || + (inst->opcode() == re2::kInstEmptyWidth && inst->empty() & re2::kEmptyEndText)) { this->state_cache.is_final_state[state] = true; } } @@ -507,6 +515,8 @@ void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool us RegexParser regexParser{}; auto parsed_regex = regexParser.parse_regex_string(pattern, encoding); auto program = parsed_regex->CompileToProg(regexParser.options.max_mem() * 2 / 3); + // FIXME: use_epsilon = false completely breaks the method convert_pro_to_nfa(). Needs fixing before allowing to + // pass the argument use_epsilon to convert_pro_to_nfa(). regexParser.convert_pro_to_nfa(nfa, program, true, epsilon_value); delete program; // Decrements reference count and deletes object if the count reaches 0 diff --git a/tests/re2parser.cc b/tests/re2parser.cc index 198946387..9c470cf6d 100644 --- a/tests/re2parser.cc +++ b/tests/re2parser.cc @@ -1,8 +1,10 @@ #include #include -#include "mata/nfa/nfa.hh" #include "mata/parser/re2parser.hh" +#include "mata/nfa/builder.hh" +#include "mata/nfa/nfa.hh" + using namespace mata::nfa; using Symbol = mata::Symbol; @@ -1273,6 +1275,24 @@ TEST_CASE("mata::Parser error") CHECK(!x.is_in_lang(Run{ Word{ 'a', 'a', 'a', 'a', 'a', 'a' }, {} })); } + SECTION("Regex from issue #456") { + Nfa x; + mata::parser::create_nfa(&x, "[\\x00-\\x5a\\x5c-\\x7F]"); + + Nfa y; + State initial_s = 0; + State final_s = 1; + y.initial.insert(initial_s); + y.final.insert(final_s); + for (Symbol c = 0; c <= 0x7F; c++) { + if (c == 0x5B) { + continue; + } + y.delta.add(initial_s, c, final_s); + } + CHECK(are_equivalent(x, y)); + } + SECTION("Another failing regex") { Nfa x; mata::parser::create_nfa(&x, "(cd(abcde)+)|(a(aaa)+|ccc+)"); @@ -1598,3 +1618,153 @@ TEST_CASE("mata::Parser UTF-8 encoding") } } // }}} + +TEST_CASE("mata::parser Parsing regexes with ^ and $") { + Nfa nfa; + Nfa expected{}; + + SECTION("Handling of '\\'") { + mata::parser::create_nfa(&nfa, "a\\\\b"); + expected = mata::nfa::builder::parse_from_mata( + std::string{ R"( + @NFA-explicit + %Alphabet-auto + %Initial q0 + %Final q3 + q0 97 q1 + q1 92 q2 + q2 98 q3)" + }); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("a|b$, a simple OR example with end marker") { + mata::parser::create_nfa(&nfa, "a|b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a|b, a simple OR example with begin marker") { + mata::parser::create_nfa(&nfa, "^a|b"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a|b$, a simple OR example with begin and end marker") { + mata::parser::create_nfa(&nfa, "^a|b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^(a|b)$, a simple OR example with begin and end marker around capture group") { + mata::parser::create_nfa(&nfa, "^(a|b)$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("a$|b, a simple OR example with end marker on the left side") { + mata::parser::create_nfa(&nfa, "a$|b"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^a$|^b$, a simple OR example with multiple begin and end markers") { + mata::parser::create_nfa(&nfa, "^a$|^b$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(0, 'b', 1); + expected.final.insert(1); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("aed|(bab)$, a simple OR example with trailing end marker") { + mata::parser::create_nfa(&nfa, "aed|(bab)$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(1, 'e', 2); + expected.delta.add(2, 'd', 3); + expected.delta.add(0, 'b', 4); + expected.delta.add(4, 'a', 5); + expected.delta.add(5, 'b', 3); + expected.final.insert(3); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("aed|bab$, a simple OR example with trailing end marker") { + mata::parser::create_nfa(&nfa, "aed|bab$"); + expected.initial.insert(0); + expected.delta.add(0, 'a', 1); + expected.delta.add(1, 'e', 2); + expected.delta.add(2, 'd', 3); + expected.delta.add(0, 'b', 4); + expected.delta.add(4, 'a', 5); + expected.delta.add(5, 'b', 3); + expected.final.insert(3); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } + + SECTION("^systempath\\=https|ftp$ correct parentheses") { + mata::parser::create_nfa(&nfa, "^[sS][yY][sS][tT][eE][mM][pP][aA][tT][hH]\\\\=(([hH][tT]{2}[pP][sS]?)|([fF][tT][pP]))$"); + expected = mata::nfa::builder::parse_from_mata(std::string{ R"( + @NFA-explicit + %Alphabet-auto + %Initial q0 + %Final q16 q17 + q0 83 q1 + q0 115 q1 + q1 89 q2 + q1 121 q2 + q2 83 q3 + q2 115 q3 + q3 84 q4 + q3 116 q4 + q4 69 q5 + q4 101 q5 + q5 77 q6 + q5 109 q6 + q6 80 q7 + q6 112 q7 + q7 65 q8 + q7 97 q8 + q8 84 q9 + q8 116 q9 + q9 72 q10 + q9 104 q10 + q10 92 q11 + q11 61 q12 + q12 70 q18 + q12 72 q13 + q12 102 q18 + q12 104 q13 + q13 84 q14 + q13 116 q14 + q14 84 q15 + q14 116 q15 + q15 80 q16 + q15 112 q16 + q16 83 q17 + q16 115 q17 + q18 84 q19 + q18 116 q19 + q19 80 q17 + q19 112 q17 + )" + }); + CHECK(mata::nfa::are_equivalent(nfa, expected)); + } +}