From 0d12da52f69a9b9d686ed8566e3f9ad7178dc1ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Chocholat=C3=BD?= <chocholaty.david@protonmail.com>
Date: Thu, 14 Nov 2024 09:49:30 +0100
Subject: [PATCH 1/5] feat(re2): Add tests for ^ and $ regex symbols

---
 tests/re2parser.cc | 154 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 153 insertions(+), 1 deletion(-)

diff --git a/tests/re2parser.cc b/tests/re2parser.cc
index bf7dc3a80..f5bcfa20b 100644
--- a/tests/re2parser.cc
+++ b/tests/re2parser.cc
@@ -1,8 +1,10 @@
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>
 
-#include "mata/nfa/nfa.hh"
 #include "mata/parser/re2parser.hh"
+#include "mata/nfa/builder.hh"
+#include "mata/nfa/nfa.hh"
+
 using namespace mata::nfa;
 
 using Symbol = mata::Symbol;
@@ -1315,3 +1317,153 @@ TEST_CASE("mata::Parser bug epsilon")
         CHECK(x.is_in_lang(Run{Word{'a', 'a', 'a', 'a'}, {}}));
     }
 } // }}}
+
+TEST_CASE("mata::parser Parsing regexes with ^ and $") {
+    Nfa nfa;
+    Nfa expected{};
+
+    SECTION("Handling of '\\'") {
+        mata::parser::create_nfa(&nfa, "a\\\\b");
+        expected = mata::nfa::builder::parse_from_mata(
+        std::string{ R"(
+            @NFA-explicit
+            %Alphabet-auto
+            %Initial q0
+            %Final q3
+            q0 97 q1
+            q1 92 q2
+            q2 98 q3)"
+        });
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("a|b$, a simple OR example with end marker") {
+        mata::parser::create_nfa(&nfa, "a|b$");
+        expected.initial.insert(0);
+        expected.delta.add(0, 'a', 1);
+        expected.delta.add(0, 'b', 1);
+        expected.final.insert(1);
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("^a|b, a simple OR example with begin marker") {
+        mata::parser::create_nfa(&nfa, "^a|b");
+        expected.initial.insert(0);
+        expected.delta.add(0, 'a', 1);
+        expected.delta.add(0, 'b', 1);
+        expected.final.insert(1);
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("^a|b$, a simple OR example with begin and end marker") {
+        mata::parser::create_nfa(&nfa, "^a|b$");
+        expected.initial.insert(0);
+        expected.delta.add(0, 'a', 1);
+        expected.delta.add(0, 'b', 1);
+        expected.final.insert(1);
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("^(a|b)$, a simple OR example with begin and end marker around capture group") {
+        mata::parser::create_nfa(&nfa, "^(a|b)$");
+        expected.initial.insert(0);
+        expected.delta.add(0, 'a', 1);
+        expected.delta.add(0, 'b', 1);
+        expected.final.insert(1);
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("a$|b, a simple OR example with end marker on the left side") {
+        mata::parser::create_nfa(&nfa, "a$|b");
+        expected.initial.insert(0);
+        expected.delta.add(0, 'a', 1);
+        expected.delta.add(0, 'b', 1);
+        expected.final.insert(1);
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("^a$|^b$, a simple OR example with multiple begin and end markers") {
+        mata::parser::create_nfa(&nfa, "^a$|^b$");
+        expected.initial.insert(0);
+        expected.delta.add(0, 'a', 1);
+        expected.delta.add(0, 'b', 1);
+        expected.final.insert(1);
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("aed|(bab)$, a simple OR example with trailing end marker") {
+        mata::parser::create_nfa(&nfa, "aed|(bab)$");
+        expected.initial.insert(0);
+        expected.delta.add(0, 'a', 1);
+        expected.delta.add(1, 'e', 2);
+        expected.delta.add(2, 'd', 3);
+        expected.delta.add(0, 'b', 4);
+        expected.delta.add(4, 'a', 5);
+        expected.delta.add(5, 'b', 3);
+        expected.final.insert(3);
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("aed|bab$, a simple OR example with trailing end marker") {
+        mata::parser::create_nfa(&nfa, "aed|bab$");
+        expected.initial.insert(0);
+        expected.delta.add(0, 'a', 1);
+        expected.delta.add(1, 'e', 2);
+        expected.delta.add(2, 'd', 3);
+        expected.delta.add(0, 'b', 4);
+        expected.delta.add(4, 'a', 5);
+        expected.delta.add(5, 'b', 3);
+        expected.final.insert(3);
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+
+    SECTION("^systempath\\=https|ftp$ correct parentheses") {
+        mata::parser::create_nfa(&nfa, "^[sS][yY][sS][tT][eE][mM][pP][aA][tT][hH]\\\\=(([hH][tT]{2}[pP][sS]?)|([fF][tT][pP]))$");
+        expected = mata::nfa::builder::parse_from_mata(std::string{ R"(
+            @NFA-explicit
+            %Alphabet-auto
+            %Initial q0
+            %Final q16 q17
+            q0 83 q1
+            q0 115 q1
+            q1 89 q2
+            q1 121 q2
+            q2 83 q3
+            q2 115 q3
+            q3 84 q4
+            q3 116 q4
+            q4 69 q5
+            q4 101 q5
+            q5 77 q6
+            q5 109 q6
+            q6 80 q7
+            q6 112 q7
+            q7 65 q8
+            q7 97 q8
+            q8 84 q9
+            q8 116 q9
+            q9 72 q10
+            q9 104 q10
+            q10 92 q11
+            q11 61 q12
+            q12 70 q18
+            q12 72 q13
+            q12 102 q18
+            q12 104 q13
+            q13 84 q14
+            q13 116 q14
+            q14 84 q15
+            q14 116 q15
+            q15 80 q16
+            q15 112 q16
+            q16 83 q17
+            q16 115 q17
+            q18 84 q19
+            q18 116 q19
+            q19 80 q17
+            q19 112 q17
+            )"
+        });
+        CHECK(mata::nfa::are_equivalent(nfa, expected));
+    }
+}

From 459c6cd524c8bd77adf9d1e233af5e2a22836725 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Chocholat=C3=BD?= <chocholaty.david@protonmail.com>
Date: Thu, 14 Nov 2024 10:01:14 +0100
Subject: [PATCH 2/5] fix(re2): Fix creating NFAs from regexes with ^ and $
 regex symbols

---
 src/re2parser.cc | 50 +++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/src/re2parser.cc b/src/re2parser.cc
index 03e9f2554..beeb1bea0 100644
--- a/src/re2parser.cc
+++ b/src/re2parser.cc
@@ -136,11 +136,14 @@ namespace {
             this->outgoingEdges = std::vector<std::vector<std::pair<mata::Symbol, mata::nfa::State>>> (prog_size);
 
             // We traverse all the states and create corresponding states and edges in Nfa
-            for (mata::nfa::State current_state = start_state; current_state < prog_size; current_state++) {
-                re2::Prog::Inst *inst = prog->inst(static_cast<int>(current_state));
+            for (State current_state = start_state, re2_state = start_state; re2_state < prog_size; ++
+                 re2_state) {
+                /// Whether to increment the current state @c current_state when the @c re2_state increments.
+                bool increment_current_state{true};
+                re2::Prog::Inst *inst = prog->inst(static_cast<int>(re2_state));
                 // Every type of state can be final (due to epsilon transition), so we check it regardless of its type
-                if (this->state_cache.is_final_state[current_state]) {
-                    this->make_state_final(current_state, explicit_nfa);
+                if (this->state_cache.is_final_state[re2_state]) {
+                    this->make_state_final(re2_state, explicit_nfa);
                 }
                 switch (inst->opcode()) {
                     default:
@@ -164,33 +167,35 @@ namespace {
                         empty_flag = static_cast<int>(inst->empty());
                         // ^ - beginning of line
                         if (empty_flag & re2::kEmptyBeginLine) {
-                            // TODO Symbol?
-                            symbols.push_back(300);
+                            increment_current_state = false;
                         }
                         // $ - end of line
                         if (empty_flag & re2::kEmptyEndLine) {
-                            // TODO Symbol?
-                            symbols.push_back(10);
+                            // TODO How to handle?
+                            // symbols.push_back(301);
+                            increment_current_state = false;
                         }
                         // \A - beginning of text
                         if (empty_flag & re2::kEmptyBeginText) {
-                            // TODO Symbol?
-                            symbols.push_back(301);
+                            increment_current_state = false;
                         }
                         // \z - end of text
                         if (empty_flag & re2::kEmptyEndText) {
-                            // TODO Symbol?
-                            symbols.push_back(302);
+                            // TODO How to handle?
+                            // symbols.push_back(302);
+                            increment_current_state = false;
                         }
                         // \b - word boundary
                         if (empty_flag & re2::kEmptyWordBoundary) {
-                            // TODO Symbol?
-                            symbols.push_back(303);
+                            // TODO How to handle?
+                            // symbols.push_back(303);
+                            increment_current_state = false;
                         }
                         // \B - not \b
                         if (empty_flag & re2::kEmptyNonWordBoundary) {
-                            // TODO Symbol?
-                            symbols.push_back(304);
+                            // TODO How to handle?
+                            // symbols.push_back(304);
+                            increment_current_state = false;
                         }
                         break;
                     // kInstByteRange represents states with a "byte range" on the outgoing transition(s)
@@ -212,15 +217,17 @@ namespace {
                         if (!use_epsilon) {
                             // There is an epsilon transition to the currentState+1 we will need to copy transitions of
                             // the currentState+1 to the currentState.
-                            if (!this->state_cache.is_last[current_state]) {
-                                for (auto state: this->state_cache.state_mapping[current_state + 1]) {
-                                    copyEdgesFromTo.emplace_back(state, current_state);
+                            if (!this->state_cache.is_last[re2_state]) {
+                                for (auto state: this->state_cache.state_mapping[re2_state + 1]) {
+                                    copyEdgesFromTo.emplace_back(state, re2_state);
                                 }
                             }
                         }
                         symbols.clear();
                         break;
                 }
+
+                if (increment_current_state) { ++current_state; }
             }
             if (!use_epsilon) {
                 // We will traverse the vector in reversed order. Like that, we will also handle chains of epsilon transitions
@@ -419,7 +426,8 @@ namespace {
                 if (inst->last()) {
                     this->state_cache.is_last[state] = true;
                 }
-                if (inst->opcode() == re2::kInstMatch) {
+                if (inst->opcode() == re2::kInstMatch ||
+                    (inst->opcode() == re2::kInstEmptyWidth && inst->empty() & re2::kEmptyEndText)) {
                     this->state_cache.is_final_state[state] = true;
                 }
             }
@@ -505,6 +513,8 @@ void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool us
     RegexParser regexParser{};
     auto parsed_regex = regexParser.parse_regex_string(pattern);
     auto program = parsed_regex->CompileToProg(regexParser.options.max_mem() * 2 / 3);
+    // FIXME: use_epsilon = false completely breaks the method convert_pro_to_nfa(). Needs fixing before allowing to
+    //  pass the argument use_epsilon to convert_pro_to_nfa().
     regexParser.convert_pro_to_nfa(nfa, program, true, epsilon_value);
     delete program;
     // Decrements reference count and deletes object if the count reaches 0

From 73f9e01d4acf5c3dcbb479251121375e72353f60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Chocholat=C3=BD?= <chocholaty.david@protonmail.com>
Date: Fri, 15 Nov 2024 10:16:20 +0100
Subject: [PATCH 3/5] fix(python): Add missing Python dependencies for Python
 binding

---
 bindings/python/requirements.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bindings/python/requirements.txt b/bindings/python/requirements.txt
index f7cbea49f..5c3b30c5f 100644
--- a/bindings/python/requirements.txt
+++ b/bindings/python/requirements.txt
@@ -7,3 +7,7 @@ ipython>=7.9.0
 pandas>=1.3.5
 networkx>=2.6.3
 graphviz>=0.20.0
+setuptools
+papermill
+ipykernel
+seaborn

From 5a388f7a89f4aedf8d3eb14cddf59f3fa7e4a114 Mon Sep 17 00:00:00 2001
From: koniksedy <xsedym02@vutbr.cz>
Date: Sat, 16 Nov 2024 11:41:42 +0100
Subject: [PATCH 4/5] to_mata and to_dot with filename

---
 include/mata/nfa/nfa.hh | 41 +++++++++++++++++++++++++++++------------
 src/nfa/nfa.cc          | 19 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/include/mata/nfa/nfa.hh b/include/mata/nfa/nfa.hh
index 63e1dc426..7dfd47cde 100644
--- a/include/mata/nfa/nfa.hh
+++ b/include/mata/nfa/nfa.hh
@@ -173,7 +173,7 @@ public:
     BoolVector get_useful_states() const;
 
     /**
-     * @brief Structure for storing callback functions (event handlers) utilizing 
+     * @brief Structure for storing callback functions (event handlers) utilizing
      * Tarjan's SCC discover algorithm.
      */
     struct TarjanDiscoverCallback {
@@ -189,7 +189,7 @@ public:
 
     /**
      * @brief Tarjan's SCC discover algorihm.
-     * 
+     *
      * @param callback Callbacks class to instantiate callbacks for the Tarjan's algorithm.
      */
     void tarjan_scc_discover(const TarjanDiscoverCallback& callback) const;
@@ -218,9 +218,9 @@ public:
 
     /**
      * @brief Get some shortest accepting run from state @p q
-     * 
+     *
      * Assumes that @p q is a state of this automaton and that there is some accepting run from @p q
-     * 
+     *
      * @param distances_to_final Vector of the lengths of the shortest runs from states (can be computed using distances_to_final())
      */
     Run get_shortest_accepting_run_from_state(State q, const std::vector<State>& distances_to_final) const;
@@ -280,6 +280,12 @@ public:
      */
     void print_to_dot(std::ostream &output) const;
 
+    /**
+     * @brief Prints the automaton to the file in DOT format
+     * @param filename Name of the file to print the automaton to
+     */
+    void print_to_dot(const std::string& filename) const;
+
     /**
      * @brief Prints the automaton in mata format
      *
@@ -289,6 +295,7 @@ public:
      * TODO handle alphabet of the automaton, currently we print the exact value of the symbols
      */
     std::string print_to_mata() const;
+
     /**
      * @brief Prints the automaton to the output stream in mata format
      *
@@ -298,12 +305,22 @@ public:
      */
     void print_to_mata(std::ostream &output) const;
 
+    /**
+     * @brief Prints the automaton to the file in mata format
+     *
+     * If you need to parse the automaton again, use IntAlphabet in construct()
+     *
+     * TODO handle alphabet of the automaton, currently we print the exact value of the symbols
+     * @param filename Name of the file to print the automaton to
+     */
+    void print_to_mata(const std::string& filename) const;
+
     // TODO: Relict from VATA. What to do with inclusion/ universality/ this post function? Revise all of them.
     StateSet post(const StateSet& states, const Symbol& symbol) const;
 
     /**
-     * Check whether the language of NFA is empty. 
-     * Currently calls is_lang_empty_scc if cex is null 
+     * Check whether the language of NFA is empty.
+     * Currently calls is_lang_empty_scc if cex is null
      * @param[out] cex Counter-example path for a case the language is not empty.
      * @return True if the language is empty, false otherwise.
      */
@@ -311,7 +328,7 @@ public:
 
     /**
      * @brief Check if the language is empty using Tarjan's SCC discover algorithm.
-     * 
+     *
      * @return Language empty <-> True
      */
     bool is_lang_empty_scc() const;
@@ -334,17 +351,17 @@ public:
 
     /**
      * @brief Is the automaton graph acyclic? Used for checking language finiteness.
-     * 
+     *
      * @return true <-> Automaton graph is acyclic.
      */
     bool is_acyclic() const;
 
     /**
      * @brief Is the automaton flat?
-     * 
-     * Flat automaton is an NFA whose every SCC is a simple loop. Basically each state in an 
+     *
+     * Flat automaton is an NFA whose every SCC is a simple loop. Basically each state in an
      * SCC has at most one successor within this SCC.
-     * 
+     *
      * @return true <-> Automaton graph is flat.
      */
     bool is_flat() const;
@@ -374,7 +391,7 @@ public:
 
     /**
      * @brief Get the set of all words in the language of the automaton whose length is <= @p max_length
-     * 
+     *
      * If you have an automaton with finite language (can be checked using @ref is_acyclic),
      * you can get all words by calling
      *      get_words(aut.num_of_states())
diff --git a/src/nfa/nfa.cc b/src/nfa/nfa.cc
index b544ed830..9b156f464 100644
--- a/src/nfa/nfa.cc
+++ b/src/nfa/nfa.cc
@@ -5,6 +5,7 @@
 #include <list>
 #include <optional>
 #include <iterator>
+#include <fstream>
 
 // MATA headers
 #include "mata/utils/sparse-set.hh"
@@ -404,7 +405,7 @@ bool Nfa::is_flat() const {
     mata::nfa::Nfa::TarjanDiscoverCallback callback {};
     callback.scc_discover = [&](const std::vector<mata::nfa::State>& scc, const std::vector<mata::nfa::State>& tarjan_stack) -> bool {
         (void)tarjan_stack;
-        
+
         for(const mata::nfa::State& st : scc) {
             bool one_input_visited = false;
             for (const mata::nfa::SymbolPost& sp : this->delta[st]) {
@@ -459,6 +460,14 @@ void Nfa::print_to_dot(std::ostream &output) const {
     output << "}" << std::endl;
 }
 
+void Nfa::print_to_dot(const std::string& filename) const {
+    std::ofstream output(filename);
+    if (!output) {
+        throw std::ios_base::failure("Failed to open file: " + filename);
+    }
+    print_to_dot(output);
+}
+
 std::string Nfa::print_to_mata() const {
     std::stringstream output;
     print_to_mata(output);
@@ -492,6 +501,14 @@ void Nfa::print_to_mata(std::ostream &output) const {
     }
 }
 
+void Nfa::print_to_mata(const std::string& filename) const {
+    std::ofstream output(filename);
+    if (!output) {
+        throw std::ios_base::failure("Failed to open file: " + filename);
+    }
+    print_to_mata(output);
+}
+
 Nfa Nfa::get_one_letter_aut(Symbol abstract_symbol) const {
     Nfa digraph{num_of_states(), StateSet(initial), StateSet(final) };
     // Add directed transitions for digraph.

From 0d3b26eb2438dd5431dfa3ca9c4741c5b7aafbc2 Mon Sep 17 00:00:00 2001
From: koniksedy <xsedym02@vutbr.cz>
Date: Sat, 16 Nov 2024 12:28:05 +0100
Subject: [PATCH 5/5] bug fix

---
 src/re2parser.cc   |  2 +-
 tests/re2parser.cc | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/re2parser.cc b/src/re2parser.cc
index 03e9f2554..bca480be1 100644
--- a/src/re2parser.cc
+++ b/src/re2parser.cc
@@ -202,7 +202,7 @@ namespace {
                                 symbols.push_back(symbol);
                                 // Foldcase causes RE2 to do a case-insensitive match, so transitions will be made for
                                 // both uppercase and lowercase symbols
-                                if (inst->foldcase()) {
+                                if (inst->foldcase() && symbol >= 'a' && symbol <= 'z') {
                                     symbols.push_back(symbol-ascii_shift_value);
                                 }
                             }
diff --git a/tests/re2parser.cc b/tests/re2parser.cc
index bf7dc3a80..3aa91d14a 100644
--- a/tests/re2parser.cc
+++ b/tests/re2parser.cc
@@ -1273,6 +1273,24 @@ TEST_CASE("mata::Parser error")
         CHECK(!x.is_in_lang(Run{ Word{ 'a', 'a', 'a', 'a', 'a', 'a' }, {} }));
     }
 
+    SECTION("Regex from issue #456") {
+        Nfa x;
+        mata::parser::create_nfa(&x, "[\\x00-\\x5a\\x5c-\\x7F]");
+
+        Nfa y;
+        State initial_s = 0;
+        State final_s = 1;
+        y.initial.insert(initial_s);
+        y.final.insert(final_s);
+        for (Symbol c = 0; c <= 0x7F; c++) {
+            if (c == 0x5B) {
+                continue;
+            }
+            y.delta.add(initial_s, c, final_s);
+        }
+        CHECK(are_equivalent(x, y));
+    }
+
     SECTION("Another failing regex") {
         Nfa x;
         mata::parser::create_nfa(&x, "(cd(abcde)+)|(a(aaa)+|ccc+)");