tested

VeriFIT · Nov 15, 2024 · 170186f · 170186f
1 parent de3ff23
commit 170186f
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 38 deletions.
diff --git a/src/alphabet.cc b/src/alphabet.cc
@@ -250,7 +250,7 @@ mata::Word mata::decode_word_utf8(const mata::Word& word) {
             assert(i + 2 < word.size());
             decoded_word.push_back(((symbol & 0x0F) << 12) | ((word[i+1] & 0x3F) << 6) | (word[i+2] & 0x3F));
             i += 2;
-        } else if ((symbol & 0xF8) == 0xF0) {
+        } else if ((symbol & 0xF8) == 0xF0 && symbol < 0xF5) {
             // U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
             assert(i + 3 < word.size());
             decoded_word.push_back(((symbol & 0x07) << 18) | ((word[i+1] & 0x3F) << 12) | ((word[i+2] & 0x3F) << 6) | (word[i+3] & 0x3F));

diff --git a/src/nfa/nfa.cc b/src/nfa/nfa.cc
@@ -654,6 +654,7 @@ Nfa Nfa::decode_utf8() const {
     BoolVector used(this->num_of_states(), false);
     std::stack<State> worklist;
 
+    // Pushes a set of states to the worklist and marks them as used.
     auto push_state_set = [&](const StateSet& set) {
         for (State state: set) {
             if (used[state]) {
@@ -664,51 +665,86 @@ Nfa Nfa::decode_utf8() const {
         }
     };
 
+    // Adds a symbol_post to the state_post.
+    // If the transition sequence is deterministic, we can use emplace_back
+    // because symbols are discovered in ascending order. However, in cases
+    // of nondeterministic sequences, we must use insert to ensure proper ordering.
+    // For example, consider the sequences 0xC8 0x80 and 0xC8 0x88.
+    // Based solely on the first byte (0xC8), we cannot determine which sequence
+    // will result in the higher number.
+    auto add_to_state_post = [&](StatePost &state_post, const SymbolPost &symbol_post, const bool is_nondet) {
+        if (is_nondet) {
+            state_post.insert(std::move(symbol_post));
+        } else {
+            state_post.emplace_back(std::move(symbol_post));
+        }
+    };
+
     // UTF-8 Byte Patterns:
     // U+0000   to U+007F  : 0xxxxxxx
     // U+0080   to U+07FF  : 110xxxxx 10xxxxxx
     // U+0800   to U+FFFF  : 1110xxxx 10xxxxxx 10xxxxxx
     // U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    // NOTE: Due to the nature of RE2, the automaton language can contain unexpected (invalid)
+    //       UTF-8 sequences, such as 11000000 10000000 (U+0300). Because of that,
+    //       we need to check if the decoded symbol is within the valid range of Unicode code points.
     push_state_set(StateSet{this->initial});
     while (!worklist.empty()) {
         State q1 = worklist.top();
+        StatePost &q1_state_post = result.delta.mutable_state_post(q1);
         worklist.pop();
-
         // 1st Byte
         for (const SymbolPost &sp1: this->delta[q1]) {
             const Symbol s1 = sp1.symbol;
             if ((s1 & 0x80) == 0x00) {
-                result.delta.add(q1, s1, sp1.targets);
+                q1_state_post.emplace_back(SymbolPost{s1, sp1.targets});;
                 push_state_set(sp1.targets);
                 continue;
             }
             // 2nd Byte
+            const bool is_nondet1 = sp1.targets.size() > 1;
             for (const State q2: sp1.targets) {
                 for (const SymbolPost &sp2: this->delta[q2]) {
                     const Symbol s2 = sp2.symbol;
                     if ((s1 & 0xE0) == 0xC0) {
                         assert((s2 & 0xC0) == 0x80);
-                        result.delta.add(q1, ((s1 & 0x1F) << 6) | (s2 & 0x3F), sp2.targets);
+                        const Symbol symbol = ((s1 & 0x1F) << 6) | (s2 & 0x3F);
+                        if (symbol < 0x80) {
+                            continue;   // Invalid UTF-8 sequence
+                        }
+                        assert(symbol <= 0x7FF);
+                        add_to_state_post(q1_state_post, SymbolPost{symbol, sp2.targets}, is_nondet1);
                         push_state_set(sp2.targets);
                         continue;
                     }
                     // 3rd Byte
+                    const bool is_nondet2 = is_nondet1 || sp2.targets.size() > 1;
                     for (const State q3: sp2.targets) {
                         for (const SymbolPost &sp3: this->delta[q3]) {
                             const Symbol s3 = sp3.symbol;
                             if ((s1 & 0xF0) == 0xE0) {
                                 assert((s3 & 0xC0) == 0x80);
-                                result.delta.add(q1, ((s1 & 0x0F) << 12) | ((s2 & 0x3F) << 6) | (s3 & 0x3F), sp3.targets);
+                                const Symbol symbol = ((s1 & 0x0F) << 12) | ((s2 & 0x3F) << 6) | (s3 & 0x3F);
+                                if (symbol < 0x800) {
+                                    continue;   // Invalid UTF-8 sequence
+                                }
+                                assert(symbol <= 0xFFFF);
+                                add_to_state_post(q1_state_post, SymbolPost{symbol, sp3.targets}, is_nondet2);
                                 push_state_set(sp3.targets);
                                 continue;
                             }
                             // 4th Byte
+                            const bool is_nondet3 = is_nondet2 || sp3.targets.size() > 1;
                             for (const State q4: sp3.targets) {
                                 for (const SymbolPost &sp4: this->delta[q4]) {
                                     const Symbol s4 = sp4.symbol;
                                     assert((s1 & 0xF8) == 0xF0);
                                     assert((s4 & 0xC0) == 0x80);
-                                    result.delta.add(q1, ((s1 & 0x07) << 18) | ((s2 & 0x3F) << 12) | ((s3 & 0x3F) << 6) | (s4 & 0x3F), sp4.targets);
+                                    const Symbol symbol = ((s1 & 0x07) << 18) | ((s2 & 0x3F) << 12) | ((s3 & 0x3F) << 6) | (s4 & 0x3F);
+                                    if (symbol < 0x10000 || symbol > 0x10FFFF) {
+                                        continue;   // Invalid UTF-8 sequence
+                                    }
+                                    add_to_state_post(q1_state_post, SymbolPost{symbol, sp4.targets}, is_nondet3);
                                     push_state_set(sp4.targets);
                                 }
                             }

diff --git a/src/re2parser.cc b/src/re2parser.cc
@@ -46,6 +46,7 @@ namespace {
         /**
          * Creates parsed regex (ie. Regexp*) from string regex_string
          * @param regex_string Regex to be parsed as a string
+         * @param encoding Encoding of the regex, default is Latin1
          * @return Parsed regex as RE2 Regexp*
          */
         re2::Regexp* parse_regex_string(const std::string& regex_string, const Encoding encoding = Encoding::Latin1) const {
@@ -490,11 +491,12 @@ namespace {
 }
 
  /**
- * The main method, it creates NFA from regex
+ * The main method, it creates NFA from regex.
  * @param pattern regex as string
  * @param use_epsilon whether to create NFA with epsilon transitions or not
  * @param epsilon_value value, that will represent epsilon on transitions
  * @param use_reduce if set to true the result is trimmed and reduced using simulation reduction
+ * @param encoding encoding of the regex, default is Latin1
  * @return Nfa corresponding to pattern
  */
 void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool use_epsilon, mata::Symbol epsilon_value, bool use_reduce, const Encoding encoding) {

diff --git a/tests/re2parser.cc b/tests/re2parser.cc
@@ -1563,22 +1563,6 @@ TEST_CASE("mata::Parser UTF-8 encoding")
         CHECK(are_equivalent(aut, result));
     }
 
-    SECTION("Regex range [x00-x900]") {
-        Nfa aut;
-        mata::parser::create_nfa(&aut, "[\\x{00}-\\x{900}]", false, 306, true, Encoding::UTF8);
-        aut = aut.decode_utf8();
-
-        Nfa result;
-        State initial_s = 0;
-        State final_s = 1;
-        result.initial.insert(initial_s);
-        result.final.insert(final_s);
-        for(Symbol c = 0; c <= 0x900; c++) {
-            result.delta.add(initial_s, c, final_s);
-        }
-        CHECK(are_equivalent(aut, result));
-    }
-
     SECTION("Regex (\\x{60}*\\x{80})|(\\x{900}*\\x{600})") {
         Nfa aut;
         mata::parser::create_nfa(&aut, "(\\x{60}*\\x{80})|(\\x{900}*\\x{600})", false, 306, true, Encoding::UTF8);
@@ -1596,20 +1580,21 @@ TEST_CASE("mata::Parser UTF-8 encoding")
         CHECK(are_equivalent(aut, result));
     }
 
-    // SECTION("Regex .*") {
-    //     Nfa aut;
-    //     mata::parser::create_nfa(&aut, ".*", false, 306, true, Encoding::UTF8);
-    //     aut = aut.decode_utf8();
-
-    //     Nfa result;
-    //     State initial_s = 0;
-    //     State final_s = 1;
-    //     result.initial.insert(initial_s);
-    //     result.final.insert(final_s);
-    //     for(Symbol c = 0; c <= 0x10FFFF; c++) {
-    //         result.delta.add(initial_s, c, final_s);
-    //     }
-    //     CHECK(are_equivalent(aut, result));
-    // }
+    // A proper test, but takes about 2 seconds to run.
+    SECTION("Regex [\\x{00}-\\x{10FFFF}]") {
+        Nfa aut;
+        mata::parser::create_nfa(&aut, "[\\x{00}-\\x{10FFFF}]", false, 306, true, Encoding::UTF8);
+        aut = aut.decode_utf8();
+
+        // Random symbols
+        std::vector<Symbol> symbols = { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x7f, 0x80,
+        0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xff, 0x100, 0x110, 0x36f0, 0x57fc, 0x6177, 0x7498,
+        0x8f3f, 0x9fc8, 0x1101e, 0x14348, 0x14e34, 0x19581, 0x1c48e, 0x1f1cc, 0x1f91d, 0x222a6, 0x22e11,
+        0xe54f5, 0xe7934, 0xe93a4, 0xe998d, 0xebee8, 0xedb9e, 0xef98b, 0xf12af, 0xf51e2, 0xf557f, 0xf6b08,
+        0xfa7f0, 0xfacb2, 0xfd719, 0x106d12, 0x106d66, 0x109220, 0x10a608, 0x10c1f5, 0x10FFFF };
+        for(const Symbol c : symbols) {
+            CHECK(aut.is_in_lang(Run{Word{c}, {}}));
+        }
+    }
 
 } // }}}