diff --git a/src/alphabet.cc b/src/alphabet.cc index 5484942f7..6bdac841b 100644 --- a/src/alphabet.cc +++ b/src/alphabet.cc @@ -250,7 +250,7 @@ mata::Word mata::decode_word_utf8(const mata::Word& word) { assert(i + 2 < word.size()); decoded_word.push_back(((symbol & 0x0F) << 12) | ((word[i+1] & 0x3F) << 6) | (word[i+2] & 0x3F)); i += 2; - } else if ((symbol & 0xF8) == 0xF0) { + } else if ((symbol & 0xF8) == 0xF0 && symbol < 0xF5) { // U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx assert(i + 3 < word.size()); decoded_word.push_back(((symbol & 0x07) << 18) | ((word[i+1] & 0x3F) << 12) | ((word[i+2] & 0x3F) << 6) | (word[i+3] & 0x3F)); diff --git a/src/nfa/nfa.cc b/src/nfa/nfa.cc index a5535107e..1348d820f 100644 --- a/src/nfa/nfa.cc +++ b/src/nfa/nfa.cc @@ -654,6 +654,7 @@ Nfa Nfa::decode_utf8() const { BoolVector used(this->num_of_states(), false); std::stack worklist; + // Pushes a set of states to the worklist and marks them as used. auto push_state_set = [&](const StateSet& set) { for (State state: set) { if (used[state]) { @@ -664,51 +665,86 @@ Nfa Nfa::decode_utf8() const { } }; + // Adds a symbol_post to the state_post. + // If the transition sequence is deterministic, we can use emplace_back + // because symbols are discovered in ascending order. However, in cases + // of nondeterministic sequences, we must use insert to ensure proper ordering. + // For example, consider the sequences 0xC8 0x80 and 0xC8 0x88. + // Based solely on the first byte (0xC8), we cannot determine which sequence + // will result in the higher number. + auto add_to_state_post = [&](StatePost &state_post, const SymbolPost &symbol_post, const bool is_nondet) { + if (is_nondet) { + state_post.insert(std::move(symbol_post)); + } else { + state_post.emplace_back(std::move(symbol_post)); + } + }; + // UTF-8 Byte Patterns: // U+0000 to U+007F : 0xxxxxxx // U+0080 to U+07FF : 110xxxxx 10xxxxxx // U+0800 to U+FFFF : 1110xxxx 10xxxxxx 10xxxxxx // U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // NOTE: Due to the nature of RE2, the automaton language can contain unexpected (invalid) + // UTF-8 sequences, such as 11000000 10000000 (U+0300). Because of that, + // we need to check if the decoded symbol is within the valid range of Unicode code points. push_state_set(StateSet{this->initial}); while (!worklist.empty()) { State q1 = worklist.top(); + StatePost &q1_state_post = result.delta.mutable_state_post(q1); worklist.pop(); - // 1st Byte for (const SymbolPost &sp1: this->delta[q1]) { const Symbol s1 = sp1.symbol; if ((s1 & 0x80) == 0x00) { - result.delta.add(q1, s1, sp1.targets); + q1_state_post.emplace_back(SymbolPost{s1, sp1.targets});; push_state_set(sp1.targets); continue; } // 2nd Byte + const bool is_nondet1 = sp1.targets.size() > 1; for (const State q2: sp1.targets) { for (const SymbolPost &sp2: this->delta[q2]) { const Symbol s2 = sp2.symbol; if ((s1 & 0xE0) == 0xC0) { assert((s2 & 0xC0) == 0x80); - result.delta.add(q1, ((s1 & 0x1F) << 6) | (s2 & 0x3F), sp2.targets); + const Symbol symbol = ((s1 & 0x1F) << 6) | (s2 & 0x3F); + if (symbol < 0x80) { + continue; // Invalid UTF-8 sequence + } + assert(symbol <= 0x7FF); + add_to_state_post(q1_state_post, SymbolPost{symbol, sp2.targets}, is_nondet1); push_state_set(sp2.targets); continue; } // 3rd Byte + const bool is_nondet2 = is_nondet1 || sp2.targets.size() > 1; for (const State q3: sp2.targets) { for (const SymbolPost &sp3: this->delta[q3]) { const Symbol s3 = sp3.symbol; if ((s1 & 0xF0) == 0xE0) { assert((s3 & 0xC0) == 0x80); - result.delta.add(q1, ((s1 & 0x0F) << 12) | ((s2 & 0x3F) << 6) | (s3 & 0x3F), sp3.targets); + const Symbol symbol = ((s1 & 0x0F) << 12) | ((s2 & 0x3F) << 6) | (s3 & 0x3F); + if (symbol < 0x800) { + continue; // Invalid UTF-8 sequence + } + assert(symbol <= 0xFFFF); + add_to_state_post(q1_state_post, SymbolPost{symbol, sp3.targets}, is_nondet2); push_state_set(sp3.targets); continue; } // 4th Byte + const bool is_nondet3 = is_nondet2 || sp3.targets.size() > 1; for (const State q4: sp3.targets) { for (const SymbolPost &sp4: this->delta[q4]) { const Symbol s4 = sp4.symbol; assert((s1 & 0xF8) == 0xF0); assert((s4 & 0xC0) == 0x80); - result.delta.add(q1, ((s1 & 0x07) << 18) | ((s2 & 0x3F) << 12) | ((s3 & 0x3F) << 6) | (s4 & 0x3F), sp4.targets); + const Symbol symbol = ((s1 & 0x07) << 18) | ((s2 & 0x3F) << 12) | ((s3 & 0x3F) << 6) | (s4 & 0x3F); + if (symbol < 0x10000 || symbol > 0x10FFFF) { + continue; // Invalid UTF-8 sequence + } + add_to_state_post(q1_state_post, SymbolPost{symbol, sp4.targets}, is_nondet3); push_state_set(sp4.targets); } } diff --git a/src/re2parser.cc b/src/re2parser.cc index 8b99abf32..6a39cb3b2 100644 --- a/src/re2parser.cc +++ b/src/re2parser.cc @@ -46,6 +46,7 @@ namespace { /** * Creates parsed regex (ie. Regexp*) from string regex_string * @param regex_string Regex to be parsed as a string + * @param encoding Encoding of the regex, default is Latin1 * @return Parsed regex as RE2 Regexp* */ re2::Regexp* parse_regex_string(const std::string& regex_string, const Encoding encoding = Encoding::Latin1) const { @@ -490,11 +491,12 @@ namespace { } /** - * The main method, it creates NFA from regex + * The main method, it creates NFA from regex. * @param pattern regex as string * @param use_epsilon whether to create NFA with epsilon transitions or not * @param epsilon_value value, that will represent epsilon on transitions * @param use_reduce if set to true the result is trimmed and reduced using simulation reduction + * @param encoding encoding of the regex, default is Latin1 * @return Nfa corresponding to pattern */ void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool use_epsilon, mata::Symbol epsilon_value, bool use_reduce, const Encoding encoding) { diff --git a/tests/re2parser.cc b/tests/re2parser.cc index 2452df766..198946387 100644 --- a/tests/re2parser.cc +++ b/tests/re2parser.cc @@ -1563,22 +1563,6 @@ TEST_CASE("mata::Parser UTF-8 encoding") CHECK(are_equivalent(aut, result)); } - SECTION("Regex range [x00-x900]") { - Nfa aut; - mata::parser::create_nfa(&aut, "[\\x{00}-\\x{900}]", false, 306, true, Encoding::UTF8); - aut = aut.decode_utf8(); - - Nfa result; - State initial_s = 0; - State final_s = 1; - result.initial.insert(initial_s); - result.final.insert(final_s); - for(Symbol c = 0; c <= 0x900; c++) { - result.delta.add(initial_s, c, final_s); - } - CHECK(are_equivalent(aut, result)); - } - SECTION("Regex (\\x{60}*\\x{80})|(\\x{900}*\\x{600})") { Nfa aut; mata::parser::create_nfa(&aut, "(\\x{60}*\\x{80})|(\\x{900}*\\x{600})", false, 306, true, Encoding::UTF8); @@ -1596,20 +1580,21 @@ TEST_CASE("mata::Parser UTF-8 encoding") CHECK(are_equivalent(aut, result)); } - // SECTION("Regex .*") { - // Nfa aut; - // mata::parser::create_nfa(&aut, ".*", false, 306, true, Encoding::UTF8); - // aut = aut.decode_utf8(); - - // Nfa result; - // State initial_s = 0; - // State final_s = 1; - // result.initial.insert(initial_s); - // result.final.insert(final_s); - // for(Symbol c = 0; c <= 0x10FFFF; c++) { - // result.delta.add(initial_s, c, final_s); - // } - // CHECK(are_equivalent(aut, result)); - // } + // A proper test, but takes about 2 seconds to run. + SECTION("Regex [\\x{00}-\\x{10FFFF}]") { + Nfa aut; + mata::parser::create_nfa(&aut, "[\\x{00}-\\x{10FFFF}]", false, 306, true, Encoding::UTF8); + aut = aut.decode_utf8(); + + // Random symbols + std::vector symbols = { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x7f, 0x80, + 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xff, 0x100, 0x110, 0x36f0, 0x57fc, 0x6177, 0x7498, + 0x8f3f, 0x9fc8, 0x1101e, 0x14348, 0x14e34, 0x19581, 0x1c48e, 0x1f1cc, 0x1f91d, 0x222a6, 0x22e11, + 0xe54f5, 0xe7934, 0xe93a4, 0xe998d, 0xebee8, 0xedb9e, 0xef98b, 0xf12af, 0xf51e2, 0xf557f, 0xf6b08, + 0xfa7f0, 0xfacb2, 0xfd719, 0x106d12, 0x106d66, 0x109220, 0x10a608, 0x10c1f5, 0x10FFFF }; + for(const Symbol c : symbols) { + CHECK(aut.is_in_lang(Run{Word{c}, {}})); + } + } } // }}}