diff --git a/src/alphabet.cc b/src/alphabet.cc index 1dc8482a6..5484942f7 100644 --- a/src/alphabet.cc +++ b/src/alphabet.cc @@ -209,15 +209,19 @@ mata::Word mata::encode_word_utf8(const mata::Word& word) { mata::Word utf8_encoded_word; for (const Symbol symbol: word) { if (symbol < 0x80) { + // U+0000 to U+007F : 0xxxxxxx utf8_encoded_word.push_back(symbol); } else if (symbol < 0x800) { + // U+0080 to U+07FF : 110xxxxx 10xxxxxx utf8_encoded_word.push_back(0xC0 | (symbol >> 6)); utf8_encoded_word.push_back(0x80 | (symbol & 0x3F)); } else if (symbol < 0x10000) { + // U+0800 to U+FFFF : 1110xxxx 10xxxxxx 10xxxxxx utf8_encoded_word.push_back(0xE0 | (symbol >> 12)); utf8_encoded_word.push_back(0x80 | ((symbol >> 6) & 0x3F)); utf8_encoded_word.push_back(0x80 | (symbol & 0x3F)); } else if (symbol < 0x110000) { + // U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx utf8_encoded_word.push_back(0xF0 | (symbol >> 18)); utf8_encoded_word.push_back(0x80 | ((symbol >> 12) & 0x3F)); utf8_encoded_word.push_back(0x80 | ((symbol >> 6) & 0x3F)); @@ -234,16 +238,20 @@ mata::Word mata::decode_word_utf8(const mata::Word& word) { for (size_t i = 0; i < word.size(); i++) { Symbol symbol = word[i]; if ((symbol & 0x80) == 0) { + // U+0000 to U+007F : 0xxxxxxx decoded_word.push_back(symbol); } else if ((symbol & 0xE0) == 0xC0) { + // U+0080 to U+07FF : 110xxxxx 10xxxxxx assert(i + 1 < word.size()); decoded_word.push_back(((symbol & 0x1F) << 6) | (word[i+1] & 0x3F)); i += 1; } else if ((symbol & 0xF0) == 0xE0) { + // U+0800 to U+FFFF : 1110xxxx 10xxxxxx 10xxxxxx assert(i + 2 < word.size()); decoded_word.push_back(((symbol & 0x0F) << 12) | ((word[i+1] & 0x3F) << 6) | (word[i+2] & 0x3F)); i += 2; } else if ((symbol & 0xF8) == 0xF0) { + // U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx assert(i + 3 < word.size()); decoded_word.push_back(((symbol & 0x07) << 18) | ((word[i+1] & 0x3F) << 12) | ((word[i+2] & 0x3F) << 6) | (word[i+3] & 0x3F)); i += 3; diff --git a/src/nfa/nfa.cc b/src/nfa/nfa.cc index b186bd83f..a5535107e 100644 --- a/src/nfa/nfa.cc +++ b/src/nfa/nfa.cc @@ -650,109 +650,25 @@ Nfa& Nfa::unite_nondet_with(const mata::nfa::Nfa& aut) { } Nfa Nfa::decode_utf8() const { - // // Decodes UTF-8 like transitions starting from the given state. - // auto decode_utf8_trans = [&](const State state, const uint8_t first_byte) -> std::vector { - // // Determine the length of the UTF-8 prefix - // const size_t prefix_len = (first_byte >> 5 == 0b110) ? 3 : - // (first_byte >> 4 == 0b1110) ? 4 : - // (first_byte >> 3 == 0b11110) ? 5 : 0; - // assert(prefix_len > 0); - // uint8_t first_byte_data = first_byte & (0xff >> (prefix_len)); - // size_t max_depth = prefix_len - 2; - - // std::vector result; - // std::stack> worklist; - // worklist.push({state, first_byte_data, 0}); - // // Inner limited depth DFS - combines multiple transitions into a single UTF-8 symbol - // while (!worklist.empty()) { - // std::tuple elem = worklist.top(); - // worklist.pop(); - // State src = std::get<0>(elem); - // Symbol symbol = std::get<1>(elem); - // uint8_t depth = std::get<2>(elem); - // assert(depth < max_depth); - // depth++; - - // for (const SymbolPost &symbol_post : this->delta[src]) { - // const uint8_t symbol_prefix = static_cast(symbol_post.symbol & 0xc0); - // assert(symbol_prefix == 0x80); - // const uint8_t symbol_data = static_cast(symbol_post.symbol & 0x7f); - // symbol = (symbol << 6) | symbol_data; - - // if (depth == max_depth) { - // // This is the last byte of the UTF-8 symbol. - // result.push_back(SymbolPost{symbol, symbol_post.targets}); - // } else { - // // This is an intermediate byte of the UTF-8 symbol. Continue the DFS. - // for (State target : symbol_post.targets) { - // worklist.push({target, symbol, depth}); - // } - // } - // } - // } - - // return result; - // }; - - // const size_t num_of_states{ this->num_of_states() }; - // Nfa result{ num_of_states, StateSet{this->initial}, StateSet{this->final} }; - // mata::BoolVector used(num_of_states, false); - - // std::stack worklist; - // for (State state: this->initial) { - // worklist.push(state); - // used[state] = true; - // } - - // // Outer DFS - traverses the automaton transitions - // while (!worklist.empty()) { - // State src = worklist.top(); - // worklist.pop(); - // StatePost &result_state_post = result.delta.mutable_state_post(src); - // for (const SymbolPost &symbol_post: this->delta[src]) { - // Symbol symbol = symbol_post.symbol; - // if (symbol & 0x80) { - // // It is an UTF-8 symbol - // const uint8_t first_byte = static_cast(symbol); - // for (const State target: symbol_post.targets) { - // for (const SymbolPost &symbol_post_decoded: decode_utf8_trans(target, first_byte)) { - // // Insert decoded transitions - // result_state_post.insert(std::move(symbol_post_decoded)); - // // Add targets to the worklist - // for (State target_decoded: symbol_post_decoded.targets) { - // if (used[target_decoded]) { - // continue; - // } - // used[target_decoded] = true; - // worklist.push(target_decoded); - // } - // } - // } - // } else { - // // It is standard ASCII symbol <0;127> - // result_state_post.insert(SymbolPost{symbol, symbol_post.targets}); - // for (State target: symbol_post.targets) { - // if (used[target]) { - // continue; - // } - // used[target] = true; - // worklist.push(target); - // } - // } - // } - // } - Nfa result{ this->num_of_states(), StateSet{this->initial}, StateSet{this->final} }; BoolVector used(this->num_of_states(), false); std::stack worklist; auto push_state_set = [&](const StateSet& set) { for (State state: set) { + if (used[state]) { + continue; + } worklist.push(state); used[state] = true; } }; + // UTF-8 Byte Patterns: + // U+0000 to U+007F : 0xxxxxxx + // U+0080 to U+07FF : 110xxxxx 10xxxxxx + // U+0800 to U+FFFF : 1110xxxx 10xxxxxx 10xxxxxx + // U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx push_state_set(StateSet{this->initial}); while (!worklist.empty()) { State q1 = worklist.top(); diff --git a/tests/nfa/nfa.cc b/tests/nfa/nfa.cc index bda745b3a..7586b0f0c 100644 --- a/tests/nfa/nfa.cc +++ b/tests/nfa/nfa.cc @@ -4318,7 +4318,7 @@ TEST_CASE("mata::nfa::Nfa::decode_utf8") { CHECK(are_equivalent(result, aut.decode_utf8())); } - SECTION("between 0x100000 and 0x10FFFF") { + SECTION("between 0x10000 and 0x10FFFF") { Nfa aut; aut.initial.insert(0); aut.delta.add(0, 0xf0, 1); @@ -4361,16 +4361,41 @@ TEST_CASE("mata::nfa::Nfa::decode_utf8") { Nfa result; result.initial.insert(0); - result.delta.add(0, 0x100000, 1); - result.delta.add(1, 0x200000, 2); - result.delta.add(2, 0x300000, 3); - result.delta.add(3, 0x400000, 4); - result.delta.add(4, 0x500000, 5); - result.delta.add(5, 0x600000, 6); - result.delta.add(6, 0x700000, 7); - result.delta.add(7, 0x800000, 8); + result.delta.add(0, 0x10000, 1); + result.delta.add(1, 0x20000, 2); + result.delta.add(2, 0x30000, 3); + result.delta.add(3, 0x40000, 4); + result.delta.add(4, 0x50000, 5); + result.delta.add(5, 0x60000, 6); + result.delta.add(6, 0x70000, 7); + result.delta.add(7, 0x80000, 8); result.delta.add(8, 0x10ffff, 9); result.final.insert(9); CHECK(are_equivalent(result, aut.decode_utf8())); } + + SECTION("mix") { + Nfa aut; + aut.initial.insert(0); + aut.delta.add(0, 0x01, 1); + aut.delta.add(1, 0xc2, 2); + aut.delta.add(2, 0x90, 3); + aut.delta.add(3, 0xe0, 4); + aut.delta.add(4, 0xa2, 5); + aut.delta.add(5, 0xac, 6); + aut.delta.add(6, 0xf0, 7); + aut.delta.add(7, 0x90, 8); + aut.delta.add(8, 0x83, 9); + aut.delta.add(9, 0x8c, 10); + aut.final.insert(10); + + Nfa result; + result.initial.insert(0); + result.delta.add(0, 0x01, 1); + result.delta.add(1, 0x90, 2); + result.delta.add(2, 0x8ac, 3); + result.delta.add(3, 0x100cc, 4); + result.final.insert(4); + CHECK(are_equivalent(result, aut.decode_utf8())); + } } diff --git a/tests/re2parser.cc b/tests/re2parser.cc index 9e296c655..2452df766 100644 --- a/tests/re2parser.cc +++ b/tests/re2parser.cc @@ -1515,4 +1515,101 @@ TEST_CASE("mata::Parser UTF-8 encoding") CHECK(are_equivalent(x, y)); } + SECTION("Regex range [x70-x90]") { + Nfa aut; + mata::parser::create_nfa(&aut, "[\\x{70}-\\x{90}]", false, 306, true, Encoding::UTF8); + aut = aut.decode_utf8(); + + Nfa result; + State initial_s = 0; + State final_s = 1; + result.initial.insert(initial_s); + result.final.insert(final_s); + for(Symbol c = 0x70; c <= 0x90; c++) { + result.delta.add(initial_s, c, final_s); + } + CHECK(are_equivalent(aut, result)); + } + + SECTION("Regex range [x790-x890]") { + Nfa aut; + mata::parser::create_nfa(&aut, "[\\x{700}-\\x{900}]", false, 306, true, Encoding::UTF8); + aut = aut.decode_utf8(); + + Nfa result; + State initial_s = 0; + State final_s = 1; + result.initial.insert(initial_s); + result.final.insert(final_s); + for(Symbol c = 0x700; c <= 0x900; c++) { + result.delta.add(initial_s, c, final_s); + } + CHECK(are_equivalent(aut, result)); + } + + SECTION("Regex range [xFF90-x10090]") { + Nfa aut; + mata::parser::create_nfa(&aut, "[\\x{FF90}-\\x{10090}]", false, 306, true, Encoding::UTF8); + aut = aut.decode_utf8(); + + Nfa result; + State initial_s = 0; + State final_s = 1; + result.initial.insert(initial_s); + result.final.insert(final_s); + for(Symbol c = 0xFF90; c <= 0x10090; c++) { + result.delta.add(initial_s, c, final_s); + } + CHECK(are_equivalent(aut, result)); + } + + SECTION("Regex range [x00-x900]") { + Nfa aut; + mata::parser::create_nfa(&aut, "[\\x{00}-\\x{900}]", false, 306, true, Encoding::UTF8); + aut = aut.decode_utf8(); + + Nfa result; + State initial_s = 0; + State final_s = 1; + result.initial.insert(initial_s); + result.final.insert(final_s); + for(Symbol c = 0; c <= 0x900; c++) { + result.delta.add(initial_s, c, final_s); + } + CHECK(are_equivalent(aut, result)); + } + + SECTION("Regex (\\x{60}*\\x{80})|(\\x{900}*\\x{600})") { + Nfa aut; + mata::parser::create_nfa(&aut, "(\\x{60}*\\x{80})|(\\x{900}*\\x{600})", false, 306, true, Encoding::UTF8); + aut = aut.decode_utf8(); + + Nfa result; + result.delta.add(0, 0x60, 0); + result.delta.add(0, 0x80, 1); + result.delta.add(2, 0x900, 2); + result.delta.add(2, 0x600, 3); + result.initial.insert(0); + result.initial.insert(2); + result.final.insert(1); + result.final.insert(3); + CHECK(are_equivalent(aut, result)); + } + + // SECTION("Regex .*") { + // Nfa aut; + // mata::parser::create_nfa(&aut, ".*", false, 306, true, Encoding::UTF8); + // aut = aut.decode_utf8(); + + // Nfa result; + // State initial_s = 0; + // State final_s = 1; + // result.initial.insert(initial_s); + // result.final.insert(final_s); + // for(Symbol c = 0; c <= 0x10FFFF; c++) { + // result.delta.add(initial_s, c, final_s); + // } + // CHECK(are_equivalent(aut, result)); + // } + } // }}}