Skip to content

Commit

Permalink
tested
Browse files Browse the repository at this point in the history
  • Loading branch information
koniksedy committed Nov 15, 2024
1 parent de3ff23 commit 170186f
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 38 deletions.
2 changes: 1 addition & 1 deletion src/alphabet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ mata::Word mata::decode_word_utf8(const mata::Word& word) {
assert(i + 2 < word.size());
decoded_word.push_back(((symbol & 0x0F) << 12) | ((word[i+1] & 0x3F) << 6) | (word[i+2] & 0x3F));
i += 2;
} else if ((symbol & 0xF8) == 0xF0) {
} else if ((symbol & 0xF8) == 0xF0 && symbol < 0xF5) {
// U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
assert(i + 3 < word.size());
decoded_word.push_back(((symbol & 0x07) << 18) | ((word[i+1] & 0x3F) << 12) | ((word[i+2] & 0x3F) << 6) | (word[i+3] & 0x3F));
Expand Down
46 changes: 41 additions & 5 deletions src/nfa/nfa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,7 @@ Nfa Nfa::decode_utf8() const {
BoolVector used(this->num_of_states(), false);
std::stack<State> worklist;

// Pushes a set of states to the worklist and marks them as used.
auto push_state_set = [&](const StateSet& set) {
for (State state: set) {
if (used[state]) {
Expand All @@ -664,51 +665,86 @@ Nfa Nfa::decode_utf8() const {
}
};

// Adds a symbol_post to the state_post.
// If the transition sequence is deterministic, we can use emplace_back
// because symbols are discovered in ascending order. However, in cases
// of nondeterministic sequences, we must use insert to ensure proper ordering.
// For example, consider the sequences 0xC8 0x80 and 0xC8 0x88.
// Based solely on the first byte (0xC8), we cannot determine which sequence
// will result in the higher number.
auto add_to_state_post = [&](StatePost &state_post, const SymbolPost &symbol_post, const bool is_nondet) {
if (is_nondet) {
state_post.insert(std::move(symbol_post));
} else {
state_post.emplace_back(std::move(symbol_post));
}
};

// UTF-8 Byte Patterns:
// U+0000 to U+007F : 0xxxxxxx
// U+0080 to U+07FF : 110xxxxx 10xxxxxx
// U+0800 to U+FFFF : 1110xxxx 10xxxxxx 10xxxxxx
// U+010000 to U+10FFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// NOTE: Due to the nature of RE2, the automaton language can contain unexpected (invalid)
// UTF-8 sequences, such as 11000000 10000000 (U+0300). Because of that,
// we need to check if the decoded symbol is within the valid range of Unicode code points.
push_state_set(StateSet{this->initial});
while (!worklist.empty()) {
State q1 = worklist.top();
StatePost &q1_state_post = result.delta.mutable_state_post(q1);
worklist.pop();

// 1st Byte
for (const SymbolPost &sp1: this->delta[q1]) {
const Symbol s1 = sp1.symbol;
if ((s1 & 0x80) == 0x00) {
result.delta.add(q1, s1, sp1.targets);
q1_state_post.emplace_back(SymbolPost{s1, sp1.targets});;
push_state_set(sp1.targets);
continue;
}
// 2nd Byte
const bool is_nondet1 = sp1.targets.size() > 1;
for (const State q2: sp1.targets) {
for (const SymbolPost &sp2: this->delta[q2]) {
const Symbol s2 = sp2.symbol;
if ((s1 & 0xE0) == 0xC0) {
assert((s2 & 0xC0) == 0x80);
result.delta.add(q1, ((s1 & 0x1F) << 6) | (s2 & 0x3F), sp2.targets);
const Symbol symbol = ((s1 & 0x1F) << 6) | (s2 & 0x3F);
if (symbol < 0x80) {
continue; // Invalid UTF-8 sequence
}
assert(symbol <= 0x7FF);
add_to_state_post(q1_state_post, SymbolPost{symbol, sp2.targets}, is_nondet1);
push_state_set(sp2.targets);
continue;
}
// 3rd Byte
const bool is_nondet2 = is_nondet1 || sp2.targets.size() > 1;
for (const State q3: sp2.targets) {
for (const SymbolPost &sp3: this->delta[q3]) {
const Symbol s3 = sp3.symbol;
if ((s1 & 0xF0) == 0xE0) {
assert((s3 & 0xC0) == 0x80);
result.delta.add(q1, ((s1 & 0x0F) << 12) | ((s2 & 0x3F) << 6) | (s3 & 0x3F), sp3.targets);
const Symbol symbol = ((s1 & 0x0F) << 12) | ((s2 & 0x3F) << 6) | (s3 & 0x3F);
if (symbol < 0x800) {
continue; // Invalid UTF-8 sequence
}
assert(symbol <= 0xFFFF);
add_to_state_post(q1_state_post, SymbolPost{symbol, sp3.targets}, is_nondet2);
push_state_set(sp3.targets);
continue;
}
// 4th Byte
const bool is_nondet3 = is_nondet2 || sp3.targets.size() > 1;
for (const State q4: sp3.targets) {
for (const SymbolPost &sp4: this->delta[q4]) {
const Symbol s4 = sp4.symbol;
assert((s1 & 0xF8) == 0xF0);
assert((s4 & 0xC0) == 0x80);
result.delta.add(q1, ((s1 & 0x07) << 18) | ((s2 & 0x3F) << 12) | ((s3 & 0x3F) << 6) | (s4 & 0x3F), sp4.targets);
const Symbol symbol = ((s1 & 0x07) << 18) | ((s2 & 0x3F) << 12) | ((s3 & 0x3F) << 6) | (s4 & 0x3F);
if (symbol < 0x10000 || symbol > 0x10FFFF) {
continue; // Invalid UTF-8 sequence
}
add_to_state_post(q1_state_post, SymbolPost{symbol, sp4.targets}, is_nondet3);
push_state_set(sp4.targets);
}
}
Expand Down
4 changes: 3 additions & 1 deletion src/re2parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ namespace {
/**
* Creates parsed regex (ie. Regexp*) from string regex_string
* @param regex_string Regex to be parsed as a string
* @param encoding Encoding of the regex, default is Latin1
* @return Parsed regex as RE2 Regexp*
*/
re2::Regexp* parse_regex_string(const std::string& regex_string, const Encoding encoding = Encoding::Latin1) const {
Expand Down Expand Up @@ -490,11 +491,12 @@ namespace {
}

/**
* The main method, it creates NFA from regex
* The main method, it creates NFA from regex.
* @param pattern regex as string
* @param use_epsilon whether to create NFA with epsilon transitions or not
* @param epsilon_value value, that will represent epsilon on transitions
* @param use_reduce if set to true the result is trimmed and reduced using simulation reduction
* @param encoding encoding of the regex, default is Latin1
* @return Nfa corresponding to pattern
*/
void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool use_epsilon, mata::Symbol epsilon_value, bool use_reduce, const Encoding encoding) {
Expand Down
47 changes: 16 additions & 31 deletions tests/re2parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1563,22 +1563,6 @@ TEST_CASE("mata::Parser UTF-8 encoding")
CHECK(are_equivalent(aut, result));
}

SECTION("Regex range [x00-x900]") {
Nfa aut;
mata::parser::create_nfa(&aut, "[\\x{00}-\\x{900}]", false, 306, true, Encoding::UTF8);
aut = aut.decode_utf8();

Nfa result;
State initial_s = 0;
State final_s = 1;
result.initial.insert(initial_s);
result.final.insert(final_s);
for(Symbol c = 0; c <= 0x900; c++) {
result.delta.add(initial_s, c, final_s);
}
CHECK(are_equivalent(aut, result));
}

SECTION("Regex (\\x{60}*\\x{80})|(\\x{900}*\\x{600})") {
Nfa aut;
mata::parser::create_nfa(&aut, "(\\x{60}*\\x{80})|(\\x{900}*\\x{600})", false, 306, true, Encoding::UTF8);
Expand All @@ -1596,20 +1580,21 @@ TEST_CASE("mata::Parser UTF-8 encoding")
CHECK(are_equivalent(aut, result));
}

// SECTION("Regex .*") {
// Nfa aut;
// mata::parser::create_nfa(&aut, ".*", false, 306, true, Encoding::UTF8);
// aut = aut.decode_utf8();

// Nfa result;
// State initial_s = 0;
// State final_s = 1;
// result.initial.insert(initial_s);
// result.final.insert(final_s);
// for(Symbol c = 0; c <= 0x10FFFF; c++) {
// result.delta.add(initial_s, c, final_s);
// }
// CHECK(are_equivalent(aut, result));
// }
// A proper test, but takes about 2 seconds to run.
SECTION("Regex [\\x{00}-\\x{10FFFF}]") {
Nfa aut;
mata::parser::create_nfa(&aut, "[\\x{00}-\\x{10FFFF}]", false, 306, true, Encoding::UTF8);
aut = aut.decode_utf8();

// Random symbols
std::vector<Symbol> symbols = { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x7f, 0x80,
0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xff, 0x100, 0x110, 0x36f0, 0x57fc, 0x6177, 0x7498,
0x8f3f, 0x9fc8, 0x1101e, 0x14348, 0x14e34, 0x19581, 0x1c48e, 0x1f1cc, 0x1f91d, 0x222a6, 0x22e11,
0xe54f5, 0xe7934, 0xe93a4, 0xe998d, 0xebee8, 0xedb9e, 0xef98b, 0xf12af, 0xf51e2, 0xf557f, 0xf6b08,
0xfa7f0, 0xfacb2, 0xfd719, 0x106d12, 0x106d66, 0x109220, 0x10a608, 0x10c1f5, 0x10FFFF };
for(const Symbol c : symbols) {
CHECK(aut.is_in_lang(Run{Word{c}, {}}));
}
}

} // }}}

0 comments on commit 170186f

Please sign in to comment.