Skip to content

Commit

Permalink
conflicts resolved
Browse files Browse the repository at this point in the history
  • Loading branch information
koniksedy committed Nov 18, 2024
2 parents 170186f + e806b8d commit a45f104
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 22 deletions.
4 changes: 4 additions & 0 deletions bindings/python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ ipython>=7.9.0
pandas>=1.3.5
networkx>=2.6.3
graphviz>=0.20.0
setuptools
papermill
ipykernel
seaborn
17 changes: 17 additions & 0 deletions include/mata/nfa/nfa.hh
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,12 @@ public:
*/
void print_to_dot(std::ostream &output) const;

/**
* @brief Prints the automaton to the file in DOT format
* @param filename Name of the file to print the automaton to
*/
void print_to_dot(const std::string& filename) const;

/**
* @brief Prints the automaton in mata format
*
Expand All @@ -297,6 +303,7 @@ public:
* TODO handle alphabet of the automaton, currently we print the exact value of the symbols
*/
std::string print_to_mata() const;

/**
* @brief Prints the automaton to the output stream in mata format
*
Expand All @@ -306,6 +313,16 @@ public:
*/
void print_to_mata(std::ostream &output) const;

/**
* @brief Prints the automaton to the file in mata format
*
* If you need to parse the automaton again, use IntAlphabet in construct()
*
* TODO handle alphabet of the automaton, currently we print the exact value of the symbols
* @param filename Name of the file to print the automaton to
*/
void print_to_mata(const std::string& filename) const;

// TODO: Relict from VATA. What to do with inclusion/ universality/ this post function? Revise all of them.
StateSet post(const StateSet& states, const Symbol& symbol) const;

Expand Down
17 changes: 17 additions & 0 deletions src/nfa/nfa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <list>
#include <optional>
#include <iterator>
#include <fstream>

// MATA headers
#include "mata/utils/sparse-set.hh"
Expand Down Expand Up @@ -459,6 +460,14 @@ void Nfa::print_to_dot(std::ostream &output) const {
output << "}" << std::endl;
}

void Nfa::print_to_dot(const std::string& filename) const {
std::ofstream output(filename);
if (!output) {
throw std::ios_base::failure("Failed to open file: " + filename);
}
print_to_dot(output);
}

std::string Nfa::print_to_mata() const {
std::stringstream output;
print_to_mata(output);
Expand Down Expand Up @@ -492,6 +501,14 @@ void Nfa::print_to_mata(std::ostream &output) const {
}
}

void Nfa::print_to_mata(const std::string& filename) const {
std::ofstream output(filename);
if (!output) {
throw std::ios_base::failure("Failed to open file: " + filename);
}
print_to_mata(output);
}

Nfa Nfa::get_one_letter_aut(Symbol abstract_symbol) const {
Nfa digraph{num_of_states(), StateSet(initial), StateSet(final) };
// Add directed transitions for digraph.
Expand Down
52 changes: 31 additions & 21 deletions src/re2parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,14 @@ namespace {
this->outgoingEdges = std::vector<std::vector<std::pair<mata::Symbol, mata::nfa::State>>> (prog_size);

// We traverse all the states and create corresponding states and edges in Nfa
for (mata::nfa::State current_state = start_state; current_state < prog_size; current_state++) {
re2::Prog::Inst *inst = prog->inst(static_cast<int>(current_state));
for (State current_state = start_state, re2_state = start_state; re2_state < prog_size; ++
re2_state) {
/// Whether to increment the current state @c current_state when the @c re2_state increments.
bool increment_current_state{true};
re2::Prog::Inst *inst = prog->inst(static_cast<int>(re2_state));
// Every type of state can be final (due to epsilon transition), so we check it regardless of its type
if (this->state_cache.is_final_state[current_state]) {
this->make_state_final(current_state, explicit_nfa);
if (this->state_cache.is_final_state[re2_state]) {
this->make_state_final(re2_state, explicit_nfa);
}
switch (inst->opcode()) {
default:
Expand All @@ -165,33 +168,35 @@ namespace {
empty_flag = static_cast<int>(inst->empty());
// ^ - beginning of line
if (empty_flag & re2::kEmptyBeginLine) {
// TODO Symbol?
symbols.push_back(300);
increment_current_state = false;
}
// $ - end of line
if (empty_flag & re2::kEmptyEndLine) {
// TODO Symbol?
symbols.push_back(10);
// TODO How to handle?
// symbols.push_back(301);
increment_current_state = false;
}
// \A - beginning of text
if (empty_flag & re2::kEmptyBeginText) {
// TODO Symbol?
symbols.push_back(301);
increment_current_state = false;
}
// \z - end of text
if (empty_flag & re2::kEmptyEndText) {
// TODO Symbol?
symbols.push_back(302);
// TODO How to handle?
// symbols.push_back(302);
increment_current_state = false;
}
// \b - word boundary
if (empty_flag & re2::kEmptyWordBoundary) {
// TODO Symbol?
symbols.push_back(303);
// TODO How to handle?
// symbols.push_back(303);
increment_current_state = false;
}
// \B - not \b
if (empty_flag & re2::kEmptyNonWordBoundary) {
// TODO Symbol?
symbols.push_back(304);
// TODO How to handle?
// symbols.push_back(304);
increment_current_state = false;
}
break;
// kInstByteRange represents states with a "byte range" on the outgoing transition(s)
Expand All @@ -203,7 +208,7 @@ namespace {
symbols.push_back(symbol);
// Foldcase causes RE2 to do a case-insensitive match, so transitions will be made for
// both uppercase and lowercase symbols
if (inst->foldcase()) {
if (inst->foldcase() && symbol >= 'a' && symbol <= 'z') {
symbols.push_back(symbol-ascii_shift_value);
}
}
Expand All @@ -213,15 +218,17 @@ namespace {
if (!use_epsilon) {
// There is an epsilon transition to the currentState+1 we will need to copy transitions of
// the currentState+1 to the currentState.
if (!this->state_cache.is_last[current_state]) {
for (auto state: this->state_cache.state_mapping[current_state + 1]) {
copyEdgesFromTo.emplace_back(state, current_state);
if (!this->state_cache.is_last[re2_state]) {
for (auto state: this->state_cache.state_mapping[re2_state + 1]) {
copyEdgesFromTo.emplace_back(state, re2_state);
}
}
}
symbols.clear();
break;
}

if (increment_current_state) { ++current_state; }
}
if (!use_epsilon) {
// We will traverse the vector in reversed order. Like that, we will also handle chains of epsilon transitions
Expand Down Expand Up @@ -420,7 +427,8 @@ namespace {
if (inst->last()) {
this->state_cache.is_last[state] = true;
}
if (inst->opcode() == re2::kInstMatch) {
if (inst->opcode() == re2::kInstMatch ||
(inst->opcode() == re2::kInstEmptyWidth && inst->empty() & re2::kEmptyEndText)) {
this->state_cache.is_final_state[state] = true;
}
}
Expand Down Expand Up @@ -507,6 +515,8 @@ void mata::parser::create_nfa(nfa::Nfa* nfa, const std::string& pattern, bool us
RegexParser regexParser{};
auto parsed_regex = regexParser.parse_regex_string(pattern, encoding);
auto program = parsed_regex->CompileToProg(regexParser.options.max_mem() * 2 / 3);
// FIXME: use_epsilon = false completely breaks the method convert_pro_to_nfa(). Needs fixing before allowing to
// pass the argument use_epsilon to convert_pro_to_nfa().
regexParser.convert_pro_to_nfa(nfa, program, true, epsilon_value);
delete program;
// Decrements reference count and deletes object if the count reaches 0
Expand Down
172 changes: 171 additions & 1 deletion tests/re2parser.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_string.hpp>

#include "mata/nfa/nfa.hh"
#include "mata/parser/re2parser.hh"
#include "mata/nfa/builder.hh"
#include "mata/nfa/nfa.hh"

using namespace mata::nfa;

using Symbol = mata::Symbol;
Expand Down Expand Up @@ -1273,6 +1275,24 @@ TEST_CASE("mata::Parser error")
CHECK(!x.is_in_lang(Run{ Word{ 'a', 'a', 'a', 'a', 'a', 'a' }, {} }));
}

SECTION("Regex from issue #456") {
Nfa x;
mata::parser::create_nfa(&x, "[\\x00-\\x5a\\x5c-\\x7F]");

Nfa y;
State initial_s = 0;
State final_s = 1;
y.initial.insert(initial_s);
y.final.insert(final_s);
for (Symbol c = 0; c <= 0x7F; c++) {
if (c == 0x5B) {
continue;
}
y.delta.add(initial_s, c, final_s);
}
CHECK(are_equivalent(x, y));
}

SECTION("Another failing regex") {
Nfa x;
mata::parser::create_nfa(&x, "(cd(abcde)+)|(a(aaa)+|ccc+)");
Expand Down Expand Up @@ -1598,3 +1618,153 @@ TEST_CASE("mata::Parser UTF-8 encoding")
}

} // }}}

TEST_CASE("mata::parser Parsing regexes with ^ and $") {
Nfa nfa;
Nfa expected{};

SECTION("Handling of '\\'") {
mata::parser::create_nfa(&nfa, "a\\\\b");
expected = mata::nfa::builder::parse_from_mata(
std::string{ R"(
@NFA-explicit
%Alphabet-auto
%Initial q0
%Final q3
q0 97 q1
q1 92 q2
q2 98 q3)"
});
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("a|b$, a simple OR example with end marker") {
mata::parser::create_nfa(&nfa, "a|b$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^a|b, a simple OR example with begin marker") {
mata::parser::create_nfa(&nfa, "^a|b");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^a|b$, a simple OR example with begin and end marker") {
mata::parser::create_nfa(&nfa, "^a|b$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^(a|b)$, a simple OR example with begin and end marker around capture group") {
mata::parser::create_nfa(&nfa, "^(a|b)$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("a$|b, a simple OR example with end marker on the left side") {
mata::parser::create_nfa(&nfa, "a$|b");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^a$|^b$, a simple OR example with multiple begin and end markers") {
mata::parser::create_nfa(&nfa, "^a$|^b$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(0, 'b', 1);
expected.final.insert(1);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("aed|(bab)$, a simple OR example with trailing end marker") {
mata::parser::create_nfa(&nfa, "aed|(bab)$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(1, 'e', 2);
expected.delta.add(2, 'd', 3);
expected.delta.add(0, 'b', 4);
expected.delta.add(4, 'a', 5);
expected.delta.add(5, 'b', 3);
expected.final.insert(3);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("aed|bab$, a simple OR example with trailing end marker") {
mata::parser::create_nfa(&nfa, "aed|bab$");
expected.initial.insert(0);
expected.delta.add(0, 'a', 1);
expected.delta.add(1, 'e', 2);
expected.delta.add(2, 'd', 3);
expected.delta.add(0, 'b', 4);
expected.delta.add(4, 'a', 5);
expected.delta.add(5, 'b', 3);
expected.final.insert(3);
CHECK(mata::nfa::are_equivalent(nfa, expected));
}

SECTION("^systempath\\=https|ftp$ correct parentheses") {
mata::parser::create_nfa(&nfa, "^[sS][yY][sS][tT][eE][mM][pP][aA][tT][hH]\\\\=(([hH][tT]{2}[pP][sS]?)|([fF][tT][pP]))$");
expected = mata::nfa::builder::parse_from_mata(std::string{ R"(
@NFA-explicit
%Alphabet-auto
%Initial q0
%Final q16 q17
q0 83 q1
q0 115 q1
q1 89 q2
q1 121 q2
q2 83 q3
q2 115 q3
q3 84 q4
q3 116 q4
q4 69 q5
q4 101 q5
q5 77 q6
q5 109 q6
q6 80 q7
q6 112 q7
q7 65 q8
q7 97 q8
q8 84 q9
q8 116 q9
q9 72 q10
q9 104 q10
q10 92 q11
q11 61 q12
q12 70 q18
q12 72 q13
q12 102 q18
q12 104 q13
q13 84 q14
q13 116 q14
q14 84 q15
q14 116 q15
q15 80 q16
q15 112 q16
q16 83 q17
q16 115 q17
q18 84 q19
q18 116 q19
q19 80 q17
q19 112 q17
)"
});
CHECK(mata::nfa::are_equivalent(nfa, expected));
}
}

0 comments on commit a45f104

Please sign in to comment.