From 2b90f93f8986817e70f1475c8cedcf875e0a067b Mon Sep 17 00:00:00 2001 From: Raekye Date: Tue, 23 Jul 2019 01:38:18 -0400 Subject: [PATCH] cleanup, fixed regex parser groups --- midori/src/main.cpp | 6 +-- midori/src/midori/lexer.cpp | 30 ++++++----- midori/src/midori/lexer.h | 4 -- midori/src/midori/parser.cpp | 8 +++ midori/src/midori/parser.h | 11 +--- midori/src/midori/regex_engine.cpp | 87 +++++++++++++----------------- midori/src/midori/utf8.h | 32 ++++++++++- midori/tests/CMakeLists.txt | 1 + midori/tests/parser.cpp | 20 +++++++ midori/tests/regex_engine.cpp | 34 ++++++++++++ 10 files changed, 150 insertions(+), 83 deletions(-) create mode 100644 midori/tests/regex_engine.cpp diff --git a/midori/src/main.cpp b/midori/src/main.cpp index ea8fb63..a61243b 100644 --- a/midori/src/main.cpp +++ b/midori/src/main.cpp @@ -9,8 +9,8 @@ int test_regex_engine() { RegexEngine re; - std::string pattern = "(abc){0,3}[a-zA-Z]|def.\\.[^a-zA-Z]?+-^\\n+[^\\t\\xff-\\u12345678]"; - //std::string pattern = "[a-]"; + //std::string pattern = "(abc){0,3}[a-zA-Z]|def.\\.[^a-zA-Z]?+-^\\n+[^\\t\\xff-\\u12345678]"; + std::string pattern = "[-a-b-cd---]"; std::unique_ptr r = re.parse(pattern); RegexASTPrinter printer; r->accept(&printer); @@ -124,10 +124,10 @@ int main() { ULong x = ~0; std::cout << "-1 is " << x << std::endl; test_interval_tree(); - test_regex_engine(); test_parser0(); test_parser2(); test_parser1(); + test_regex_engine(); //test_generator(); return 0; } diff --git a/midori/src/midori/lexer.cpp b/midori/src/midori/lexer.cpp index 22dd49a..038d0c6 100644 --- a/midori/src/midori/lexer.cpp +++ b/midori/src/midori/lexer.cpp @@ -14,8 +14,8 @@ FileInputStream::FileInputStream(std::istream* file) : file(file) { } Long FileInputStream::get() { - UInt ch = this->file->get(); - if (!this->file->good()) { + Long ch = utf8::codepoint_from_istream(this->file); + if (ch < 0) { if (this->file->eof()) { return Lexer::CHAR_EOF; } @@ -35,8 +35,8 @@ Long VectorInputStream::get() { return this->v.at(this->pos++); } -Lexer::Lexer() : current_state(nullptr), buffer_pos(0), location(0, 0) { - return; +Lexer::Lexer() { + this->reset(); } void Lexer::generate() { @@ -137,8 +137,8 @@ void Lexer::generate() { void Lexer::reset() { this->buffer.clear(); this->buffer_pos = 0; - this->location.line = 0; - this->location.column = 0; + this->location.line = 1; + this->location.column = 1; } void Lexer::add_rule(std::string rule, std::unique_ptr regex) { @@ -147,7 +147,7 @@ void Lexer::add_rule(std::string rule, std::unique_ptr regex) { } std::unique_ptr Lexer::scan(IInputStream* in) { - this->current_state = this->dfa->root(); + RegexDFAState* current_state = this->dfa->root(); bool matched = false; std::vector matched_tags; std::string matched_str = ""; @@ -156,9 +156,9 @@ std::unique_ptr Lexer::scan(IInputStream* in) { std::string found_buffer = ""; std::unique_ptr t; while (true) { - if (!this->current_state->terminals.empty()) { + if (!current_state->terminals.empty()) { matched = true; - matched_tags = this->current_state->terminals; + matched_tags = current_state->terminals; matched_str.append(found_buffer); matched_buffer_pos = this->buffer_pos; matched_location = this->location; @@ -167,7 +167,7 @@ std::unique_ptr Lexer::scan(IInputStream* in) { Long ch = this->read(in); if (ch == '\n') { this->location.line++; - this->location.column = 0; + this->location.column = 1; } else { this->location.column++; } @@ -185,16 +185,18 @@ std::unique_ptr Lexer::scan(IInputStream* in) { t.reset(new Token({ Lexer::TOKEN_BAD }, found_buffer, matched_location)); break; } - RegexDFAState* next = this->current_state->next(ch); + found_buffer.append(utf8::string_from_codepoint(ch)); + this->buffer_pos++; + RegexDFAState* next = current_state->next(ch); if (next == nullptr) { if (matched) { t.reset(new Token(matched_tags, matched_str, matched_location)); + } else { + t.reset(new Token({ Lexer::TOKEN_BAD }, found_buffer, matched_location)); } break; } - found_buffer.append(utf8::string_from_codepoint(ch)); - this->buffer_pos++; - this->current_state = next; + current_state = next; } this->buffer_pos = matched_buffer_pos; this->location = matched_location; diff --git a/midori/src/midori/lexer.h b/midori/src/midori/lexer.h index 2ef99b8..b80b078 100644 --- a/midori/src/midori/lexer.h +++ b/midori/src/midori/lexer.h @@ -14,9 +14,6 @@ struct LocationInfo { UInt line; UInt column; - LocationInfo(UInt line, UInt column) : line(line), column(column) { - return; - } }; struct Token { @@ -72,7 +69,6 @@ class Lexer { std::vector> rules_regex; std::unique_ptr dfa; - RegexDFAState* current_state; std::vector buffer; UInt buffer_pos; diff --git a/midori/src/midori/parser.cpp b/midori/src/midori/parser.cpp index 206f23e..81c5caf 100644 --- a/midori/src/midori/parser.cpp +++ b/midori/src/midori/parser.cpp @@ -133,6 +133,14 @@ bool Parser::parse_advance(std::unique_ptr t) { return false; } } + std::cout << "no rules, expected to see" << std::endl; + for (auto const& kv : curr->next) { + std::cout << kv.first << " -> " << kv.second->index << std::endl; + } + for (auto const& kv : curr->reductions) { + std::cout << kv.first << " <- "; + Parser::debug_production(kv.second); + } return true; } diff --git a/midori/src/midori/parser.h b/midori/src/midori/parser.h index a67e415..190b832 100644 --- a/midori/src/midori/parser.h +++ b/midori/src/midori/parser.h @@ -22,8 +22,6 @@ class ParserAST; typedef std::function(MatchedNonterminal*)> ProductionHandler; typedef std::pair Item; -typedef std::string Symbol; -typedef std::tuple ExtendedSymbol; class ParserAST { public: @@ -36,19 +34,12 @@ struct Production { ProductionHandler handler; }; -struct ExtendedProduction { - ExtendedSymbol target; - std::vector symbols; - Production* orig; -}; - struct ItemSet { + Int index; std::set head; - std::set additionals; std::set closure; std::map next; std::map reductions; - Int index; }; class Match { diff --git a/midori/src/midori/regex_engine.cpp b/midori/src/midori/regex_engine.cpp index 4b2bb02..047d00e 100644 --- a/midori/src/midori/regex_engine.cpp +++ b/midori/src/midori/regex_engine.cpp @@ -1,8 +1,9 @@ #include "regex_engine.h" -#include #include "helper.h" #include "utf8.h" +#include +#include ParserRegexAST::ParserRegexAST(std::unique_ptr r) : regex(std::move(r)) { return; @@ -194,41 +195,26 @@ std::unique_ptr RegexEngine::make() { MatchedNonterminal* n2 = m->nonterminal(1); ParserRegexAST* r1 = dynamic_cast(n1->value.get()); ParserRegexAST* r2 = dynamic_cast(n2->value.get()); + RegexASTGroup* g1 = dynamic_cast(r1->regex.get()); + RegexASTGroup* g2 = dynamic_cast(r2->regex.get()); + assert(g1->span->next == nullptr); std::unique_ptr car(new RegexASTGroup::RangeList); - if (RegexASTLiteral* l = dynamic_cast(r1->regex.get())) { - car->range.first = l->ch; - car->range.second = l->ch; - } else if (RegexASTGroup* g = dynamic_cast(r1->regex.get())) { - assert(g->span->next == nullptr); - car->range.first = g->span->range.first; - car->range.second = g->span->range.second; - } - std::unique_ptr cdr = nullptr; - if (RegexASTLiteral* l2 = dynamic_cast(r2->regex.get())) { - cdr.reset(new RegexASTGroup::RangeList); - cdr->range.first = l2->ch; - cdr->range.second = l2->ch; - cdr->next = nullptr; - } else if (RegexASTGroup* g2 = dynamic_cast(r2->regex.get())) { - cdr = std::move(g2->span); - } - car->next = std::move(cdr); + car->range.first = g1->span->range.first; + car->range.second = g1->span->range.second; + car->next = std::move(g2->span); return std::unique_ptr(new ParserRegexAST(std::unique_ptr(new RegexASTGroup(false, std::move(car))))); }); p->add_production("group_contents", { "group_element" }, [](MatchedNonterminal* m) -> std::unique_ptr { MatchedNonterminal* n = m->nonterminal(0); return std::move(n->value); }); + /* p->add_production("group_element", { "group_range" }, [](MatchedNonterminal* m) -> std::unique_ptr { MatchedNonterminal* n = m->nonterminal(0); return std::move(n->value); }); */ - p->add_production("group_element", { "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr { - MatchedNonterminal* n = m->nonterminal(0); - return std::move(n->value); - }); p->add_production("group_element", { "group_element", "DASH", "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr { MatchedNonterminal* n1 = m->nonterminal(0); MatchedNonterminal* n2 = m->nonterminal(2); @@ -236,39 +222,38 @@ std::unique_ptr RegexEngine::make() { ParserRegexAST* r2 = dynamic_cast(n2->value.get()); std::unique_ptr p1(std::move(r1->regex)); std::unique_ptr p2(std::move(r2->regex)); - RegexASTLiteral* l1 = dynamic_cast(p1.get()); - RegexASTLiteral* l2 = dynamic_cast(p2.get()); - return std::unique_ptr(new ParserRegexAST(std::unique_ptr(RegexASTGroup::make(false, { l1->ch, l2->ch })))); + RegexASTGroup* g1 = dynamic_cast(p1.get()); + RegexASTGroup* g2 = dynamic_cast(p2.get()); + assert(g1->span->next == nullptr); + assert(g2->span->next == nullptr); + assert(g2->span->range.first == g2->span->range.second); + UInt a = g1->span->range.first; + UInt b = g1->span->range.second; + UInt c = g2->span->range.first; + UInt lower = std::min({ a, b, c }); + UInt upper = std::max({ a, b, c }); + return std::unique_ptr(new ParserRegexAST(std::unique_ptr(RegexASTGroup::make(false, { lower, upper })))); }); - - p->add_production("group_literal", { "group_escaped_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr { + p->add_production("group_element", { "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr { MatchedNonterminal* n = m->nonterminal(0); return std::move(n->value); }); - /* - p->add_production("group_literal", { "DEC" }, [](MatchedNonterminal* m) -> std::unique_ptr { - MatchedTerminal* n = m->terminal(0); - return std::unique_ptr(new ParserRegexAST(std::unique_ptr(new RegexASTLiteral(n->token->lexeme.at(0))))); + + p->add_production("group_literal", { "group_literal_char" }, [](MatchedNonterminal* m) -> std::unique_ptr { + MatchedNonterminal* n = m->nonterminal(0); + ParserRegexAST* r = dynamic_cast(n->value.get()); + RegexASTLiteral* l = dynamic_cast(r->regex.get()); + return std::unique_ptr(new ParserRegexAST(std::unique_ptr(RegexASTGroup::make(false, { l->ch, l->ch })))); }); - p->add_production("group_literal", { "HEX" }, [](MatchedNonterminal* m) -> std::unique_ptr { - MatchedTerminal* n = m->terminal(0); - return std::unique_ptr(new ParserRegexAST(std::unique_ptr(new RegexASTLiteral(n->token->lexeme.at(0))))); + p->add_production("group_literal_char", { "group_escaped_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr { + MatchedNonterminal* n = m->nonterminal(0); + return std::move(n->value); }); - RegexEngine::add_literal(p.get(), "group_literal", "N", 'n'); - RegexEngine::add_literal(p.get(), "group_literal", "T", 't'); - RegexEngine::add_literal(p.get(), "group_literal", "X", 'x'); - RegexEngine::add_literal(p.get(), "group_literal", "COMMA", ','); - RegexEngine::add_literal(p.get(), "group_literal", "DOT", '.'); - RegexEngine::add_literal(p.get(), "group_literal", "LPAREN", '('); - RegexEngine::add_literal(p.get(), "group_literal", "RPAREN", ')'); - RegexEngine::add_literal(p.get(), "group_literal", "LBRACE", '{'); - RegexEngine::add_literal(p.get(), "group_literal", "RBRACE", '}'); - RegexEngine::add_literal(p.get(), "group_literal", "STAR", '*'); - RegexEngine::add_literal(p.get(), "group_literal", "PLUS", '+'); - RegexEngine::add_literal(p.get(), "group_literal", "QUESTION", '?'); - RegexEngine::add_literal(p.get(), "group_literal", "OR", '|'); - */ - p->add_production("group_literal", { "GROUP_ANY" }, [](MatchedNonterminal* m) -> std::unique_ptr { + p->add_production("group_literal_char", { "DASH" }, [](MatchedNonterminal* m) -> std::unique_ptr { + (void) m; + return std::unique_ptr(new ParserRegexAST(std::unique_ptr(new RegexASTLiteral('-')))); + }); + p->add_production("group_literal_char", { "GROUP_ANY" }, [](MatchedNonterminal* m) -> std::unique_ptr { MatchedTerminal* n = m->terminal(0); Long ch = utf8::codepoint_from_string(n->token->lexeme, 0, nullptr); assert(ch >= 0); @@ -290,7 +275,7 @@ std::unique_ptr RegexEngine::make() { RegexEngine::add_literal(p.get(), "group_escape_special", "N", '\n'); RegexEngine::add_literal(p.get(), "group_escape_special", "T", '\t'); RegexEngine::add_literal(p.get(), "group_escape_special", "ESCAPE", '\\'); - RegexEngine::add_literal(p.get(), "group_escape_special", "DASH", '-'); + //RegexEngine::add_literal(p.get(), "group_escape_special", "DASH", '-'); RegexEngine::add_literal(p.get(), "group_escape_special", "LBRACKET", '['); RegexEngine::add_literal(p.get(), "group_escape_special", "RBRACKET", ']'); diff --git a/midori/src/midori/utf8.h b/midori/src/midori/utf8.h index 612dc28..dc28768 100644 --- a/midori/src/midori/utf8.h +++ b/midori/src/midori/utf8.h @@ -1,8 +1,9 @@ #ifndef MIDORI_UTF8_H_INCLUDED #define MIDORI_UTF8_H_INCLUDED -#include #include "global.h" +#include +#include class utf8 { public: @@ -66,6 +67,35 @@ class utf8 { } return ((a & 0x07) << 18) | ((b & 0x3f) << 12) | ((c & 0x3f) << 6) | (d & 0x3f); } + + static Long codepoint_from_istream(std::istream* is) { + Int a = is->get(); + if (!is->good()) { + return -1; + } + if ((a & 0x80) == 0) { + return a; + } + Int b = is->get(); + if (!is->good()) { + return -1; + } + if ((a & 0xe0) == 0xc0) { + return ((a & 0x1f) << 6) | (b & 0x3f); + } + Int c = is->get(); + if (!is->good()) { + return -1; + } + if ((a & 0xf0) == 0xe0) { + return ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f); + } + Int d = is->get(); + if (!is->good()) { + return -1; + } + return ((a & 0x07) << 18) | ((b & 0x3f) << 12) | ((c & 0x3f) << 6) | (d & 0x3f); + } }; #endif /* MIDORI_UTF8_H_INCLUDED */ diff --git a/midori/tests/CMakeLists.txt b/midori/tests/CMakeLists.txt index 9d5b9fb..8aa162d 100644 --- a/midori/tests/CMakeLists.txt +++ b/midori/tests/CMakeLists.txt @@ -41,6 +41,7 @@ set(TESTS utf8 lexer parser + regex_engine ) include_directories(${CMAKE_SOURCE_DIR}/src) diff --git a/midori/tests/parser.cpp b/midori/tests/parser.cpp index 80fa82c..72fe803 100644 --- a/midori/tests/parser.cpp +++ b/midori/tests/parser.cpp @@ -43,6 +43,26 @@ TEST_F(ParserTest, Recursion) { p.parse(&fis); } +TEST_F(ParserTest, Epsilon) { + ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr { + (void) m; + return nullptr; + }; + Parser p; + p.add_token("A", std::unique_ptr(new RegexASTLiteral('a'))); + p.add_production("n", { "m", "n" }, fn); + p.add_production("n", {}, fn); + p.add_production("m", { "A" }, fn); + p.generate("n"); + std::stringstream ss; + ss << "aaa"; + FileInputStream fis(&ss); + p.parse(&fis); + p.reset(); + ss << ""; + p.parse(&fis); +} + TEST_F(ParserTest, RegexGroup) { ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr { (void) m; diff --git a/midori/tests/regex_engine.cpp b/midori/tests/regex_engine.cpp new file mode 100644 index 0000000..cae31a4 --- /dev/null +++ b/midori/tests/regex_engine.cpp @@ -0,0 +1,34 @@ +#include "gtest/gtest.h" +#include "midori/lexer.h" +#include +#include "midori/regex_engine.h" + +class RegexEngineTest : public ::testing::Test { +}; + +TEST_F(RegexEngineTest, Dash) { + RegexEngine re; + Lexer l; + l.add_rule("a", re.parse("[-b-a-cd---]")); + l.generate(); + std::stringstream ss; + for (char ch = '-'; ch <= 'd'; ch++) { + ss << ch; + } + FileInputStream fis(&ss); + for (char ch = '-'; ch <= 'd'; ch++) { + std::unique_ptr t = l.scan(&fis); + ASSERT_EQ(t->tags.at(0), "a"); + } + ss.clear(); + ss << (char) ('-' - 1); + std::unique_ptr t = l.scan(&fis); + ASSERT_EQ(t->tags.at(0), Lexer::TOKEN_BAD); + ASSERT_EQ(t->lexeme, std::string(1, ('-' - 1))); + l.reset(); + ss.clear(); + ss << (char) ('d' + 1); + t = l.scan(&fis); + ASSERT_EQ(t->tags.at(0), Lexer::TOKEN_BAD); + ASSERT_EQ(t->lexeme, std::string(1, ('d' + 1))); +}