From 92a3501e97e4652813934ffef324b06f17696e14 Mon Sep 17 00:00:00 2001 From: Raekye Date: Mon, 22 Jul 2019 12:52:41 -0400 Subject: [PATCH] refactored and cleaned up a lot of code --- midori/src/main.cpp | 133 +++--- midori/src/midori/finite_automata.cpp | 12 + midori/src/midori/finite_automata.h | 1 + midori/src/midori/lexer.cpp | 46 +- midori/src/midori/lexer.h | 22 +- midori/src/midori/parser.cpp | 582 ++++---------------------- midori/src/midori/parser.h | 26 +- midori/tests/CMakeLists.txt | 1 + midori/tests/parser.cpp | 68 +++ 9 files changed, 258 insertions(+), 633 deletions(-) create mode 100644 midori/tests/parser.cpp diff --git a/midori/src/main.cpp b/midori/src/main.cpp index 45a82e0..ea8fb63 100644 --- a/midori/src/main.cpp +++ b/midori/src/main.cpp @@ -17,46 +17,21 @@ int test_regex_engine() { return 0; } -int test_parser() { - Parser p; - /* - p.add_token("A", std::unique_ptr(new RegexASTLiteral('a'))); - p.add_production("n", { "n", "n" }, [](MatchedNonterminal* m) -> std::unique_ptr { +int test_parser0() { + ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr { (void) m; return nullptr; - }); - p.add_production("n", { "A" }, [](MatchedNonterminal* m) -> std::unique_ptr { - (void) m; - return nullptr; - }); - */ + }; + Parser p; p.add_token("EQUALS", std::unique_ptr(new RegexASTLiteral('='))); p.add_token("X", std::unique_ptr(new RegexASTLiteral('x'))); p.add_token("STAR", std::unique_ptr(new RegexASTLiteral('*'))); - p.add_production("s", { "n" }, [](MatchedNonterminal* m) -> std::unique_ptr { - (void) m; - return nullptr; - }); - p.add_production("n", { "v", "EQUALS", "e" }, [](MatchedNonterminal* m) -> std::unique_ptr { - (void) m; - return nullptr; - }); - p.add_production("n", { "e" }, [](MatchedNonterminal* m) -> std::unique_ptr { - (void) m; - return nullptr; - }); - p.add_production("e", { "v" }, [](MatchedNonterminal* m) -> std::unique_ptr { - (void) m; - return nullptr; - }); - p.add_production("v", { "X" }, [](MatchedNonterminal* m) -> std::unique_ptr { - (void) m; - return nullptr; - }); - p.add_production("v", { "STAR", "e" }, [](MatchedNonterminal* m) -> std::unique_ptr { - (void) m; - return nullptr; - }); + p.add_production("s", { "n" }, fn); + p.add_production("n", { "v", "EQUALS", "e" }, fn); + p.add_production("n", { "e" }, fn); + p.add_production("e", { "v" }, fn); + p.add_production("v", { "X" }, fn); + p.add_production("v", { "STAR", "e" }, fn); p.generate("s"); std::stringstream ss; ss << "*x=x"; @@ -65,63 +40,49 @@ int test_parser() { return 0; } -int test_regex_parser() { - /* - std::unique_ptr p = RegexParserGenerator::make(); - - for (int i = 0; i < 2; i++) { - std::stringstream ss; - ss << "a(bc){3,4}\\[|def[ghi\\t0-9\\-]+\\000000x2dz(\\x2d)d"; - //ss << "[x-zabc-f]"; - std::unique_ptr m = p->parse(&ss); - MatchedNonterminal* n = dynamic_cast(m.get()); - ParserRegexAST* r = dynamic_cast(n->value.get()); - RegexASTPrinter printer; - r->regex->accept(&printer); - } - */ +int test_parser1() { + ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr { + (void) m; + return nullptr; + }; + Parser p; + p.add_token("A", std::unique_ptr(new RegexASTLiteral('a'))); + p.add_production("n", { "n", "n" }, fn); + p.add_production("n", { "A" }, fn); + p.generate("n"); + std::stringstream ss; + ss << "aaa"; + FileInputStream fis(&ss); + p.parse(&fis); return 0; } -/* -int test_parser() { - std::string indents = ""; - ProductionHandler fn = [&indents, &fn](Match* m) -> void { - std::stack s; - s.push(m); - while (s.size() > 0) { - m = s.top(); - s.pop(); - if (MatchedTerminal* mt = dynamic_cast(m)) { - mdk::logf("%s- terminal %s, %s\n", indents.c_str(), mt->token->tag.c_str(), mt->token->lexeme.c_str()); - } else if (MatchedNonterminal* mnt = dynamic_cast(m)) { - mdk::logf("%s- nonterminal ", indents.c_str()); - Parser::debug_production(mnt->production); - indents += " "; - for (std::unique_ptr& x : mnt->terms) { - fn(x.get()); - } - indents = indents.substr(0, indents.length() - 2); - } - } +int test_parser2() { + ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr { + (void) m; + return nullptr; }; - Parser parser; - parser.set_start("s"); - parser.add_token("STAR", "\\*"); - parser.add_token("X", "x"); - parser.add_token("EQUALS", "="); - parser.add_production("s", { "n" }, fn); - parser.add_production("n", { "v", "EQUALS", "e" }, fn); - parser.add_production("n", { "e" }, fn); - parser.add_production("e", { "v" }, fn); - parser.add_production("v", { "X" }, fn); - parser.add_production("v", { "STAR", "e" }, fn); + Parser p; + p.add_token("LB", std::unique_ptr(new RegexASTLiteral('['))); + p.add_token("RB", std::unique_ptr(new RegexASTLiteral(']'))); + p.add_token("DASH", std::unique_ptr(new RegexASTLiteral('-'))); + p.add_token("ANY", std::unique_ptr(RegexASTGroup::make(true, { '[', '[', ']', ']' }))); + p.add_production("class", { "LB", "class_contents", "RB" }, fn); + p.add_production("class_contents", { "class_element" }, fn); + p.add_production("class_contents", { "class_element", "class_contents" }, fn); + p.add_production("class_element", { "literal" }, fn); + p.add_production("class_element", { "class_element", "DASH", "literal" }, fn); + p.add_production("literal", { "DASH" }, fn); + p.add_production("literal", { "ANY" }, fn); + p.generate("class"); std::stringstream ss; - ss << "x=*x"; - parser.parse(&ss); + ss << "[-a-c-d-]"; + FileInputStream fis(&ss); + p.parse(&fis); return 0; } +/* int test_generator() { std::fstream f("src/parser.txt", std::fstream::in); std::unique_ptr p = Parser::from_file(&f); @@ -163,8 +124,10 @@ int main() { ULong x = ~0; std::cout << "-1 is " << x << std::endl; test_interval_tree(); - test_parser(); test_regex_engine(); + test_parser0(); + test_parser2(); + test_parser1(); //test_generator(); return 0; } diff --git a/midori/src/midori/finite_automata.cpp b/midori/src/midori/finite_automata.cpp index 8158cdd..97f4d33 100644 --- a/midori/src/midori/finite_automata.cpp +++ b/midori/src/midori/finite_automata.cpp @@ -12,6 +12,18 @@ RegexDFAState::RegexDFAState(UInt id) : id(id) { std::memset(this->_transitions, 0, RegexDFAState::OPTIMIZED_CHARS * sizeof(RegexDFAState*)); } +RegexDFAState* RegexDFAState::next(UInt ch) { + if (ch < RegexDFAState::OPTIMIZED_CHARS) { + return this->_transitions[ch]; + } + std::unique_ptr l = this->transitions.find(Tree::Interval(ch, ch)); + assert(l->size() <= 1); + if (l->size() > 0) { + return l->front().second; + } + return nullptr; +} + RegexNFAState::RegexNFAState(UInt id) : id(id), terminal(false) { return; } diff --git a/midori/src/midori/finite_automata.h b/midori/src/midori/finite_automata.h index 1b49105..eb7e6ae 100644 --- a/midori/src/midori/finite_automata.h +++ b/midori/src/midori/finite_automata.h @@ -20,6 +20,7 @@ class RegexDFAState { Tree transitions; RegexDFAState(UInt); + RegexDFAState* next(UInt); }; class RegexNFAState { diff --git a/midori/src/midori/lexer.cpp b/midori/src/midori/lexer.cpp index e1f2ba4..22dd49a 100644 --- a/midori/src/midori/lexer.cpp +++ b/midori/src/midori/lexer.cpp @@ -2,6 +2,9 @@ #include #include "utf8.h" +std::string const Lexer::TOKEN_END = "$END"; +std::string const Lexer::TOKEN_BAD = "$BAD"; + IInputStream::~IInputStream() { return; } @@ -13,7 +16,10 @@ FileInputStream::FileInputStream(std::istream* file) : file(file) { Long FileInputStream::get() { UInt ch = this->file->get(); if (!this->file->good()) { - return -1; + if (this->file->eof()) { + return Lexer::CHAR_EOF; + } + return Lexer::CHAR_BAD; } return ch; } @@ -24,12 +30,12 @@ VectorInputStream::VectorInputStream(std::vector v) : v(v), pos(0) { Long VectorInputStream::get() { if (this->pos >= this->v.size()) { - return -1; + return Lexer::CHAR_EOF; } return this->v.at(this->pos++); } -Lexer::Lexer() : current_state(nullptr), buffer_pos(0) { +Lexer::Lexer() : current_state(nullptr), buffer_pos(0), location(0, 0) { return; } @@ -131,6 +137,8 @@ void Lexer::generate() { void Lexer::reset() { this->buffer.clear(); this->buffer_pos = 0; + this->location.line = 0; + this->location.column = 0; } void Lexer::add_rule(std::string rule, std::unique_ptr regex) { @@ -144,6 +152,7 @@ std::unique_ptr Lexer::scan(IInputStream* in) { std::vector matched_tags; std::string matched_str = ""; UInt matched_buffer_pos = this->buffer_pos; + LocationInfo matched_location = this->location; std::string found_buffer = ""; std::unique_ptr t; while (true) { @@ -152,24 +161,34 @@ std::unique_ptr Lexer::scan(IInputStream* in) { matched_tags = this->current_state->terminals; matched_str.append(found_buffer); matched_buffer_pos = this->buffer_pos; + matched_location = this->location; found_buffer = ""; } Long ch = this->read(in); - RegexDFAState* next = nullptr; - if (ch >= 0) { - if (ch < RegexDFAState::OPTIMIZED_CHARS) { - next = this->current_state->_transitions[ch]; - } else { - std::unique_ptr l = this->current_state->transitions.find(RegexDFAState::Tree::Interval(ch, ch)); - assert(l->size() <= 1); - if (l->size() > 0) { - next = l->front().second; + if (ch == '\n') { + this->location.line++; + this->location.column = 0; + } else { + this->location.column++; + } + if (ch < 0) { + if (matched) { + t.reset(new Token(matched_tags, matched_str, matched_location)); + break; + } + if (ch == Lexer::CHAR_EOF) { + if (this->buffer_pos == matched_buffer_pos) { + t.reset(new Token({ Lexer::TOKEN_END }, "", this->location)); + break; } } + t.reset(new Token({ Lexer::TOKEN_BAD }, found_buffer, matched_location)); + break; } + RegexDFAState* next = this->current_state->next(ch); if (next == nullptr) { if (matched) { - t.reset(new Token(matched_tags, matched_str, LocationInfo(0, 0))); + t.reset(new Token(matched_tags, matched_str, matched_location)); } break; } @@ -178,6 +197,7 @@ std::unique_ptr Lexer::scan(IInputStream* in) { this->current_state = next; } this->buffer_pos = matched_buffer_pos; + this->location = matched_location; return t; } diff --git a/midori/src/midori/lexer.h b/midori/src/midori/lexer.h index e906817..2ef99b8 100644 --- a/midori/src/midori/lexer.h +++ b/midori/src/midori/lexer.h @@ -54,6 +54,19 @@ class VectorInputStream : public IInputStream { }; class Lexer { +public: + static std::string const TOKEN_END; + static std::string const TOKEN_BAD; + static const Long CHAR_EOF = -1; + static const Long CHAR_BAD = -2; + + Lexer(); + + void add_rule(std::string, std::unique_ptr); + std::unique_ptr scan(IInputStream*); + void generate(); + void reset(); + private: std::vector rules; std::vector> rules_regex; @@ -64,14 +77,9 @@ class Lexer { std::vector buffer; UInt buffer_pos; - Long read(IInputStream*); -public: - Lexer(); + LocationInfo location; - void add_rule(std::string, std::unique_ptr); - std::unique_ptr scan(IInputStream*); - void generate(); - void reset(); + Long read(IInputStream*); }; #endif /* MIDORI_LEXER_H_INCLUDED */ diff --git a/midori/src/midori/parser.cpp b/midori/src/midori/parser.cpp index 29a32df..206f23e 100644 --- a/midori/src/midori/parser.cpp +++ b/midori/src/midori/parser.cpp @@ -8,8 +8,7 @@ */ std::string const Parser::ROOT = "$root"; -std::string const Parser::END = "$"; -std::string const Parser::EPSILON = "0"; +//std::string const Parser::EPSILON = "0"; ParserAST::~ParserAST() { return; @@ -33,47 +32,19 @@ void Parser::add_token(std::string tag, std::unique_ptr regex) { void Parser::add_production(std::string target, std::vector symbols, ProductionHandler handler) { std::unique_ptr p(new Production); p->target = target; - p->symbols = symbols.empty() ? std::vector{ Parser::EPSILON } : symbols; + p->symbols = symbols; p->handler = handler; this->nonterminals[target].push_back(p.get()); this->productions.push_back(std::move(p)); } void Parser::generate(std::string start) { - //this->start = Parser::ROOT; - this->start = start; - /* - this->add_production(Parser::ROOT, { start, Parser::END }, [](MatchedNonterminal* m) -> std::unique_ptr { - + this->add_production(Parser::ROOT, { start, Lexer::TOKEN_END }, [](MatchedNonterminal* m) -> std::unique_ptr { + return std::move(m->value); }); - */ this->generate_first_sets(); this->generate_follow_sets(); this->generate_itemsets(); - /* - this->start = symbol; - std::set head; - for (Production* p : this->nonterminals.at(symbol)) { - head.insert(Item(p, 0)); - } - this->generate_itemset(head); - - std::cout << std::endl; - - std::cout << "=== Generating extended grammar" << std::endl; - this->generate_extended_grammar(); - std::cout << "=== Done extended grammar" << std::endl; - std::cout << std::endl; - - std::cout << "=== Generating extended sets" << std::endl; - this->generate_extended_first_sets(); - this->generate_extended_follow_sets(); - std::cout << "=== Done extended sets" << std::endl; - std::cout << std::endl; - - this->generate_reductions(); - */ - this->lexer.generate(); Parser::debug(this); @@ -84,131 +55,19 @@ std::unique_ptr Parser::parse(IInputStream* in) { this->reset(); std::cout << std::endl << "===== Parsing" << std::endl; this->parse_stack.push(0); - /* - Symbol last_reduction = ""; while (true) { - mdk::printf("[debug] parse at state %d\n", this->parse_stack.top()); - std::unique_ptr& current_state = this->states.at(this->parse_stack.top()); - bool accept = false; - for (const Item& item : current_state->head) { - mdk::printf("[debug] checking head "); - Parser::debug_item(item); - if (item.first->target == this->start && Parser::item_is_done(item)) { - accept = true; - break; - } - } - if (accept) { - std::cout << "Accepted." << std::endl; - //break; - } - - mdk::printf("[debug] last reduction is {%s}\n", last_reduction.c_str()); - Symbol lr2 = last_reduction; - last_reduction = ""; - if (lr2.length() > 0) { - std::map::iterator it = current_state->next.find(lr2); - if (it != current_state->next.end()) { - std::cout << "goto " << it->second->index << std::endl; - this->parse_stack.push(it->second->index); - continue; - } else { - // TODO: what? - std::cout << "No next state" << std::endl; - break; - } - } - std::unique_ptr t = this->next_token(in); - if (t == nullptr) { - std::cout << "Got null token, state " << current_state->index << std::endl; - // TODO: refactor? - std::map reduction_row = this->reductions.at(current_state->index); - if (reduction_row.find(Parser::END) != reduction_row.end()) { - std::cout << "Reducing via end" << std::endl; - t.reset(new Token({ Parser::END }, "", LocationInfo(0, 0))); - } else { - if (current_state->next.find(Parser::EPSILON) != current_state->next.end()) { - mdk::printf("[debug] Reducing via epsilon\n"); - t.reset(new Token({ Parser::EPSILON }, "", LocationInfo(0, 0))); - } else { - std::cout << "Breaking." << std::endl; - break; - } - } - } - std::cout << "Got token"; - for (std::string const& tag : t->tags) { - std::cout << " " << tag; - } - std::cout << ": " << t->lexeme << std::endl; - std::map reduction_row = this->reductions.at(current_state->index); - bool found = false; - for (std::string const& tag : t->tags) { - mdk::printf("[debug] getting shifts at %d, tag is %s\n", current_state->index, tag.c_str()); - std::map::iterator it = current_state->next.find(tag); - if (it != current_state->next.end()) { - std::cout << "shift " << it->second->index << std::endl; - this->parse_stack.push(it->second->index); - this->parse_stack_matches.push(std::unique_ptr(new MatchedTerminal(std::move(t)))); - found = true; - break; - } - mdk::printf("[debug] getting reductions at %d, tag is %s\n", current_state->index, tag.c_str()); - std::map::iterator it2 = reduction_row.find(tag); - if (it2 != reduction_row.end()) { - std::cout << "reducing via rule "; - Parser::debug_production(it2->second); - std::unique_ptr mnt(new MatchedNonterminal(it2->second)); - for (size_t i = 0; i < it2->second->symbols.size(); i++) { - this->parse_stack.pop(); - std::unique_ptr m = std::move(this->parse_stack_matches.top()); - this->parse_stack_matches.pop(); - //std::cout << "Removing entry "; - //Parser::debug_match(m.get(), 0); - mnt->terms[it2->second->symbols.size() - i - 1] = std::move(m); - } - mnt->value = it2->second->handler(mnt.get()); - this->parse_stack_matches.push(std::move(mnt)); - last_reduction = it2->second->target; - if (tag != Parser::EPSILON) { - mdk::printf("[debug] pushing token %s\n", tag.c_str()); - this->push_token(std::move(t)); - } - found = true; - break; - } - } - if (!found) { - // TODO: what is this? - if (current_state->next.find(Parser::EPSILON) != current_state->next.end()) { - mdk::printf("[debug] shift has epsilon\n"); - this->push_token(std::move(t)); - this->push_token(std::unique_ptr(new Token({ Parser::EPSILON }, "", LocationInfo(0, 0)))); - } else if (reduction_row.find(Parser::EPSILON) != reduction_row.end()) { - mdk::printf("[debug] reduction has epsilon\n"); - this->push_token(std::move(t)); - this->push_token(std::unique_ptr(new Token({ Parser::EPSILON }, "", LocationInfo(0, 0)))); - } else { - std::cout << "No reduction" << std::endl; - } - } - } - */ - while (true) { - std::unique_ptr t = this->next_token(in); - if (t == nullptr) { + if (t->tags.at(0) == Lexer::TOKEN_BAD) { + std::cout << "Bad token" << std::endl; break; } if (this->parse_advance(std::move(t))) { break; } } - if (this->parse_advance(std::unique_ptr(new Token({ Parser::END }, "", LocationInfo(0, 0))))) { - } mdk::printf("[debug] parse stack size %zd, stack matches %zd\n", this->parse_stack.size(), this->parse_stack_matches.size()); Parser::debug_match(this->parse_stack_matches.top().get(), 0); - assert(this->parse_stack.size() == 1); + assert(this->parse_stack.size() == 2); assert(this->parse_stack_matches.size() == 1); std::unique_ptr ret = std::move(this->parse_stack_matches.top()); this->parse_stack_matches.pop(); @@ -228,140 +87,59 @@ void Parser::reset() { #pragma mark - Parser - private bool Parser::parse_advance(std::unique_ptr t) { - while (true) { - mdk::printf("[debug] parse at state %d, size %zd\n", this->parse_stack.top(), this->parse_stack.size()); - std::cout << "Got token"; - for (std::string const& tag : t->tags) { - std::cout << " " << tag; - } - std::cout << ": " << t->lexeme << std::endl; - ItemSet* curr = this->current_state(); - size_t i = 0; - for (std::string const& tag : t->tags) { - std::cout << "Trying tag " << tag << std::endl; - std::map::iterator next_shift = curr->next.find(tag); - std::map::iterator next_reduction = curr->reductions.find(tag); - if (next_shift != curr->next.end()) { - if (next_reduction != curr->reductions.end()) { - std::cout << "Shift reduce conflict" << std::endl; - } - std::cout << "shifting to " << next_shift->second->index << std::endl; - this->parse_stack.push(next_shift->second->index); - this->parse_stack_matches.push(std::unique_ptr(new MatchedTerminal(std::move(t)))); - return false; - } + mdk::printf("[debug] parse at state %d, size %zd\n", this->parse_stack.top(), this->parse_stack.size()); + std::cout << "Got token"; + for (std::string const& tag : t->tags) { + std::cout << " " << tag; + } + std::cout << ": " << t->lexeme << std::endl; + ItemSet* curr = this->current_state(); + for (std::string const& tag : t->tags) { + std::cout << "Trying tag " << tag << std::endl; + std::map::iterator next_shift = curr->next.find(tag); + std::map::iterator next_reduction = curr->reductions.find(tag); + if (next_shift != curr->next.end()) { if (next_reduction != curr->reductions.end()) { - std::cout << "reducing via rule "; - Parser::debug_production(next_reduction->second); - std::unique_ptr mnt(new MatchedNonterminal(next_reduction->second)); - for (size_t i = 0; i < next_reduction->second->symbols.size(); i++) { - this->parse_stack.pop(); - std::unique_ptr m = std::move(this->parse_stack_matches.top()); - this->parse_stack_matches.pop(); - //std::cout << "Removing entry "; - //Parser::debug_match(m.get(), 0); - mnt->terms[next_reduction->second->symbols.size() - i - 1] = std::move(m); - } - mnt->value = next_reduction->second->handler(mnt.get()); - // TODO: why no cast? - this->parse_stack_matches.push(std::move(mnt)); - std::cout << "getting shift from state " << this->parse_stack.top() << " size " << this->parse_stack.size() << std::endl; - ItemSet* reduce_state = this->current_state(); - std::map::iterator reduction_shift = reduce_state->next.find(next_reduction->second->target); - if (reduction_shift == reduce_state->next.end()) { - return true; - } - std::cout << "reduction shifting to " << reduction_shift->second->index << std::endl; - this->parse_stack.push(reduction_shift->second->index); - break; + std::cout << "Shift reduce conflict" << std::endl; } - i++; - } - if (i == t->tags.size()) { - std::cout << "no rules" << std::endl; - break; - } - } - return true; -} - -ItemSet* Parser::generate_itemset(std::set head) { - std::cout << "=== Generating head:" << std::endl; - for (const Item& x : head) { - std::cout << "\t"; - Parser::debug_item(x); - } - std::cout << "=== done head "; - std::map, ItemSet*>::iterator it = this->itemsets.find(head); - if (it != this->itemsets.end()) { - std::cout << "(exists)" << std::endl; - return it->second; - } - std::cout << "(new)" << std::endl; - std::unique_ptr is(new ItemSet); - is->head = head; - is->index = this->states.size(); - ItemSet* ret = is.get(); - this->itemsets[head] = ret; - this->states.push_back(std::move(is)); - std::set encountered_terminals; - for (const Item& item : head) { - if (Parser::item_is_done(item)) { - continue; - } - std::string next_symbol = item.first->symbols.at(item.second); - this->expand_symbol_into_itemset(ret, next_symbol, &encountered_terminals); - } - - std::set combined(ret->head); - combined.insert(ret->additionals.begin(), ret->additionals.end()); - //assert(combined.size() == ret->head.size() + ret->additionals.size()); - for (const Item& item : combined) { - if (Parser::item_is_done(item)) { - continue; - } - std::string next_symbol = item.first->symbols.at(item.second); - if (ret->next.find(next_symbol) == ret->next.end()) { - std::set h2; - for (const Item& i2 : combined) { - if (Parser::item_is_done(i2)) { - continue; - } - std::string ns2 = i2.first->symbols.at(i2.second); - if (ns2 == next_symbol) { - h2.insert(Item(i2.first, i2.second + 1)); - } + if (tag == Lexer::TOKEN_END) { + return true; } - ret->next[next_symbol] = this->generate_itemset(h2); - } - } - return ret; -} - -void Parser::expand_symbol_into_itemset(ItemSet* is, std::string symbol, std::set* encountered_terminals) { - if (Parser::symbol_is_token(symbol) || Parser::symbol_is_epsilon(symbol)) { - std::cout << "=== expand symbol into itemset: token " << symbol << std::endl; - return; - } else { - // if new symbol was inserted - if (encountered_terminals->insert(symbol).second) { - std::cout << "=== expand symbol into itemset: new nonterminal " << symbol << std::endl; - for (Production* p : this->nonterminals.at(symbol)) { - std::cout << "\t"; - Parser::debug_production(p); - is->additionals.insert(Item(p, 0)); - this->expand_symbol_into_itemset(is, p->symbols.front(), encountered_terminals); + std::cout << "shifting to " << next_shift->second->index << std::endl; + this->parse_stack.push(next_shift->second->index); + this->parse_stack_matches.push(std::unique_ptr(new MatchedTerminal(std::move(t)))); + return false; + } + if (next_reduction != curr->reductions.end()) { + std::cout << "reducing via rule "; + Parser::debug_production(next_reduction->second); + std::unique_ptr mnt(new MatchedNonterminal(next_reduction->second)); + for (size_t i = 0; i < next_reduction->second->symbols.size(); i++) { + this->parse_stack.pop(); + std::unique_ptr m = std::move(this->parse_stack_matches.top()); + this->parse_stack_matches.pop(); + mnt->terms[next_reduction->second->symbols.size() - i - 1] = std::move(m); } - } else { - std::cout << "=== expand symbol into itemset: existing nonterminal " << symbol << std::endl; + mnt->value = next_reduction->second->handler(mnt.get()); + // TODO: why no cast? + this->parse_stack_matches.push(std::move(mnt)); + std::cout << "getting shift from state " << this->parse_stack.top() << " size " << this->parse_stack.size() << std::endl; + ItemSet* reduce_state = this->current_state(); + std::map::iterator reduction_shift = reduce_state->next.find(next_reduction->second->target); + assert(reduction_shift != reduce_state->next.end()); + std::cout << "reduction shifting to " << reduction_shift->second->index << std::endl; + this->parse_stack.push(reduction_shift->second->index); + this->push_token(std::move(t)); + return false; } } + return true; } void Parser::generate_itemsets() { assert(this->states.size() == 0); std::unique_ptr start(new ItemSet); - for (Production* p : this->nonterminals.at(this->start)) { + for (Production* p : this->nonterminals.at(Parser::ROOT)) { start->head.insert(Item(p, 0)); } std::list q; @@ -424,45 +202,11 @@ void Parser::generate_closure(std::set* kernel, std::set* closure) { } } -void Parser::generate_extended_grammar() { - Int num = 0; - auto fn = [&num](Parser* self, Item item, ItemSet* is) -> void { - if (item.second != 0) { - return; - } - std::cout << num << ". "; - Parser::debug_item(item); - Symbol target = item.first->target; - std::unique_ptr ep(new ExtendedProduction); - ep->target = ExtendedSymbol(target, is->index, (target == self->start) ? -1 : is->next.at(target)->index); - ep->orig = item.first; - Int i = 0; - ItemSet* curr = is; - for (std::string& s : item.first->symbols) { - ItemSet* next = curr->next.at(s); - ep->symbols.push_back(ExtendedSymbol(s, curr->index, next->index)); - curr = next; - i++; - } - Parser::debug_extended_production(ep.get()); - self->extended_nonterminals[ep->target].push_back(ep.get()); - self->extended_grammar.push_back(std::move(ep)); - num++; - }; - for (auto& kv : this->itemsets) { - for (auto& item : kv.first) { - fn(this, item, kv.second); - } - for (auto& item : kv.second->additionals) { - fn(this, item, kv.second); - } - } -} - /* * Dragon book page 221 */ void Parser::generate_first_sets() { + this->firsts[Lexer::TOKEN_END].insert(Lexer::TOKEN_END); for (std::string const& s : this->terminals) { this->firsts[s].insert(s); } @@ -474,14 +218,19 @@ void Parser::generate_first_sets() { Parser::debug_production(p.get()); std::set& f = this->firsts[p->target]; if (Parser::production_is_epsilon(p.get())) { - changed = changed || f.insert(Parser::EPSILON).second; - this->nullable.insert(p->target); + //changed = changed || f.insert(Parser::EPSILON).second; + changed = changed || this->nullable.insert(p->target).second; continue; } size_t old = f.size(); size_t i = 0; for (std::string const& s : p->symbols) { std::set& f2 = this->firsts[s]; + f.insert(f2.begin(), f2.end()); + if (this->nullable.find(s) == this->nullable.end()) { + break; + } + /* bool nullable = false; for (std::string const& s2 : f2) { if (Parser::symbol_is_epsilon(s2)) { @@ -493,29 +242,19 @@ void Parser::generate_first_sets() { if (!nullable) { break; } + */ i++; } if (i == p->symbols.size()) { - f.insert(Parser::EPSILON); - this->nullable.insert(p->target); + //f.insert(Parser::EPSILON); + changed = changed || this->nullable.insert(p->target).second; } changed = changed || (f.size() != old); } } - std::cout << "=== First sets" << std::endl; - for (auto const& kv : this->firsts) { - std::cout << "\t" << kv.first << ":"; - for (auto const& s : kv.second) { - std::cout << " " << s; - } - std::cout << std::endl; - } - std::cout << "=== End first sets" << std::endl; - std::cout << std::endl; } void Parser::generate_follow_sets() { - this->follows[this->start].insert(Parser::END); bool changed = true; while (changed) { changed = false; @@ -539,9 +278,9 @@ void Parser::generate_follow_sets() { for (size_t j = i; j < p->symbols.size(); j++) { std::string s = p->symbols.at(j); for (std::string const& s2 : this->firsts[s]) { - if (!Parser::symbol_is_epsilon(s)) { + //if (!Parser::symbol_is_epsilon(s)) { changed = changed || f3.insert(s2).second; - } + //} } if (this->nullable.find(s) == this->nullable.end()) { break; @@ -552,158 +291,6 @@ void Parser::generate_follow_sets() { } } } - std::cout << "=== Follow sets" << std::endl; - for (auto const& kv : this->follows) { - std::cout << "\t" << kv.first << ":"; - for (auto const& s : kv.second) { - std::cout << " " << s; - } - std::cout << std::endl; - } - std::cout << "=== End follow sets" << std::endl; -} - -void Parser::generate_extended_first_sets() { - for (const std::unique_ptr& ep : this->extended_grammar) { - for (ExtendedSymbol const& es : ep->symbols) { - Symbol s = std::get<0>(es); - if (Parser::symbol_is_token(s) || Parser::symbol_is_epsilon(s)) { - this->extended_firsts[es].insert(s); - } - } - } - bool changed = true; - while (changed) { - changed = false; - for (const std::unique_ptr& ep : this->extended_grammar) { - mdk::printf("[debug] generating first set for\n"); - Parser::debug_extended_production(ep.get()); - if (Parser::production_is_epsilon(ep->orig)) { - changed = changed || this->extended_firsts[ep->target].insert(Parser::EPSILON).second; - } else { - size_t old = this->extended_firsts[ep->target].size(); - // if `FIRST(ep->target)` contains `epsilon`, make sure we don't remove it later - std::set::iterator it = this->extended_firsts[ep->target].find(Parser::EPSILON); - for (const ExtendedSymbol& es : ep->symbols) { - /* - Symbol s = std::get<0>(es); - if (Parser::symbol_is_token(s)) { - changed = changed || this->extended_firsts[es].insert(s).second; - this->extended_firsts[ep->target].insert(s); - } else { - */ - // TODO: there may be a bug here regarding nullable productions - this->extended_firsts[ep->target].insert(this->extended_firsts[es].begin(), this->extended_firsts[es].end()); - //mdk::printf("[debug] inserting %s to %s is: ", s.c_str(), std::get<0>(ep->target).c_str()); - Parser::debug_set(this->extended_firsts[ep->target]); - if (this->extended_firsts[es].find(Parser::EPSILON) == this->extended_firsts[es].end()) { - if (it == this->extended_firsts[ep->target].end()) { - this->extended_firsts[ep->target].erase(Parser::EPSILON); - } - break; - } - //} - } - changed = changed || (this->extended_firsts[ep->target].size() != old); - } - } - } -} - -void Parser::generate_extended_follow_sets() { - bool changed = true; - while (changed) { - changed = false; - for (auto& kv : this->extended_nonterminals) { - std::cout << "\t- generating extended follow "; - Parser::debug_extended_symbol(kv.first); - std::cout << std::endl; - if (std::get<2>(kv.first) == -1) { - this->extended_follows[kv.first].insert(Parser::END); - } - for (ExtendedProduction* ep : kv.second) { - if (Parser::production_is_epsilon(ep->orig)) { - continue; - } - size_t len = ep->symbols.size(); - assert(len > 0); - ExtendedSymbol z = ep->symbols.at(len - 1); - if (!Parser::symbol_is_token(std::get<0>(z))) { - size_t old = this->extended_follows[z].size(); - this->extended_follows[z].insert(this->extended_follows[ep->target].begin(), this->extended_follows[ep->target].end()); - changed = changed || (this->extended_follows[z].size() != old); - } - for (size_t i = 0; i < len - 1; i++) { - ExtendedSymbol x = ep->symbols.at(i); - if (!Parser::symbol_is_token(std::get<0>(x))) { - ExtendedSymbol y = ep->symbols.at(i + 1); - size_t old = this->extended_follows[x].size(); - std::cout << "\t\t- adding to "; - Parser::debug_extended_symbol(x); - std::cout << " from "; - Parser::debug_extended_symbol(y); - std::cout << std::endl; - this->extended_follows[x].insert(this->extended_firsts[y].begin(), this->extended_firsts[y].end()); - changed = changed || (this->extended_follows[x].size() != old); - } - } - } - } - } -} - -void Parser::generate_reductions() { - this->reductions.resize(this->itemsets.size()); - std::map lookup; - for (size_t i = 0; i < this->extended_grammar.size(); i++) { - std::unique_ptr& p1 = this->extended_grammar.at(i); - Int final_set = std::get<2>(p1->symbols.back()); - - std::map::iterator it = lookup.find(final_set); - if (it == lookup.end()) { - lookup[final_set] = p1->orig; - } else { - assert(it->second == p1->orig); - } - - for (const Symbol& s : this->extended_follows.at(p1->target)) { - std::map::iterator it = this->reductions.at(final_set).find(s); - if (it == this->reductions.at(final_set).end()) { - this->reductions.at(final_set)[s] = p1->orig; - mdk::printf("[debug] creating reduction from %d to %s from %s\n", final_set, p1->orig->target.c_str(), s.c_str()); - } else { - assert(it->second == p1->orig); - } - } - - // God knows what this is and why it's commented out - /* - for (size_t j = i + 1; j < this->extended_grammar.size(); j++) { - std::unique_ptr& p2 = this->extended_grammar.at(j); - if (p1->orig == p2->orig && final_set == std::get<2>(p2->symbols.back())) { - for (const Symbol& s : this->extended_follows.at(p1->target)) { - std::map::iterator it = this->reductions.at(final_set).find(s); - if (it == this->reductions.at(final_set).end()) { - this->reductions.at(final_set)[s] = p1->orig; - } else { - assert(it->second == p1->orig); - } - } - } - } - */ - } - std::cout << "===== Reductions" << std::endl; - Int i = 0; - for (const std::map& col : this->reductions) { - std::cout << "=== ItemSet " << i << std::endl; - for (auto& kv : col) { - std::cout << "- " << kv.first << ": "; - Parser::debug_production(kv.second); - } - std::cout << std::endl; - i++; - } } void Parser::push_token(std::unique_ptr t) { @@ -721,16 +308,17 @@ std::unique_ptr Parser::next_token(IInputStream* in) { bool Parser::symbol_is_token(std::string str) { UInt ch = str[0]; - return ('A' <= ch) && (ch <= 'Z'); + return (('A' <= ch) && (ch <= 'Z')) || (str == Lexer::TOKEN_END); } +/* bool Parser::symbol_is_epsilon(std::string str) { return str == Parser::EPSILON; } +*/ bool Parser::production_is_epsilon(Production* p) { return p->symbols.size() == 0; - //return p->symbols.size() == 1 && Parser::symbol_is_epsilon(p->symbols.front()); } bool Parser::item_is_done(Item item) { @@ -761,22 +349,6 @@ void Parser::debug_item(Item item) { Parser::debug_production(item.first, (Int) item.second); } -void Parser::debug_extended_symbol(ExtendedSymbol es) { - //std::cout << std::get<1>(es) << "_" << std::get<0>(es) << "_" << std::get<2>(es); - std::cout << std::get<0>(es) << "(" << std::get<1>(es) << ", " << std::get<2>(es) << ")"; -} - -void Parser::debug_extended_production(ExtendedProduction* ep) { - std::cout << "\t"; - Parser::debug_extended_symbol(ep->target); - std::cout << " ::="; - for (auto& es : ep->symbols) { - std::cout << " "; - Parser::debug_extended_symbol(es); - } - std::cout << std::endl; -} - void Parser::debug_set(std::set s) { std::cout << "{"; for (auto& str : s) { @@ -810,6 +382,16 @@ void Parser::debug_match(Match* m, Int levels) { void Parser::debug(Parser* p) { std::cout << "===== Hmmmmm" << std::endl; + std::cout << "=== Firsts" << std::endl; + for (auto& kv : p->firsts) { + std::cout << "\t" << kv.first << ": "; + Parser::debug_set(kv.second); + } + std::cout << "=== Follows" << std::endl; + for (auto& kv : p->follows) { + std::cout << "\t" << kv.first << ": "; + Parser::debug_set(kv.second); + } for (auto& is : p->states) { std::cout << "=== Item Set " << is->index << std::endl; std::cout << "Head:" << std::endl; @@ -834,18 +416,4 @@ void Parser::debug(Parser* p) { std::cout << "=== done " << is->index << std::endl; std::cout << std::endl; } - std::cout << "=== Extended firsts" << std::endl; - for (auto& kv : p->extended_firsts) { - std::cout << "\t"; - Parser::debug_extended_symbol(kv.first); - std::cout << ": "; - Parser::debug_set(kv.second); - } - std::cout << "=== Extended follows" << std::endl; - for (auto& kv : p->extended_follows) { - std::cout << "\t"; - Parser::debug_extended_symbol(kv.first); - std::cout << ": "; - Parser::debug_set(kv.second); - } } diff --git a/midori/src/midori/parser.h b/midori/src/midori/parser.h index 743e499..a67e415 100644 --- a/midori/src/midori/parser.h +++ b/midori/src/midori/parser.h @@ -88,11 +88,10 @@ class Parser { private: static std::string const ROOT; - static std::string const END; - static std::string const EPSILON; + //static std::string const EPSILON; static bool symbol_is_token(std::string); - static bool symbol_is_epsilon(std::string); + //static bool symbol_is_epsilon(std::string); static bool production_is_epsilon(Production*); static bool item_is_done(Item); @@ -102,7 +101,6 @@ class Parser { std::set terminals; std::map> nonterminals; std::vector> productions; - std::string start; std::vector> states; std::map, ItemSet*> itemsets; @@ -111,12 +109,6 @@ class Parser { std::map> firsts; std::map> follows; - std::vector> extended_grammar; - std::map> extended_nonterminals; - std::map> extended_firsts; - std::map> extended_follows; - - std::vector> reductions; std::stack parse_stack; std::stack> parse_stack_matches; @@ -129,25 +121,17 @@ class Parser { bool parse_advance(std::unique_ptr); + void generate_first_sets(); + void generate_follow_sets(); + ItemSet* generate_itemset(std::set); void expand_symbol_into_itemset(ItemSet*, std::string, std::set*); void generate_itemsets(); void generate_closure(std::set*, std::set*); - void generate_first_sets(); - void generate_follow_sets(); - - void generate_extended_grammar(); - void generate_extended_first_sets(); - void generate_extended_follow_sets(); - - void generate_reductions(); - static void debug(Parser*); static void debug_production(Production*, Int = -1); static void debug_item(Item); - static void debug_extended_symbol(ExtendedSymbol); - static void debug_extended_production(ExtendedProduction*); static void debug_set(std::set); static void debug_match(Match*, Int); }; diff --git a/midori/tests/CMakeLists.txt b/midori/tests/CMakeLists.txt index 9f6cdd4..9d5b9fb 100644 --- a/midori/tests/CMakeLists.txt +++ b/midori/tests/CMakeLists.txt @@ -40,6 +40,7 @@ set(TESTS finite_automata utf8 lexer + parser ) include_directories(${CMAKE_SOURCE_DIR}/src) diff --git a/midori/tests/parser.cpp b/midori/tests/parser.cpp new file mode 100644 index 0000000..80fa82c --- /dev/null +++ b/midori/tests/parser.cpp @@ -0,0 +1,68 @@ +#include "gtest/gtest.h" +#include "midori/parser.h" +#include + +class ParserTest : public ::testing::Test { +}; + +TEST_F(ParserTest, Basic) { + ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr { + (void) m; + return nullptr; + }; + Parser p; + p.add_token("EQUALS", std::unique_ptr(new RegexASTLiteral('='))); + p.add_token("X", std::unique_ptr(new RegexASTLiteral('x'))); + p.add_token("STAR", std::unique_ptr(new RegexASTLiteral('*'))); + p.add_production("s", { "n" }, fn); + p.add_production("n", { "v", "EQUALS", "e" }, fn); + p.add_production("n", { "e" }, fn); + p.add_production("e", { "v" }, fn); + p.add_production("v", { "X" }, fn); + p.add_production("v", { "STAR", "e" }, fn); + p.generate("s"); + std::stringstream ss; + ss << "*x=x"; + FileInputStream fis(&ss); + p.parse(&fis); +} + +TEST_F(ParserTest, Recursion) { + ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr { + (void) m; + return nullptr; + }; + Parser p; + p.add_token("A", std::unique_ptr(new RegexASTLiteral('a'))); + p.add_production("n", { "n", "n" }, fn); + p.add_production("n", { "A" }, fn); + p.generate("n"); + std::stringstream ss; + ss << "aaa"; + FileInputStream fis(&ss); + p.parse(&fis); +} + +TEST_F(ParserTest, RegexGroup) { + ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr { + (void) m; + return nullptr; + }; + Parser p; + p.add_token("LB", std::unique_ptr(new RegexASTLiteral('['))); + p.add_token("RB", std::unique_ptr(new RegexASTLiteral(']'))); + p.add_token("DASH", std::unique_ptr(new RegexASTLiteral('-'))); + p.add_token("ANY", std::unique_ptr(RegexASTGroup::make(true, { '[', '[', ']', ']' }))); + p.add_production("class", { "LB", "class_contents", "RB" }, fn); + p.add_production("class_contents", { "class_element" }, fn); + p.add_production("class_contents", { "class_element", "class_contents" }, fn); + p.add_production("class_element", { "literal" }, fn); + p.add_production("class_element", { "class_element", "DASH", "literal" }, fn); + p.add_production("literal", { "DASH" }, fn); + p.add_production("literal", { "ANY" }, fn); + p.generate("class"); + std::stringstream ss; + ss << "[-a-c-d-]"; + FileInputStream fis(&ss); + p.parse(&fis); +}