Skip to content

Commit

Permalink
hmmmm
Browse files Browse the repository at this point in the history
  • Loading branch information
Raekye committed Aug 24, 2019
1 parent b0b7932 commit ee40573
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 32 deletions.
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Example build: `mkdir build && cd build && cmake .. && make && make test`.
- `sayaka`: successor to `madoka/`, had ideas on what to do differently. The ideas were pre 2014-summer, most of the work on it is post 2014-summer
- `siyu`: hand written LL(1) regex parser, NFA state generator, DFA state generator, lexer-generator, and parser-generator
- `tk`: successor to `siyu/`, completed LALR(1) parser generator
- `midori`: successor to `tk/`, hmmmm
- `midori`: successor to `tk/`, fixed/much improved LALR(1) parser generator

### Siyu
- hand written, recursive descent basic regex parser (builds AST)
Expand All @@ -25,7 +25,12 @@ Example build: `mkdir build && cd build && cmake .. && make && make test`.

### Midori
- lexer/finite automata now support ranges using interval trees
- rewrote parser generator/parsing algorithm several times ([SLR(1)][2], [LR(1)][3], [LALR(1)][4])
- rewrote parser generator/parsing algorithm several times:
- [SLR(1)][2]
- [LR(1)][3]
- [LALR(1)][4] based on the dragon book
- [LALR(1)][5] using DeRemer and Pennello's [lookahead algorithm][6], based on [PLY][7],
and as described in The Theory and Practice of Compiler Writing, and Parsing Theory Volume 2

## Regex grammar
- multiplication is repetition
Expand Down Expand Up @@ -197,8 +202,10 @@ dec_int
- https://web.cs.dal.ca/~sjackson/lalr1.html
- https://stackoverflow.com/questions/8242509/how-does-the-yacc-bison-lalr1-algorithm-treat-empty-rules
- https://stackoverflow.com/questions/57120176/grammar-matching-regex-character-classes-trailing-dash/
- Compilers: Principles, Techniques, and Tools (the Dragon book)
- Compilers: Principles, Techniques, and Tools (the dragon book)
- Parsing Theory Volume 2: LR(k) and LL(k) Parsing
- The Theory and Practice of Compiler Writing
- Efficient Computation of LALR(1) Look-Ahead Sets, DeRemer and Pennello (1982) ([link][6])
- Efficient Parsing for Natural Language: A Fast Algorithm for Practical Systems
- http://scottmcpeak.com/elkhound/elkhound.ps
- https://web.stanford.edu/class/archive/cs/cs143/cs143.1128/
Expand All @@ -207,3 +214,6 @@ dec_int
[2]: https://github.com/Raekye/hmmm/tree/1130d9626c838b36b54155926df05da25e4e828f/midori/src/midori/parser.cpp
[3]: https://github.com/Raekye/hmmm/tree/ca9659d56b1876f5a325463ebcdb04aec0e3cfbe/midori/src/midori/parser.cpp
[4]: https://github.com/Raekye/hmmm/tree/a4cb4c7e844ef49d675a9faac622d8d57c8da184/midori/src/midori/parser.cpp
[5]: https://github.com/Raekye/hmmm/tree/b0b7932e6c7ba5db770fd2ebe5ea3c5b6bfe0a79/midori/src/midori/parser.cpp
[6]: https://dl.acm.org/citation.cfm?id=357187
[7]: https://github.com/dabeaz/ply
6 changes: 5 additions & 1 deletion midori/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
cmake_minimum_required (VERSION 3.11)

set(CMAKE_CXX_STANDARD 11)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

set(CMAKE_CXX_STANDARD 11)
add_library(coverage INTERFACE)
target_compile_options(coverage INTERFACE -O0 -g --coverage)
target_link_libraries(coverage INTERFACE --coverage)

enable_testing()

Expand Down
1 change: 1 addition & 0 deletions midori/src/midori/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ set(SOURCES

add_library(midori SHARED ${SOURCES})
target_compile_options(midori PRIVATE -Wall -Wextra -Wpedantic -Werror -Wno-unknown-pragmas)
target_link_libraries(midori PUBLIC coverage)
54 changes: 31 additions & 23 deletions midori/src/midori/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Int Parser::add_production(std::string target, std::vector<std::string> symbols,
return this->add_production(target, symbols, handler, nullptr);
}
Int Parser::add_production(std::string target, std::vector<std::string> symbols, ProductionHandler handler, RewriteHandler rewrite) {
Int n = this->productions.size();
UInt n = this->productions.size();
std::unique_ptr<Production> p(new Production);
p->index = n;
p->target = target;
Expand Down Expand Up @@ -98,7 +98,7 @@ void Parser::generate(Type type, std::string start) {
*/
this->generate_reads_relations();
this->generate_read_sets();
this->generate_includes_lookback();
this->generate_includes_lookback_relations();
this->generate_follow_sets();
this->generate_lookaheads();
this->lr1_states = std::move(this->lr0_states);
Expand Down Expand Up @@ -253,7 +253,7 @@ void Parser::generate_lr1_closure(ItemSet* is) {
continue;
}
std::vector<std::string> l;
size_t j = (size_t) (i.dot + 1);
size_t j = i.dot + 1;
while (j < i.production->symbols.size()) {
std::string s2 = i.production->symbols.at(j);
std::map<std::string, std::set<std::string>>::iterator it = this->firsts.find(s2);
Expand Down Expand Up @@ -530,6 +530,9 @@ void Parser::generate_lalr_itemsets() {
}
}

// see The Theory and Practice of Compiler Writing, page 382
// see Tarjan's strongly connected components algorithm
// - https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
template <typename T, typename U> void Parser::digraph(std::vector<T>* nodes, GraphRelation<T> r, GraphFunction<T, U> g, std::map<T, std::set<U>>* f) {
std::map<T, Int> weights;
std::stack<T> stack;
Expand All @@ -542,8 +545,8 @@ template <typename T, typename U> void Parser::digraph(std::vector<T>* nodes, Gr

template <typename T, typename U> void Parser::traverse(T x, std::stack<T>* stack, std::map<T, Int>* weights, std::vector<T>* nodes, GraphRelation<T> r, GraphFunction<T, U> g, std::map<T, std::set<U>>* f) {
stack->push(x);
size_t d = stack->size();
weights->operator[](x) = (Int) d;
Int d = (Int) stack->size();
weights->operator[](x) = d;
f->operator[](x) = g(x);
typename std::map<T, Int>::iterator it_x = weights->find(x);
std::set<U>& f_x = f->operator[](x);
Expand All @@ -561,7 +564,7 @@ template <typename T, typename U> void Parser::traverse(T x, std::stack<T>* stac
std::set<U>& f_y = f->operator[](y);
f_x.insert(f_y.begin(), f_y.end());
}
if (it_x->second == (Int) d) {
if (it_x->second == d) {
while (true) {
T z = stack->top();
stack->pop();
Expand All @@ -574,6 +577,9 @@ template <typename T, typename U> void Parser::traverse(T x, std::stack<T>* stac
}
}

// see Efficient Computation of LALR(1) Look-Ahead Sets, DeRemer and Pennello (1982)
// - https://dl.acm.org/citation.cfm?id=357187
// see The Theory and Practice of Compiler Writing, pages 375-383
void Parser::generate_reads_relations() {
ItemSet* root = this->lr0_states.front().get();
assert(root->kernel.size() == 1);
Expand All @@ -595,6 +601,7 @@ void Parser::generate_reads_relations() {
}
this->nonterminal_transitions.emplace_back(is.get(), s);
LalrTransition& lt = this->nonterminal_transitions.back();
// create the mapping even if the sets end up being empty
std::set<std::string>& y = this->directly_reads_relation[lt];
std::set<LalrTransition>& z = this->reads_relation[lt];
ItemSet* js = is->next.at(s);
Expand All @@ -615,18 +622,18 @@ void Parser::generate_reads_relations() {
}

void Parser::generate_read_sets() {
std::map<LalrTransition, std::set<std::string>>& drr = this->directly_reads_relation;
std::map<LalrTransition, std::set<LalrTransition>>& rr = this->reads_relation;
GraphRelation<LalrTransition> r = [ &rr ](LalrTransition lt) -> std::set<LalrTransition>& {
return rr.at(lt);
std::map<LalrTransition, std::set<std::string>>* drr = &(this->directly_reads_relation);
std::map<LalrTransition, std::set<LalrTransition>>* rr = &(this->reads_relation);
GraphRelation<LalrTransition> r = [ rr ](LalrTransition lt) -> std::set<LalrTransition>& {
return rr->at(lt);
};
GraphFunction<LalrTransition, std::string> g = [ &drr ](LalrTransition lt) -> std::set<std::string>& {
return drr.at(lt);
GraphFunction<LalrTransition, std::string> g = [ drr ](LalrTransition lt) -> std::set<std::string>& {
return drr->at(lt);
};
Parser::digraph<LalrTransition, std::string>(&(this->nonterminal_transitions), r, g, &(this->reads));
}

void Parser::generate_includes_lookback() {
void Parser::generate_includes_lookback_relations() {
for (LalrTransition const& lt : this->nonterminal_transitions) {
std::cout << "Looking at transition ";
Parser::debug_lalr_transition(lt);
Expand All @@ -649,14 +656,15 @@ void Parser::generate_includes_lookback() {
continue;
}
size_t l = k + 1;
while (l < i.production->symbols.size()) {
std::string s = i.production->symbols.at(l);
if (this->nullable.find(s) == this->nullable.end()) {
size_t n = i.production->symbols.size();
while (l < n) {
std::string s2 = i.production->symbols.at(l);
if (this->nullable.find(s2) == this->nullable.end()) {
break;
}
l++;
}
if (l == i.production->symbols.size()) {
if (l == n) {
includes.push_back(lt2);
}
}
Expand All @@ -676,13 +684,13 @@ void Parser::generate_includes_lookback() {
}

void Parser::generate_follow_sets() {
std::map<LalrTransition, std::set<LalrTransition>>& ir = this->includes_relation;
std::map<LalrTransition, std::set<std::string>>& rs = this->reads;
GraphRelation<LalrTransition> r = [ &ir ](LalrTransition lt) -> std::set<LalrTransition>& {
return ir[lt];
std::map<LalrTransition, std::set<LalrTransition>>* ir = &(this->includes_relation);
std::map<LalrTransition, std::set<std::string>>* rs = &(this->reads);
GraphRelation<LalrTransition> r = [ ir ](LalrTransition lt) -> std::set<LalrTransition>& {
return ir->operator[](lt);
};
GraphFunction<LalrTransition, std::string> g = [ &rs ](LalrTransition lt) -> std::set<std::string>& {
return rs.at(lt);
GraphFunction<LalrTransition, std::string> g = [ rs ](LalrTransition lt) -> std::set<std::string>& {
return rs->at(lt);
};
Parser::digraph<LalrTransition, std::string>(&(this->nonterminal_transitions), r, g, &(this->follows));
}
Expand Down
10 changes: 5 additions & 5 deletions midori/src/midori/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ struct Precedence {
};

struct Production {
Int index;
UInt index;
std::string target;
std::vector<std::string> symbols;
std::string precedence;
Expand All @@ -103,7 +103,7 @@ struct Production {

struct Item {
Production* production;
Int dot;
UInt dot;
// for LR0 items, `terminal` is unused
std::string terminal;

Expand All @@ -112,7 +112,7 @@ struct Item {
}

bool is_done() const {
return this->dot == (Int) this->production->symbols.size();
return this->dot == this->production->symbols.size();
}

std::string next_symbol() const {
Expand All @@ -134,7 +134,7 @@ struct Action {
};

struct ItemSet {
Int index;
UInt index;
bool accept;
std::set<Item> kernel;
std::set<Item> closure;
Expand Down Expand Up @@ -279,7 +279,7 @@ class Parser {
template <typename T, typename U> static void traverse(T, std::stack<T>*, std::map<T, Int>*, std::vector<T>*, GraphRelation<T>, GraphFunction<T, U>, std::map<T, std::set<U>>*);
void generate_reads_relations();
void generate_read_sets();
void generate_includes_lookback();
void generate_includes_lookback_relations();
void generate_follow_sets();
void generate_lookaheads();

Expand Down

0 comments on commit ee40573

Please sign in to comment.