Skip to content

Commit

Permalink
cleanup, fixed regex parser groups
Browse files Browse the repository at this point in the history
  • Loading branch information
Raekye committed Jul 23, 2019
1 parent 92a3501 commit 2b90f93
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 83 deletions.
6 changes: 3 additions & 3 deletions midori/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

int test_regex_engine() {
RegexEngine re;
std::string pattern = "(abc){0,3}[a-zA-Z]|def.\\.[^a-zA-Z]?+-^\\n+[^\\t\\xff-\\u12345678]";
//std::string pattern = "[a-]";
//std::string pattern = "(abc){0,3}[a-zA-Z]|def.\\.[^a-zA-Z]?+-^\\n+[^\\t\\xff-\\u12345678]";
std::string pattern = "[-a-b-cd---]";
std::unique_ptr<RegexAST> r = re.parse(pattern);
RegexASTPrinter printer;
r->accept(&printer);
Expand Down Expand Up @@ -124,10 +124,10 @@ int main() {
ULong x = ~0;
std::cout << "-1 is " << x << std::endl;
test_interval_tree();
test_regex_engine();
test_parser0();
test_parser2();
test_parser1();
test_regex_engine();
//test_generator();
return 0;
}
30 changes: 16 additions & 14 deletions midori/src/midori/lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ FileInputStream::FileInputStream(std::istream* file) : file(file) {
}

Long FileInputStream::get() {
UInt ch = this->file->get();
if (!this->file->good()) {
Long ch = utf8::codepoint_from_istream(this->file);
if (ch < 0) {
if (this->file->eof()) {
return Lexer::CHAR_EOF;
}
Expand All @@ -35,8 +35,8 @@ Long VectorInputStream::get() {
return this->v.at(this->pos++);
}

Lexer::Lexer() : current_state(nullptr), buffer_pos(0), location(0, 0) {
return;
Lexer::Lexer() {
this->reset();
}

void Lexer::generate() {
Expand Down Expand Up @@ -137,8 +137,8 @@ void Lexer::generate() {
void Lexer::reset() {
this->buffer.clear();
this->buffer_pos = 0;
this->location.line = 0;
this->location.column = 0;
this->location.line = 1;
this->location.column = 1;
}

void Lexer::add_rule(std::string rule, std::unique_ptr<RegexAST> regex) {
Expand All @@ -147,7 +147,7 @@ void Lexer::add_rule(std::string rule, std::unique_ptr<RegexAST> regex) {
}

std::unique_ptr<Token> Lexer::scan(IInputStream* in) {
this->current_state = this->dfa->root();
RegexDFAState* current_state = this->dfa->root();
bool matched = false;
std::vector<std::string> matched_tags;
std::string matched_str = "";
Expand All @@ -156,9 +156,9 @@ std::unique_ptr<Token> Lexer::scan(IInputStream* in) {
std::string found_buffer = "";
std::unique_ptr<Token> t;
while (true) {
if (!this->current_state->terminals.empty()) {
if (!current_state->terminals.empty()) {
matched = true;
matched_tags = this->current_state->terminals;
matched_tags = current_state->terminals;
matched_str.append(found_buffer);
matched_buffer_pos = this->buffer_pos;
matched_location = this->location;
Expand All @@ -167,7 +167,7 @@ std::unique_ptr<Token> Lexer::scan(IInputStream* in) {
Long ch = this->read(in);
if (ch == '\n') {
this->location.line++;
this->location.column = 0;
this->location.column = 1;
} else {
this->location.column++;
}
Expand All @@ -185,16 +185,18 @@ std::unique_ptr<Token> Lexer::scan(IInputStream* in) {
t.reset(new Token({ Lexer::TOKEN_BAD }, found_buffer, matched_location));
break;
}
RegexDFAState* next = this->current_state->next(ch);
found_buffer.append(utf8::string_from_codepoint(ch));
this->buffer_pos++;
RegexDFAState* next = current_state->next(ch);
if (next == nullptr) {
if (matched) {
t.reset(new Token(matched_tags, matched_str, matched_location));
} else {
t.reset(new Token({ Lexer::TOKEN_BAD }, found_buffer, matched_location));
}
break;
}
found_buffer.append(utf8::string_from_codepoint(ch));
this->buffer_pos++;
this->current_state = next;
current_state = next;
}
this->buffer_pos = matched_buffer_pos;
this->location = matched_location;
Expand Down
4 changes: 0 additions & 4 deletions midori/src/midori/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
struct LocationInfo {
UInt line;
UInt column;
LocationInfo(UInt line, UInt column) : line(line), column(column) {
return;
}
};

struct Token {
Expand Down Expand Up @@ -72,7 +69,6 @@ class Lexer {
std::vector<std::unique_ptr<RegexAST>> rules_regex;

std::unique_ptr<RegexDFA> dfa;
RegexDFAState* current_state;

std::vector<UInt> buffer;
UInt buffer_pos;
Expand Down
8 changes: 8 additions & 0 deletions midori/src/midori/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ bool Parser::parse_advance(std::unique_ptr<Token> t) {
return false;
}
}
std::cout << "no rules, expected to see" << std::endl;
for (auto const& kv : curr->next) {
std::cout << kv.first << " -> " << kv.second->index << std::endl;
}
for (auto const& kv : curr->reductions) {
std::cout << kv.first << " <- ";
Parser::debug_production(kv.second);
}
return true;
}

Expand Down
11 changes: 1 addition & 10 deletions midori/src/midori/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ class ParserAST;

typedef std::function<std::unique_ptr<ParserAST>(MatchedNonterminal*)> ProductionHandler;
typedef std::pair<Production*, Int> Item;
typedef std::string Symbol;
typedef std::tuple<Symbol, Int, Int> ExtendedSymbol;

class ParserAST {
public:
Expand All @@ -36,19 +34,12 @@ struct Production {
ProductionHandler handler;
};

struct ExtendedProduction {
ExtendedSymbol target;
std::vector<ExtendedSymbol> symbols;
Production* orig;
};

struct ItemSet {
Int index;
std::set<Item> head;
std::set<Item> additionals;
std::set<Item> closure;
std::map<std::string, ItemSet*> next;
std::map<std::string, Production*> reductions;
Int index;
};

class Match {
Expand Down
87 changes: 36 additions & 51 deletions midori/src/midori/regex_engine.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "regex_engine.h"

#include <sstream>
#include "helper.h"
#include "utf8.h"
#include <sstream>
#include <algorithm>

ParserRegexAST::ParserRegexAST(std::unique_ptr<RegexAST> r) : regex(std::move(r)) {
return;
Expand Down Expand Up @@ -194,81 +195,65 @@ std::unique_ptr<Parser> RegexEngine::make() {
MatchedNonterminal* n2 = m->nonterminal(1);
ParserRegexAST* r1 = dynamic_cast<ParserRegexAST*>(n1->value.get());
ParserRegexAST* r2 = dynamic_cast<ParserRegexAST*>(n2->value.get());
RegexASTGroup* g1 = dynamic_cast<RegexASTGroup*>(r1->regex.get());
RegexASTGroup* g2 = dynamic_cast<RegexASTGroup*>(r2->regex.get());
assert(g1->span->next == nullptr);
std::unique_ptr<RegexASTGroup::RangeList> car(new RegexASTGroup::RangeList);
if (RegexASTLiteral* l = dynamic_cast<RegexASTLiteral*>(r1->regex.get())) {
car->range.first = l->ch;
car->range.second = l->ch;
} else if (RegexASTGroup* g = dynamic_cast<RegexASTGroup*>(r1->regex.get())) {
assert(g->span->next == nullptr);
car->range.first = g->span->range.first;
car->range.second = g->span->range.second;
}
std::unique_ptr<RegexASTGroup::RangeList> cdr = nullptr;
if (RegexASTLiteral* l2 = dynamic_cast<RegexASTLiteral*>(r2->regex.get())) {
cdr.reset(new RegexASTGroup::RangeList);
cdr->range.first = l2->ch;
cdr->range.second = l2->ch;
cdr->next = nullptr;
} else if (RegexASTGroup* g2 = dynamic_cast<RegexASTGroup*>(r2->regex.get())) {
cdr = std::move(g2->span);
}
car->next = std::move(cdr);
car->range.first = g1->span->range.first;
car->range.second = g1->span->range.second;
car->next = std::move(g2->span);
return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(new RegexASTGroup(false, std::move(car)))));
});
p->add_production("group_contents", { "group_element" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedNonterminal* n = m->nonterminal(0);
return std::move(n->value);
});

/*
p->add_production("group_element", { "group_range" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedNonterminal* n = m->nonterminal(0);
return std::move(n->value);
});
*/
p->add_production("group_element", { "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedNonterminal* n = m->nonterminal(0);
return std::move(n->value);
});
p->add_production("group_element", { "group_element", "DASH", "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedNonterminal* n1 = m->nonterminal(0);
MatchedNonterminal* n2 = m->nonterminal(2);
ParserRegexAST* r1 = dynamic_cast<ParserRegexAST*>(n1->value.get());
ParserRegexAST* r2 = dynamic_cast<ParserRegexAST*>(n2->value.get());
std::unique_ptr<RegexAST> p1(std::move(r1->regex));
std::unique_ptr<RegexAST> p2(std::move(r2->regex));
RegexASTLiteral* l1 = dynamic_cast<RegexASTLiteral*>(p1.get());
RegexASTLiteral* l2 = dynamic_cast<RegexASTLiteral*>(p2.get());
return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(RegexASTGroup::make(false, { l1->ch, l2->ch }))));
RegexASTGroup* g1 = dynamic_cast<RegexASTGroup*>(p1.get());
RegexASTGroup* g2 = dynamic_cast<RegexASTGroup*>(p2.get());
assert(g1->span->next == nullptr);
assert(g2->span->next == nullptr);
assert(g2->span->range.first == g2->span->range.second);
UInt a = g1->span->range.first;
UInt b = g1->span->range.second;
UInt c = g2->span->range.first;
UInt lower = std::min({ a, b, c });
UInt upper = std::max({ a, b, c });
return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(RegexASTGroup::make(false, { lower, upper }))));
});

p->add_production("group_literal", { "group_escaped_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
p->add_production("group_element", { "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedNonterminal* n = m->nonterminal(0);
return std::move(n->value);
});
/*
p->add_production("group_literal", { "DEC" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedTerminal* n = m->terminal(0);
return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(new RegexASTLiteral(n->token->lexeme.at(0)))));

p->add_production("group_literal", { "group_literal_char" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedNonterminal* n = m->nonterminal(0);
ParserRegexAST* r = dynamic_cast<ParserRegexAST*>(n->value.get());
RegexASTLiteral* l = dynamic_cast<RegexASTLiteral*>(r->regex.get());
return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(RegexASTGroup::make(false, { l->ch, l->ch }))));
});
p->add_production("group_literal", { "HEX" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedTerminal* n = m->terminal(0);
return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(new RegexASTLiteral(n->token->lexeme.at(0)))));
p->add_production("group_literal_char", { "group_escaped_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedNonterminal* n = m->nonterminal(0);
return std::move(n->value);
});
RegexEngine::add_literal(p.get(), "group_literal", "N", 'n');
RegexEngine::add_literal(p.get(), "group_literal", "T", 't');
RegexEngine::add_literal(p.get(), "group_literal", "X", 'x');
RegexEngine::add_literal(p.get(), "group_literal", "COMMA", ',');
RegexEngine::add_literal(p.get(), "group_literal", "DOT", '.');
RegexEngine::add_literal(p.get(), "group_literal", "LPAREN", '(');
RegexEngine::add_literal(p.get(), "group_literal", "RPAREN", ')');
RegexEngine::add_literal(p.get(), "group_literal", "LBRACE", '{');
RegexEngine::add_literal(p.get(), "group_literal", "RBRACE", '}');
RegexEngine::add_literal(p.get(), "group_literal", "STAR", '*');
RegexEngine::add_literal(p.get(), "group_literal", "PLUS", '+');
RegexEngine::add_literal(p.get(), "group_literal", "QUESTION", '?');
RegexEngine::add_literal(p.get(), "group_literal", "OR", '|');
*/
p->add_production("group_literal", { "GROUP_ANY" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
p->add_production("group_literal_char", { "DASH" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
(void) m;
return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(new RegexASTLiteral('-'))));
});
p->add_production("group_literal_char", { "GROUP_ANY" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
MatchedTerminal* n = m->terminal(0);
Long ch = utf8::codepoint_from_string(n->token->lexeme, 0, nullptr);
assert(ch >= 0);
Expand All @@ -290,7 +275,7 @@ std::unique_ptr<Parser> RegexEngine::make() {
RegexEngine::add_literal(p.get(), "group_escape_special", "N", '\n');
RegexEngine::add_literal(p.get(), "group_escape_special", "T", '\t');
RegexEngine::add_literal(p.get(), "group_escape_special", "ESCAPE", '\\');
RegexEngine::add_literal(p.get(), "group_escape_special", "DASH", '-');
//RegexEngine::add_literal(p.get(), "group_escape_special", "DASH", '-');
RegexEngine::add_literal(p.get(), "group_escape_special", "LBRACKET", '[');
RegexEngine::add_literal(p.get(), "group_escape_special", "RBRACKET", ']');

Expand Down
32 changes: 31 additions & 1 deletion midori/src/midori/utf8.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#ifndef MIDORI_UTF8_H_INCLUDED
#define MIDORI_UTF8_H_INCLUDED

#include <string>
#include "global.h"
#include <string>
#include <istream>

class utf8 {
public:
Expand Down Expand Up @@ -66,6 +67,35 @@ class utf8 {
}
return ((a & 0x07) << 18) | ((b & 0x3f) << 12) | ((c & 0x3f) << 6) | (d & 0x3f);
}

static Long codepoint_from_istream(std::istream* is) {
Int a = is->get();
if (!is->good()) {
return -1;
}
if ((a & 0x80) == 0) {
return a;
}
Int b = is->get();
if (!is->good()) {
return -1;
}
if ((a & 0xe0) == 0xc0) {
return ((a & 0x1f) << 6) | (b & 0x3f);
}
Int c = is->get();
if (!is->good()) {
return -1;
}
if ((a & 0xf0) == 0xe0) {
return ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f);
}
Int d = is->get();
if (!is->good()) {
return -1;
}
return ((a & 0x07) << 18) | ((b & 0x3f) << 12) | ((c & 0x3f) << 6) | (d & 0x3f);
}
};

#endif /* MIDORI_UTF8_H_INCLUDED */
1 change: 1 addition & 0 deletions midori/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ set(TESTS
utf8
lexer
parser
regex_engine
)

include_directories(${CMAKE_SOURCE_DIR}/src)
Expand Down
20 changes: 20 additions & 0 deletions midori/tests/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,26 @@ TEST_F(ParserTest, Recursion) {
p.parse(&fis);
}

TEST_F(ParserTest, Epsilon) {
ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
(void) m;
return nullptr;
};
Parser p;
p.add_token("A", std::unique_ptr<RegexAST>(new RegexASTLiteral('a')));
p.add_production("n", { "m", "n" }, fn);
p.add_production("n", {}, fn);
p.add_production("m", { "A" }, fn);
p.generate("n");
std::stringstream ss;
ss << "aaa";
FileInputStream fis(&ss);
p.parse(&fis);
p.reset();
ss << "";
p.parse(&fis);
}

TEST_F(ParserTest, RegexGroup) {
ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
(void) m;
Expand Down
Loading

0 comments on commit 2b90f93

Please sign in to comment.