cleanup, fixed regex parser groups

Raekye · Jul 23, 2019 · 2b90f93 · 2b90f93
1 parent 92a3501
commit 2b90f93
Show file tree

Hide file tree

Showing 10 changed files with 150 additions and 83 deletions.
diff --git a/midori/src/main.cpp b/midori/src/main.cpp
@@ -9,8 +9,8 @@
 
 int test_regex_engine() {
 	RegexEngine re;
-	std::string pattern = "(abc){0,3}[a-zA-Z]|def.\\.[^a-zA-Z]?+-^\\n+[^\\t\\xff-\\u12345678]";
-	//std::string pattern = "[a-]";
+	//std::string pattern = "(abc){0,3}[a-zA-Z]|def.\\.[^a-zA-Z]?+-^\\n+[^\\t\\xff-\\u12345678]";
+	std::string pattern = "[-a-b-cd---]";
 	std::unique_ptr<RegexAST> r = re.parse(pattern);
 	RegexASTPrinter printer;
 	r->accept(&printer);
@@ -124,10 +124,10 @@ int main() {
 	ULong x = ~0;
 	std::cout << "-1 is " << x << std::endl;
 	test_interval_tree();
-	test_regex_engine();
 	test_parser0();
 	test_parser2();
 	test_parser1();
+	test_regex_engine();
 	//test_generator();
 	return 0;
 }
diff --git a/midori/src/midori/lexer.cpp b/midori/src/midori/lexer.cpp
@@ -14,8 +14,8 @@ FileInputStream::FileInputStream(std::istream* file) : file(file) {
 }
 
 Long FileInputStream::get() {
-	UInt ch = this->file->get();
-	if (!this->file->good()) {
+	Long ch = utf8::codepoint_from_istream(this->file);
+	if (ch < 0) {
 		if (this->file->eof()) {
 			return Lexer::CHAR_EOF;
 		}
@@ -35,8 +35,8 @@ Long VectorInputStream::get() {
 	return this->v.at(this->pos++);
 }
 
-Lexer::Lexer() : current_state(nullptr), buffer_pos(0), location(0, 0) {
-	return;
+Lexer::Lexer() {
+	this->reset();
 }
 
 void Lexer::generate() {
@@ -137,8 +137,8 @@ void Lexer::generate() {
 void Lexer::reset() {
 	this->buffer.clear();
 	this->buffer_pos = 0;
-	this->location.line = 0;
-	this->location.column = 0;
+	this->location.line = 1;
+	this->location.column = 1;
 }
 
 void Lexer::add_rule(std::string rule, std::unique_ptr<RegexAST> regex) {
@@ -147,7 +147,7 @@ void Lexer::add_rule(std::string rule, std::unique_ptr<RegexAST> regex) {
 }
 
 std::unique_ptr<Token> Lexer::scan(IInputStream* in) {
-	this->current_state = this->dfa->root();
+	RegexDFAState* current_state = this->dfa->root();
 	bool matched = false;
 	std::vector<std::string> matched_tags;
 	std::string matched_str = "";
@@ -156,9 +156,9 @@ std::unique_ptr<Token> Lexer::scan(IInputStream* in) {
 	std::string found_buffer = "";
 	std::unique_ptr<Token> t;
 	while (true) {
-		if (!this->current_state->terminals.empty()) {
+		if (!current_state->terminals.empty()) {
 			matched = true;
-			matched_tags = this->current_state->terminals;
+			matched_tags = current_state->terminals;
 			matched_str.append(found_buffer);
 			matched_buffer_pos = this->buffer_pos;
 			matched_location = this->location;
@@ -167,7 +167,7 @@ std::unique_ptr<Token> Lexer::scan(IInputStream* in) {
 		Long ch = this->read(in);
 		if (ch == '\n') {
 			this->location.line++;
-			this->location.column = 0;
+			this->location.column = 1;
 		} else {
 			this->location.column++;
 		}
@@ -185,16 +185,18 @@ std::unique_ptr<Token> Lexer::scan(IInputStream* in) {
 			t.reset(new Token({ Lexer::TOKEN_BAD }, found_buffer, matched_location));
 			break;
 		}
-		RegexDFAState* next = this->current_state->next(ch);
+		found_buffer.append(utf8::string_from_codepoint(ch));
+		this->buffer_pos++;
+		RegexDFAState* next = current_state->next(ch);
 		if (next == nullptr) {
 			if (matched) {
 				t.reset(new Token(matched_tags, matched_str, matched_location));
+			} else {
+				t.reset(new Token({ Lexer::TOKEN_BAD }, found_buffer, matched_location));
 			}
 			break;
 		}
-		found_buffer.append(utf8::string_from_codepoint(ch));
-		this->buffer_pos++;
-		this->current_state = next;
+		current_state = next;
 	}
 	this->buffer_pos = matched_buffer_pos;
 	this->location = matched_location;

diff --git a/midori/src/midori/lexer.h b/midori/src/midori/lexer.h
@@ -14,9 +14,6 @@
 struct LocationInfo {
 	UInt line;
 	UInt column;
-	LocationInfo(UInt line, UInt column) : line(line), column(column) {
-		return;
-	}
 };
 
 struct Token {
@@ -72,7 +69,6 @@ class Lexer {
 	std::vector<std::unique_ptr<RegexAST>> rules_regex;
 
 	std::unique_ptr<RegexDFA> dfa;
-	RegexDFAState* current_state;
 
 	std::vector<UInt> buffer;
 	UInt buffer_pos;

diff --git a/midori/src/midori/parser.cpp b/midori/src/midori/parser.cpp
@@ -133,6 +133,14 @@ bool Parser::parse_advance(std::unique_ptr<Token> t) {
 			return false;
 		}
 	}
+	std::cout << "no rules, expected to see" << std::endl;
+	for (auto const& kv : curr->next) {
+		std::cout << kv.first << " -> " << kv.second->index << std::endl;
+	}
+	for (auto const& kv : curr->reductions) {
+		std::cout << kv.first << " <- ";
+		Parser::debug_production(kv.second);
+	}
 	return true;
 }
 

diff --git a/midori/src/midori/parser.h b/midori/src/midori/parser.h
@@ -22,8 +22,6 @@ class ParserAST;
 
 typedef std::function<std::unique_ptr<ParserAST>(MatchedNonterminal*)> ProductionHandler;
 typedef std::pair<Production*, Int> Item;
-typedef std::string Symbol;
-typedef std::tuple<Symbol, Int, Int> ExtendedSymbol;
 
 class ParserAST {
 public:
@@ -36,19 +34,12 @@ struct Production {
 	ProductionHandler handler;
 };
 
-struct ExtendedProduction {
-	ExtendedSymbol target;
-	std::vector<ExtendedSymbol> symbols;
-	Production* orig;
-};
-
 struct ItemSet {
+	Int index;
 	std::set<Item> head;
-	std::set<Item> additionals;
 	std::set<Item> closure;
 	std::map<std::string, ItemSet*> next;
 	std::map<std::string, Production*> reductions;
-	Int index;
 };
 
 class Match {

diff --git a/midori/src/midori/regex_engine.cpp b/midori/src/midori/regex_engine.cpp
@@ -1,8 +1,9 @@
 #include "regex_engine.h"
 
-#include <sstream>
 #include "helper.h"
 #include "utf8.h"
+#include <sstream>
+#include <algorithm>
 
 ParserRegexAST::ParserRegexAST(std::unique_ptr<RegexAST> r) : regex(std::move(r)) {
 	return;
@@ -194,81 +195,65 @@ std::unique_ptr<Parser> RegexEngine::make() {
 		MatchedNonterminal* n2 = m->nonterminal(1);
 		ParserRegexAST* r1 = dynamic_cast<ParserRegexAST*>(n1->value.get());
 		ParserRegexAST* r2 = dynamic_cast<ParserRegexAST*>(n2->value.get());
+		RegexASTGroup* g1 = dynamic_cast<RegexASTGroup*>(r1->regex.get());
+		RegexASTGroup* g2 = dynamic_cast<RegexASTGroup*>(r2->regex.get());
+		assert(g1->span->next == nullptr);
 		std::unique_ptr<RegexASTGroup::RangeList> car(new RegexASTGroup::RangeList);
-		if (RegexASTLiteral* l = dynamic_cast<RegexASTLiteral*>(r1->regex.get())) {
-			car->range.first = l->ch;
-			car->range.second = l->ch;
-		} else if (RegexASTGroup* g = dynamic_cast<RegexASTGroup*>(r1->regex.get())) {
-			assert(g->span->next == nullptr);
-			car->range.first = g->span->range.first;
-			car->range.second = g->span->range.second;
-		}
-		std::unique_ptr<RegexASTGroup::RangeList> cdr = nullptr;
-		if (RegexASTLiteral* l2 = dynamic_cast<RegexASTLiteral*>(r2->regex.get())) {
-			cdr.reset(new RegexASTGroup::RangeList);
-			cdr->range.first = l2->ch;
-			cdr->range.second = l2->ch;
-			cdr->next = nullptr;
-		} else if (RegexASTGroup* g2 = dynamic_cast<RegexASTGroup*>(r2->regex.get())) {
-			cdr = std::move(g2->span);
-		}
-		car->next = std::move(cdr);
+		car->range.first = g1->span->range.first;
+		car->range.second = g1->span->range.second;
+		car->next = std::move(g2->span);
 		return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(new RegexASTGroup(false, std::move(car)))));
 	});
 	p->add_production("group_contents", { "group_element" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
 		MatchedNonterminal* n = m->nonterminal(0);
 		return std::move(n->value);
 	});
+
 	/*
 	p->add_production("group_element", { "group_range" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
 		MatchedNonterminal* n = m->nonterminal(0);
 		return std::move(n->value);
 	});
 	*/
-	p->add_production("group_element", { "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
-		MatchedNonterminal* n = m->nonterminal(0);
-		return std::move(n->value);
-	});
 	p->add_production("group_element", { "group_element", "DASH", "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
 		MatchedNonterminal* n1 = m->nonterminal(0);
 		MatchedNonterminal* n2 = m->nonterminal(2);
 		ParserRegexAST* r1 = dynamic_cast<ParserRegexAST*>(n1->value.get());
 		ParserRegexAST* r2 = dynamic_cast<ParserRegexAST*>(n2->value.get());
 		std::unique_ptr<RegexAST> p1(std::move(r1->regex));
 		std::unique_ptr<RegexAST> p2(std::move(r2->regex));
-		RegexASTLiteral* l1 = dynamic_cast<RegexASTLiteral*>(p1.get());
-		RegexASTLiteral* l2 = dynamic_cast<RegexASTLiteral*>(p2.get());
-		return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(RegexASTGroup::make(false, { l1->ch, l2->ch }))));
+		RegexASTGroup* g1 = dynamic_cast<RegexASTGroup*>(p1.get());
+		RegexASTGroup* g2 = dynamic_cast<RegexASTGroup*>(p2.get());
+		assert(g1->span->next == nullptr);
+		assert(g2->span->next == nullptr);
+		assert(g2->span->range.first == g2->span->range.second);
+		UInt a = g1->span->range.first;
+		UInt b = g1->span->range.second;
+		UInt c = g2->span->range.first;
+		UInt lower = std::min({ a, b, c });
+		UInt upper = std::max({ a, b, c });
+		return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(RegexASTGroup::make(false, { lower, upper }))));
 	});
-
-	p->add_production("group_literal", { "group_escaped_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
+	p->add_production("group_element", { "group_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
 		MatchedNonterminal* n = m->nonterminal(0);
 		return std::move(n->value);
 	});
-	/*
-	p->add_production("group_literal", { "DEC" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
-		MatchedTerminal* n = m->terminal(0);
-		return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(new RegexASTLiteral(n->token->lexeme.at(0)))));
+
+	p->add_production("group_literal", { "group_literal_char" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
+		MatchedNonterminal* n = m->nonterminal(0);
+		ParserRegexAST* r = dynamic_cast<ParserRegexAST*>(n->value.get());
+		RegexASTLiteral* l = dynamic_cast<RegexASTLiteral*>(r->regex.get());
+		return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(RegexASTGroup::make(false, { l->ch, l->ch }))));
 	});
-	p->add_production("group_literal", { "HEX" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
-		MatchedTerminal* n = m->terminal(0);
-		return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(new RegexASTLiteral(n->token->lexeme.at(0)))));
+	p->add_production("group_literal_char", { "group_escaped_literal" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
+		MatchedNonterminal* n = m->nonterminal(0);
+		return std::move(n->value);
 	});
-	RegexEngine::add_literal(p.get(), "group_literal", "N", 'n');
-	RegexEngine::add_literal(p.get(), "group_literal", "T", 't');
-	RegexEngine::add_literal(p.get(), "group_literal", "X", 'x');
-	RegexEngine::add_literal(p.get(), "group_literal", "COMMA", ',');
-	RegexEngine::add_literal(p.get(), "group_literal", "DOT", '.');
-	RegexEngine::add_literal(p.get(), "group_literal", "LPAREN", '(');
-	RegexEngine::add_literal(p.get(), "group_literal", "RPAREN", ')');
-	RegexEngine::add_literal(p.get(), "group_literal", "LBRACE", '{');
-	RegexEngine::add_literal(p.get(), "group_literal", "RBRACE", '}');
-	RegexEngine::add_literal(p.get(), "group_literal", "STAR", '*');
-	RegexEngine::add_literal(p.get(), "group_literal", "PLUS", '+');
-	RegexEngine::add_literal(p.get(), "group_literal", "QUESTION", '?');
-	RegexEngine::add_literal(p.get(), "group_literal", "OR", '|');
-	*/
-	p->add_production("group_literal", { "GROUP_ANY" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
+	p->add_production("group_literal_char", { "DASH" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
+		(void) m;
+		return std::unique_ptr<ParserAST>(new ParserRegexAST(std::unique_ptr<RegexAST>(new RegexASTLiteral('-'))));
+	});
+	p->add_production("group_literal_char", { "GROUP_ANY" }, [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
 		MatchedTerminal* n = m->terminal(0);
 		Long ch = utf8::codepoint_from_string(n->token->lexeme, 0, nullptr);
 		assert(ch >= 0);
@@ -290,7 +275,7 @@ std::unique_ptr<Parser> RegexEngine::make() {
 	RegexEngine::add_literal(p.get(), "group_escape_special", "N", '\n');
 	RegexEngine::add_literal(p.get(), "group_escape_special", "T", '\t');
 	RegexEngine::add_literal(p.get(), "group_escape_special", "ESCAPE", '\\');
-	RegexEngine::add_literal(p.get(), "group_escape_special", "DASH", '-');
+	//RegexEngine::add_literal(p.get(), "group_escape_special", "DASH", '-');
 	RegexEngine::add_literal(p.get(), "group_escape_special", "LBRACKET", '[');
 	RegexEngine::add_literal(p.get(), "group_escape_special", "RBRACKET", ']');
 

diff --git a/midori/src/midori/utf8.h b/midori/src/midori/utf8.h
@@ -1,8 +1,9 @@
 #ifndef MIDORI_UTF8_H_INCLUDED
 #define MIDORI_UTF8_H_INCLUDED
 
-#include <string>
 #include "global.h"
+#include <string>
+#include <istream>
 
 class utf8 {
 public:
@@ -66,6 +67,35 @@ class utf8 {
 		}
 		return ((a & 0x07) << 18) | ((b & 0x3f) << 12) | ((c & 0x3f) << 6) | (d & 0x3f);
 	}
+
+	static Long codepoint_from_istream(std::istream* is) {
+		Int a = is->get();
+		if (!is->good()) {
+			return -1;
+		}
+		if ((a & 0x80) == 0) {
+			return a;
+		}
+		Int b = is->get();
+		if (!is->good()) {
+			return -1;
+		}
+		if ((a & 0xe0) == 0xc0) {
+			return ((a & 0x1f) << 6) | (b & 0x3f);
+		}
+		Int c = is->get();
+		if (!is->good()) {
+			return -1;
+		}
+		if ((a & 0xf0) == 0xe0) {
+			return ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f);
+		}
+		Int d = is->get();
+		if (!is->good()) {
+			return -1;
+		}
+		return ((a & 0x07) << 18) | ((b & 0x3f) << 12) | ((c & 0x3f) << 6) | (d & 0x3f);
+	}
 };
 
 #endif /* MIDORI_UTF8_H_INCLUDED */
diff --git a/midori/tests/CMakeLists.txt b/midori/tests/CMakeLists.txt
@@ -41,6 +41,7 @@ set(TESTS
 	utf8
 	lexer
 	parser
+	regex_engine
 	)
 
 include_directories(${CMAKE_SOURCE_DIR}/src)

diff --git a/midori/tests/parser.cpp b/midori/tests/parser.cpp
@@ -43,6 +43,26 @@ TEST_F(ParserTest, Recursion) {
 	p.parse(&fis);
 }
 
+TEST_F(ParserTest, Epsilon) {
+	ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
+		(void) m;
+		return nullptr;
+	};
+	Parser p;
+	p.add_token("A", std::unique_ptr<RegexAST>(new RegexASTLiteral('a')));
+	p.add_production("n", { "m", "n" }, fn);
+	p.add_production("n", {}, fn);
+	p.add_production("m", { "A" }, fn);
+	p.generate("n");
+	std::stringstream ss;
+	ss << "aaa";
+	FileInputStream fis(&ss);
+	p.parse(&fis);
+	p.reset();
+	ss << "";
+	p.parse(&fis);
+}
+
 TEST_F(ParserTest, RegexGroup) {
 	ProductionHandler fn = [](MatchedNonterminal* m) -> std::unique_ptr<ParserAST> {
 		(void) m;
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,6 +41,7 @@ set(TESTS @@
     	utf8
     	lexer
     	parser
+    	regex_engine
     	)
     include_directories(${CMAKE_SOURCE_DIR}/src)
@@ Expand Down @@