From f87b8d82cccfb1ad0c08ff37efdf98c5d0ee1dde Mon Sep 17 00:00:00 2001
From: Raekye <adrian@creatifcubed.com>
Date: Tue, 24 Feb 2015 15:20:29 -0500
Subject: [PATCH] this was a while ago I think I was specifying a formal
 grammar for the regex and writing a corresponding parser

---
 README.md             | 141 ++++++++++-
 primed/src/dfa.cpp    |   1 +
 primed/src/dfa.h      |  12 +
 primed/src/driver.cpp |   4 +-
 primed/src/lexer.cpp  |   5 +-
 primed/src/regex.cpp  | 563 +++++++++++++++++++++++++++---------------
 primed/src/regex.h    |  73 ++++--
 7 files changed, 578 insertions(+), 221 deletions(-)
 create mode 100644 primed/src/dfa.cpp
 create mode 100644 primed/src/dfa.h

diff --git a/README.md b/README.md
index 3c3a655..f1faeee 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,145 @@ Each of these directories has a `Makefile` that puts stuff in a `bin/` folder.
 - `primed/` (in progress): lexer-generator and parser-generator
 
 ### Primed
-- contains a hand written, recursive descent basic regex parser (builds AST)
+- hand written, recursive descent basic regex parser (builds AST)
 - regex used to define tokens, lexer-generator generates states and next-states for a lexer (a big FSM)
+- goal is to generate DFA
+
+#### Regex grammar
+- multiplication is repetition
+- addition is concatenation
+- multiplication, addition, and logical or have the same precedence as they usually would in a programming language (in that order)
+
+```
+TOKEN_STAR: *
+TOKEN_PLUS: +
+TOKEN_QUESTION_MARK: ?
+TOKEN_OR: |
+TOKEN_ESCAPE: \
+
+TOKEN_LPAREN: (
+TOKEN_RPAREN: )
+TOKEN_LBRACE: {
+TOKEN_RBRACE: }
+TOKEN_LBRACKET: [
+TOKEN_RBRACKET: ]
+
+TOKEN_SPECIAL: any of the tokens above
+TOKEN_PLAIN: everything not TOKEN_SPECIAL, code point in [32 (space) , 127 (tilda) )
+TOKEN_GROUP_SPECIAL: TOKEN_LBRACKET | TOKEN_RBRACKET | TOKEN_DASH | TOKEN_ESCAPE
+TOKEN_GROUP_PLAIN: everything not TOKEN_GROUP_SPECIAL, code point in [32, 127)
+
+TOKEN_DASH: -
+TOKEN_COMMA: ,
+
+TOKEN_X: x
+TOKEN_U: u
+TOKEN_T: t
+TOKEN_N: n
+TOKEN_R: r
+
+TOKEN_HEX_DIGIT: [0-9a-f]
+TOKEN_DEC_DIGIT: [0-9]
+
+top_level
+	: lr_or
+	;
+
+lr_or
+	: not_lr_or TOKEN_OR lr_or
+	| not_lr_or
+	;
+
+not_lr_or
+	: lr_add
+	;
+
+lr_add
+	: not_lr_add lr_add
+	| not_lr_add
+	;
+
+not_lr_add
+	: lr_mul
+	;
+
+lr_mul
+	: not_lr_mul TOKEN_STAR
+	| not_lr_mul TOKEN_PLUS
+	| not_lr_mul TOKEN_QUESTION_MARK
+	| not_lr_mul mul_range
+	| not_lr_mul
+	;
+
+not_lr_mul
+	: not_lr
+	;
+
+not_lr
+	: parentheses
+	| literal
+	| group
+	;
+
+mul_range
+	: TOKEN_LBRACE dec_int TOKEN_COMMA dec_int TOKEN_LBRACE
+	;
+
+parentheses
+	: TOKEN_LPAREN top_level TOKEN_RPAREN
+	;
+
+literal
+	: absolute_literal
+	| TOKEN_ESCAPE TOKEN_SPECIAL
+	| TOKEN_PLAIN
+	;
+
+group
+	: TOKEN_LBRACKET group_contents TOKEN_RBRACKET
+	;
+
+group_contents
+	: group_element group_contents
+	| group_element
+	;
+
+group_element
+	: group_range
+	| group_literal
+	;
+
+group_literal
+	| absolute_literal
+	| TOKEN_ESCAPE TOKEN_GROUP_SPECIAL
+	| TOKEN_GROUP_PLAIN
+	;
+
+group_range
+	: group_literal TOKEN_DASH group_literal
+	;
+
+absolute_literal
+	: TOKEN_ESCAPE TOKEN_X hex_byte
+	| TOKEN_ESCAPE TOKEN_U hex_int
+	| TOKEN_ESCAPE TOKEN_T
+	| TOKEN_ESCAPE TOKEN_N
+	| TOKEN_ESCAPE TOKEN_R
+	;
+
+hex_byte
+	: TOKEN_HEX_DIGIT TOKEN_HEX_DIGIT
+	;
+
+hex_int
+	: hex_byte hex_byte hex_byte hex_byte
+	;
+
+dec_int
+	: TOKEN_DEC_DIGIT
+	| TOKEN_DEC_DIGIT dec_int
+	;
+```
 
 ## Dependencies
 - gcc-c++
@@ -52,5 +189,7 @@ Each of these directories has a `Makefile` that puts stuff in a `bin/` folder.
 - http://stackoverflow.com/questions/3551733/llvm-automatic-c-linking
 - http://stackoverflow.com/questions/4425797/linking-llvm-jit-code-to-external-c-functions
 - http://stackoverflow.com/questions/14307906/c-llvm-class-functionality
+- http://swtch.com/~rsc/regexp/regexp1.html
+- http://stackoverflow.com/questions/2245962/is-there-an-alternative-for-flex-bison-that-is-usable-on-8-bit-embedded-systems
 
 [1]: https://github.com/Raekye/bdel_and_dfr_compiler
diff --git a/primed/src/dfa.cpp b/primed/src/dfa.cpp
new file mode 100644
index 0000000..a33db3f
--- /dev/null
+++ b/primed/src/dfa.cpp
@@ -0,0 +1 @@
+#include "dfa.h"
diff --git a/primed/src/dfa.h b/primed/src/dfa.h
new file mode 100644
index 0000000..64c7f59
--- /dev/null
+++ b/primed/src/dfa.h
@@ -0,0 +1,12 @@
+#ifndef PRIMED_DFA_H_INCLUDED
+#define PRIMED_DFA_H_INCLUDED
+
+#include <map>
+
+template <typename T> class DFAState {
+public:
+	std::map<T, DFAState<T>*>* link;
+	bool terminal;
+};
+
+#endif /* PRIMED_DFA_H_INCLUDED */
diff --git a/primed/src/driver.cpp b/primed/src/driver.cpp
index 2cf91ac..e616928 100644
--- a/primed/src/driver.cpp
+++ b/primed/src/driver.cpp
@@ -5,9 +5,9 @@
 
 int main() {
 	Lexer l;
-	l.add_rule(Rule("rule1", "abc[def]*", "tag1"));
+	l.add_rule(Rule("rule1", "a(xyzb)*c|def", "tag1"));
 	std::stringstream ss;
-	ss << "abcfefefee";
+	ss << "a";
 	Token* t = l.scan(&ss);
 	l.print_states();
 	if (t) {
diff --git a/primed/src/lexer.cpp b/primed/src/lexer.cpp
index 1b01860..c39f78c 100644
--- a/primed/src/lexer.cpp
+++ b/primed/src/lexer.cpp
@@ -15,8 +15,7 @@ void Lexer::generate() {
 	this->generation_parent_states_stack.push(root);
 	std::cout << "=== Rules" << std::endl;
 	for (int32_t i = 0; i < this->rules.size(); i++) {
-		RegexASTChain* regex = this->regex_parser.parse(this->rules[i].pattern);
-		this->generation_regex_chain_end = regex->sequence->back();
+		RegexAST* regex = this->regex_parser.parse(this->rules[i].pattern);
 		this->generation_terminal_tag = this->rules[i].tag;
 		RegexASTPrinter a;
 		a.indents = 1;
@@ -131,7 +130,7 @@ int32_t Lexer::generation_new_state() {
 void Lexer::print_states() {
 	std::cout << "=== States" << std::endl;
 	for (int32_t i = 0; i < this->states.size(); i++) {
-		std::cout << "State " << i << ": " << this->states[i]->tag;
+		std::cout << "State " << i << (this->states[i]->is_terminal() ? "(end)" : "") << ": " << this->states[i]->tag;
 		for (std::map<int32_t, std::vector<int32_t>>::iterator it = this->states[i]->next_states.begin(); it != this->states[i]->next_states.end(); it++) {
 			std::cout << ", " << (char) it->first << " ->";
 			for (int32_t j = 0; j < it->second.size(); j++) {
diff --git a/primed/src/regex.cpp b/primed/src/regex.cpp
index 5f39d74..9e8f03c 100644
--- a/primed/src/regex.cpp
+++ b/primed/src/regex.cpp
@@ -2,6 +2,8 @@
 #include <cctype>
 #include <iostream>
 
+#pragma mark - RegexAST
+
 RegexASTChain::RegexASTChain(std::vector<RegexAST*>* sequence) {
 	this->sequence = sequence;
 }
@@ -21,7 +23,7 @@ RegexASTMultiplication::RegexASTMultiplication(RegexAST* node, int32_t min, int3
 	this->max = max;
 }
 
-RegexASTRange::RegexASTRange(int32_t lower, int32_t upper) {
+RegexASTRange::RegexASTRange(uint32_t lower, uint32_t upper) {
 	this->lower = lower;
 	this->upper = upper;
 }
@@ -74,265 +76,292 @@ void RegexASTRange::accept(IRegexASTVisitor* visitor) {
 	visitor->visit(this);
 }
 
-#pragma mark - Regex parser
+#pragma mark - RegexParser
 RegexParser::RegexParser() {
 	return;
 }
 
-RegexASTChain* RegexParser::parse(std::string str) {
+int32_t RegexParser::buffer_pos() {
+	return this->pos.top();
+}
+
+void RegexParser::buffer_advance(int32_t delta) {
+	this->pos.push(this->buffer_pos() + delta);
+}
+
+uint32_t RegexParser::buffer_char(int32_t delta) {
+	if (this->buffer_pos() + delta >= this->buffer.size()) {
+		return 0;
+	}
+	return this->buffer[this->buffer_pos() + delta];
+}
+
+void RegexParser::buffer_push(int32_t loc) {
+	this->pos.push(loc);
+}
+
+int32_t RegexParser::buffer_pop(int32_t times) {
+	int32_t popped = this->buffer_pos();
+	for (int32_t i = 0; i < times; i++) {
+		this->pos.pop();
+	}
+	return popped;
+}
+
+#pragma mark - RegexParser - parsing
+RegexAST* RegexParser::parse(std::string str) {
 	this->buffer = str;
 	this->pos = std::stack<int32_t>();
 	this->pos.push(0);
-	RegexASTChain* regex = this->parse_chain();
+	RegexAST* regex = this->parse_toplevel();
+	if (!regex) {
+		return NULL;
+	}
 	if (this->buffer_pos() != str.length()) {
 		delete regex;
 		return NULL;
 	}
-	return regex;
-}
-
-RegexASTChain* RegexParser::parse_chain() {
-	std::vector<RegexAST*>* sequence = new std::vector<RegexAST*>();
-	RegexAST* first = this->parse_toplevel();
-	if (first) {
-		sequence->push_back(first);
-		while (RegexAST* node = this->parse_toplevel()) {
-			sequence->push_back(node);
-		}
-	} else {
-		delete sequence;
-		return NULL;
+	this->pos.pop();
+	if (this->pos.size() != 1) {
+		delete regex;
+		throw std::runtime_error("RegexParser did not finish with pos stack 1");
 	}
-	return new RegexASTChain(sequence);
+	return regex;
 }
 
 RegexAST* RegexParser::parse_toplevel() {
-	RegexAST* node = NULL;
-	if ((node = this->parse_multiplication())) {
-		return node;
-	} else if ((node = this->parse_or())) {
-		return node;
-	} else if ((node = this->parse_toplevel_nonrecursive())) {
-		return node;
-	}
-	return NULL;
+	return this->parse_lr_or();
 }
 
-RegexAST* RegexParser::parse_toplevel_nonrecursive() {
-	RegexAST* node = NULL;
-	if ((node = this->parse_parenthesis())) {
-		return node;
-	} else if ((node = RegexParser::parse_literal())) {
-		return node;
-	} else if ((node = RegexParser::parse_group())) {
-		return node;
-	}
-	return NULL;
-}
-
-RegexAST* RegexParser::parse_parenthesis() {
-	if (this->buffer_char() != '(') {
+RegexAST* RegexParser::parse_lr_or() {
+	RegexAST* l = this->parse_not_lr_or();
+	if (!l) {
 		return NULL;
 	}
-	this->buffer_advance(1);
-	RegexAST* node = this->parse_chain();
-	if (!node) {
-		this->buffer_pop(1);
-		return NULL;
+	if (this->buffer_char() != RegexParser::TOKEN_OR) {
+		return l;
 	}
-	if (this->buffer_char() != ')') {
-		delete node;
+	this->buffer_advance(1);
+	RegexAST* r = this->parse_lr_or();
+	if (!r) {
+		delete l;
 		this->buffer_pop(2);
 		return NULL;
 	}
-	this->buffer_push(this->buffer_pop(2) + 1);
-	return node;
+	this->buffer_push(this->buffer_pop(3));
+	return new RegexASTOr(l, r);
 }
 
-RegexAST* RegexParser::parse_literal() {
-	int32_t ch = this->buffer_char();
-	if (ch == 0) {
+RegexAST* RegexParser::parse_not_lr_or() {
+	return this->parse_lr_add();
+}
+
+RegexAST* RegexParser::parse_lr_add() {
+	RegexAST* car = this->parse_not_lr_add();
+	if (!car) {
 		return NULL;
 	}
-	if (ch == '\\') {
-		ch = this->buffer_char(1);
-		if (RegexParser::is_special_char(ch)) {
-			this->buffer_advance(2);
-			return new RegexASTLiteral(ch);
+	std::vector<RegexAST*>* chain = new std::vector<RegexAST*>();
+	chain->push_back(car);
+	while (true) {
+		RegexAST* next = this->parse_not_lr_add();
+		if (!next) {
+			break;
 		}
+		chain->push_back(next);
+		this->buffer_push(this->buffer_pop(2));
 	}
-	if (RegexParser::is_special_char(ch)) {
-		return NULL;
+	return new RegexASTChain(chain);
+}
+
+RegexAST* RegexParser::parse_not_lr_add() {
+	return this->parse_lr_mul();
+}
+
+RegexAST* RegexParser::parse_lr_mul() {
+	RegexAST* l = this->parse_not_lr_mul();
+	uint32_t ch = this->buffer_char();
+	if (ch == RegexParser::TOKEN_STAR) {
+		this->buffer_advance(1);
+		this->buffer_push(this->buffer_pop(2));
+		return new RegexASTMultiplication(l, 0, 0);
+	} else if (ch == RegexParser::TOKEN_PLUS) {
+		this->buffer_advance(1);
+		this->buffer_push(this->buffer_pop(2));
+		return new RegexASTMultiplication(l, 1, 0);
+	} else if (ch == RegexParser::TOKEN_QUESTION_MARK) {
+		this->buffer_advance(1);
+		this->buffer_push(this->buffer_pop(2));
+		return new RegexASTMultiplication(l, 0, 1);
 	}
-	this->buffer_advance(1);
-	return new RegexASTLiteral(ch);
+	std::tuple<int32_t, int32_t>* range = this->parse_mul_range();
+	if (!range) {
+		return l;
+	}
+	this->buffer_push(this->buffer_pop(2));
+	RegexAST* r = new RegexASTMultiplication(l, std::get<0>(*range), std::get<1>(*range));
+	delete range;
+	return r;
 }
 
-RegexAST* RegexParser::parse_or() {
-	RegexAST* left = this->parse_toplevel_nonrecursive();
-	if (!left) {
+std::tuple<int32_t, int32_t>* RegexParser::parse_mul_range() {
+	if (this->buffer_char() != '{') {
 		return NULL;
 	}
-	if (this->buffer_char() != '|') {
-		delete left;
+	this->buffer_advance(1);
+	uint32_t* lower = this->parse_dec_int();
+	if (!lower) {
 		this->buffer_pop(1);
 		return NULL;
 	}
-	this->buffer_advance(1);
-	RegexAST* right = this->parse_toplevel();
-	if (!right) {
-		delete left;
+	if (this->buffer_char() != ',') {
 		this->buffer_pop(2);
 		return NULL;
 	}
-	this->buffer_push(this->buffer_pop(3));
-	return new RegexASTOr(left, right);
-}
-
-RegexAST* RegexParser::parse_multiplication() {
-	RegexAST* node = this->parse_toplevel_nonrecursive();
-	if (!node) {
+	this->buffer_advance(1);
+	uint32_t* upper = this->parse_dec_int();
+	if (!upper) {
+		delete lower;
+		this->buffer_pop(3);
 		return NULL;
 	}
-	if (this->buffer_char() == '*') {
-		this->buffer_push(this->buffer_pop(1) + 1);
-		return new RegexASTMultiplication(node, 0, 0);
-	} else if (this->buffer_char() == '?') {
-		this->buffer_push(this->buffer_pop(1) + 1);
-		return new RegexASTMultiplication(node, 0, 1);
-	} else if (this->buffer_char() == '+') {
-		this->buffer_push(this->buffer_pop(1) + 1);
-		return new RegexASTMultiplication(node, 1, 0);
-	} else {
-		if (std::tuple<int32_t, int32_t>* range = this->parse_multiplication_range()) {
-			this->buffer_push(this->buffer_pop(2));
-			RegexAST* node_prime = new RegexASTMultiplication(node, std::get<0>(*range), std::get<1>(*range));
-			delete range;
-			return node_prime;
-		}
+	if (this->buffer_char() != '}') {
+		delete lower;
+		delete upper;
+		this->buffer_pop(4);
+		return NULL;
+	}
+	this->buffer_push(this->buffer_pop(4) + 1);
+	std::tuple<int32_t, int32_t>* range = new std::tuple<int32_t, int32_t>(*lower, *upper);
+	delete lower;
+	delete upper;
+	return range;
+}
+
+RegexAST* RegexParser::parse_not_lr_mul() {
+	return this->parse_not_lr();
+}
+
+RegexAST* RegexParser::parse_not_lr() {
+	if (RegexAST* r = this->parse_parentheses()) {
+		return r;
+	} else if (RegexAST* r = this->parse_literal()) {
+		return r;
+	} else if (RegexAST* r = this->parse_group()) {
+		return r;
 	}
-	delete node;
-	this->buffer_pop(1);
 	return NULL;
 }
 
-RegexAST* RegexParser::parse_group() {
-	if (this->buffer_char() != '[') {
+RegexAST* RegexParser::parse_parentheses() {
+	if (this->buffer_char() != '(') {
 		return NULL;
 	}
 	this->buffer_advance(1);
-	RegexAST* first = this->parse_group_element();
-	if (!first) {
+	RegexAST* node = this->parse_toplevel();
+	if (!node) {
 		this->buffer_pop(1);
 		return NULL;
 	}
-	RegexAST* second = this->parse_group_element();
-	if (!second) {
-		if (this->buffer_char() != ']') {
-			delete first;
-			this->buffer_pop(2);
-			return NULL;
-		}
-		this->buffer_push(this->buffer_pop(2) + 1);
-		return first;
-	}
-	int32_t delta = 3;
-	RegexASTOr* aggregation = new RegexASTOr(first, second);
-	RegexASTOr* tail = aggregation;
-	while (RegexAST* node = this->parse_group_element()) {
-		RegexASTOr* tail_prime = new RegexASTOr(tail->right, node);
-		tail->right = tail_prime;
-		tail = tail_prime;
-		delta++;
+	if (this->buffer_char() != ')') {
+		delete node;
+		this->buffer_pop(2);
+		return NULL;
+	}
+	this->buffer_push(this->buffer_pop(2) + 1);
+	return node;
+}
+
+RegexAST* RegexParser::parse_literal() {
+	uint32_t* x = this->parse_absolute_literal();
+	if (x) {
+		RegexAST* r = new RegexASTLiteral(*x);
+		delete x;
+		return r;
 	}
-	if (this->buffer_char() != ']') {
-		delete aggregation;
-		this->buffer_pop(delta);
+	int32_t ch = this->buffer_char();
+	if (ch == RegexParser::TOKEN_ESCAPE) {
+		ch = this->buffer_char(1);
+		if (RegexParser::is_special_char(ch)) {
+			this->buffer_advance(2);
+			return new RegexASTLiteral(ch);
+		}
 		return NULL;
 	}
-	this->buffer_push(this->buffer_pop(delta) + 1);
-	return aggregation;
+	if (!RegexParser::is_special_char(ch) && 32 <= ch && ch < 127) {
+		this->buffer_advance(1);
+		return new RegexASTLiteral(ch);
+	}
+	return NULL;
 }
 
-std::tuple<int32_t, int32_t>* RegexParser::parse_multiplication_range() {
-	if (this->buffer_char() != '{') {
+RegexAST* RegexParser::parse_group() {
+	if (this->buffer_char() != RegexParser::TOKEN_LBRACKET) {
 		return NULL;
 	}
 	this->buffer_advance(1);
-	int32_t lower = this->parse_number();
-	if (lower < 0) {
+	RegexAST* contents = this->parse_group_contents();
+	if (!contents) {
 		this->buffer_pop(1);
 		return NULL;
 	}
-	if (this->buffer_char() != ',') {
+	if (this->buffer_char() != RegexParser::TOKEN_RBRACKET) {
+		delete contents;
 		this->buffer_pop(2);
 		return NULL;
 	}
 	this->buffer_advance(1);
-	int32_t upper = this->parse_number();
-	if (upper < 0) {
-		this->buffer_pop(3);
-		return NULL;
-	}
-	if (this->buffer_char() != '}') {
-		this->buffer_pop(4);
-		return NULL;
-	}
-	this->buffer_push(this->buffer_pop(4) + 1);
-	return new std::tuple<int32_t, int32_t>(lower, upper);
+	this->buffer_push(this->buffer_pop(3));
+	return contents;
 }
 
-int32_t RegexParser::parse_number() {
-	int32_t ch = this->buffer_char();
-	if (!std::isdigit(ch)) {
-		return -1;
+RegexAST* RegexParser::parse_group_contents() {
+	RegexAST* car = this->parse_group_element();
+	if (!car) {
+		return NULL;
 	}
-	int32_t delta = 1;
-	int32_t x = ch - '0';
-	while (true) {
-		int32_t ch = this->buffer_char(delta);
-		if (!std::isdigit(ch)) {
-			break;
-		}
-		x = x * 10 + (ch - '0');
-		delta++;
+	RegexAST* cdr = this->parse_group_contents();
+	if (!cdr) {
+		return car;
 	}
-	this->buffer_advance(delta);
-	return x;
+	this->buffer_push(this->buffer_pop(2));
+	return new RegexASTOr(car, cdr);
 }
 
 RegexAST* RegexParser::parse_group_element() {
 	if (RegexAST* node = this->parse_group_range()) {
 		return node;
-	} else if (int32_t* x = this->parse_group_literal()) {
-		RegexAST* node = new RegexASTLiteral(*x);
-		delete x;
-		return node;
+	} else if (uint32_t* x= this->parse_group_literal()) {
+		 RegexAST* r =new RegexASTLiteral(*x);
+		 delete x;
+		 return r;
 	}
 	return NULL;
 }
 
-int32_t* RegexParser::parse_group_literal() {
-	int32_t ch = this->buffer_char();
-	if (ch == 0) {
-		return NULL;
+uint32_t* RegexParser::parse_group_literal() {
+	uint32_t* l = this->parse_absolute_literal();
+	if (l) {
+		return l;
 	}
-	if (ch == '\\') {
+	int32_t ch = this->buffer_char();
+	if (ch == RegexParser::TOKEN_ESCAPE) {
 		ch = this->buffer_char(1);
-		if (ch == '-' || ch == ']') {
+		if (RegexParser::is_group_special_char(ch)) {
 			this->buffer_advance(2);
-			return new int32_t(ch);
+			return new uint32_t(ch);
 		}
-	}
-	if (ch == '-' || ch == ']') {
 		return NULL;
 	}
-	this->buffer_advance(1);
-	return new int32_t(ch);
+	if (!RegexParser::is_group_special_char(ch) && 32 <= ch && ch < 127) {
+		this->buffer_advance(1);
+		return new uint32_t(ch);
+	}
+	return NULL;
 }
 
 RegexAST* RegexParser::parse_group_range() {
-	int32_t* lower = this->parse_group_literal();
+	uint32_t* lower = this->parse_group_literal();
 	if (!lower) {
 		return NULL;
 	}
@@ -342,7 +371,7 @@ RegexAST* RegexParser::parse_group_range() {
 		return NULL;
 	}
 	this->buffer_advance(1);
-	int32_t* upper = this->parse_group_literal();
+	uint32_t* upper = this->parse_group_literal();
 	if (!upper) {
 		delete lower;
 		this->buffer_pop(2);
@@ -355,43 +384,179 @@ RegexAST* RegexParser::parse_group_range() {
 	return node;
 }
 
-int32_t RegexParser::buffer_pos() {
-	return this->pos.top();
+uint32_t* RegexParser::parse_absolute_literal() {
+	if (this->buffer_char() != RegexParser::TOKEN_ESCAPE) {
+		return NULL;
+	}
+	this->buffer_advance(1);
+	uint32_t ch = this->buffer_char();
+	if (ch == RegexParser::TOKEN_X) {
+		this->buffer_advance(1);
+		uint32_t* x = this->parse_hex_byte();
+		if (!x) {
+			this->buffer_pop(2);
+			return NULL;
+		}
+		this->buffer_push(this->buffer_pop(2));
+		return x;
+	} else if (ch == RegexParser::TOKEN_U) {
+		this->buffer_advance(1);
+		uint32_t* x = this->parse_hex_int();
+		if (!x) {
+			this->buffer_pop(2);
+			return NULL;
+		}
+		this->buffer_push(this->buffer_pop(2));
+		return x;
+	} else if (ch == RegexParser::TOKEN_T) {
+		this->buffer_advance(1);
+		this->buffer_push(this->buffer_pop(2));
+		return new uint32_t('\t');
+	} else if (ch == RegexParser::TOKEN_N) {
+		this->buffer_advance(1);
+		this->buffer_push(this->buffer_pop(2));
+		return new uint32_t('\n');
+	} else if (ch == RegexParser::TOKEN_R) {
+		this->buffer_advance(1);
+		this->buffer_push(this->buffer_pop(2));
+		return new uint32_t('\r');
+	}
+	this->buffer_pop(1);
+	return NULL;
 }
 
-void RegexParser::buffer_advance(int32_t delta) {
-	this->pos.push(this->pos.top() + delta);
+uint32_t* RegexParser::parse_hex_byte() {
+	int32_t upper = this->buffer_char();
+	if (!RegexParser::is_hex_digit(upper)) {
+		return NULL;
+	}
+	int32_t lower = this->buffer_char(1);
+	if (!RegexParser::is_hex_digit(lower)) {
+		return NULL;
+	}
+	uint32_t x = 0;
+	if (RegexParser::is_dec_digit(upper)) {
+		x = upper - '0';
+	} else {
+		x = upper - 'a' + 10;
+	}
+	x <<= 4;
+	if (RegexParser::is_dec_digit(lower)) {
+		x |= upper - '0';
+	} else {
+		x |= upper - 'a' + 10;
+	}
+	this->buffer_advance(2);
+	return new uint32_t(x);
 }
 
-int32_t RegexParser::buffer_char(int32_t delta) {
-	if (this->pos.top() + delta >= this->buffer.size()) {
-		return 0;
+uint32_t* RegexParser::parse_hex_int() {
+	uint32_t x = 0;
+	this->buffer_advance(0);
+	for (int32_t i = 0; i < 4; i++) {
+		uint32_t* b = this->parse_hex_byte();
+		if (!b) {
+			this->buffer_pop(1);
+			return NULL;
+		}
+		x = (x << 8) + *b;
+		delete b;
+		this->buffer_push(this->buffer_pop(2));
 	}
-	return this->buffer[this->pos.top() + delta];
+	return new uint32_t(x);
 }
 
-void RegexParser::buffer_push(int32_t loc) {
-	this->pos.push(loc);
+uint32_t* RegexParser::parse_dec_int() {
+	uint32_t x = 0;
+	int32_t delta = 0;
+	while (true) {
+		int32_t ch = this->buffer_char(delta);
+		if (!RegexParser::is_dec_digit(ch)) {
+			break;
+		}
+		x = x * 10 + (ch - '0');
+		delta++;
+	}
+	this->buffer_advance(delta);
+	return new uint32_t(x);
 }
 
-int32_t RegexParser::buffer_pop(int32_t times) {
-	int32_t popped = this->pos.top();
-	for (int32_t i = 0; i < times; i++) {
-		this->pos.pop();
+bool RegexParser::is_special_char(uint32_t ch) {
+	return ch == TOKEN_ESCAPE
+		|| ch == TOKEN_LPAREN
+		|| ch == TOKEN_RPAREN
+		|| ch == TOKEN_LBRACE
+		|| ch == TOKEN_RBRACE
+		|| ch == TOKEN_LBRACKET
+		|| ch == TOKEN_RBRACKET
+		|| ch == TOKEN_OR
+		|| ch == TOKEN_STAR
+		|| ch == TOKEN_PLUS
+		|| ch == TOKEN_QUESTION_MARK;
+}
+
+bool RegexParser::is_group_special_char(uint32_t ch) {
+	return ch == TOKEN_LBRACKET
+		|| ch == TOKEN_DASH
+		|| ch == TOKEN_RBRACKET
+		|| ch == TOKEN_ESCAPE;
+}
+
+bool RegexParser::is_hex_digit(uint32_t ch) {
+	return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f');
+}
+
+bool RegexParser::is_dec_digit(uint32_t ch) {
+	return ('0' <= ch && ch <= '9');
+}
+
+#pragma mark - RegexDFAGenerator
+
+void RegexDFAGenerator::visit(RegexASTChain* node) {
+	DFAState<uint32_t>* saved_root = this->root;
+	node->sequence->front()->accept(this);
+	this->root = this->ret;
+	for (int32_t i = 1; i < node->sequence->size(); i++) {
+		node->sequence->operator[](i)->accept(this);
+		this->root = this->ret;
 	}
-	return popped;
+	this->root = saved_root;
+	// ret keeps last value
 }
 
-bool RegexParser::is_special_char(int32_t ch) {
-	return ch == '\\'
-		|| ch == '['
-		|| ch == ']'
-		|| ch == '('
-		|| ch == ')'
-		|| ch == '{'
-		|| ch == '}'
-		|| ch == '|'
-		|| ch == '*'
-		|| ch == '+'
-		|| ch == '?';
+void RegexDFAGenerator::visit(RegexASTLiteral* node) {
+	std::map<uint32_t, DFAState<uint32_t>*>::iterator it = this->root->link->find(node->ch);
+	if (it == this->root->link->end()) {
+		this->ret = new DFAState<uint32_t>();
+	} else {
+		this->ret = it->second;
+	}
+}
+
+void RegexDFAGenerator::visit(RegexASTOr* node) {
+	node->left->accept(this);
+	DFAState<uint32_t>* saved_left_ret = this->ret;
+	node->right->accept(this);
+	for (std::map<uint32_t, DFAState<uint32_t>*>::iterator it = saved_left_ret->link->begin(); it != saved_left_ret->link->end(); it++) {
+		std::map<uint32_t, DFAState<uint32_t>*>::iterator it2 = this->ret->link->find(it->first);
+		if (it2 == this->ret->link->end()) {
+			this->ret->link->operator[](it->first) = it->second;
+		} else {
+			if (it2->second != it->second) {
+				throw std::runtime_error("State badness");
+			}
+		}
+	}
+	delete saved_left_ret->link;
+	saved_left_ret->link = this->ret->link;
+	// root unchanged
+	// ret keeps last generated value
+}
+
+void RegexDFAGenerator::visit(RegexASTMultiplication* node) {
+	return;
+}
+
+void RegexDFAGenerator::visit(RegexASTRange* node) {
+	return;
 }
diff --git a/primed/src/regex.h b/primed/src/regex.h
index db8f9bf..dbce2d5 100644
--- a/primed/src/regex.h
+++ b/primed/src/regex.h
@@ -6,34 +6,65 @@
 #include <stack>
 #include <tuple>
 #include <iostream>
+#include "dfa.h"
 
 class RegexAST;
-class RegexASTChain;
 class IRegexASTVisitor;
 
 class RegexParser {
 private:
-	RegexASTChain* parse_chain();
 	RegexAST* parse_toplevel();
-	RegexAST* parse_toplevel_nonrecursive();
-	RegexAST* parse_parenthesis();
+	RegexAST* parse_lr_or();
+	RegexAST* parse_not_lr_or();
+	RegexAST* parse_lr_add();
+	RegexAST* parse_not_lr_add();
+	RegexAST* parse_lr_mul();
+	RegexAST* parse_not_lr_mul();
+	RegexAST* parse_not_lr();
+	RegexAST* parse_parentheses();
 	RegexAST* parse_literal();
-	RegexAST* parse_or();
-	RegexAST* parse_multiplication();
 	RegexAST* parse_group();
-	std::tuple<int32_t, int32_t>* parse_multiplication_range();
-	int32_t parse_number();
+	uint32_t* parse_absolute_literal();
+
+	std::tuple<int32_t, int32_t>* parse_mul_range();
+
+	RegexAST* parse_group_contents();
 	RegexAST* parse_group_element();
-	int32_t* parse_group_literal();
 	RegexAST* parse_group_range();
+	uint32_t* parse_group_literal();
+
+	uint32_t* parse_hex_byte();
+	uint32_t* parse_hex_int();
+	uint32_t* parse_dec_int();
 
 	int32_t buffer_pos();
 	void buffer_advance(int32_t);
-	int32_t buffer_char(int32_t = 0);
+	uint32_t buffer_char(int32_t = 0);
 	void buffer_push(int32_t);
 	int32_t buffer_pop(int32_t);
 
-	static bool is_special_char(int32_t ch);
+	static bool is_special_char(uint32_t);
+	static bool is_group_special_char(uint32_t);
+	static bool is_hex_digit(uint32_t);
+	static bool is_dec_digit(uint32_t);
+
+	static const int32_t TOKEN_STAR = '*';
+	static const int32_t TOKEN_PLUS = '+';
+	static const int32_t TOKEN_QUESTION_MARK = '?';
+	static const int32_t TOKEN_OR = '|';
+	static const int32_t TOKEN_DASH = '-';
+	static const int32_t TOKEN_ESCAPE = '\\';
+	static const int32_t TOKEN_LPAREN = '(';
+	static const int32_t TOKEN_RPAREN = ')';
+	static const int32_t TOKEN_LBRACE = '{';
+	static const int32_t TOKEN_RBRACE = '}';
+	static const int32_t TOKEN_LBRACKET = '[';
+	static const int32_t TOKEN_RBRACKET = ']';
+	static const int32_t TOKEN_X = 'x';
+	static const int32_t TOKEN_U = 'u';
+	static const int32_t TOKEN_T = 't';
+	static const int32_t TOKEN_N = 'n';
+	static const int32_t TOKEN_R = 'r';
 
 public:
 	std::string buffer;
@@ -41,7 +72,7 @@ class RegexParser {
 
 	RegexParser();
 
-	RegexASTChain* parse(std::string);
+	RegexAST* parse(std::string);
 };
 
 class RegexAST {
@@ -61,7 +92,7 @@ class RegexASTChain : public RegexAST {
 
 class RegexASTLiteral : public RegexAST {
 public:
-	int32_t ch;
+	uint32_t ch;
 
 	RegexASTLiteral(int32_t);
 	virtual ~RegexASTLiteral();
@@ -93,10 +124,10 @@ class RegexASTMultiplication : public RegexAST {
 
 class RegexASTRange : public RegexAST {
 public:
-	int32_t lower;
-	int32_t upper;
+	uint32_t lower;
+	uint32_t upper;
 
-	RegexASTRange(int32_t, int32_t);
+	RegexASTRange(uint32_t, uint32_t);
 	virtual ~RegexASTRange();
 	virtual void accept(IRegexASTVisitor*) override;
 };
@@ -110,6 +141,16 @@ class IRegexASTVisitor {
 	virtual void visit(RegexASTRange*) = 0;
 };
 
+class RegexDFAGenerator : public IRegexASTVisitor {
+	DFAState<uint32_t>* root;
+	DFAState<uint32_t>* ret;
+	void visit(RegexASTChain*) override;
+	void visit(RegexASTLiteral*) override;
+	void visit(RegexASTOr*) override;
+	void visit(RegexASTMultiplication*) override;
+	void visit(RegexASTRange*) override;
+};
+
 class RegexASTPrinter : public IRegexASTVisitor {
 public:
 	int32_t indents = 0;