this was a while ago

I think I was specifying a formal grammar for the regex and writing a corresponding parser
Raekye · Feb 24, 2015 · f87b8d8 · f87b8d8
1 parent be2d863
commit f87b8d8
Show file tree

Hide file tree

Showing 7 changed files with 578 additions and 221 deletions.
diff --git a/README.md b/README.md
@@ -17,8 +17,145 @@ Each of these directories has a `Makefile` that puts stuff in a `bin/` folder.
 - `primed/` (in progress): lexer-generator and parser-generator
 
 ### Primed
-- contains a hand written, recursive descent basic regex parser (builds AST)
+- hand written, recursive descent basic regex parser (builds AST)
 - regex used to define tokens, lexer-generator generates states and next-states for a lexer (a big FSM)
+- goal is to generate DFA
+
+#### Regex grammar
+- multiplication is repetition
+- addition is concatenation
+- multiplication, addition, and logical or have the same precedence as they usually would in a programming language (in that order)
+
+```
+TOKEN_STAR: *
+TOKEN_PLUS: +
+TOKEN_QUESTION_MARK: ?
+TOKEN_OR: |
+TOKEN_ESCAPE: \
+
+TOKEN_LPAREN: (
+TOKEN_RPAREN: )
+TOKEN_LBRACE: {
+TOKEN_RBRACE: }
+TOKEN_LBRACKET: [
+TOKEN_RBRACKET: ]
+
+TOKEN_SPECIAL: any of the tokens above
+TOKEN_PLAIN: everything not TOKEN_SPECIAL, code point in [32 (space) , 127 (tilda) )
+TOKEN_GROUP_SPECIAL: TOKEN_LBRACKET | TOKEN_RBRACKET | TOKEN_DASH | TOKEN_ESCAPE
+TOKEN_GROUP_PLAIN: everything not TOKEN_GROUP_SPECIAL, code point in [32, 127)
+
+TOKEN_DASH: -
+TOKEN_COMMA: ,
+
+TOKEN_X: x
+TOKEN_U: u
+TOKEN_T: t
+TOKEN_N: n
+TOKEN_R: r
+
+TOKEN_HEX_DIGIT: [0-9a-f]
+TOKEN_DEC_DIGIT: [0-9]
+
+top_level
+	: lr_or
+	;
+
+lr_or
+	: not_lr_or TOKEN_OR lr_or
+	| not_lr_or
+	;
+
+not_lr_or
+	: lr_add
+	;
+
+lr_add
+	: not_lr_add lr_add
+	| not_lr_add
+	;
+
+not_lr_add
+	: lr_mul
+	;
+
+lr_mul
+	: not_lr_mul TOKEN_STAR
+	| not_lr_mul TOKEN_PLUS
+	| not_lr_mul TOKEN_QUESTION_MARK
+	| not_lr_mul mul_range
+	| not_lr_mul
+	;
+
+not_lr_mul
+	: not_lr
+	;
+
+not_lr
+	: parentheses
+	| literal
+	| group
+	;
+
+mul_range
+	: TOKEN_LBRACE dec_int TOKEN_COMMA dec_int TOKEN_LBRACE
+	;
+
+parentheses
+	: TOKEN_LPAREN top_level TOKEN_RPAREN
+	;
+
+literal
+	: absolute_literal
+	| TOKEN_ESCAPE TOKEN_SPECIAL
+	| TOKEN_PLAIN
+	;
+
+group
+	: TOKEN_LBRACKET group_contents TOKEN_RBRACKET
+	;
+
+group_contents
+	: group_element group_contents
+	| group_element
+	;
+
+group_element
+	: group_range
+	| group_literal
+	;
+
+group_literal
+	| absolute_literal
+	| TOKEN_ESCAPE TOKEN_GROUP_SPECIAL
+	| TOKEN_GROUP_PLAIN
+	;
+
+group_range
+	: group_literal TOKEN_DASH group_literal
+	;
+
+absolute_literal
+	: TOKEN_ESCAPE TOKEN_X hex_byte
+	| TOKEN_ESCAPE TOKEN_U hex_int
+	| TOKEN_ESCAPE TOKEN_T
+	| TOKEN_ESCAPE TOKEN_N
+	| TOKEN_ESCAPE TOKEN_R
+	;
+
+hex_byte
+	: TOKEN_HEX_DIGIT TOKEN_HEX_DIGIT
+	;
+
+hex_int
+	: hex_byte hex_byte hex_byte hex_byte
+	;
+
+dec_int
+	: TOKEN_DEC_DIGIT
+	| TOKEN_DEC_DIGIT dec_int
+	;
+```
 
 ## Dependencies
 - gcc-c++
@@ -52,5 +189,7 @@ Each of these directories has a `Makefile` that puts stuff in a `bin/` folder.
 - http://stackoverflow.com/questions/3551733/llvm-automatic-c-linking
 - http://stackoverflow.com/questions/4425797/linking-llvm-jit-code-to-external-c-functions
 - http://stackoverflow.com/questions/14307906/c-llvm-class-functionality
+- http://swtch.com/~rsc/regexp/regexp1.html
+- http://stackoverflow.com/questions/2245962/is-there-an-alternative-for-flex-bison-that-is-usable-on-8-bit-embedded-systems
 
 [1]: https://github.com/Raekye/bdel_and_dfr_compiler
diff --git a/primed/src/dfa.cpp b/primed/src/dfa.cpp
@@ -0,0 +1 @@
+#include "dfa.h"
diff --git a/primed/src/dfa.h b/primed/src/dfa.h
@@ -0,0 +1,12 @@
+#ifndef PRIMED_DFA_H_INCLUDED
+#define PRIMED_DFA_H_INCLUDED
+
+#include <map>
+
+template <typename T> class DFAState {
+public:
+	std::map<T, DFAState<T>*>* link;
+	bool terminal;
+};
+
+#endif /* PRIMED_DFA_H_INCLUDED */
diff --git a/primed/src/driver.cpp b/primed/src/driver.cpp
@@ -5,9 +5,9 @@
 
 int main() {
 	Lexer l;
-	l.add_rule(Rule("rule1", "abc[def]*", "tag1"));
+	l.add_rule(Rule("rule1", "a(xyzb)*c|def", "tag1"));
 	std::stringstream ss;
-	ss << "abcfefefee";
+	ss << "a";
 	Token* t = l.scan(&ss);
 	l.print_states();
 	if (t) {

diff --git a/primed/src/lexer.cpp b/primed/src/lexer.cpp
@@ -15,8 +15,7 @@ void Lexer::generate() {
 	this->generation_parent_states_stack.push(root);
 	std::cout << "=== Rules" << std::endl;
 	for (int32_t i = 0; i < this->rules.size(); i++) {
-		RegexASTChain* regex = this->regex_parser.parse(this->rules[i].pattern);
-		this->generation_regex_chain_end = regex->sequence->back();
+		RegexAST* regex = this->regex_parser.parse(this->rules[i].pattern);
 		this->generation_terminal_tag = this->rules[i].tag;
 		RegexASTPrinter a;
 		a.indents = 1;
@@ -131,7 +130,7 @@ int32_t Lexer::generation_new_state() {
 void Lexer::print_states() {
 	std::cout << "=== States" << std::endl;
 	for (int32_t i = 0; i < this->states.size(); i++) {
-		std::cout << "State " << i << ": " << this->states[i]->tag;
+		std::cout << "State " << i << (this->states[i]->is_terminal() ? "(end)" : "") << ": " << this->states[i]->tag;
 		for (std::map<int32_t, std::vector<int32_t>>::iterator it = this->states[i]->next_states.begin(); it != this->states[i]->next_states.end(); it++) {
 			std::cout << ", " << (char) it->first << " ->";
 			for (int32_t j = 0; j < it->second.size(); j++) {