From f87b8d82cccfb1ad0c08ff37efdf98c5d0ee1dde Mon Sep 17 00:00:00 2001 From: Raekye Date: Tue, 24 Feb 2015 15:20:29 -0500 Subject: [PATCH] this was a while ago I think I was specifying a formal grammar for the regex and writing a corresponding parser --- README.md | 141 ++++++++++- primed/src/dfa.cpp | 1 + primed/src/dfa.h | 12 + primed/src/driver.cpp | 4 +- primed/src/lexer.cpp | 5 +- primed/src/regex.cpp | 563 +++++++++++++++++++++++++++--------------- primed/src/regex.h | 73 ++++-- 7 files changed, 578 insertions(+), 221 deletions(-) create mode 100644 primed/src/dfa.cpp create mode 100644 primed/src/dfa.h diff --git a/README.md b/README.md index 3c3a655..f1faeee 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,145 @@ Each of these directories has a `Makefile` that puts stuff in a `bin/` folder. - `primed/` (in progress): lexer-generator and parser-generator ### Primed -- contains a hand written, recursive descent basic regex parser (builds AST) +- hand written, recursive descent basic regex parser (builds AST) - regex used to define tokens, lexer-generator generates states and next-states for a lexer (a big FSM) +- goal is to generate DFA + +#### Regex grammar +- multiplication is repetition +- addition is concatenation +- multiplication, addition, and logical or have the same precedence as they usually would in a programming language (in that order) + +``` +TOKEN_STAR: * +TOKEN_PLUS: + +TOKEN_QUESTION_MARK: ? +TOKEN_OR: | +TOKEN_ESCAPE: \ + +TOKEN_LPAREN: ( +TOKEN_RPAREN: ) +TOKEN_LBRACE: { +TOKEN_RBRACE: } +TOKEN_LBRACKET: [ +TOKEN_RBRACKET: ] + +TOKEN_SPECIAL: any of the tokens above +TOKEN_PLAIN: everything not TOKEN_SPECIAL, code point in [32 (space) , 127 (tilda) ) +TOKEN_GROUP_SPECIAL: TOKEN_LBRACKET | TOKEN_RBRACKET | TOKEN_DASH | TOKEN_ESCAPE +TOKEN_GROUP_PLAIN: everything not TOKEN_GROUP_SPECIAL, code point in [32, 127) + +TOKEN_DASH: - +TOKEN_COMMA: , + +TOKEN_X: x +TOKEN_U: u +TOKEN_T: t +TOKEN_N: n +TOKEN_R: r + +TOKEN_HEX_DIGIT: [0-9a-f] +TOKEN_DEC_DIGIT: [0-9] + +top_level + : lr_or + ; + +lr_or + : not_lr_or TOKEN_OR lr_or + | not_lr_or + ; + +not_lr_or + : lr_add + ; + +lr_add + : not_lr_add lr_add + | not_lr_add + ; + +not_lr_add + : lr_mul + ; + +lr_mul + : not_lr_mul TOKEN_STAR + | not_lr_mul TOKEN_PLUS + | not_lr_mul TOKEN_QUESTION_MARK + | not_lr_mul mul_range + | not_lr_mul + ; + +not_lr_mul + : not_lr + ; + +not_lr + : parentheses + | literal + | group + ; + +mul_range + : TOKEN_LBRACE dec_int TOKEN_COMMA dec_int TOKEN_LBRACE + ; + +parentheses + : TOKEN_LPAREN top_level TOKEN_RPAREN + ; + +literal + : absolute_literal + | TOKEN_ESCAPE TOKEN_SPECIAL + | TOKEN_PLAIN + ; + +group + : TOKEN_LBRACKET group_contents TOKEN_RBRACKET + ; + +group_contents + : group_element group_contents + | group_element + ; + +group_element + : group_range + | group_literal + ; + +group_literal + | absolute_literal + | TOKEN_ESCAPE TOKEN_GROUP_SPECIAL + | TOKEN_GROUP_PLAIN + ; + +group_range + : group_literal TOKEN_DASH group_literal + ; + +absolute_literal + : TOKEN_ESCAPE TOKEN_X hex_byte + | TOKEN_ESCAPE TOKEN_U hex_int + | TOKEN_ESCAPE TOKEN_T + | TOKEN_ESCAPE TOKEN_N + | TOKEN_ESCAPE TOKEN_R + ; + +hex_byte + : TOKEN_HEX_DIGIT TOKEN_HEX_DIGIT + ; + +hex_int + : hex_byte hex_byte hex_byte hex_byte + ; + +dec_int + : TOKEN_DEC_DIGIT + | TOKEN_DEC_DIGIT dec_int + ; +``` ## Dependencies - gcc-c++ @@ -52,5 +189,7 @@ Each of these directories has a `Makefile` that puts stuff in a `bin/` folder. - http://stackoverflow.com/questions/3551733/llvm-automatic-c-linking - http://stackoverflow.com/questions/4425797/linking-llvm-jit-code-to-external-c-functions - http://stackoverflow.com/questions/14307906/c-llvm-class-functionality +- http://swtch.com/~rsc/regexp/regexp1.html +- http://stackoverflow.com/questions/2245962/is-there-an-alternative-for-flex-bison-that-is-usable-on-8-bit-embedded-systems [1]: https://github.com/Raekye/bdel_and_dfr_compiler diff --git a/primed/src/dfa.cpp b/primed/src/dfa.cpp new file mode 100644 index 0000000..a33db3f --- /dev/null +++ b/primed/src/dfa.cpp @@ -0,0 +1 @@ +#include "dfa.h" diff --git a/primed/src/dfa.h b/primed/src/dfa.h new file mode 100644 index 0000000..64c7f59 --- /dev/null +++ b/primed/src/dfa.h @@ -0,0 +1,12 @@ +#ifndef PRIMED_DFA_H_INCLUDED +#define PRIMED_DFA_H_INCLUDED + +#include + +template class DFAState { +public: + std::map*>* link; + bool terminal; +}; + +#endif /* PRIMED_DFA_H_INCLUDED */ diff --git a/primed/src/driver.cpp b/primed/src/driver.cpp index 2cf91ac..e616928 100644 --- a/primed/src/driver.cpp +++ b/primed/src/driver.cpp @@ -5,9 +5,9 @@ int main() { Lexer l; - l.add_rule(Rule("rule1", "abc[def]*", "tag1")); + l.add_rule(Rule("rule1", "a(xyzb)*c|def", "tag1")); std::stringstream ss; - ss << "abcfefefee"; + ss << "a"; Token* t = l.scan(&ss); l.print_states(); if (t) { diff --git a/primed/src/lexer.cpp b/primed/src/lexer.cpp index 1b01860..c39f78c 100644 --- a/primed/src/lexer.cpp +++ b/primed/src/lexer.cpp @@ -15,8 +15,7 @@ void Lexer::generate() { this->generation_parent_states_stack.push(root); std::cout << "=== Rules" << std::endl; for (int32_t i = 0; i < this->rules.size(); i++) { - RegexASTChain* regex = this->regex_parser.parse(this->rules[i].pattern); - this->generation_regex_chain_end = regex->sequence->back(); + RegexAST* regex = this->regex_parser.parse(this->rules[i].pattern); this->generation_terminal_tag = this->rules[i].tag; RegexASTPrinter a; a.indents = 1; @@ -131,7 +130,7 @@ int32_t Lexer::generation_new_state() { void Lexer::print_states() { std::cout << "=== States" << std::endl; for (int32_t i = 0; i < this->states.size(); i++) { - std::cout << "State " << i << ": " << this->states[i]->tag; + std::cout << "State " << i << (this->states[i]->is_terminal() ? "(end)" : "") << ": " << this->states[i]->tag; for (std::map>::iterator it = this->states[i]->next_states.begin(); it != this->states[i]->next_states.end(); it++) { std::cout << ", " << (char) it->first << " ->"; for (int32_t j = 0; j < it->second.size(); j++) { diff --git a/primed/src/regex.cpp b/primed/src/regex.cpp index 5f39d74..9e8f03c 100644 --- a/primed/src/regex.cpp +++ b/primed/src/regex.cpp @@ -2,6 +2,8 @@ #include #include +#pragma mark - RegexAST + RegexASTChain::RegexASTChain(std::vector* sequence) { this->sequence = sequence; } @@ -21,7 +23,7 @@ RegexASTMultiplication::RegexASTMultiplication(RegexAST* node, int32_t min, int3 this->max = max; } -RegexASTRange::RegexASTRange(int32_t lower, int32_t upper) { +RegexASTRange::RegexASTRange(uint32_t lower, uint32_t upper) { this->lower = lower; this->upper = upper; } @@ -74,265 +76,292 @@ void RegexASTRange::accept(IRegexASTVisitor* visitor) { visitor->visit(this); } -#pragma mark - Regex parser +#pragma mark - RegexParser RegexParser::RegexParser() { return; } -RegexASTChain* RegexParser::parse(std::string str) { +int32_t RegexParser::buffer_pos() { + return this->pos.top(); +} + +void RegexParser::buffer_advance(int32_t delta) { + this->pos.push(this->buffer_pos() + delta); +} + +uint32_t RegexParser::buffer_char(int32_t delta) { + if (this->buffer_pos() + delta >= this->buffer.size()) { + return 0; + } + return this->buffer[this->buffer_pos() + delta]; +} + +void RegexParser::buffer_push(int32_t loc) { + this->pos.push(loc); +} + +int32_t RegexParser::buffer_pop(int32_t times) { + int32_t popped = this->buffer_pos(); + for (int32_t i = 0; i < times; i++) { + this->pos.pop(); + } + return popped; +} + +#pragma mark - RegexParser - parsing +RegexAST* RegexParser::parse(std::string str) { this->buffer = str; this->pos = std::stack(); this->pos.push(0); - RegexASTChain* regex = this->parse_chain(); + RegexAST* regex = this->parse_toplevel(); + if (!regex) { + return NULL; + } if (this->buffer_pos() != str.length()) { delete regex; return NULL; } - return regex; -} - -RegexASTChain* RegexParser::parse_chain() { - std::vector* sequence = new std::vector(); - RegexAST* first = this->parse_toplevel(); - if (first) { - sequence->push_back(first); - while (RegexAST* node = this->parse_toplevel()) { - sequence->push_back(node); - } - } else { - delete sequence; - return NULL; + this->pos.pop(); + if (this->pos.size() != 1) { + delete regex; + throw std::runtime_error("RegexParser did not finish with pos stack 1"); } - return new RegexASTChain(sequence); + return regex; } RegexAST* RegexParser::parse_toplevel() { - RegexAST* node = NULL; - if ((node = this->parse_multiplication())) { - return node; - } else if ((node = this->parse_or())) { - return node; - } else if ((node = this->parse_toplevel_nonrecursive())) { - return node; - } - return NULL; + return this->parse_lr_or(); } -RegexAST* RegexParser::parse_toplevel_nonrecursive() { - RegexAST* node = NULL; - if ((node = this->parse_parenthesis())) { - return node; - } else if ((node = RegexParser::parse_literal())) { - return node; - } else if ((node = RegexParser::parse_group())) { - return node; - } - return NULL; -} - -RegexAST* RegexParser::parse_parenthesis() { - if (this->buffer_char() != '(') { +RegexAST* RegexParser::parse_lr_or() { + RegexAST* l = this->parse_not_lr_or(); + if (!l) { return NULL; } - this->buffer_advance(1); - RegexAST* node = this->parse_chain(); - if (!node) { - this->buffer_pop(1); - return NULL; + if (this->buffer_char() != RegexParser::TOKEN_OR) { + return l; } - if (this->buffer_char() != ')') { - delete node; + this->buffer_advance(1); + RegexAST* r = this->parse_lr_or(); + if (!r) { + delete l; this->buffer_pop(2); return NULL; } - this->buffer_push(this->buffer_pop(2) + 1); - return node; + this->buffer_push(this->buffer_pop(3)); + return new RegexASTOr(l, r); } -RegexAST* RegexParser::parse_literal() { - int32_t ch = this->buffer_char(); - if (ch == 0) { +RegexAST* RegexParser::parse_not_lr_or() { + return this->parse_lr_add(); +} + +RegexAST* RegexParser::parse_lr_add() { + RegexAST* car = this->parse_not_lr_add(); + if (!car) { return NULL; } - if (ch == '\\') { - ch = this->buffer_char(1); - if (RegexParser::is_special_char(ch)) { - this->buffer_advance(2); - return new RegexASTLiteral(ch); + std::vector* chain = new std::vector(); + chain->push_back(car); + while (true) { + RegexAST* next = this->parse_not_lr_add(); + if (!next) { + break; } + chain->push_back(next); + this->buffer_push(this->buffer_pop(2)); } - if (RegexParser::is_special_char(ch)) { - return NULL; + return new RegexASTChain(chain); +} + +RegexAST* RegexParser::parse_not_lr_add() { + return this->parse_lr_mul(); +} + +RegexAST* RegexParser::parse_lr_mul() { + RegexAST* l = this->parse_not_lr_mul(); + uint32_t ch = this->buffer_char(); + if (ch == RegexParser::TOKEN_STAR) { + this->buffer_advance(1); + this->buffer_push(this->buffer_pop(2)); + return new RegexASTMultiplication(l, 0, 0); + } else if (ch == RegexParser::TOKEN_PLUS) { + this->buffer_advance(1); + this->buffer_push(this->buffer_pop(2)); + return new RegexASTMultiplication(l, 1, 0); + } else if (ch == RegexParser::TOKEN_QUESTION_MARK) { + this->buffer_advance(1); + this->buffer_push(this->buffer_pop(2)); + return new RegexASTMultiplication(l, 0, 1); } - this->buffer_advance(1); - return new RegexASTLiteral(ch); + std::tuple* range = this->parse_mul_range(); + if (!range) { + return l; + } + this->buffer_push(this->buffer_pop(2)); + RegexAST* r = new RegexASTMultiplication(l, std::get<0>(*range), std::get<1>(*range)); + delete range; + return r; } -RegexAST* RegexParser::parse_or() { - RegexAST* left = this->parse_toplevel_nonrecursive(); - if (!left) { +std::tuple* RegexParser::parse_mul_range() { + if (this->buffer_char() != '{') { return NULL; } - if (this->buffer_char() != '|') { - delete left; + this->buffer_advance(1); + uint32_t* lower = this->parse_dec_int(); + if (!lower) { this->buffer_pop(1); return NULL; } - this->buffer_advance(1); - RegexAST* right = this->parse_toplevel(); - if (!right) { - delete left; + if (this->buffer_char() != ',') { this->buffer_pop(2); return NULL; } - this->buffer_push(this->buffer_pop(3)); - return new RegexASTOr(left, right); -} - -RegexAST* RegexParser::parse_multiplication() { - RegexAST* node = this->parse_toplevel_nonrecursive(); - if (!node) { + this->buffer_advance(1); + uint32_t* upper = this->parse_dec_int(); + if (!upper) { + delete lower; + this->buffer_pop(3); return NULL; } - if (this->buffer_char() == '*') { - this->buffer_push(this->buffer_pop(1) + 1); - return new RegexASTMultiplication(node, 0, 0); - } else if (this->buffer_char() == '?') { - this->buffer_push(this->buffer_pop(1) + 1); - return new RegexASTMultiplication(node, 0, 1); - } else if (this->buffer_char() == '+') { - this->buffer_push(this->buffer_pop(1) + 1); - return new RegexASTMultiplication(node, 1, 0); - } else { - if (std::tuple* range = this->parse_multiplication_range()) { - this->buffer_push(this->buffer_pop(2)); - RegexAST* node_prime = new RegexASTMultiplication(node, std::get<0>(*range), std::get<1>(*range)); - delete range; - return node_prime; - } + if (this->buffer_char() != '}') { + delete lower; + delete upper; + this->buffer_pop(4); + return NULL; + } + this->buffer_push(this->buffer_pop(4) + 1); + std::tuple* range = new std::tuple(*lower, *upper); + delete lower; + delete upper; + return range; +} + +RegexAST* RegexParser::parse_not_lr_mul() { + return this->parse_not_lr(); +} + +RegexAST* RegexParser::parse_not_lr() { + if (RegexAST* r = this->parse_parentheses()) { + return r; + } else if (RegexAST* r = this->parse_literal()) { + return r; + } else if (RegexAST* r = this->parse_group()) { + return r; } - delete node; - this->buffer_pop(1); return NULL; } -RegexAST* RegexParser::parse_group() { - if (this->buffer_char() != '[') { +RegexAST* RegexParser::parse_parentheses() { + if (this->buffer_char() != '(') { return NULL; } this->buffer_advance(1); - RegexAST* first = this->parse_group_element(); - if (!first) { + RegexAST* node = this->parse_toplevel(); + if (!node) { this->buffer_pop(1); return NULL; } - RegexAST* second = this->parse_group_element(); - if (!second) { - if (this->buffer_char() != ']') { - delete first; - this->buffer_pop(2); - return NULL; - } - this->buffer_push(this->buffer_pop(2) + 1); - return first; - } - int32_t delta = 3; - RegexASTOr* aggregation = new RegexASTOr(first, second); - RegexASTOr* tail = aggregation; - while (RegexAST* node = this->parse_group_element()) { - RegexASTOr* tail_prime = new RegexASTOr(tail->right, node); - tail->right = tail_prime; - tail = tail_prime; - delta++; + if (this->buffer_char() != ')') { + delete node; + this->buffer_pop(2); + return NULL; + } + this->buffer_push(this->buffer_pop(2) + 1); + return node; +} + +RegexAST* RegexParser::parse_literal() { + uint32_t* x = this->parse_absolute_literal(); + if (x) { + RegexAST* r = new RegexASTLiteral(*x); + delete x; + return r; } - if (this->buffer_char() != ']') { - delete aggregation; - this->buffer_pop(delta); + int32_t ch = this->buffer_char(); + if (ch == RegexParser::TOKEN_ESCAPE) { + ch = this->buffer_char(1); + if (RegexParser::is_special_char(ch)) { + this->buffer_advance(2); + return new RegexASTLiteral(ch); + } return NULL; } - this->buffer_push(this->buffer_pop(delta) + 1); - return aggregation; + if (!RegexParser::is_special_char(ch) && 32 <= ch && ch < 127) { + this->buffer_advance(1); + return new RegexASTLiteral(ch); + } + return NULL; } -std::tuple* RegexParser::parse_multiplication_range() { - if (this->buffer_char() != '{') { +RegexAST* RegexParser::parse_group() { + if (this->buffer_char() != RegexParser::TOKEN_LBRACKET) { return NULL; } this->buffer_advance(1); - int32_t lower = this->parse_number(); - if (lower < 0) { + RegexAST* contents = this->parse_group_contents(); + if (!contents) { this->buffer_pop(1); return NULL; } - if (this->buffer_char() != ',') { + if (this->buffer_char() != RegexParser::TOKEN_RBRACKET) { + delete contents; this->buffer_pop(2); return NULL; } this->buffer_advance(1); - int32_t upper = this->parse_number(); - if (upper < 0) { - this->buffer_pop(3); - return NULL; - } - if (this->buffer_char() != '}') { - this->buffer_pop(4); - return NULL; - } - this->buffer_push(this->buffer_pop(4) + 1); - return new std::tuple(lower, upper); + this->buffer_push(this->buffer_pop(3)); + return contents; } -int32_t RegexParser::parse_number() { - int32_t ch = this->buffer_char(); - if (!std::isdigit(ch)) { - return -1; +RegexAST* RegexParser::parse_group_contents() { + RegexAST* car = this->parse_group_element(); + if (!car) { + return NULL; } - int32_t delta = 1; - int32_t x = ch - '0'; - while (true) { - int32_t ch = this->buffer_char(delta); - if (!std::isdigit(ch)) { - break; - } - x = x * 10 + (ch - '0'); - delta++; + RegexAST* cdr = this->parse_group_contents(); + if (!cdr) { + return car; } - this->buffer_advance(delta); - return x; + this->buffer_push(this->buffer_pop(2)); + return new RegexASTOr(car, cdr); } RegexAST* RegexParser::parse_group_element() { if (RegexAST* node = this->parse_group_range()) { return node; - } else if (int32_t* x = this->parse_group_literal()) { - RegexAST* node = new RegexASTLiteral(*x); - delete x; - return node; + } else if (uint32_t* x= this->parse_group_literal()) { + RegexAST* r =new RegexASTLiteral(*x); + delete x; + return r; } return NULL; } -int32_t* RegexParser::parse_group_literal() { - int32_t ch = this->buffer_char(); - if (ch == 0) { - return NULL; +uint32_t* RegexParser::parse_group_literal() { + uint32_t* l = this->parse_absolute_literal(); + if (l) { + return l; } - if (ch == '\\') { + int32_t ch = this->buffer_char(); + if (ch == RegexParser::TOKEN_ESCAPE) { ch = this->buffer_char(1); - if (ch == '-' || ch == ']') { + if (RegexParser::is_group_special_char(ch)) { this->buffer_advance(2); - return new int32_t(ch); + return new uint32_t(ch); } - } - if (ch == '-' || ch == ']') { return NULL; } - this->buffer_advance(1); - return new int32_t(ch); + if (!RegexParser::is_group_special_char(ch) && 32 <= ch && ch < 127) { + this->buffer_advance(1); + return new uint32_t(ch); + } + return NULL; } RegexAST* RegexParser::parse_group_range() { - int32_t* lower = this->parse_group_literal(); + uint32_t* lower = this->parse_group_literal(); if (!lower) { return NULL; } @@ -342,7 +371,7 @@ RegexAST* RegexParser::parse_group_range() { return NULL; } this->buffer_advance(1); - int32_t* upper = this->parse_group_literal(); + uint32_t* upper = this->parse_group_literal(); if (!upper) { delete lower; this->buffer_pop(2); @@ -355,43 +384,179 @@ RegexAST* RegexParser::parse_group_range() { return node; } -int32_t RegexParser::buffer_pos() { - return this->pos.top(); +uint32_t* RegexParser::parse_absolute_literal() { + if (this->buffer_char() != RegexParser::TOKEN_ESCAPE) { + return NULL; + } + this->buffer_advance(1); + uint32_t ch = this->buffer_char(); + if (ch == RegexParser::TOKEN_X) { + this->buffer_advance(1); + uint32_t* x = this->parse_hex_byte(); + if (!x) { + this->buffer_pop(2); + return NULL; + } + this->buffer_push(this->buffer_pop(2)); + return x; + } else if (ch == RegexParser::TOKEN_U) { + this->buffer_advance(1); + uint32_t* x = this->parse_hex_int(); + if (!x) { + this->buffer_pop(2); + return NULL; + } + this->buffer_push(this->buffer_pop(2)); + return x; + } else if (ch == RegexParser::TOKEN_T) { + this->buffer_advance(1); + this->buffer_push(this->buffer_pop(2)); + return new uint32_t('\t'); + } else if (ch == RegexParser::TOKEN_N) { + this->buffer_advance(1); + this->buffer_push(this->buffer_pop(2)); + return new uint32_t('\n'); + } else if (ch == RegexParser::TOKEN_R) { + this->buffer_advance(1); + this->buffer_push(this->buffer_pop(2)); + return new uint32_t('\r'); + } + this->buffer_pop(1); + return NULL; } -void RegexParser::buffer_advance(int32_t delta) { - this->pos.push(this->pos.top() + delta); +uint32_t* RegexParser::parse_hex_byte() { + int32_t upper = this->buffer_char(); + if (!RegexParser::is_hex_digit(upper)) { + return NULL; + } + int32_t lower = this->buffer_char(1); + if (!RegexParser::is_hex_digit(lower)) { + return NULL; + } + uint32_t x = 0; + if (RegexParser::is_dec_digit(upper)) { + x = upper - '0'; + } else { + x = upper - 'a' + 10; + } + x <<= 4; + if (RegexParser::is_dec_digit(lower)) { + x |= upper - '0'; + } else { + x |= upper - 'a' + 10; + } + this->buffer_advance(2); + return new uint32_t(x); } -int32_t RegexParser::buffer_char(int32_t delta) { - if (this->pos.top() + delta >= this->buffer.size()) { - return 0; +uint32_t* RegexParser::parse_hex_int() { + uint32_t x = 0; + this->buffer_advance(0); + for (int32_t i = 0; i < 4; i++) { + uint32_t* b = this->parse_hex_byte(); + if (!b) { + this->buffer_pop(1); + return NULL; + } + x = (x << 8) + *b; + delete b; + this->buffer_push(this->buffer_pop(2)); } - return this->buffer[this->pos.top() + delta]; + return new uint32_t(x); } -void RegexParser::buffer_push(int32_t loc) { - this->pos.push(loc); +uint32_t* RegexParser::parse_dec_int() { + uint32_t x = 0; + int32_t delta = 0; + while (true) { + int32_t ch = this->buffer_char(delta); + if (!RegexParser::is_dec_digit(ch)) { + break; + } + x = x * 10 + (ch - '0'); + delta++; + } + this->buffer_advance(delta); + return new uint32_t(x); } -int32_t RegexParser::buffer_pop(int32_t times) { - int32_t popped = this->pos.top(); - for (int32_t i = 0; i < times; i++) { - this->pos.pop(); +bool RegexParser::is_special_char(uint32_t ch) { + return ch == TOKEN_ESCAPE + || ch == TOKEN_LPAREN + || ch == TOKEN_RPAREN + || ch == TOKEN_LBRACE + || ch == TOKEN_RBRACE + || ch == TOKEN_LBRACKET + || ch == TOKEN_RBRACKET + || ch == TOKEN_OR + || ch == TOKEN_STAR + || ch == TOKEN_PLUS + || ch == TOKEN_QUESTION_MARK; +} + +bool RegexParser::is_group_special_char(uint32_t ch) { + return ch == TOKEN_LBRACKET + || ch == TOKEN_DASH + || ch == TOKEN_RBRACKET + || ch == TOKEN_ESCAPE; +} + +bool RegexParser::is_hex_digit(uint32_t ch) { + return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f'); +} + +bool RegexParser::is_dec_digit(uint32_t ch) { + return ('0' <= ch && ch <= '9'); +} + +#pragma mark - RegexDFAGenerator + +void RegexDFAGenerator::visit(RegexASTChain* node) { + DFAState* saved_root = this->root; + node->sequence->front()->accept(this); + this->root = this->ret; + for (int32_t i = 1; i < node->sequence->size(); i++) { + node->sequence->operator[](i)->accept(this); + this->root = this->ret; } - return popped; + this->root = saved_root; + // ret keeps last value } -bool RegexParser::is_special_char(int32_t ch) { - return ch == '\\' - || ch == '[' - || ch == ']' - || ch == '(' - || ch == ')' - || ch == '{' - || ch == '}' - || ch == '|' - || ch == '*' - || ch == '+' - || ch == '?'; +void RegexDFAGenerator::visit(RegexASTLiteral* node) { + std::map*>::iterator it = this->root->link->find(node->ch); + if (it == this->root->link->end()) { + this->ret = new DFAState(); + } else { + this->ret = it->second; + } +} + +void RegexDFAGenerator::visit(RegexASTOr* node) { + node->left->accept(this); + DFAState* saved_left_ret = this->ret; + node->right->accept(this); + for (std::map*>::iterator it = saved_left_ret->link->begin(); it != saved_left_ret->link->end(); it++) { + std::map*>::iterator it2 = this->ret->link->find(it->first); + if (it2 == this->ret->link->end()) { + this->ret->link->operator[](it->first) = it->second; + } else { + if (it2->second != it->second) { + throw std::runtime_error("State badness"); + } + } + } + delete saved_left_ret->link; + saved_left_ret->link = this->ret->link; + // root unchanged + // ret keeps last generated value +} + +void RegexDFAGenerator::visit(RegexASTMultiplication* node) { + return; +} + +void RegexDFAGenerator::visit(RegexASTRange* node) { + return; } diff --git a/primed/src/regex.h b/primed/src/regex.h index db8f9bf..dbce2d5 100644 --- a/primed/src/regex.h +++ b/primed/src/regex.h @@ -6,34 +6,65 @@ #include #include #include +#include "dfa.h" class RegexAST; -class RegexASTChain; class IRegexASTVisitor; class RegexParser { private: - RegexASTChain* parse_chain(); RegexAST* parse_toplevel(); - RegexAST* parse_toplevel_nonrecursive(); - RegexAST* parse_parenthesis(); + RegexAST* parse_lr_or(); + RegexAST* parse_not_lr_or(); + RegexAST* parse_lr_add(); + RegexAST* parse_not_lr_add(); + RegexAST* parse_lr_mul(); + RegexAST* parse_not_lr_mul(); + RegexAST* parse_not_lr(); + RegexAST* parse_parentheses(); RegexAST* parse_literal(); - RegexAST* parse_or(); - RegexAST* parse_multiplication(); RegexAST* parse_group(); - std::tuple* parse_multiplication_range(); - int32_t parse_number(); + uint32_t* parse_absolute_literal(); + + std::tuple* parse_mul_range(); + + RegexAST* parse_group_contents(); RegexAST* parse_group_element(); - int32_t* parse_group_literal(); RegexAST* parse_group_range(); + uint32_t* parse_group_literal(); + + uint32_t* parse_hex_byte(); + uint32_t* parse_hex_int(); + uint32_t* parse_dec_int(); int32_t buffer_pos(); void buffer_advance(int32_t); - int32_t buffer_char(int32_t = 0); + uint32_t buffer_char(int32_t = 0); void buffer_push(int32_t); int32_t buffer_pop(int32_t); - static bool is_special_char(int32_t ch); + static bool is_special_char(uint32_t); + static bool is_group_special_char(uint32_t); + static bool is_hex_digit(uint32_t); + static bool is_dec_digit(uint32_t); + + static const int32_t TOKEN_STAR = '*'; + static const int32_t TOKEN_PLUS = '+'; + static const int32_t TOKEN_QUESTION_MARK = '?'; + static const int32_t TOKEN_OR = '|'; + static const int32_t TOKEN_DASH = '-'; + static const int32_t TOKEN_ESCAPE = '\\'; + static const int32_t TOKEN_LPAREN = '('; + static const int32_t TOKEN_RPAREN = ')'; + static const int32_t TOKEN_LBRACE = '{'; + static const int32_t TOKEN_RBRACE = '}'; + static const int32_t TOKEN_LBRACKET = '['; + static const int32_t TOKEN_RBRACKET = ']'; + static const int32_t TOKEN_X = 'x'; + static const int32_t TOKEN_U = 'u'; + static const int32_t TOKEN_T = 't'; + static const int32_t TOKEN_N = 'n'; + static const int32_t TOKEN_R = 'r'; public: std::string buffer; @@ -41,7 +72,7 @@ class RegexParser { RegexParser(); - RegexASTChain* parse(std::string); + RegexAST* parse(std::string); }; class RegexAST { @@ -61,7 +92,7 @@ class RegexASTChain : public RegexAST { class RegexASTLiteral : public RegexAST { public: - int32_t ch; + uint32_t ch; RegexASTLiteral(int32_t); virtual ~RegexASTLiteral(); @@ -93,10 +124,10 @@ class RegexASTMultiplication : public RegexAST { class RegexASTRange : public RegexAST { public: - int32_t lower; - int32_t upper; + uint32_t lower; + uint32_t upper; - RegexASTRange(int32_t, int32_t); + RegexASTRange(uint32_t, uint32_t); virtual ~RegexASTRange(); virtual void accept(IRegexASTVisitor*) override; }; @@ -110,6 +141,16 @@ class IRegexASTVisitor { virtual void visit(RegexASTRange*) = 0; }; +class RegexDFAGenerator : public IRegexASTVisitor { + DFAState* root; + DFAState* ret; + void visit(RegexASTChain*) override; + void visit(RegexASTLiteral*) override; + void visit(RegexASTOr*) override; + void visit(RegexASTMultiplication*) override; + void visit(RegexASTRange*) override; +}; + class RegexASTPrinter : public IRegexASTVisitor { public: int32_t indents = 0;