From 928a696db5d47ce950ba9c56f366e0fe81da5639 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 19 Oct 2023 13:34:02 -0400 Subject: [PATCH] Add error code for unsupported PCRE cases (RE_EUNSUPPPCRE), reject one. There's extra error codes in #440 for regexes that aren't UNSATISFIABLE per se, but depend on particular corner cases in PCRE that probably aren't worth supporting in an automata-based implementation. Add a test case for one, tests/pcre/in48.re: ^a|$[^x]b* This is a tricky one to handle properly; according to PCRE it should match either "a" OR "\n", but nothing else. The newline match is because $ is a non-input-consuming check that evaluation is either at the end of input, or at a newline immediately before the end. In this case `$[^x]b*` matches exactly one newline; it's equivalent to "$\n". This probably isn't worth supporting, but we can detect cases where a potential newline match appears after a $ and reject them as an unsupported PCRE behavior. --- include/re/re.h | 2 + src/libre/ast.h | 1 + src/libre/ast_analysis.c | 103 +++++++++++++++++++++++++++++++++++++-- src/libre/ast_analysis.h | 4 +- src/libre/print/tree.c | 1 + src/libre/re.c | 10 +++- src/libre/strerror.c | 2 + tests/pcre/in48.re | 1 + tests/pcre/out48.err | 1 + 9 files changed, 118 insertions(+), 7 deletions(-) create mode 100644 tests/pcre/in48.re create mode 100644 tests/pcre/out48.err diff --git a/include/re/re.h b/include/re/re.h index deab6caed..f876f635a 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -46,6 +46,8 @@ enum re_errno { RE_EERRNO = 1 | RE_MISC, RE_EBADDIALECT = 2 | RE_MISC, RE_EBADGROUP = 3 | RE_MISC, + RE_EUNSUPCAPTUR = 4 | RE_MISC, + RE_EUNSUPPPCRE = 5 | RE_MISC, RE_ENEGRANGE = 0 | RE_MARK | RE_GROUP, RE_ENEGCOUNT = 1 | RE_MARK | RE_GROUP, diff --git a/src/libre/ast.h b/src/libre/ast.h index 233744847..24031887e 100644 --- a/src/libre/ast.h +++ b/src/libre/ast.h @@ -98,6 +98,7 @@ enum ast_flags { AST_FLAG_ANCHORED_START = 1 << 6, AST_FLAG_ANCHORED_END = 1 << 7, AST_FLAG_END_NL = 1 << 8, + AST_FLAG_MATCHES_1NEWLINE= 1 << 9, AST_FLAG_NONE = 0x00 }; diff --git a/src/libre/ast_analysis.c b/src/libre/ast_analysis.c index 3298f62c2..e3e85526a 100644 --- a/src/libre/ast_analysis.c +++ b/src/libre/ast_analysis.c @@ -552,6 +552,46 @@ set_flags_subtree(struct ast_expr *n, enum ast_flags flags) } } +static int +can_consume_single_newline(struct ast_expr *n) +{ + if (!can_consume_input(n)) { return 0; } + + if (n->flags & AST_FLAG_MATCHES_1NEWLINE) { return 1; } + + switch (n->type) { + case AST_EXPR_LITERAL: + return n->u.literal.c == '\n'; + + case AST_EXPR_CODEPOINT: + return n->u.codepoint.u == (uint32_t)'\n'; + + case AST_EXPR_RANGE: + if ((n->u.range.from.type == AST_ENDPOINT_LITERAL) && + (n->u.range.to.type == AST_ENDPOINT_LITERAL)) { + return n->u.range.from.u.literal.c <= '\n' + && n->u.range.to.u.literal.c >= '\n'; + } else if ((n->u.range.from.type == AST_ENDPOINT_CODEPOINT) && + (n->u.range.to.type == AST_ENDPOINT_CODEPOINT)) { + return n->u.range.from.u.codepoint.u <= '\n' + && n->u.range.to.u.codepoint.u >= '\n'; + } else if (n->u.range.from.type == AST_ENDPOINT_NAMED) { + /* TODO: unreachable? */ + break; + } + break; + + case AST_EXPR_SUBTRACT: + return can_consume_single_newline(n->u.subtract.a) + && !can_consume_single_newline(n->u.subtract.b); + + default: + break; + } + + return 0; +} + struct anchoring_env { enum re_flags re_flags; @@ -562,6 +602,7 @@ struct anchoring_env { /* Corresponding flag for end anchors while sweeping backward. */ int followed_by_consuming; + int followed_by_consuming_newline; int before_start_anchor; }; @@ -644,6 +685,9 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) case AST_EXPR_LITERAL: case AST_EXPR_CODEPOINT: case AST_EXPR_RANGE: + if (can_consume_single_newline(n)) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } break; /* handled outside switch/case */ case AST_EXPR_CONCAT: { @@ -810,6 +854,13 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } } + for (i = 0; i < n->u.concat.count; i++) { + struct ast_expr *child = n->u.concat.n[i]; + if (can_consume_single_newline(child)) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } + } + break; } @@ -846,6 +897,9 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } else if (res == AST_ANALYSIS_OK) { all_set_past_always_consuming &= child_env.past_always_consuming; any_sat = 1; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE + || res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + continue; } else { return res; } @@ -858,6 +912,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) all_end_anchored = 0; } } + + if (child->flags & AST_FLAG_MATCHES_1NEWLINE) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } } if (!env->past_always_consuming && all_set_past_always_consuming) { @@ -925,6 +983,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) return res; } + if (can_consume_single_newline(n->u.repeat.e)) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } + if (n->u.repeat.e->flags & AST_FLAG_ANCHORED_END && n->u.repeat.min > 0) { /* FIXME: if repeating something that is always * anchored at the end, repeat.max could be @@ -964,6 +1026,11 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } while(0) PROPAGATE_CHILD_FLAGS("GROUP", n, n->u.group.e); + + if (n->u.group.e->flags & AST_FLAG_MATCHES_1NEWLINE) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } + break; case AST_EXPR_SUBTRACT: @@ -991,6 +1058,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } return res; } + if (can_consume_single_newline(n->u.repeat.e)) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } + break; default: @@ -1048,11 +1119,18 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) assert(n->flags & AST_FLAG_ANCHORED_END); if (env->followed_by_consuming) { - LOG(3 - LOG_ANCHORING, - "%s: END anchor & followed_by_consuming, setting UNSATISFIABLE\n", - __func__); - set_flags(n, AST_FLAG_UNSATISFIABLE); - return AST_ANALYSIS_UNSATISFIABLE; + if (env->followed_by_consuming_newline) { + LOG(3 - LOG_ANCHORING, + "%s: END anchor & followed_by_consuming, returning UNSUPPORTED_PCRE\n", + __func__); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } else { + LOG(3 - LOG_ANCHORING, + "%s: END anchor & followed_by_consuming, setting UNSATISFIABLE\n", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_UNSATISFIABLE; + } } break; @@ -1113,6 +1191,8 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) set_flags(n, AST_FLAG_UNSATISFIABLE); } } else if (res != AST_ANALYSIS_OK) { + LOG(3 - LOG_ANCHORING, + "%s: CONCAT: got res of %d, bubbling up\n", __func__, res); return res; } @@ -1128,6 +1208,15 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) env->followed_by_consuming = 1; } + if (!env->followed_by_consuming_newline && + (child_env.followed_by_consuming_newline + || child->flags & AST_FLAG_MATCHES_1NEWLINE)) { + LOG(3 - LOG_ANCHORING, + "%s: setting followed_by_consuming_newline due to child %p's analysis\n", + __func__, (void *)child); + env->followed_by_consuming_newline = 1; + } + if (!env->before_start_anchor && child_env.before_start_anchor && !is_nullable(child)) { LOG(3 - LOG_ANCHORING, @@ -1169,6 +1258,10 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) all_set_followed_by_consuming &= child_env.followed_by_consuming; all_set_before_start_anchor &= child_env.before_start_anchor; any_sat = 1; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE + || res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + LOG(3 - LOG_ANCHORING, "%s: got res of UNSUPPORTED_*, bubbling up\n", __func__); + return res; } else { return res; } diff --git a/src/libre/ast_analysis.h b/src/libre/ast_analysis.h index f9ec8ebbb..5390cce57 100644 --- a/src/libre/ast_analysis.h +++ b/src/libre/ast_analysis.h @@ -30,7 +30,9 @@ enum ast_analysis_res { AST_ANALYSIS_UNSATISFIABLE, AST_ANALYSIS_ERROR_NULL = -1, - AST_ANALYSIS_ERROR_MEMORY = -2 + AST_ANALYSIS_ERROR_MEMORY = -2, + AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE = -3, + AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE = -4 }; enum ast_analysis_res diff --git a/src/libre/print/tree.c b/src/libre/print/tree.c index 5d2f78691..ccf50cb23 100644 --- a/src/libre/print/tree.c +++ b/src/libre/print/tree.c @@ -62,6 +62,7 @@ fprintf_flags(FILE *f, enum ast_flags flags) PR_FLAG(END_NL, "N"); PR_FLAG(CAN_CONSUME, "c"); PR_FLAG(ALWAYS_CONSUMES, "C"); + PR_FLAG(MATCHES_1NEWLINE, "n"); #undef PR_FLAG diff --git a/src/libre/re.c b/src/libre/re.c index 6c423dc36..5e4604aa3 100644 --- a/src/libre/re.c +++ b/src/libre/re.c @@ -125,7 +125,15 @@ re_parse(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque, if (res < 0) { ast_free(ast); - if (err != NULL) { err->e = RE_EERRNO; } + if (err != NULL) { + if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { + err->e = RE_EUNSUPPPCRE; + } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE) { + err->e = RE_EUNSUPCAPTUR; + } else if (err->e == RE_ESUCCESS) { + err->e = RE_EERRNO; + } + } return NULL; } diff --git a/src/libre/strerror.c b/src/libre/strerror.c index 009d61df2..d66e750a4 100644 --- a/src/libre/strerror.c +++ b/src/libre/strerror.c @@ -20,6 +20,8 @@ re_strerror(enum re_errno e) case RE_EERRNO: return strerror(errno); case RE_EBADDIALECT: return "Bad dialect"; case RE_EBADGROUP: return "Bad group"; + case RE_EUNSUPCAPTUR: return "Cannot support captures in this case"; + case RE_EUNSUPPPCRE: return "Unsupported PCRE edge case"; case RE_ENEGRANGE: return "Negative group range"; case RE_ENEGCOUNT: return "Negative count range"; diff --git a/tests/pcre/in48.re b/tests/pcre/in48.re new file mode 100644 index 000000000..15490c598 --- /dev/null +++ b/tests/pcre/in48.re @@ -0,0 +1 @@ +^a|$[^x]b* \ No newline at end of file diff --git a/tests/pcre/out48.err b/tests/pcre/out48.err new file mode 100644 index 000000000..b03d96d7e --- /dev/null +++ b/tests/pcre/out48.err @@ -0,0 +1 @@ +tests/pcre/in48.re: Unsupported PCRE edge case