diff --git a/src/libre/ast.h b/src/libre/ast.h index 3ef0c1f5f..d217f3b9f 100644 --- a/src/libre/ast.h +++ b/src/libre/ast.h @@ -105,6 +105,7 @@ enum ast_flags { AST_FLAG_ANCHORED_START = 1 << 6, AST_FLAG_ANCHORED_END = 1 << 7, AST_FLAG_END_NL = 1 << 8, + AST_FLAG_MATCHES_1NEWLINE= 1 << 9, AST_FLAG_NONE = 0x00 }; diff --git a/src/libre/ast_analysis.c b/src/libre/ast_analysis.c index df9e8ce54..1964d9b54 100644 --- a/src/libre/ast_analysis.c +++ b/src/libre/ast_analysis.c @@ -814,6 +814,46 @@ set_flags_subtree(struct ast_expr *n, enum ast_flags flags) } } +static int +can_consume_single_newline(struct ast_expr *n) +{ + if (!can_consume_input(n)) { return 0; } + + if (n->flags & AST_FLAG_MATCHES_1NEWLINE) { return 1; } + + switch (n->type) { + case AST_EXPR_LITERAL: + return n->u.literal.c == '\n'; + + case AST_EXPR_CODEPOINT: + return n->u.codepoint.u == (uint32_t)'\n'; + + case AST_EXPR_RANGE: + if ((n->u.range.from.type == AST_ENDPOINT_LITERAL) && + (n->u.range.to.type == AST_ENDPOINT_LITERAL)) { + return n->u.range.from.u.literal.c <= '\n' + && n->u.range.to.u.literal.c >= '\n'; + } else if ((n->u.range.from.type == AST_ENDPOINT_CODEPOINT) && + (n->u.range.to.type == AST_ENDPOINT_CODEPOINT)) { + return n->u.range.from.u.codepoint.u <= '\n' + && n->u.range.to.u.codepoint.u >= '\n'; + } else if (n->u.range.from.type == AST_ENDPOINT_NAMED) { + /* TODO: unreachable? */ + break; + } + break; + + case AST_EXPR_SUBTRACT: + return can_consume_single_newline(n->u.subtract.a) + && !can_consume_single_newline(n->u.subtract.b); + + default: + break; + } + + return 0; +} + struct anchoring_env { enum re_flags re_flags; @@ -977,6 +1017,9 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) break; case AST_EXPR_CODEPOINT: case AST_EXPR_RANGE: + if (can_consume_single_newline(n)) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } break; /* handled outside switch/case */ case AST_EXPR_CONCAT: { @@ -1145,6 +1188,13 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } } + for (i = 0; i < n->u.concat.count; i++) { + struct ast_expr *child = n->u.concat.n[i]; + if (can_consume_single_newline(child)) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } + } + break; } @@ -1183,7 +1233,6 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) any_sat = 1; } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE || res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { - assert(child->flags & AST_FLAG_UNSATISFIABLE); continue; } else { return res; @@ -1197,6 +1246,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) all_end_anchored = 0; } } + + if (child->flags & AST_FLAG_MATCHES_1NEWLINE) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } } if (!env->past_always_consuming && all_set_past_always_consuming) { @@ -1285,6 +1338,21 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) n->u.repeat.max = 1; } } + + if (can_consume_single_newline(n->u.repeat.e)) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } + + if (n->u.repeat.e->flags & AST_FLAG_ANCHORED_END && n->u.repeat.min > 0) { + /* FIXME: if repeating something that is always + * anchored at the end, repeat.max could be + * capped at 1, but I have not yet found any + * inputs where that change is necessary to + * produce a correct result. */ + LOG(3 - LOG_ANCHORING, + "%s: REPEAT: repeating ANCHORED_END subtree >0 times -> ANCHORED_END\n", __func__); + set_flags(n, n->u.repeat.e->flags & END_ANCHOR_FLAG_MASK); + } break; case AST_EXPR_GROUP: @@ -1302,13 +1370,18 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } if (res == AST_ANALYSIS_UNSATISFIABLE) { LOG(3 - LOG_ANCHORING, - "%s: GROUP: setting UNSATISFIABLE due to unsatisfiable childn", + "%s: GROUP: setting UNSATISFIABLE due to unsatisfiable child", __func__); set_flags(n, AST_FLAG_UNSATISFIABLE); } if (res != AST_ANALYSIS_OK) { return res; } + + if (n->u.group.e->flags & AST_FLAG_MATCHES_1NEWLINE) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } + break; case AST_EXPR_SUBTRACT: @@ -1345,6 +1418,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n) } return res; } + if (can_consume_single_newline(n->u.repeat.e)) { + set_flags(n, AST_FLAG_MATCHES_1NEWLINE); + } + break; default: @@ -1412,18 +1489,19 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) * have reached it. */ set_flags(n, AST_FLAG_ANCHORED_END); - if (env->followed_by_consuming_newline) { - LOG(3 - LOG_ANCHORING, - "%s: RANGE: rejecting possible newline match after $ as unsupported\n", - __func__); - set_flags(n, AST_FLAG_UNSATISFIABLE); - return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; - } else if (env->followed_by_consuming) { - LOG(3 - LOG_ANCHORING, - "%s: END anchor & followed_by_consuming, setting UNSATISFIABLE\n", - __func__); - set_flags(n, AST_FLAG_UNSATISFIABLE); - return AST_ANALYSIS_UNSATISFIABLE; + if (env->followed_by_consuming) { + if (env->followed_by_consuming_newline) { + LOG(3 - LOG_ANCHORING, + "%s: END anchor & followed_by_consuming, returning UNSUPPORTED_PCRE\n", + __func__); + return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE; + } else { + LOG(3 - LOG_ANCHORING, + "%s: END anchor & followed_by_consuming, setting UNSATISFIABLE\n", + __func__); + set_flags(n, AST_FLAG_UNSATISFIABLE); + return AST_ANALYSIS_UNSATISFIABLE; + } } break; @@ -1484,7 +1562,8 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) set_flags(n, AST_FLAG_UNSATISFIABLE); } } else if (res != AST_ANALYSIS_OK) { - set_flags(n, AST_FLAG_UNSATISFIABLE); + LOG(3 - LOG_ANCHORING, + "%s: CONCAT: got res of %d, bubbling up\n", __func__, res); return res; } @@ -1500,6 +1579,15 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) env->followed_by_consuming = 1; } + if (!env->followed_by_consuming_newline && + (child_env.followed_by_consuming_newline + || child->flags & AST_FLAG_MATCHES_1NEWLINE)) { + LOG(3 - LOG_ANCHORING, + "%s: setting followed_by_consuming_newline due to child %p's analysis\n", + __func__, (void *)child); + env->followed_by_consuming_newline = 1; + } + if (!env->before_start_anchor && child_env.before_start_anchor && !is_nullable(child)) { LOG(3 - LOG_ANCHORING, @@ -1554,8 +1642,8 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n) any_sat = 1; } else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE || res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) { - assert(child->flags & AST_FLAG_UNSATISFIABLE); - continue; + LOG(3 - LOG_ANCHORING, "%s: got res of UNSUPPORTED_*, bubbling up\n", __func__); + return res; } else { return res; } diff --git a/src/libre/print/tree.c b/src/libre/print/tree.c index 58e1d6050..1f4b673fe 100644 --- a/src/libre/print/tree.c +++ b/src/libre/print/tree.c @@ -62,6 +62,7 @@ fprintf_flags(FILE *f, enum ast_flags flags) PR_FLAG(END_NL, "N"); PR_FLAG(CAN_CONSUME, "c"); PR_FLAG(ALWAYS_CONSUMES, "C"); + PR_FLAG(MATCHES_1NEWLINE, "n"); #undef PR_FLAG diff --git a/tests/capture/captest.c b/tests/capture/captest.c index bf453e3ad..4a27333c1 100644 --- a/tests/capture/captest.c +++ b/tests/capture/captest.c @@ -63,8 +63,14 @@ captest_run_case(const struct captest_case_single *testcase, if (testcase->match == SHOULD_REJECT_AS_UNSUPPORTED) { if (fsm != NULL) { fsm_free(fsm); + if (verbosity > 0) { + printf("FAIL (expected UNSUPPORTED)\n"); + } return CAPTEST_RUN_CASE_FAIL; } + if (verbosity > 0) { + printf("pass\n"); + } return CAPTEST_RUN_CASE_PASS; } diff --git a/tests/capture/capture_test_case_list.c b/tests/capture/capture_test_case_list.c index 6b72d1018..738e6785a 100644 --- a/tests/capture/capture_test_case_list.c +++ b/tests/capture/capture_test_case_list.c @@ -1370,11 +1370,11 @@ const struct captest_case_single single_cases[] = { { .regex = "a|_$[^b]", - .input = "a", - .count = 1, - .expected = { - { .pos = {0, 1}, }, - }, + .match = SHOULD_REJECT_AS_UNSUPPORTED, + }, + { + .regex = "^a|$[^x]b*", + .match = SHOULD_REJECT_AS_UNSUPPORTED, }, { diff --git a/tests/pcre/in48.re b/tests/pcre/in48.re new file mode 100644 index 000000000..15490c598 --- /dev/null +++ b/tests/pcre/in48.re @@ -0,0 +1 @@ +^a|$[^x]b* \ No newline at end of file diff --git a/tests/pcre/out48.err b/tests/pcre/out48.err new file mode 100644 index 000000000..b03d96d7e --- /dev/null +++ b/tests/pcre/out48.err @@ -0,0 +1 @@ +tests/pcre/in48.re: Unsupported PCRE edge case