Skip to content

Commit

Permalink
Add error code for unsupported PCRE cases (RE_EUNSUPPPCRE), reject one.
Browse files Browse the repository at this point in the history
There's extra error codes in #440 for regexes that aren't UNSATISFIABLE
per se, but depend on particular corner cases in PCRE that probably
aren't worth supporting in an automata-based implementation.

Add a test case for one, tests/pcre/in48.re: ^a|$[^x]b*

This is a tricky one to handle properly; according to PCRE it should
match either "a<anything...>" OR "\n", but nothing else. The newline
match is because $ is a non-input-consuming check that evaluation is
either at the end of input, or at a newline immediately before the end.
In this case `$[^x]b*` matches exactly one newline; it's equivalent to
"$\n". This probably isn't worth supporting, but we can detect cases
where a potential newline match appears after a $ and reject them as
an unsupported PCRE behavior.
  • Loading branch information
silentbicycle committed Oct 19, 2023
1 parent 1ca3726 commit 928a696
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 7 deletions.
2 changes: 2 additions & 0 deletions include/re/re.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ enum re_errno {
RE_EERRNO = 1 | RE_MISC,
RE_EBADDIALECT = 2 | RE_MISC,
RE_EBADGROUP = 3 | RE_MISC,
RE_EUNSUPCAPTUR = 4 | RE_MISC,
RE_EUNSUPPPCRE = 5 | RE_MISC,

RE_ENEGRANGE = 0 | RE_MARK | RE_GROUP,
RE_ENEGCOUNT = 1 | RE_MARK | RE_GROUP,
Expand Down
1 change: 1 addition & 0 deletions src/libre/ast.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ enum ast_flags {
AST_FLAG_ANCHORED_START = 1 << 6,
AST_FLAG_ANCHORED_END = 1 << 7,
AST_FLAG_END_NL = 1 << 8,
AST_FLAG_MATCHES_1NEWLINE= 1 << 9,

AST_FLAG_NONE = 0x00
};
Expand Down
103 changes: 98 additions & 5 deletions src/libre/ast_analysis.c
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,46 @@ set_flags_subtree(struct ast_expr *n, enum ast_flags flags)
}
}

static int
can_consume_single_newline(struct ast_expr *n)
{
if (!can_consume_input(n)) { return 0; }

if (n->flags & AST_FLAG_MATCHES_1NEWLINE) { return 1; }

switch (n->type) {
case AST_EXPR_LITERAL:
return n->u.literal.c == '\n';

case AST_EXPR_CODEPOINT:
return n->u.codepoint.u == (uint32_t)'\n';

case AST_EXPR_RANGE:
if ((n->u.range.from.type == AST_ENDPOINT_LITERAL) &&
(n->u.range.to.type == AST_ENDPOINT_LITERAL)) {
return n->u.range.from.u.literal.c <= '\n'
&& n->u.range.to.u.literal.c >= '\n';
} else if ((n->u.range.from.type == AST_ENDPOINT_CODEPOINT) &&
(n->u.range.to.type == AST_ENDPOINT_CODEPOINT)) {
return n->u.range.from.u.codepoint.u <= '\n'
&& n->u.range.to.u.codepoint.u >= '\n';
} else if (n->u.range.from.type == AST_ENDPOINT_NAMED) {
/* TODO: unreachable? */
break;
}
break;

case AST_EXPR_SUBTRACT:
return can_consume_single_newline(n->u.subtract.a)
&& !can_consume_single_newline(n->u.subtract.b);

default:
break;
}

return 0;
}

struct anchoring_env {
enum re_flags re_flags;

Expand All @@ -562,6 +602,7 @@ struct anchoring_env {

/* Corresponding flag for end anchors while sweeping backward. */
int followed_by_consuming;
int followed_by_consuming_newline;

int before_start_anchor;
};
Expand Down Expand Up @@ -644,6 +685,9 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n)
case AST_EXPR_LITERAL:
case AST_EXPR_CODEPOINT:
case AST_EXPR_RANGE:
if (can_consume_single_newline(n)) {
set_flags(n, AST_FLAG_MATCHES_1NEWLINE);
}
break; /* handled outside switch/case */

case AST_EXPR_CONCAT: {
Expand Down Expand Up @@ -810,6 +854,13 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n)
}
}

for (i = 0; i < n->u.concat.count; i++) {
struct ast_expr *child = n->u.concat.n[i];
if (can_consume_single_newline(child)) {
set_flags(n, AST_FLAG_MATCHES_1NEWLINE);
}
}

break;
}

Expand Down Expand Up @@ -846,6 +897,9 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n)
} else if (res == AST_ANALYSIS_OK) {
all_set_past_always_consuming &= child_env.past_always_consuming;
any_sat = 1;
} else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE
|| res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) {
continue;
} else {
return res;
}
Expand All @@ -858,6 +912,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n)
all_end_anchored = 0;
}
}

if (child->flags & AST_FLAG_MATCHES_1NEWLINE) {
set_flags(n, AST_FLAG_MATCHES_1NEWLINE);
}
}

if (!env->past_always_consuming && all_set_past_always_consuming) {
Expand Down Expand Up @@ -925,6 +983,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n)
return res;
}

if (can_consume_single_newline(n->u.repeat.e)) {
set_flags(n, AST_FLAG_MATCHES_1NEWLINE);
}

if (n->u.repeat.e->flags & AST_FLAG_ANCHORED_END && n->u.repeat.min > 0) {
/* FIXME: if repeating something that is always
* anchored at the end, repeat.max could be
Expand Down Expand Up @@ -964,6 +1026,11 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n)
} while(0)

PROPAGATE_CHILD_FLAGS("GROUP", n, n->u.group.e);

if (n->u.group.e->flags & AST_FLAG_MATCHES_1NEWLINE) {
set_flags(n, AST_FLAG_MATCHES_1NEWLINE);
}

break;

case AST_EXPR_SUBTRACT:
Expand Down Expand Up @@ -991,6 +1058,10 @@ analysis_iter_anchoring(struct anchoring_env *env, struct ast_expr *n)
}
return res;
}
if (can_consume_single_newline(n->u.repeat.e)) {
set_flags(n, AST_FLAG_MATCHES_1NEWLINE);
}

break;

default:
Expand Down Expand Up @@ -1048,11 +1119,18 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n)
assert(n->flags & AST_FLAG_ANCHORED_END);

if (env->followed_by_consuming) {
LOG(3 - LOG_ANCHORING,
"%s: END anchor & followed_by_consuming, setting UNSATISFIABLE\n",
__func__);
set_flags(n, AST_FLAG_UNSATISFIABLE);
return AST_ANALYSIS_UNSATISFIABLE;
if (env->followed_by_consuming_newline) {
LOG(3 - LOG_ANCHORING,
"%s: END anchor & followed_by_consuming, returning UNSUPPORTED_PCRE\n",
__func__);
return AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE;
} else {
LOG(3 - LOG_ANCHORING,
"%s: END anchor & followed_by_consuming, setting UNSATISFIABLE\n",
__func__);
set_flags(n, AST_FLAG_UNSATISFIABLE);
return AST_ANALYSIS_UNSATISFIABLE;
}
}

break;
Expand Down Expand Up @@ -1113,6 +1191,8 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n)
set_flags(n, AST_FLAG_UNSATISFIABLE);
}
} else if (res != AST_ANALYSIS_OK) {
LOG(3 - LOG_ANCHORING,
"%s: CONCAT: got res of %d, bubbling up\n", __func__, res);
return res;
}

Expand All @@ -1128,6 +1208,15 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n)
env->followed_by_consuming = 1;
}

if (!env->followed_by_consuming_newline &&
(child_env.followed_by_consuming_newline
|| child->flags & AST_FLAG_MATCHES_1NEWLINE)) {
LOG(3 - LOG_ANCHORING,
"%s: setting followed_by_consuming_newline due to child %p's analysis\n",
__func__, (void *)child);
env->followed_by_consuming_newline = 1;
}

if (!env->before_start_anchor && child_env.before_start_anchor
&& !is_nullable(child)) {
LOG(3 - LOG_ANCHORING,
Expand Down Expand Up @@ -1169,6 +1258,10 @@ analysis_iter_reverse_anchoring(struct anchoring_env *env, struct ast_expr *n)
all_set_followed_by_consuming &= child_env.followed_by_consuming;
all_set_before_start_anchor &= child_env.before_start_anchor;
any_sat = 1;
} else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE
|| res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) {
LOG(3 - LOG_ANCHORING, "%s: got res of UNSUPPORTED_*, bubbling up\n", __func__);
return res;
} else {
return res;
}
Expand Down
4 changes: 3 additions & 1 deletion src/libre/ast_analysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ enum ast_analysis_res {
AST_ANALYSIS_UNSATISFIABLE,

AST_ANALYSIS_ERROR_NULL = -1,
AST_ANALYSIS_ERROR_MEMORY = -2
AST_ANALYSIS_ERROR_MEMORY = -2,
AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE = -3,
AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE = -4
};

enum ast_analysis_res
Expand Down
1 change: 1 addition & 0 deletions src/libre/print/tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ fprintf_flags(FILE *f, enum ast_flags flags)
PR_FLAG(END_NL, "N");
PR_FLAG(CAN_CONSUME, "c");
PR_FLAG(ALWAYS_CONSUMES, "C");
PR_FLAG(MATCHES_1NEWLINE, "n");

#undef PR_FLAG

Expand Down
10 changes: 9 additions & 1 deletion src/libre/re.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,15 @@ re_parse(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque,

if (res < 0) {
ast_free(ast);
if (err != NULL) { err->e = RE_EERRNO; }
if (err != NULL) {
if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_PCRE) {
err->e = RE_EUNSUPPPCRE;
} else if (res == AST_ANALYSIS_ERROR_UNSUPPORTED_CAPTURE) {
err->e = RE_EUNSUPCAPTUR;
} else if (err->e == RE_ESUCCESS) {
err->e = RE_EERRNO;
}
}
return NULL;
}

Expand Down
2 changes: 2 additions & 0 deletions src/libre/strerror.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ re_strerror(enum re_errno e)
case RE_EERRNO: return strerror(errno);
case RE_EBADDIALECT: return "Bad dialect";
case RE_EBADGROUP: return "Bad group";
case RE_EUNSUPCAPTUR: return "Cannot support captures in this case";
case RE_EUNSUPPPCRE: return "Unsupported PCRE edge case";

case RE_ENEGRANGE: return "Negative group range";
case RE_ENEGCOUNT: return "Negative count range";
Expand Down
1 change: 1 addition & 0 deletions tests/pcre/in48.re
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
^a|$[^x]b*
1 change: 1 addition & 0 deletions tests/pcre/out48.err
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/pcre/in48.re: Unsupported PCRE edge case

0 comments on commit 928a696

Please sign in to comment.