From 977f22394661b403a88eec9f23b1ce10249134fa Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Thu, 10 Sep 2020 19:24:40 +0100 Subject: [PATCH] Fixes for rewrite_repeat rewrite_repeat, used for rewriting nested repeats contained a comment that said repeats can only be combined if the range of the result is not more than the sum of the ranges of the inputs, but the assertion actually tested the opposite: it tested that the range of the result is more than the sum of the ranges of the inputs. This assertion does not universally hold either way: a{2}{2} can be rewritten to a{4}, but the range as calculated of the latter (0) is equal to the sum of the ranges of the inputs (also 0). Separate from the assert, the rewrite was not performed in some cases where it is valid to do so. It is valid whenever the inner repeat has a lower bound of 0 or 1, whenever the inner repeat has no upper bound and the outer repeat has a positive lower bound, and whenever the outer range has a lower bound equal to the upper bound. The last case was missing. An example is a{2,3}{2}, which would previously be preserved in that form, but is now rewritten as a{4,6}. --- src/libre/ast_rewrite.c | 61 ++++++++++++++----------------------- tests/pcre-repeat/in55.re | 1 + tests/pcre-repeat/out55.fsm | 7 +++++ 3 files changed, 31 insertions(+), 38 deletions(-) create mode 100644 tests/pcre-repeat/in55.re create mode 100644 tests/pcre-repeat/out55.fsm diff --git a/src/libre/ast_rewrite.c b/src/libre/ast_rewrite.c index 329e3d30f..2b863a067 100644 --- a/src/libre/ast_rewrite.c +++ b/src/libre/ast_rewrite.c @@ -410,48 +410,33 @@ rewrite_repeat(struct ast_expr *n, enum re_flags flags) return 1; } - if (h == 0 || h == 1) { - dead = n->u.repeat.e; - - n->u.repeat.min = v; - n->u.repeat.max = w; - n->u.repeat.e = n->u.repeat.e->u.repeat.e; - - dead->type = AST_EXPR_EMPTY; - ast_expr_free(dead); - - return 1; - } - /* * a{h,i}{j,k} is equivalent to a{h*j,i*k} if it's possible to combine, - * and it's possible iff the range of the result is not more than - * the sum of the ranges of the two inputs. - */ - if (i != AST_COUNT_UNBOUNDED && k != AST_COUNT_UNBOUNDED) { - /* I don't know why this is true */ - assert(w - v > i - h + k - j); - } - - /* - * a{h,}{j,} - * a{h,i}{j,} - * a{h,}{j,k} + * i.e. if for all n in [h*j,i*k], a{n} would be included in a{h,i}{j,k}. + * a{h*j} is trivially included, and for any a{n} that is included where + * n is not a multiple of i, a{n+1} is also included: if n is not a + * multiple of i, then one of the a{h,i} instances matched fewer than i + * times and can accept another a. The only case to consider is where n + * is a multiple of i and the rewritten expression would include a{n+1}. + * + * If j == k, this cannot be a problem. The only case where all instances + * of a{h,i} accept i instances of a is where n==i*j==i*k. + * + * If i == AST_COUNT_UNBOUNDED and j != 0, this cannot be a problem. It + * is impossible for any a{h,i} to accept i instances of a, so the problem + * never arises here either. Note the restriction on j != 0 though: + * a{2,}{0,} cannot be rewritten as a{0,}. + * + * If a{n} is included by m instances of a{i}, where j<=mu.repeat.e; - - n->u.repeat.min = v; - n->u.repeat.max = w; - n->u.repeat.e = n->u.repeat.e->u.repeat.e; - - dead->type = AST_EXPR_EMPTY; - ast_expr_free(dead); - - return 1; - } - if (h > 1 && i == AST_COUNT_UNBOUNDED && j > 0) { + if (j == k + || (i == AST_COUNT_UNBOUNDED && j > 0) + || h * (j + 1) <= i * j + 1) { dead = n->u.repeat.e; n->u.repeat.min = v; diff --git a/tests/pcre-repeat/in55.re b/tests/pcre-repeat/in55.re new file mode 100644 index 000000000..32772dff2 --- /dev/null +++ b/tests/pcre-repeat/in55.re @@ -0,0 +1 @@ +^(?:a{2}){2}$ \ No newline at end of file diff --git a/tests/pcre-repeat/out55.fsm b/tests/pcre-repeat/out55.fsm new file mode 100644 index 000000000..f5f31a725 --- /dev/null +++ b/tests/pcre-repeat/out55.fsm @@ -0,0 +1,7 @@ +0 -> 1 'a'; +1 -> 2 'a'; +2 -> 3 'a'; +3 -> 4 'a'; + +start: 0; +end: 4;