diff --git a/fuzz/target.c b/fuzz/target.c index e0c35c018..316c5ad57 100644 --- a/fuzz/target.c +++ b/fuzz/target.c @@ -422,6 +422,11 @@ fsm_eager_output_dump(FILE *f, const struct fsm *fsm); static int fuzz_eager_output(const uint8_t *data, size_t size) { + if (size > 0) { + const unsigned seed = data[0]; + srand(seed); + } + struct feo_env env = { .ok = true, .pattern_count = 0, @@ -446,9 +451,6 @@ fuzz_eager_output(const uint8_t *data, size_t size) size_t max_pattern_length = 0; - const unsigned seed = size == 0 ? 0 : data[0]; - srand(seed); - /* chop data into a series of patterns */ { size_t prev = 0; @@ -506,13 +508,14 @@ fuzz_eager_output(const uint8_t *data, size_t size) } } - struct re_anchoring_info anchorage[MAX_PATTERNS] = {0}; + enum re_is_anchored_res anchorage[MAX_PATTERNS] = {0}; /* for each pattern, attempt to compile to a DFA */ for (size_t p_i = 0; p_i < env.pattern_count; p_i++) { const char *p = env.patterns[p_i]; - if (!re_is_anchored(RE_PCRE, fsm_sgetc, &p, 0, NULL, &anchorage[p_i])) { + enum re_is_anchored_res a = re_is_anchored(RE_PCRE, fsm_sgetc, &p, 0, NULL); + if (a == RE_IS_ANCHORED_ERROR) { continue; /* unsupported regex */ } @@ -602,8 +605,8 @@ fuzz_eager_output(const uint8_t *data, size_t size) } entries[used].fsm = cp; - entries[used].anchored_start = anchorage[i].start; - entries[used].anchored_end = anchorage[i].end; + entries[used].anchored_start = anchorage[i] & RE_IS_ANCHORED_START; + entries[used].anchored_end = anchorage[i] & RE_IS_ANCHORED_END; used++; } @@ -648,7 +651,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) * Use the combined DFA to generate matches, check that the * match behavior agrees with the individual DFA copies. */ env.current_pattern = (size_t)-1; - if (!fsm_generate_matches(env.combined, max_pattern_length, seed, gen_combined_check_individual_cb, &env)) { + if (!fsm_generate_matches(env.combined, max_pattern_length, 1, gen_combined_check_individual_cb, &env)) { goto cleanup; } @@ -658,7 +661,7 @@ fuzz_eager_output(const uint8_t *data, size_t size) /* check behavior against the combined DFA. */ for (size_t i = 0; i < env.pattern_count; i++) { env.current_pattern = i; - if (!fsm_generate_matches(env.combined, max_pattern_length, seed, gen_individual_check_combined_cb, &env)) { + if (!fsm_generate_matches(env.combined, max_pattern_length, 1, gen_individual_check_combined_cb, &env)) { goto cleanup; } } diff --git a/include/adt/common.h b/include/adt/common.h index 8604edacd..a83604c97 100644 --- a/include/adt/common.h +++ b/include/adt/common.h @@ -36,7 +36,7 @@ /* If non-zero, expand the timer macros defined below, otherwise * they compile away. */ #ifndef TRACK_TIMES -#define TRACK_TIMES 0 +#define TRACK_TIMES (0 && !BUILD_FOR_FUZZER) #endif #if EXPENSIVE_CHECKS && TRACK_TIMES diff --git a/include/fsm/fsm.h b/include/fsm/fsm.h index 9baf929d0..f78d91d71 100644 --- a/include/fsm/fsm.h +++ b/include/fsm/fsm.h @@ -432,6 +432,21 @@ fsm_remove_epsilons(struct fsm *fsm); int fsm_determinise(struct fsm *fsm); +/* Determinise, with a passed in configuration + * and a distinct return value for reaching + * the state limit. */ +struct fsm_determinise_config { + size_t state_limit; /* 0: no limit */ +}; +enum fsm_determinise_with_config_res { + FSM_DETERMINISE_WITH_CONFIG_OK, + FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED, + FSM_DETERMINISE_WITH_CONFIG_ERRNO, +}; +enum fsm_determinise_with_config_res +fsm_determinise_with_config(struct fsm *fsm, + const struct fsm_determinise_config *config); + /* * Make a DFA complete, as per fsm_iscomplete. */ diff --git a/include/fsm/walk.h b/include/fsm/walk.h index ea5a223e0..cb97e989a 100644 --- a/include/fsm/walk.h +++ b/include/fsm/walk.h @@ -90,9 +90,10 @@ fsm_walk_edges(const struct fsm *fsm, void *opaque, * functionally equivalent cases makes testing dramatically faster, * but exploring every edge could be added later. * - * If seed is zero then it will generate the first label in the label - * set, otherwise a label from the set will be chosen using rand() - * (favoring printable characters). + * If randomized is zero then it will generate the first label in the + * label set, otherwise a label from the set will be chosen using rand() + * (favoring printable characters). The caller can use srand() + * beforehand to set a PRNG seed. * * Note: fsm is non-const because it calls fsm_trim on the FSM * internally. This records the shortest distance from each state to an @@ -118,7 +119,7 @@ fsm_generate_matches_cb(const struct fsm *fsm, const char *input, size_t input_length, fsm_state_t end_state, void *opaque); int -fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed, +fsm_generate_matches(struct fsm *fsm, size_t max_length, int randomized, fsm_generate_matches_cb *cb, void *opaque); /* Callback provided for the most basic use case for diff --git a/include/re/re.h b/include/re/re.h index a3e1f7e0c..841e4e946 100644 --- a/include/re/re.h +++ b/include/re/re.h @@ -136,20 +136,21 @@ re_comp(enum re_dialect dialect, const struct fsm_alloc *alloc, enum re_flags flags, struct re_err *err); -struct re_anchoring_info { - int start; - int end; - /* FIXME: this could also check for AST_FLAG_NULLABLE, AST_FLAG_UNSATISFIABLE, - * AST_FLAG_ALWAYS_CONSUMES, AST_FLAG_CAN_CONSUME */ -}; - /* Parse and analyze the regex enough to determine whether it is - * anchored at the start and/or end. Returns 0 if the regex is not - * supported, otherwise returns 1 and writes anchoring flags into *info. */ -int + * anchored at the start and/or end. + * + * As long as the result is checked for RE_IS_ANCHORED_ERROR first, + * the result can be used like a bitset. */ +enum re_is_anchored_res { + RE_IS_ANCHORED_NONE = 0x00, + RE_IS_ANCHORED_START = 0x01, + RE_IS_ANCHORED_END = 0x02, + RE_IS_ANCHORED_BOTH = 0x03, + RE_IS_ANCHORED_ERROR = 0xFFFF, +}; +enum re_is_anchored_res re_is_anchored(enum re_dialect dialect, re_getchar_fun *f, void *opaque, - enum re_flags flags, struct re_err *err, - struct re_anchoring_info *info); + enum re_flags flags, struct re_err *err); /* * Return a human-readable string describing a given error code. The string diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 8978ce06c..9483218a1 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -20,16 +20,20 @@ dump_labels(FILE *f, const uint64_t labels[4]) } } -int -fsm_determinise(struct fsm *nfa) +enum fsm_determinise_with_config_res +fsm_determinise_with_config(struct fsm *nfa, + const struct fsm_determinise_config *config) { - int res = 0; + enum fsm_determinise_with_config_res res = FSM_DETERMINISE_WITH_CONFIG_ERRNO; struct mappingstack *stack = NULL; struct interned_state_set_pool *issp = NULL; struct map map = { NULL, 0, 0, NULL }; struct mapping *curr = NULL; size_t dfacount = 0; + const size_t state_limit = config == NULL + ? 0 + : config->state_limit; struct analyze_closures_env ac_env = { 0 }; @@ -43,7 +47,7 @@ fsm_determinise(struct fsm *nfa) */ if (fsm_has(nfa, fsm_hasepsilons)) { if (!fsm_remove_epsilons(nfa)) { - return 0; + return FSM_DETERMINISE_WITH_CONFIG_ERRNO; } } @@ -55,7 +59,12 @@ fsm_determinise(struct fsm *nfa) issp = interned_state_set_pool_alloc(nfa->alloc); if (issp == NULL) { - return 0; + return FSM_DETERMINISE_WITH_CONFIG_ERRNO; + } + + if (state_limit != 0 && fsm_countstates(nfa) > state_limit) { + res = FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED; + goto cleanup; } { @@ -77,7 +86,7 @@ fsm_determinise(struct fsm *nfa) */ if (!fsm_getstart(nfa, &start)) { - res = 1; + res = FSM_DETERMINISE_WITH_CONFIG_OK; goto cleanup; } @@ -153,6 +162,11 @@ fsm_determinise(struct fsm *nfa) assert(m->dfastate < dfacount); } else { /* not found -- add a new one and push it to the stack for processing */ + + if (state_limit != 0 && dfacount > state_limit) { + res = FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED; + goto cleanup; + } if (!map_add(&map, dfacount, iss, &m)) { goto cleanup; } @@ -174,8 +188,6 @@ fsm_determinise(struct fsm *nfa) } ac_env.output_count = 0; - - /* All elements in sclosures[] are interned, so they will be freed later. */ } while ((curr = stack_pop(stack))); { @@ -267,7 +279,7 @@ fsm_determinise(struct fsm *nfa) assert(fsm_all(nfa, fsm_isdfa)); #endif - res = 1; + res = FSM_DETERMINISE_WITH_CONFIG_OK; cleanup: map_free(&map); @@ -318,6 +330,22 @@ fsm_determinise(struct fsm *nfa) return res; } +int +fsm_determinise(struct fsm *nfa) +{ + enum fsm_determinise_with_config_res res = fsm_determinise_with_config(nfa, NULL); + switch (res) { + case FSM_DETERMINISE_WITH_CONFIG_OK: + return 1; + case FSM_DETERMINISE_WITH_CONFIG_STATE_LIMIT_REACHED: + /* unreachable */ + return 0; + case FSM_DETERMINISE_WITH_CONFIG_ERRNO: + default: + return 0; + } +} + /* Add DFA_state to the list for NFA_state. */ static int add_reverse_mapping(const struct fsm_alloc *alloc, diff --git a/src/libfsm/gen.c b/src/libfsm/gen.c index 9f78e67db..8b8551489 100644 --- a/src/libfsm/gen.c +++ b/src/libfsm/gen.c @@ -107,7 +107,7 @@ struct gen_ctx { static bool gen_init_outer(struct fsm *fsm, size_t max_length, fsm_generate_matches_cb *cb, void *opaque, - bool randomized, unsigned seed); + bool randomized); static bool gen_init(struct gen_ctx *ctx, struct fsm *fsm); @@ -140,7 +140,7 @@ static bool grow_stack(struct gen_ctx *ctx); int -fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed, +fsm_generate_matches(struct fsm *fsm, size_t max_length, int randomized, fsm_generate_matches_cb *cb, void *opaque) { if (max_length == 0) { @@ -154,7 +154,7 @@ fsm_generate_matches(struct fsm *fsm, size_t max_length, unsigned seed, INIT_TIMERS(); TIME(&pre); - int res = gen_init_outer(fsm, max_length, cb, opaque, seed != 0, seed); + int res = gen_init_outer(fsm, max_length, cb, opaque, randomized != 0); TIME(&post); DIFF_MSEC("fsm_generate_matches", pre, post, NULL); @@ -204,7 +204,7 @@ fsm_generate_cb_printf(const struct fsm *fsm, static bool gen_init_outer(struct fsm *fsm, size_t max_length, fsm_generate_matches_cb *cb, void *opaque, - bool randomized, unsigned seed) + bool randomized) { int res = false; if (fsm == NULL || cb == NULL || max_length == 0) { @@ -213,10 +213,6 @@ gen_init_outer(struct fsm *fsm, size_t max_length, assert(fsm_all(fsm, fsm_isdfa)); /* DFA-only */ - if (randomized) { - srand(seed); - } - #if LOG_GEN > 1 fprintf(stderr, "%s: %u states\n", __func__, fsm_countstates(fsm)); #endif diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index f109bbf3e..ab28b0a21 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -114,6 +114,7 @@ fsm_countstates fsm_trim fsm_reverse fsm_determinise +fsm_determinise_with_config fsm_remove_epsilons fsm_complete fsm_minimise diff --git a/src/libfsm/trim.c b/src/libfsm/trim.c index 4f45607bd..c37965fd8 100644 --- a/src/libfsm/trim.c +++ b/src/libfsm/trim.c @@ -462,6 +462,10 @@ integrity_check(const char *descr, const struct fsm *fsm) return; #endif +#if !EXPENSIVE_CHECKS + return; +#endif + if (LOG_TRIM > 1) { fprintf(stderr, "integrity check: %s...\n", descr); } diff --git a/src/libre/re.c b/src/libre/re.c index c19183dcc..013e2b58c 100644 --- a/src/libre/re.c +++ b/src/libre/re.c @@ -335,37 +335,40 @@ re_is_literal(enum re_dialect dialect, int (*getc)(void *opaque), void *opaque, return -1; } -/* FIXME: placeholder interface */ -int +enum re_is_anchored_res re_is_anchored(enum re_dialect dialect, re_getchar_fun *getc, void *opaque, - enum re_flags flags, struct re_err *err, - struct re_anchoring_info *info) + enum re_flags flags, struct re_err *err) { - /* FIXME: copy/pasted from above, factor out common */ + /* FIXME: copy/pasted from above, factor out common code later. */ struct ast *ast; const struct dialect *m; int unsatisfiable; assert(getc != NULL); - assert(info != NULL); m = re_dialect(dialect); if (m == NULL) { if (err != NULL) { err->e = RE_EBADDIALECT; } - return 0; + return RE_IS_ANCHORED_ERROR; } flags |= m->flags; ast = re_parse(dialect, getc, opaque, flags, err, &unsatisfiable); if (ast == NULL) { - return 0; + return RE_IS_ANCHORED_ERROR; } - info->start = (ast->expr->flags & AST_FLAG_ANCHORED_START) != 0; - info->end = (ast->expr->flags & AST_FLAG_ANCHORED_END) != 0; + /* Copy anchoring flags, ending up with NONE, START, END, or BOTH. */ + enum re_is_anchored_res res = RE_IS_ANCHORED_NONE; + if (ast->expr->flags & AST_FLAG_ANCHORED_START) { + res |= RE_IS_ANCHORED_START; + } + if (ast->expr->flags & AST_FLAG_ANCHORED_END) { + res |= RE_IS_ANCHORED_END; + } ast_free(ast); - return 1; + return res; }