From b2b466c5140ad6dace697e4a208d4557a5e8608f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Chocholat=C3=BD?= Date: Fri, 12 Jul 2024 07:23:18 +0200 Subject: [PATCH] feat: Add function to compute language difference lazily --- include/mata/nfa/nfa.hh | 29 ++++++++ src/nfa/operations.cc | 109 ++++++++++++++++++++++++++- tests/nfa/nfa.cc | 159 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 295 insertions(+), 2 deletions(-) diff --git a/include/mata/nfa/nfa.hh b/include/mata/nfa/nfa.hh index cb3927546..699d139f2 100644 --- a/include/mata/nfa/nfa.hh +++ b/include/mata/nfa/nfa.hh @@ -493,6 +493,34 @@ Nfa union_nondet(const Nfa &lhs, const Nfa &rhs); Nfa union_product(const Nfa &lhs, const Nfa &rhs, Symbol first_epsilon = EPSILON, std::unordered_map,State> *prod_map = nullptr); +/** + * @brief Compute a language difference as @p nfa_included \ @p nfa_excluded. + * + * Computed as a lazy intersection of @p nfa_included and a complement of @p nfa_excluded. The NFAs are lazily + * determinized and the complement is constructed lazily as well, guided by @p nfa_included. + * + * @param[in] nfa_included NFA to include in the difference. + * @param[in] nfa_excluded NFA to exclude from the difference. + * @param[in] macrostate_discover Callback event handler for discovering a new macrostate for the first time. The + * parameters are the determinized NFA constructed so far, the current macrostate, and the set of the original states + * corresponding to the macrostate. Return @c true if the determinization should continue, and @c false if the + * determinization should stop and return only the determinized NFA constructed so far. + * The parameters are: + const Nfa& nfa_included, + const Nfa& nfa_excluded, + const StateSet& macrostate_included_state_set, + const StateSet& macrostate_excluded_state_set, + const State macrostate, + const Nfa& nfa_lang_difference. + * @todo: TODO: Add support for specifying first epsilon symbol and compute epsilon closure during determinization. + */ +Nfa lang_difference( + const Nfa &nfa_included, const Nfa &nfa_excluded, + std::optional< + std::function + > macrostate_discover = std::nullopt +); + /** * @brief Compute intersection of two NFAs. * @@ -581,6 +609,7 @@ Nfa minimize(const Nfa &aut, const ParameterMap& params = { { "algorithm", "brzo * corresponding to the macrostate. Return @c true if the determinization should continue, and @c false if the * determinization should stop and return only the determinized NFA constructed so far. * @return Determinized automaton. + * @todo: TODO: Add support for specifying first epsilon symbol and compute epsilon closure during determinization. */ Nfa determinize( const Nfa& aut, std::unordered_map *subset_map = nullptr, diff --git a/src/nfa/operations.cc b/src/nfa/operations.cc index 171926dbd..98d631e66 100644 --- a/src/nfa/operations.cc +++ b/src/nfa/operations.cc @@ -1224,7 +1224,6 @@ std::optional Nfa::get_word_from_complement(const Alphabet* alphabet std::vector::const_pointer> worklist{}; std::unordered_map subset_map{}; - const auto subset_map_end{ subset_map.end() }; Nfa nfa_complete{}; const State sink_state{ nfa_complete.add_state() }; @@ -1269,7 +1268,7 @@ std::optional Nfa::get_word_from_complement(const Alphabet* alphabet if (symbols_it == symbols_end || symbol_advanced_to <= *symbols_it) { // Continue with the determinization of the NFA. const auto target_macrostate_it = subset_map.find(orig_targets); - if (target_macrostate_it != subset_map_end) { + if (target_macrostate_it != subset_map.end()) { target_macrostate = target_macrostate_it->second; } else { target_macrostate = nfa_complete.add_state(); @@ -1298,3 +1297,109 @@ std::optional Nfa::get_word_from_complement(const Alphabet* alphabet } return nfa_complete.get_word(); } + +Nfa mata::nfa::lang_difference( + const Nfa& nfa_included, const Nfa& nfa_excluded, + std::optional< + std::function + > macrostate_discover +) { + std::unordered_set subset_set_included{}; + std::unordered_set subset_set_excluded{}; + using SubsetMacrostateMap = std::unordered_map::const_pointer, + std::unordered_set::const_pointer + >, State>; + SubsetMacrostateMap subset_macrostate_map{}; + std::vector worklist{}; + + // '{}' represents a sink state that is always final in the complement. + const auto subset_set_excluded_sink_ptr{ subset_set_excluded.insert(StateSet{}).first.operator->() }; + + Nfa nfa_lang_difference{}; + const State new_initial{ nfa_lang_difference.add_state() }; + nfa_lang_difference.initial.insert(new_initial); + if (nfa_included.final.intersects_with(nfa_included.initial) && + !nfa_excluded.final.intersects_with(nfa_excluded.initial)) { + nfa_lang_difference.final.insert(new_initial); + } + auto subset_set_included_ptr{ + subset_set_included.emplace(nfa_included.initial).first.operator->() }; + auto subset_set_excluded_ptr{ + subset_set_excluded.emplace(nfa_excluded.initial).first.operator->() }; + auto subset_macrostate_map_ptr{ + subset_macrostate_map.emplace( + std::make_pair(subset_set_included_ptr, subset_set_excluded_ptr), new_initial).first.operator->() }; + worklist.emplace_back(subset_macrostate_map_ptr); + if (macrostate_discover.has_value() + && !(*macrostate_discover)( + nfa_included, nfa_excluded, + *subset_set_included_ptr, *subset_set_excluded_ptr, + new_initial, nfa_lang_difference) + ) { return nfa_lang_difference; } + + using Iterator = mata::utils::OrdVector::const_iterator; + SynchronizedExistentialSymbolPostIterator synchronized_iterator_included{}; + SynchronizedExistentialSymbolPostIterator synchronized_iterator_excluded{}; + + while (!worklist.empty()) { + const auto curr_macrostate_ptr{ worklist.back() }; + worklist.pop_back(); + const auto curr_state_set_included_ptr{ curr_macrostate_ptr->first.first }; + const auto curr_state_set_excluded_ptr{ curr_macrostate_ptr->first.second }; + const StateSet& curr_state_set_included{ *curr_state_set_included_ptr }; + const StateSet& curr_state_set_excluded{ *curr_state_set_excluded_ptr }; + const State macrostate{ curr_macrostate_ptr->second }; + + synchronized_iterator_included.reset(); + synchronized_iterator_excluded.reset(); + for (const State orig_state: curr_state_set_included) { + mata::utils::push_back(synchronized_iterator_included, nfa_included.delta[orig_state]); + } + for (const State orig_state: curr_state_set_excluded) { + mata::utils::push_back(synchronized_iterator_excluded, nfa_excluded.delta[orig_state]); + } + bool sync_it_included_advanced{ synchronized_iterator_included.advance() }; + bool sync_it_excluded_advanced{ false }; + while (sync_it_included_advanced) { + const std::vector& orig_symbol_posts{ synchronized_iterator_included.get_current() }; + const Symbol symbol_advanced_to{ (*orig_symbol_posts.begin())->symbol }; + StateSet orig_targets_included{ synchronized_iterator_included.unify_targets() }; + sync_it_excluded_advanced = synchronized_iterator_excluded.synchronize_with(symbol_advanced_to); + StateSet orig_targets_excluded{ + sync_it_excluded_advanced ? synchronized_iterator_excluded.unify_targets() : StateSet{} + }; + const bool final_included_intersects_targets{ nfa_included.final.intersects_with(orig_targets_included) }; + const bool final_excluded_intersects_targets{ nfa_excluded.final.intersects_with(orig_targets_excluded) }; + subset_set_included_ptr = subset_set_included.insert(std::move(orig_targets_included)).first.operator->(); + subset_set_excluded_ptr = subset_set_excluded.insert(std::move(orig_targets_excluded)).first.operator->(); + const auto [target_macrostate_it, macrostate_inserted]{ subset_macrostate_map.emplace( + std::make_pair(subset_set_included_ptr, subset_set_excluded_ptr), nfa_lang_difference.num_of_states() + ) }; + subset_macrostate_map_ptr = target_macrostate_it.operator->(); + const State target_macrostate{ target_macrostate_it->second }; + nfa_lang_difference.delta.add(macrostate, symbol_advanced_to, target_macrostate); + if (macrostate_inserted) { + // 'sync_it_excluded_advanced' is true iff there is a transition in the excluded NFA over the symbol + // 'symbol_advanced_to'. If sync_it_excluded_advanced == false, the complement of the excluded NFA will + // have a transition over 'symbol_advanced_to' to a "sink state" which is a final state in the + // complement, and therefore must always be final in the language difference. + if (final_included_intersects_targets) { + if (subset_set_excluded_ptr == subset_set_excluded_sink_ptr + || (sync_it_excluded_advanced && !final_excluded_intersects_targets)) { + nfa_lang_difference.final.insert(target_macrostate); + } + } + if (macrostate_discover.has_value() + && !(*macrostate_discover)( + nfa_included, nfa_excluded, + *subset_set_included_ptr, *subset_set_excluded_ptr, + target_macrostate, nfa_lang_difference)) { return nfa_lang_difference; } + + worklist.emplace_back(subset_macrostate_map_ptr); + } + sync_it_included_advanced = synchronized_iterator_included.advance(); + } + } + return nfa_lang_difference; +} diff --git a/tests/nfa/nfa.cc b/tests/nfa/nfa.cc index 7122fa36e..03d7121b7 100644 --- a/tests/nfa/nfa.cc +++ b/tests/nfa/nfa.cc @@ -619,6 +619,165 @@ TEST_CASE("mata::nfa::Nfa::get_word_from_complement()") { } } +TEST_CASE("mata::nfa::lang_difference()") { + Nfa nfa_included{}; + Nfa nfa_excluded{}; + Nfa result{}; + Nfa expected{}; + + SECTION("empty automata") { + result = lang_difference(nfa_included, nfa_excluded); + CHECK(result.is_lang_empty()); + } + + SECTION("empty included") { + nfa_excluded.initial = { 0 }; + nfa_excluded.final = { 0 }; + result = lang_difference(nfa_included, nfa_excluded); + CHECK(result.is_lang_empty()); + } + + SECTION("empty excluded") { + nfa_included.initial = { 0 }; + nfa_included.final = { 0 }; + result = lang_difference(nfa_included, nfa_excluded); + expected = nfa_included; + CHECK(are_equivalent(result, expected)); + } + + SECTION("included { '', 'a' }, excluded { '' }") { + nfa_included.initial = { 0 }; + nfa_included.delta.add(0, 'a', 1); + nfa_included.final = { 0, 1 }; + + nfa_excluded.initial = { 0 }; + nfa_excluded.final = { 0 }; + + expected.initial = { 0 }; + expected.delta.add(0, 'a', 1); + expected.final = { 1 }; + + result = lang_difference(nfa_included, nfa_excluded); + CHECK(are_equivalent(result, expected)); + } + + SECTION("included { '', 'a' }, excluded { 'a' }") { + nfa_included.initial = { 0 }; + nfa_included.delta.add(0, 'a', 1); + nfa_included.final = { 0 }; + + nfa_excluded.initial = { 0 }; + nfa_excluded.delta.add(0, 'a', 1); + nfa_excluded.final = { 1 }; + + expected.initial = { 0 }; + expected.final = { 0 }; + + result = lang_difference(nfa_included, nfa_excluded); + CHECK(are_equivalent(result, expected)); + } + + SECTION("included { '', 'a' }, excluded { '', 'a' }") { + nfa_included.initial = { 0 }; + nfa_included.delta.add(0, 'a', 1); + nfa_included.final = { 0, 1 }; + + nfa_excluded.initial = { 0 }; + nfa_excluded.delta.add(0, 'a', 1); + nfa_excluded.final = { 0, 1 }; + + result = lang_difference(nfa_included, nfa_excluded); + CHECK(are_equivalent(result, expected)); + } + + SECTION("included { '', 'a', 'ab' }, excluded { '', 'ab' }") { + nfa_included.initial = { 0 }; + nfa_included.delta.add(0, 'a', 1); + nfa_included.delta.add(1, 'b', 2); + nfa_included.final = { 0, 1, 2 }; + + nfa_excluded.initial = { 0 }; + nfa_excluded.delta.add(0, 'a', 1); + nfa_excluded.delta.add(1, 'b', 2); + nfa_excluded.final = { 0, 2 }; + + expected.initial = { 0 }; + expected.delta.add(0, 'a', 1); + expected.final = { 1 }; + + result = lang_difference(nfa_included, nfa_excluded); + CHECK(are_equivalent(result, expected)); + } + + SECTION("included { '', 'a+', 'a+b' }, excluded { '', 'ab' }") { + nfa_included.initial = { 0 }; + nfa_included.delta.add(0, 'a', 1); + nfa_included.delta.add(1, 'a', 1); + nfa_included.delta.add(1, 'b', 2); + nfa_included.final = { 0, 1, 2 }; + + nfa_excluded.initial = { 0 }; + nfa_excluded.delta.add(0, 'a', 1); + nfa_excluded.delta.add(1, 'b', 2); + nfa_excluded.final = { 0, 2 }; + + expected.initial = { 0 }; + expected.delta.add(0, 'a', 1); + expected.delta.add(1, 'a', 1); + expected.delta.add(1, 'a', 2); + expected.delta.add(2, 'b', 3); + expected.final = { 1, 3 }; + + result = lang_difference(nfa_included, nfa_excluded); + CHECK(are_equivalent(result, expected)); + } + + SECTION("included { '', 'ab' }, excluded { '', 'a+', 'a+b' }") { + nfa_included.initial = { 0 }; + nfa_included.delta.add(0, 'a', 1); + nfa_included.delta.add(1, 'b', 2); + nfa_included.final = { 0, 2 }; + + nfa_excluded.initial = { 0 }; + nfa_excluded.delta.add(0, 'a', 1); + nfa_excluded.delta.add(1, 'a', 1); + nfa_excluded.delta.add(1, 'b', 2); + nfa_excluded.final = { 0, 1, 2 }; + + result = lang_difference(nfa_included, nfa_excluded); + CHECK(are_equivalent(result, expected)); + } + + SECTION("included { 'a', 'ab', '(abc)+a', '(abc)+ab' }, excluded { 'a', 'ab' }") { + nfa_included.initial = { 0 }; + nfa_included.delta.add(0, 'a', 1); + nfa_included.delta.add(0, 'a', 2); + nfa_included.delta.add(0, 'a', 3); + nfa_included.delta.add(3, 'b', 4); + nfa_included.delta.add(2, 'b', 5); + nfa_included.delta.add(1, 'b', 6); + nfa_included.delta.add(6, 'c', 0); + nfa_included.final = { 2, 6 }; + + nfa_excluded.initial = { 0 }; + nfa_excluded.delta.add(0, 'a', 1); + nfa_excluded.delta.add(1, 'b', 2); + nfa_excluded.final = { 0, 1, 2 }; + + expected.initial = { 0 }; + expected.delta.add(0, 'a', 1); + expected.delta.add(1, 'b', 2); + expected.delta.add(2, 'c', 3); + expected.delta.add(3, 'a', 4); + expected.delta.add(4, 'b', 5); + expected.delta.add(5, 'c', 3); + expected.final = { 4, 5 }; + + result = lang_difference(nfa_included, nfa_excluded); + CHECK(are_equivalent(result, expected)); + } +} + TEST_CASE("mata::nfa::minimize() for profiling", "[.profiling],[minimize]") { Nfa aut(4); Nfa result;