From 475782f3b0be31d58d05102f487db67a0d0b3c39 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 19 Sep 2024 13:58:18 +0800 Subject: [PATCH 1/5] crimson/osd/backfill_state: add pg infos to all backfill_state's logs Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_facades.h | 3 + src/crimson/osd/backfill_state.cc | 134 ++++++++++++++++------------- src/crimson/osd/backfill_state.h | 11 +++ src/test/crimson/test_backfill.cc | 3 + 4 files changed, 91 insertions(+), 60 deletions(-) diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h index 64544d4c8704d..ce649303d4f48 100644 --- a/src/crimson/osd/backfill_facades.h +++ b/src/crimson/osd/backfill_facades.h @@ -82,6 +82,9 @@ struct PGFacade final : BackfillState::PGFacade { } PGFacade(PG& pg) : pg(pg) {} + std::ostream &print(std::ostream &out) const override { + return out << pg; + } }; } // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 837fd2eb2af35..55ccb0ada13d3 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -8,11 +8,7 @@ #include "crimson/osd/backfill_state.h" #include "osd/osd_types_fmt.h" -namespace { - seastar::logger& logger() { - return crimson::get_logger(ceph_subsys_osd); - } -} +SET_SUBSYS(osd); namespace crimson::osd { @@ -27,22 +23,23 @@ BackfillState::BackfillState( progress_tracker( std::make_unique(backfill_machine)) { - logger().debug("{}:{}", __func__, __LINE__); + LOG_PREFIX(BackfillState::BackfillState); + DEBUGDPP("", *backfill_machine.pg); backfill_machine.initiate(); } template BackfillState::StateHelper::StateHelper() { - logger().debug("enter {}", - boost::typeindex::type_id().pretty_name()); + LOG_PREFIX(BackfillState::StateHelper); + DEBUGDPP("enter {}", pg(), boost::typeindex::type_id().pretty_name()); } template BackfillState::StateHelper::~StateHelper() { - logger().debug("exit {}", - boost::typeindex::type_id().pretty_name()); + LOG_PREFIX(BackfillState::StateHelper); + DEBUG("exit {}", boost::typeindex::type_id().pretty_name()); } BackfillState::~BackfillState() = default; @@ -63,13 +60,16 @@ BackfillState::BackfillMachine::~BackfillMachine() = default; BackfillState::Initial::Initial(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::Initial::Initial); backfill_state().last_backfill_started = peering_state().earliest_backfill(); - logger().debug("{}: bft={} from {}", - __func__, peering_state().get_backfill_targets(), - backfill_state().last_backfill_started); + DEBUGDPP("{}: bft={} from {}", + pg(), + __func__, + peering_state().get_backfill_targets(), + backfill_state().last_backfill_started); for (const auto& bt : peering_state().get_backfill_targets()) { - logger().debug("{}: target shard {} from {}", - __func__, bt, peering_state().get_peer_last_backfill(bt)); + DEBUGDPP("{}: target shard {} from {}", + pg(), __func__, bt, peering_state().get_peer_last_backfill(bt)); } ceph_assert(peering_state().get_backfill_targets().size()); ceph_assert(!backfill_state().last_backfill_started.is_max()); @@ -80,7 +80,8 @@ BackfillState::Initial::Initial(my_context ctx) boost::statechart::result BackfillState::Initial::react(const BackfillState::Triggered& evt) { - logger().debug("{}: backfill triggered", __func__); + LOG_PREFIX(BackfillState::Initial::react::Triggered); + DEBUGDPP("", pg()); ceph_assert(backfill_state().last_backfill_started == \ peering_state().earliest_backfill()); ceph_assert(peering_state().is_backfilling()); @@ -93,10 +94,10 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt) if (Enqueuing::all_enqueued(peering_state(), backfill_state().backfill_info, backfill_state().peer_backfill_info)) { - logger().debug("{}: switching to Done state", __func__); + DEBUGDPP("switching to Done state", pg()); return transit(); } else { - logger().debug("{}: switching to Enqueuing state", __func__); + DEBUGDPP("switching to Enqueuing state", pg()); return transit(); } } @@ -104,15 +105,16 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt) boost::statechart::result BackfillState::Cancelled::react(const BackfillState::Triggered& evt) { - logger().debug("{}: backfill re-triggered", __func__); + LOG_PREFIX(BackfillState::Cancelled::react::Triggered); + DEBUGDPP("backfill re-triggered", pg()); ceph_assert(peering_state().is_backfilling()); if (Enqueuing::all_enqueued(peering_state(), backfill_state().backfill_info, backfill_state().peer_backfill_info)) { - logger().debug("{}: switching to Done state", __func__); + DEBUGDPP("{}: switching to Done state", pg()); return transit(); } else { - logger().debug("{}: switching to Enqueuing state", __func__); + DEBUGDPP("{}: switching to Enqueuing state", pg()); return transit(); } } @@ -120,9 +122,10 @@ BackfillState::Cancelled::react(const BackfillState::Triggered& evt) // -- Enqueuing void BackfillState::Enqueuing::maybe_update_range() { + LOG_PREFIX(BackfillState::Enqueuing::maybe_update_range); if (auto& primary_bi = backfill_state().backfill_info; primary_bi.version >= pg().get_projected_last_update()) { - logger().info("{}: bi is current", __func__); + INFODPP("bi is current", pg()); ceph_assert(primary_bi.version == pg().get_projected_last_update()); } else if (primary_bi.version >= peering_state().get_log_tail()) { if (peering_state().get_pg_log().get_log().empty() && @@ -136,31 +139,31 @@ void BackfillState::Enqueuing::maybe_update_range() ceph_assert(primary_bi.version == eversion_t()); return; } - logger().debug("{}: bi is old, ({}) can be updated with log to {}", - __func__, - primary_bi.version, - pg().get_projected_last_update()); + DEBUGDPP("{}: bi is old, ({}) can be updated with log to {}", + pg(), + primary_bi.version, + pg().get_projected_last_update()); auto func = [&](const pg_log_entry_t& e) { - logger().debug("maybe_update_range(lambda): updating from version {}", - e.version); + DEBUGDPP("maybe_update_range(lambda): updating from version {}", + pg(), e.version); if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) { if (e.is_update()) { - logger().debug("maybe_update_range(lambda): {} updated to ver {}", - e.soid, e.version); + DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}", + pg(), e.soid, e.version); primary_bi.objects.erase(e.soid); primary_bi.objects.insert(std::make_pair(e.soid, e.version)); } else if (e.is_delete()) { - logger().debug("maybe_update_range(lambda): {} removed", - e.soid); + DEBUGDPP("maybe_update_range(lambda): {} removed", + pg(), e.soid); primary_bi.objects.erase(e.soid); } } }; - logger().debug("{}: scanning pg log first", __func__); + DEBUGDPP("{}: scanning pg log first", pg()); peering_state().scan_log_after(primary_bi.version, func); - logger().debug("{}: scanning projected log", __func__); + DEBUGDPP("{}: scanning projected log", pg()); pg().get_projected_log().scan_log_after(primary_bi.version, func); primary_bi.version = pg().get_projected_last_update(); } else { @@ -244,6 +247,7 @@ void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( BackfillState::Enqueuing::result_t BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) { + LOG_PREFIX(BackfillState::Enqueuing::remove_on_peers); // set `new_last_backfill_started` to `check` result_t result { {}, check }; for (const auto& bt : peering_state().get_backfill_targets()) { @@ -255,8 +259,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) backfill_listener().enqueue_drop(bt, pbi.begin, version); } } - logger().debug("{}: BACKFILL removing {} from peers {}", - __func__, check, result.pbi_targets); + DEBUGDPP("BACKFILL removing {} from peers {}", + pg(), check, result.pbi_targets); ceph_assert(!result.pbi_targets.empty()); return result; } @@ -264,7 +268,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) BackfillState::Enqueuing::result_t BackfillState::Enqueuing::update_on_peers(const hobject_t& check) { - logger().debug("{}: check={}", __func__, check); + LOG_PREFIX(BackfillState::Enqueuing::update_on_peers); + DEBUGDPP("check={}", pg(), check); const auto& primary_bi = backfill_state().backfill_info; result_t result { {}, primary_bi.begin }; std::map>> backfills; @@ -325,6 +330,7 @@ bool BackfillState::Enqueuing::Enqueuing::all_emptied( BackfillState::Enqueuing::Enqueuing(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::Enqueuing::Enqueuing); auto& primary_bi = backfill_state().backfill_info; // update our local interval to cope with recent changes @@ -334,8 +340,7 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) // that backfill will be spinning here over and over. For the sake // of performance and complexity we don't synchronize with entire PG. // similar can happen in classical OSD. - logger().warn("{}: bi is old, rescanning of local backfill_info", - __func__); + WARNDPP("bi is old, rescanning of local backfill_info", pg()); post_event(RequestPrimaryScanning{}); return; } else { @@ -347,7 +352,7 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) primary_bi)) { // need to grab one another chunk of the object namespace and restart // the queueing. - logger().debug("{}: reached end for current local chunk", __func__); + DEBUGDPP("reached end for current local chunk", pg()); post_event(RequestPrimaryScanning{}); return; } @@ -399,8 +404,8 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) backfill_state().last_backfill_started = hobject_t::get_max(); backfill_listener().update_peers_last_backfill(hobject_t::get_max()); } - logger().debug("{}: reached end for both local and all peers " - "but still has in-flight operations", __func__); + DEBUGDPP("reached end for both local and all peers " + "but still has in-flight operations", pg()); post_event(RequestWaiting{}); } @@ -416,7 +421,8 @@ BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx) boost::statechart::result BackfillState::PrimaryScanning::react(PrimaryScanned evt) { - logger().debug("{}", __func__); + LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned); + DEBUGDPP("", pg()); backfill_state().backfill_info = std::move(evt.result); return transit(); } @@ -424,8 +430,9 @@ BackfillState::PrimaryScanning::react(PrimaryScanned evt) boost::statechart::result BackfillState::PrimaryScanning::react(ObjectPushed evt) { - logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::PrimaryScanning::react::ObjectPushed); + DEBUGDPP("PrimaryScanning::react() on ObjectPushed; evt.object={}", + pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -443,11 +450,11 @@ bool BackfillState::ReplicasScanning::replica_needs_scan( BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::ReplicasScanning::ReplicasScanning); for (const auto& bt : peering_state().get_backfill_targets()) { if (const auto& pbi = backfill_state().peer_backfill_info.at(bt); replica_needs_scan(pbi, backfill_state().backfill_info)) { - logger().debug("{}: scanning peer osd.{} from {}", - __func__, bt, pbi.end); + DEBUGDPP("scanning peer osd.{} from {}", pg(), bt, pbi.end); backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{}); ceph_assert(waiting_on_backfill.find(bt) == \ @@ -469,8 +476,9 @@ BackfillState::ReplicasScanning::~ReplicasScanning() boost::statechart::result BackfillState::ReplicasScanning::react(ReplicaScanned evt) { - logger().debug("{}: got scan result from osd={}, result={}", - __func__, evt.from, evt.result); + LOG_PREFIX(BackfillState::ReplicasScanning::react::ReplicaScanned); + DEBUGDPP("got scan result from osd={}, result={}", + pg(), evt.from, evt.result); // TODO: maybe we'll be able to move waiting_on_backfill from // the machine to the state. ceph_assert(peering_state().is_backfill_target(evt.from)); @@ -484,7 +492,7 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) } else { // we canceled backfill for a while due to a too full, and this // is an extra response from a non-too-full peer - logger().debug("{}: canceled backfill (too full?)", __func__); + DEBUGDPP("canceled backfill (too full?)", pg()); } return discard_event(); } @@ -492,8 +500,8 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) boost::statechart::result BackfillState::ReplicasScanning::react(CancelBackfill evt) { - logger().debug("{}: cancelled within ReplicasScanning", - __func__); + LOG_PREFIX(BackfillState::ReplicasScanning::react::CancelBackfill); + DEBUGDPP("cancelled within ReplicasScanning", pg()); waiting_on_backfill.clear(); return transit(); } @@ -501,8 +509,9 @@ BackfillState::ReplicasScanning::react(CancelBackfill evt) boost::statechart::result BackfillState::ReplicasScanning::react(ObjectPushed evt) { - logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::ReplicasScanning::react::ObjectPushed); + DEBUGDPP("ReplicasScanning::react() on ObjectPushed; evt.object={}", + pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -517,17 +526,18 @@ BackfillState::Waiting::Waiting(my_context ctx) boost::statechart::result BackfillState::Waiting::react(ObjectPushed evt) { - logger().debug("Waiting::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed); + DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false); - return transit();; + return transit(); } // -- Done BackfillState::Done::Done(my_context ctx) : my_base(ctx) { - logger().info("{}: backfill is done", __func__); + LOG_PREFIX(BackfillState::Done::Done); + INFODPP("backfill is done", pg()); backfill_listener().backfilled(); } @@ -577,8 +587,8 @@ void BackfillState::ProgressTracker::complete_to( const pg_stat_t& stats, bool may_push_to_max) { - logger().debug("{}: obj={}", - __func__, obj); + LOG_PREFIX(BackfillState::ProgressTracker::complete_to); + DEBUGDPP("obj={}", pg(), obj); if (auto completion_iter = registry.find(obj); completion_iter != std::end(registry)) { completion_iter->second = \ @@ -619,4 +629,8 @@ void BackfillState::enqueue_standalone_push( backfill_machine.backfill_listener.enqueue_push(obj, v, peers); } +std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg) { + return pg.print(out); +} + } // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 072c91e079d7f..9e908af4c8a73 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -405,8 +405,10 @@ struct BackfillState::PGFacade { virtual const eversion_t& get_projected_last_update() const = 0; virtual const PGLog::IndexedLog& get_projected_log() const = 0; + virtual std::ostream &print(std::ostream &out) const = 0; virtual ~PGFacade() {} }; +std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg); class BackfillState::ProgressTracker { // TODO: apply_stat, @@ -433,6 +435,9 @@ class BackfillState::ProgressTracker { BackfillListener& backfill_listener() { return backfill_machine.backfill_listener; } + PGFacade& pg() { + return *backfill_machine.pg; + } public: ProgressTracker(BackfillMachine& backfill_machine) @@ -447,3 +452,9 @@ class BackfillState::ProgressTracker { }; } // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter + : fmt::ostream_formatter {}; +#endif + diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc index 7e058c80ed6e2..6d100984adfab 100644 --- a/src/test/crimson/test_backfill.cc +++ b/src/test/crimson/test_backfill.cc @@ -274,6 +274,9 @@ struct BackfillFixture::PGFacade : public crimson::osd::BackfillState::PGFacade return backfill_source.projected_log; } + std::ostream &print(std::ostream &out) const override { + return out << "FakePGFacade"; + } }; BackfillFixture::BackfillFixture( From 8f2ad5f560622f43dc8aaaba0fb5446d92c72196 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 4 Sep 2024 14:58:41 +0800 Subject: [PATCH 2/5] crimson/osd/backfill_state: Enqueuing doesn't last, so there should be no other events coming to it than those it generated itself Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_state.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 9e908af4c8a73..da07a377b9277 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -174,12 +174,9 @@ struct BackfillState { struct Enqueuing : sc::state, StateHelper { using reactions = boost::mpl::list< - sc::transition, sc::transition, sc::transition, sc::transition, - sc::transition, - sc::transition, sc::transition>; explicit Enqueuing(my_context); From 892cf5661f2f28efcaa3515d1c10b0aa2e6cc0fc Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Fri, 27 Sep 2024 09:51:20 +0800 Subject: [PATCH 3/5] crimson/osd/backfill_state: drop Cancelled We'll implement the backfill cancellation as a simple pause of the backfill procedure, which means we don't have to add a new state Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_state.cc | 26 -------------------------- src/crimson/osd/backfill_state.h | 26 -------------------------- 2 files changed, 52 deletions(-) diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 55ccb0ada13d3..62607c7fbebab 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -102,23 +102,6 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt) } } -boost::statechart::result -BackfillState::Cancelled::react(const BackfillState::Triggered& evt) -{ - LOG_PREFIX(BackfillState::Cancelled::react::Triggered); - DEBUGDPP("backfill re-triggered", pg()); - ceph_assert(peering_state().is_backfilling()); - if (Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - DEBUGDPP("{}: switching to Done state", pg()); - return transit(); - } else { - DEBUGDPP("{}: switching to Enqueuing state", pg()); - return transit(); - } -} - // -- Enqueuing void BackfillState::Enqueuing::maybe_update_range() { @@ -502,8 +485,6 @@ BackfillState::ReplicasScanning::react(CancelBackfill evt) { LOG_PREFIX(BackfillState::ReplicasScanning::react::CancelBackfill); DEBUGDPP("cancelled within ReplicasScanning", pg()); - waiting_on_backfill.clear(); - return transit(); } boost::statechart::result @@ -547,13 +528,6 @@ BackfillState::Crashed::Crashed() ceph_abort_msg("{}: this should not happen"); } -// -- Cancelled -BackfillState::Cancelled::Cancelled(my_context ctx) - : my_base(ctx) -{ - ceph_assert(peering_state().get_backfill_targets().size()); -} - // ProgressTracker is an intermediary between the BackfillListener and // BackfillMachine + its states. All requests to push or drop an object // are directed through it. The same happens with notifications about diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index da07a377b9277..34400d930b2b9 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -136,34 +136,10 @@ struct BackfillState { explicit Crashed(); }; - struct Cancelled : sc::state, - StateHelper { - using reactions = boost::mpl::list< - sc::custom_reaction, - sc::custom_reaction, - sc::custom_reaction, - sc::custom_reaction, - sc::transition>; - explicit Cancelled(my_context); - // resume after triggering backfill by on_activate_complete(). - // transit to Enqueuing. - sc::result react(const Triggered&); - sc::result react(const PrimaryScanned&) { - return discard_event(); - } - sc::result react(const ReplicaScanned&) { - return discard_event(); - } - sc::result react(const ObjectPushed&) { - return discard_event(); - } - }; - struct Initial : sc::state, StateHelper { using reactions = boost::mpl::list< sc::custom_reaction, - sc::transition, sc::transition>; explicit Initial(my_context); // initialize after triggering backfill by on_activate_complete(). @@ -234,7 +210,6 @@ struct BackfillState { sc::custom_reaction, sc::custom_reaction, sc::transition, - sc::transition, sc::transition>; explicit PrimaryScanning(my_context); sc::result react(ObjectPushed); @@ -274,7 +249,6 @@ struct BackfillState { using reactions = boost::mpl::list< sc::custom_reaction, sc::transition, - sc::transition, sc::transition>; explicit Waiting(my_context); sc::result react(ObjectPushed); From a34b0ce2aa6820706167e8ea3119160e6e68f157 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 8 Oct 2024 12:26:41 +0800 Subject: [PATCH 4/5] crimson/osd/backfill_state: treat Cancelled as a pause of the ongoing backfilling Fixes: https://tracker.ceph.com/issues/67888 Signed-off-by: Xuehan Xu --- src/crimson/osd/backfill_state.cc | 87 ++++++++++++++++++++++++++++--- src/crimson/osd/backfill_state.h | 30 +++++++++++ 2 files changed, 110 insertions(+), 7 deletions(-) diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 62607c7fbebab..1392ee330ac20 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -407,7 +407,34 @@ BackfillState::PrimaryScanning::react(PrimaryScanned evt) LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned); DEBUGDPP("", pg()); backfill_state().backfill_info = std::move(evt.result); - return transit(); + if (!backfill_state().is_suspended()) { + return transit(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(CancelBackfill evt) +{ + LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill); + DEBUGDPP("suspended within PrimaryScanning", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::PrimaryScanning::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit(); + } + return discard_event(); } boost::statechart::result @@ -470,12 +497,17 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) if (waiting_on_backfill.empty()) { ceph_assert(backfill_state().peer_backfill_info.size() == \ peering_state().get_backfill_targets().size()); - return transit(); + if (!backfill_state().is_suspended()) { + return transit(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } } } else { - // we canceled backfill for a while due to a too full, and this + // we suspended backfill for a while due to a too full, and this // is an extra response from a non-too-full peer - DEBUGDPP("canceled backfill (too full?)", pg()); + DEBUGDPP("suspended backfill (too full?)", pg()); } return discard_event(); } @@ -483,8 +515,22 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) boost::statechart::result BackfillState::ReplicasScanning::react(CancelBackfill evt) { - LOG_PREFIX(BackfillState::ReplicasScanning::react::CancelBackfill); - DEBUGDPP("cancelled within ReplicasScanning", pg()); + LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill); + DEBUGDPP("suspended within ReplicasScanning", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::ReplicasScanning::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::ReplicasScanning::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit(); + } + return discard_event(); } boost::statechart::result @@ -510,7 +556,34 @@ BackfillState::Waiting::react(ObjectPushed evt) LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed); DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false); - return transit(); + if (!backfill_state().is_suspended()) { + return transit(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::Waiting::react(CancelBackfill evt) +{ + LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill); + DEBUGDPP("suspended within Waiting", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::Waiting::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::Waiting::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit(); + } + return discard_event(); } // -- Done diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 34400d930b2b9..463be4a7a2eb5 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -210,11 +210,15 @@ struct BackfillState { sc::custom_reaction, sc::custom_reaction, sc::transition, + sc::custom_reaction, + sc::custom_reaction, sc::transition>; explicit PrimaryScanning(my_context); sc::result react(ObjectPushed); // collect scanning result and transit to Enqueuing. sc::result react(PrimaryScanned); + sc::result react(CancelBackfill); + sc::result react(Triggered); }; struct ReplicasScanning : sc::state, @@ -223,6 +227,7 @@ struct BackfillState { sc::custom_reaction, sc::custom_reaction, sc::custom_reaction, + sc::custom_reaction, sc::transition, sc::transition>; explicit ReplicasScanning(my_context); @@ -231,6 +236,7 @@ struct BackfillState { sc::result react(ObjectPushed); sc::result react(ReplicaScanned); sc::result react(CancelBackfill); + sc::result react(Triggered); // indicate whether a particular peer should be scanned to retrieve // BackfillInterval for new range of hobject_t namespace. @@ -249,9 +255,13 @@ struct BackfillState { using reactions = boost::mpl::list< sc::custom_reaction, sc::transition, + sc::custom_reaction, + sc::custom_reaction, sc::transition>; explicit Waiting(my_context); sc::result react(ObjectPushed); + sc::result react(CancelBackfill); + sc::result react(Triggered); }; struct Done : sc::state, @@ -296,6 +306,26 @@ struct BackfillState { } } private: + struct backfill_suspend_state_t { + bool suspended = false; + bool should_go_enqueuing = false; + } backfill_suspend_state; + bool is_suspended() const { + return backfill_suspend_state.suspended; + } + void on_suspended() { + ceph_assert(!is_suspended()); + backfill_suspend_state = {true, false}; + } + bool on_resumed() { + auto go_enqueuing = backfill_suspend_state.should_go_enqueuing; + backfill_suspend_state = {false, false}; + return go_enqueuing; + } + void go_enqueuing_on_resume() { + ceph_assert(is_suspended()); + backfill_suspend_state.should_go_enqueuing = true; + } hobject_t last_backfill_started; BackfillInterval backfill_info; std::map peer_backfill_info; From 1e22c110183c6512b771e421723b4cab172074ed Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 11 Sep 2024 11:37:40 +0800 Subject: [PATCH 5/5] test/crimson/test_backfill: more detailed backfill cancellation tests Emulate all backfill cancellation scenarios Signed-off-by: Xuehan Xu --- src/test/crimson/test_backfill.cc | 139 ++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 5 deletions(-) diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc index 6d100984adfab..e0fc5821d0896 100644 --- a/src/test/crimson/test_backfill.cc +++ b/src/test/crimson/test_backfill.cc @@ -119,6 +119,11 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener { events_to_dispatch.emplace_back(event.intrusive_from_this()); } + template + void schedule_event_immediate(const EventT& event) { + events_to_dispatch.emplace_front(event.intrusive_from_this()); + } + // BackfillListener { void request_replica_scan( const pg_shard_t& target, @@ -188,12 +193,11 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener { struct PGFacade; void cancel() { - events_to_dispatch.clear(); - schedule_event(crimson::osd::BackfillState::CancelBackfill{}); + schedule_event_immediate(crimson::osd::BackfillState::CancelBackfill{}); } void resume() { - schedule_event(crimson::osd::BackfillState::Triggered{}); + schedule_event_immediate(crimson::osd::BackfillState::Triggered{}); } }; @@ -455,7 +459,38 @@ TEST(backfill, two_empty_replicas) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } -TEST(backfill, cancel_resume) +TEST(backfill, cancel_resume_middle_of_primaryscan) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.cancel(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.resume(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_replicascan1) { const auto reference_store = FakeStore{ { { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, @@ -486,7 +521,7 @@ TEST(backfill, cancel_resume) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } -TEST(backfill, cancel_resume_middle_of_scan) +TEST(backfill, cancel_resume_middle_of_replicascan2) { const auto reference_store = FakeStore{ { { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, @@ -517,6 +552,100 @@ TEST(backfill, cancel_resume_middle_of_scan) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } +TEST(backfill, cancel_resume_middle_of_push1) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.cancel(); + cluster_fixture.next_round2(); + cluster_fixture.resume(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_push2) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.cancel(); + cluster_fixture.next_round2(); + cluster_fixture.resume(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_push3) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.cancel(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.resume(); + cluster_fixture.next_round2(); + cluster_fixture.next_round2(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + namespace StoreRandomizer { // FIXME: copied & pasted from test/test_snap_mapper.cc. We need to // find a way to avoid code duplication in test. A static library?