From daa619be721c2f216821b26648172dfedd21c257 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 28 Mar 2024 15:32:12 +0000 Subject: [PATCH] When triggering tree repairs - repair each vnode once (#26) Each vnode keeps track of the ID if the last repair it triggered. If that ID changes, it assumes it has been triggered again and will prompt repair (as long as it is not the filter vnode on the query). This means that for each call to riak_kv_ttaaefs_manager:trigger_tree_repairs/0, each vnode should only repair the tree once. If any faulty segment is not repaired - the next time this node performs a full-sync, the repair will be re-triggered, and each vnode should repair once (and once only again). Note with N nodes in M clusters, when there is a faulty segment there will be N X M fetch_clocks_nval queries for every local full-sync event (and hence trigger of read repairs). Should sync_state become true, repairs will be disabled. to force repairs as an operator - call riak_kv_ttaaefs_manager:trigger_tree_repairs() from remote_console. This will force a read repair only once for each vnode (unless it is disabled by a local full-sync where sync_state=true). --- src/riak_kv_ttaaefs_manager.erl | 52 +++++++++++++++++++++++++++------ src/riak_kv_vnode.erl | 13 +++++---- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/src/riak_kv_ttaaefs_manager.erl b/src/riak_kv_ttaaefs_manager.erl index c435274a8..696dd01e7 100644 --- a/src/riak_kv_ttaaefs_manager.erl +++ b/src/riak_kv_ttaaefs_manager.erl @@ -48,7 +48,11 @@ clear_range/0, get_range/0, autocheck_suppress/0, - autocheck_suppress/1]). + autocheck_suppress/1, + maybe_repair_trees/2, + trigger_tree_repairs/0, + disable_tree_repairs/0 + ]). -include_lib("kernel/include/logger.hrl"). @@ -134,8 +138,9 @@ calendar:datetime()}. -type slot_info_fun() :: fun(() -> node_info()). +-type repair_id() :: {pid(), non_neg_integer()}. --export_type([work_item/0]). +-export_type([work_item/0, repair_id/0]). %%%============================================================================ %%% API @@ -742,19 +747,48 @@ drop_next_autocheck() -> -spec trigger_tree_repairs() -> ok. trigger_tree_repairs() -> + Cnt = get_repair_count(), ?LOG_INFO( - "Setting node to repair trees as unsync'd all_check had no repairs"), - application:set_env(riak_kv, aae_fetchclocks_repair, true). + "Setting node to repair trees as unsync'd all_check had no " + "repairs - count of triggered repairs for this node is ~w", + [Cnt] + ), + application:set_env( + riak_kv, aae_fetchclocks_repair, {true, {self(), Cnt}}), + application:set_env( + riak_kv, aae_fetchclocks_repair_count, Cnt). -spec disable_tree_repairs() -> ok. disable_tree_repairs() -> - case application:get_env(riak_kv, aae_fetchclocks_repair_force, false) of - true -> - ok; - false -> - application:set_env(riak_kv, aae_fetchclocks_repair, false) + application:set_env(riak_kv, aae_fetchclocks_repair, false). + +-spec get_repair_count() -> pos_integer(). +get_repair_count() -> + case application:get_env(riak_kv, aae_fetchclocks_repair_count, 0) of + N when is_integer(N) -> + N + 1; + _ -> + 1 end. +-spec maybe_repair_trees( + repair_id()|undefined, boolean()) -> {boolean(), repair_id()|undefined}. +maybe_repair_trees(LastRepairID, true) -> + %% If only a subset of partitions are to be queried on the vnode, then + %% don't repair. Wait until all partitions are to be repaired so that + %% there is not a partial repair + {false, LastRepairID}; +maybe_repair_trees(LastRepairID, _Filtered) -> + case application:get_env(riak_kv, aae_fetchclocks_repair, false) of + {true, LastRepairID} -> + {false, LastRepairID}; + {true, UpdRepairID} -> + {true, UpdRepairID}; + _ -> + {false, LastRepairID} + end. + + %%%============================================================================ %%% Internal functions %%%============================================================================ diff --git a/src/riak_kv_vnode.erl b/src/riak_kv_vnode.erl index 491421475..bdc2d1531 100644 --- a/src/riak_kv_vnode.erl +++ b/src/riak_kv_vnode.erl @@ -165,7 +165,8 @@ update_hook :: update_hook(), max_aae_queue_time :: non_neg_integer(), enable_nextgenreplsrc = false :: boolean(), - sizelimit_nextgenreplsrc = 0 :: non_neg_integer() + sizelimit_nextgenreplsrc = 0 :: non_neg_integer(), + tree_repair_id :: undefined|riak_kv_ttaaefs_manager:repair_id() }). -type index_op() :: add | remove. @@ -1719,17 +1720,19 @@ handle_aaefold({merge_branch_nval, Nval, BranchIDs}, {noreply, State}; handle_aaefold({fetch_clocks_nval, Nval, SegmentIDs}, InitAcc, Nval, - IndexNs, _Filtered, ReturnFun, Cntrl, Sender, + IndexNs, Filtered, ReturnFun, Cntrl, Sender, State) -> - case app_helper:get_env(riak_kv, aae_fetchclocks_repair, false) of + {ToRepair, NewRepairID} = + riak_kv_ttaaefs_manager:maybe_repair_trees( + State#state.tree_repair_id, Filtered), + case ToRepair of true -> aae_controller:aae_fetchclocks(Cntrl, IndexNs, SegmentIDs, ReturnFun, fun preflistfun/2), - - {noreply, State}; + {noreply, State#state{tree_repair_id = NewRepairID}}; false -> %% Using fetch_clocks_range will mean that the AF3_QUEUE will be %% used for scheduling the work not the aae_runner. Also, the