Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backing: Back-off from backing on approval checking lag #2908

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 56 additions & 8 deletions polkadot/node/core/provisioner/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ use polkadot_primitives::{
BackedCandidate, BlockNumber, CandidateHash, CandidateReceipt, CoreState, Hash, Id as ParaId,
OccupiedCoreAssumption, SignedAvailabilityBitfield, ValidatorIndex,
};
use std::collections::{BTreeMap, HashMap};
use std::{
cmp::max,
collections::{BTreeMap, HashMap},
};

mod disputes;
mod error;
Expand All @@ -52,6 +55,8 @@ mod metrics;
pub use self::metrics::*;
use error::{Error, FatalResult};

const TOLERATED_APPROVAL_CHECKING_LAG: i32 = 8;

#[cfg(test)]
mod tests;

Expand Down Expand Up @@ -124,10 +129,17 @@ impl<Context> ProvisionerSubsystem {
async fn run<Context>(mut ctx: Context, metrics: Metrics) -> FatalResult<()> {
let mut inherent_delays = InherentDelays::new();
let mut per_relay_parent = HashMap::new();
let mut approval_checking_lag: BlockNumber = 0;

loop {
let result =
run_iteration(&mut ctx, &mut per_relay_parent, &mut inherent_delays, &metrics).await;
let result = run_iteration(
&mut ctx,
&mut per_relay_parent,
&mut inherent_delays,
&metrics,
&mut approval_checking_lag,
)
.await;

match result {
Ok(()) => break,
Expand All @@ -144,6 +156,7 @@ async fn run_iteration<Context>(
per_relay_parent: &mut HashMap<Hash, PerRelayParent>,
inherent_delays: &mut InherentDelays,
metrics: &Metrics,
approval_checking_lag: &mut BlockNumber,
) -> Result<(), Error> {
loop {
futures::select! {
Expand All @@ -155,7 +168,7 @@ async fn run_iteration<Context>(
FromOrchestra::Signal(OverseerSignal::BlockFinalized(..)) => {},
FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(()),
FromOrchestra::Communication { msg } => {
handle_communication(ctx, per_relay_parent, msg, metrics).await?;
handle_communication(ctx, per_relay_parent, msg, metrics, approval_checking_lag).await?;
},
}
},
Expand All @@ -171,7 +184,7 @@ async fn run_iteration<Context>(

let return_senders = std::mem::take(&mut state.awaiting_inherent);
if !return_senders.is_empty() {
send_inherent_data_bg(ctx, &state, return_senders, metrics.clone()).await?;
send_inherent_data_bg(ctx, &state, return_senders, metrics.clone(), *approval_checking_lag).await?;
}
}
}
Expand Down Expand Up @@ -207,6 +220,7 @@ async fn handle_communication<Context>(
per_relay_parent: &mut HashMap<Hash, PerRelayParent>,
message: ProvisionerMessage,
metrics: &Metrics,
approval_checking_lag: &mut BlockNumber,
) -> Result<(), Error> {
match message {
ProvisionerMessage::RequestInherentData(relay_parent, return_sender) => {
Expand All @@ -215,8 +229,14 @@ async fn handle_communication<Context>(
if let Some(state) = per_relay_parent.get_mut(&relay_parent) {
if state.is_inherent_ready {
gum::trace!(target: LOG_TARGET, ?relay_parent, "Calling send_inherent_data.");
send_inherent_data_bg(ctx, &state, vec![return_sender], metrics.clone())
.await?;
send_inherent_data_bg(
ctx,
&state,
vec![return_sender],
metrics.clone(),
*approval_checking_lag,
)
.await?;
} else {
gum::trace!(
target: LOG_TARGET,
Expand All @@ -237,6 +257,9 @@ async fn handle_communication<Context>(
note_provisionable_data(state, &span, data);
}
},
ProvisionerMessage::ApprovalCheckingLagUpdate(lag) => {
*approval_checking_lag = lag;
},
}

Ok(())
Expand All @@ -248,6 +271,7 @@ async fn send_inherent_data_bg<Context>(
per_relay_parent: &PerRelayParent,
return_senders: Vec<oneshot::Sender<ProvisionerInherentData>>,
metrics: Metrics,
approval_checking_lag: BlockNumber,
) -> Result<(), Error> {
let leaf = per_relay_parent.leaf.clone();
let signed_bitfields = per_relay_parent.signed_bitfields.clone();
Expand Down Expand Up @@ -275,6 +299,7 @@ async fn send_inherent_data_bg<Context>(
return_senders,
&mut sender,
&metrics,
approval_checking_lag,
) // Make sure call is not taking forever:
.timeout(SEND_INHERENT_DATA_TIMEOUT)
.map(|v| match v {
Expand Down Expand Up @@ -386,6 +411,7 @@ async fn send_inherent_data(
return_senders: Vec<oneshot::Sender<ProvisionerInherentData>>,
from_job: &mut impl overseer::ProvisionerSenderTrait,
metrics: &Metrics,
approval_checking_lag: BlockNumber,
) -> Result<(), Error> {
gum::trace!(
target: LOG_TARGET,
Expand Down Expand Up @@ -436,6 +462,7 @@ async fn send_inherent_data(
prospective_parachains_mode,
leaf.hash,
from_job,
approval_checking_lag,
)
.await?;

Expand Down Expand Up @@ -708,12 +735,13 @@ async fn select_candidates(
prospective_parachains_mode: ProspectiveParachainsMode,
relay_parent: Hash,
sender: &mut impl overseer::ProvisionerSenderTrait,
approval_checking_lag: BlockNumber,
) -> Result<Vec<BackedCandidate>, Error> {
gum::trace!(target: LOG_TARGET,
leaf_hash=?relay_parent,
"before GetBackedCandidates");

let selected_candidates = match prospective_parachains_mode {
let mut selected_candidates = match prospective_parachains_mode {
ProspectiveParachainsMode::Enabled { .. } =>
request_backable_candidates(availability_cores, bitfields, relay_parent, sender).await?,
ProspectiveParachainsMode::Disabled =>
Expand All @@ -727,6 +755,26 @@ async fn select_candidates(
.await?,
};

// Eponentially back-off if we are past a tolerated approval checking lag.
// This way we avoid creating new work for approval subsystem and give it
// the opportunity to catch if it has falled behind.
let max_candidates_to_back = selected_candidates.len() /
usize::pow(
2,
max(0, approval_checking_lag as i32 - TOLERATED_APPROVAL_CHECKING_LAG) as u32,
);

if max_candidates_to_back != selected_candidates.len() {
gum::info!(
target: LOG_TARGET,
available_for_backing = ?selected_candidates.len(),
?max_candidates_to_back,
?approval_checking_lag,
"High approval checking lag throttle backing"
);
selected_candidates.truncate(max_candidates_to_back);
}

// now get the backed candidates corresponding to these candidate receipts
let (tx, rx) = oneshot::channel();
sender.send_unbounded_message(CandidateBackingMessage::GetBackedCandidates(
Expand Down
27 changes: 25 additions & 2 deletions polkadot/node/core/provisioner/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,18 +399,27 @@ mod select_candidates {
prospective_parachains_mode,
Default::default(),
&mut tx,
0,
)
.await
.unwrap();
},
)
}

#[test]
fn selects_correct_candidates() {
run_selects_correct_candidates(0);
run_selects_correct_candidates(2);
run_selects_correct_candidates(5);
run_selects_correct_candidates(6);
run_selects_correct_candidates(10);
}

// this tests that only the appropriate candidates get selected.
// To accomplish this, we supply a candidate list containing one candidate per possible core;
// the candidate selection algorithm must filter them to the appropriate set
#[test]
fn selects_correct_candidates() {
fn run_selects_correct_candidates(approval_checking_lag: BlockNumber) {
let mock_cores = mock_availability_cores();

let empty_hash = PersistedValidationData::<Hash, BlockNumber>::default().hash();
Expand Down Expand Up @@ -475,10 +484,21 @@ mod select_candidates {
prospective_parachains_mode,
Default::default(),
&mut tx,
approval_checking_lag,
)
.await
.unwrap();

assert_eq!(
expected_candidates.len() /
usize::pow(
2,
max(0, approval_checking_lag as i32 - TOLERATED_APPROVAL_CHECKING_LAG)
as u32,
),
result.len()
);

result.into_iter().for_each(|c| {
assert!(
expected_candidates.iter().any(|c2| c.candidate.corresponds_to(c2)),
Expand Down Expand Up @@ -553,6 +573,7 @@ mod select_candidates {
prospective_parachains_mode,
Default::default(),
&mut tx,
0,
)
.await
.unwrap();
Expand Down Expand Up @@ -620,6 +641,7 @@ mod select_candidates {
prospective_parachains_mode,
Default::default(),
&mut tx,
0,
)
.await
.unwrap();
Expand Down Expand Up @@ -686,6 +708,7 @@ mod select_candidates {
prospective_parachains_mode,
Default::default(),
&mut tx,
0,
)
.await
.unwrap();
Expand Down
7 changes: 7 additions & 0 deletions polkadot/node/service/src/relay_chain_selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ use polkadot_node_subsystem::messages::{
ApprovalDistributionMessage, ApprovalVotingMessage, ChainSelectionMessage,
DisputeCoordinatorMessage, HighestApprovedAncestorBlock,
};
use polkadot_node_subsystem_types::messages::ProvisionerMessage;
use polkadot_node_subsystem_util::metrics::{self, prometheus};
use polkadot_overseer::{AllMessages, Handle};
use polkadot_primitives::{Block as PolkadotBlock, BlockNumber, Hash, Header as PolkadotHeader};
Expand Down Expand Up @@ -483,6 +484,12 @@ where
std::any::type_name::<Self>(),
)
.await;
overseer_handle
.send_msg(
ProvisionerMessage::ApprovalCheckingLagUpdate(lag),
std::any::type_name::<Self>(),
)
.await;
};

spawn_handle.spawn(
Expand Down
2 changes: 2 additions & 0 deletions polkadot/node/subsystem-types/src/messages.rs
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,8 @@ pub enum ProvisionerMessage {
RequestInherentData(Hash, oneshot::Sender<ProvisionerInherentData>),
/// This data should become part of a relay chain block
ProvisionableData(Hash, ProvisionableData),
/// Approval checking lag update measured in blocks.
ApprovalCheckingLagUpdate(BlockNumber),
}

/// Message to the Collation Generation subsystem.
Expand Down
Loading