Skip to content

Commit

Permalink
[core] Detect and diagnose split brain in checkpoint proposals (Myste…
Browse files Browse the repository at this point in the history
  • Loading branch information
williampsmith authored Dec 12, 2023
1 parent 36a50a3 commit 879d7b6
Show file tree
Hide file tree
Showing 12 changed files with 571 additions and 38 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions crates/sui-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ bcs.workspace = true
bytes.workspace = true
chrono.workspace = true
dashmap.workspace = true
diffy = { version = "0.3", default-features = false }
either.workspace = true
enum_dispatch.workspace = true
eyre.workspace = true
Expand Down Expand Up @@ -95,6 +96,7 @@ pretty_assertions.workspace = true
serde-reflection.workspace = true
serde_yaml.workspace = true

test-cluster.workspace = true
move-symbol-pool.workspace = true

sui-test-transaction-builder.workspace = true
Expand Down
83 changes: 78 additions & 5 deletions crates/sui-core/src/authority.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ use sui_json_rpc_types::{
SuiObjectDataFilter, SuiTransactionBlockData, SuiTransactionBlockEffects,
SuiTransactionBlockEvents, TransactionFilter,
};
use sui_macros::{fail_point, fail_point_async};
use sui_macros::{fail_point, fail_point_async, fail_point_if};
use sui_protocol_config::{ProtocolConfig, SupportedProtocolVersions};
use sui_storage::indexes::{CoinInfo, ObjectIndexChanges};
use sui_storage::key_value_store::{TransactionKeyValueStore, TransactionKeyValueStoreTrait};
Expand All @@ -95,10 +95,10 @@ use sui_types::inner_temporary_store::{
use sui_types::message_envelope::Message;
use sui_types::messages_checkpoint::{
CertifiedCheckpointSummary, CheckpointCommitment, CheckpointContents, CheckpointContentsDigest,
CheckpointDigest, CheckpointSequenceNumber, CheckpointSummary, CheckpointTimestamp,
VerifiedCheckpoint,
CheckpointDigest, CheckpointRequest, CheckpointRequestV2, CheckpointResponse,
CheckpointResponseV2, CheckpointSequenceNumber, CheckpointSummary, CheckpointSummaryResponse,
CheckpointTimestamp, VerifiedCheckpoint,
};
use sui_types::messages_checkpoint::{CheckpointRequest, CheckpointResponse};
use sui_types::messages_consensus::AuthorityCapabilities;
use sui_types::messages_grpc::{
HandleTransactionResponse, LayoutGenerationOption, ObjectInfoRequest, ObjectInfoRequestKind,
Expand Down Expand Up @@ -1364,7 +1364,9 @@ impl AuthorityState {
let protocol_config = epoch_store.protocol_config();
let transaction_data = &certificate.data().intent_message().value;
let (kind, signer, gas) = transaction_data.execution_parts();
let (inner_temp_store, effects, execution_error_opt) =

#[allow(unused_mut)]
let (inner_temp_store, mut effects, execution_error_opt) =
epoch_store.executor().execute_transaction_to_effects(
&self.database,
protocol_config,
Expand All @@ -1387,6 +1389,11 @@ impl AuthorityState {
tx_digest,
);

fail_point_if!("cp_execution_nondeterminism", || {
#[cfg(msim)]
self.create_fail_state(certificate, epoch_store, &mut effects);
});

Ok((inner_temp_store, effects, execution_error_opt.err()))
}

Expand Down Expand Up @@ -1728,6 +1735,37 @@ impl AuthorityState {
.await
}

#[cfg(msim)]
fn create_fail_state(
&self,
certificate: &VerifiedExecutableTransaction,
epoch_store: &Arc<AuthorityPerEpochStore>,
effects: &mut TransactionEffects,
) {
use std::cell::RefCell;
thread_local! {
static FAIL_STATE: RefCell<(u64, HashSet<AuthorityName>)> = RefCell::new((0, HashSet::new()));
}
if !certificate.data().intent_message().value.is_system_tx() {
let committee = epoch_store.committee();
let cur_stake = (**committee).weight(&self.name);
if cur_stake > 0 {
FAIL_STATE.with_borrow_mut(|fail_state| {
//let (&mut failing_stake, &mut failing_validators) = fail_state;
if fail_state.0 < committee.validity_threshold() {
fail_state.0 += cur_stake;
fail_state.1.insert(self.name);
}

if fail_state.1.contains(&self.name) {
info!("cp_exec failing tx");
effects.gas_cost_summary_mut_for_testing().computation_cost += 1;
}
});
}
}
}

fn process_object_index(
&self,
effects: &TransactionEffects,
Expand Down Expand Up @@ -2098,6 +2136,41 @@ impl AuthorityState {
})
}

#[instrument(level = "trace", skip_all)]
pub fn handle_checkpoint_request_v2(
&self,
request: &CheckpointRequestV2,
) -> SuiResult<CheckpointResponseV2> {
let summary = if request.certified {
let summary = match request.sequence_number {
Some(seq) => self
.checkpoint_store
.get_checkpoint_by_sequence_number(seq)?,
None => self.checkpoint_store.get_latest_certified_checkpoint(),
}
.map(|v| v.into_inner());
summary.map(CheckpointSummaryResponse::Certified)
} else {
let summary = match request.sequence_number {
Some(seq) => self.checkpoint_store.get_locally_computed_checkpoint(seq)?,
None => self
.checkpoint_store
.get_latest_locally_computed_checkpoint(),
};
summary.map(CheckpointSummaryResponse::Pending)
};
let contents = match &summary {
Some(s) => self
.checkpoint_store
.get_checkpoint_contents(&s.content_digest())?,
None => None,
};
Ok(CheckpointResponseV2 {
checkpoint: summary,
contents,
})
}

fn check_protocol_version(
supported_protocol_versions: SupportedProtocolVersions,
current_version: ProtocolVersion,
Expand Down
21 changes: 20 additions & 1 deletion crates/sui-core/src/authority_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ use std::time::Duration;
use sui_network::{api::ValidatorClient, tonic};
use sui_types::base_types::AuthorityName;
use sui_types::committee::CommitteeWithNetworkMetadata;
use sui_types::messages_checkpoint::{CheckpointRequest, CheckpointResponse};
use sui_types::messages_checkpoint::{
CheckpointRequest, CheckpointRequestV2, CheckpointResponse, CheckpointResponseV2,
};
use sui_types::multiaddr::Multiaddr;
use sui_types::sui_system_state::SuiSystemState;
use sui_types::{error::SuiError, transaction::*};
Expand Down Expand Up @@ -59,6 +61,11 @@ pub trait AuthorityAPI {
request: CheckpointRequest,
) -> Result<CheckpointResponse, SuiError>;

async fn handle_checkpoint_v2(
&self,
request: CheckpointRequestV2,
) -> Result<CheckpointResponseV2, SuiError>;

// This API is exclusively used by the benchmark code.
// Hence it's OK to return a fixed system state type.
async fn handle_system_state_object(
Expand Down Expand Up @@ -189,6 +196,18 @@ impl AuthorityAPI for NetworkAuthorityClient {
.map_err(Into::into)
}

/// Handle Object information requests for this account.
async fn handle_checkpoint_v2(
&self,
request: CheckpointRequestV2,
) -> Result<CheckpointResponseV2, SuiError> {
self.client()
.checkpoint_v2(request)
.await
.map(tonic::Response::into_inner)
.map_err(Into::into)
}

async fn handle_system_state_object(
&self,
request: SystemStateRequest,
Expand Down
15 changes: 14 additions & 1 deletion crates/sui-core/src/authority_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ use sui_types::{effects::TransactionEffectsAPI, message_envelope::Message};
use sui_types::{error::*, transaction::*};
use sui_types::{
fp_ensure,
messages_checkpoint::{CheckpointRequest, CheckpointResponse},
messages_checkpoint::{
CheckpointRequest, CheckpointRequestV2, CheckpointResponse, CheckpointResponseV2,
},
};
use tap::TapFallible;
use tokio::task::JoinHandle;
Expand Down Expand Up @@ -606,6 +608,17 @@ impl Validator for ValidatorService {
return Ok(tonic::Response::new(response));
}

async fn checkpoint_v2(
&self,
request: tonic::Request<CheckpointRequestV2>,
) -> Result<tonic::Response<CheckpointResponseV2>, tonic::Status> {
let request = request.into_inner();

let response = self.state.handle_checkpoint_request_v2(&request)?;

return Ok(tonic::Response::new(response));
}

async fn get_system_state_object(
&self,
_request: tonic::Request<SystemStateRequest>,
Expand Down
7 changes: 7 additions & 0 deletions crates/sui-core/src/checkpoints/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub struct CheckpointMetrics {
pub highest_accumulated_epoch: IntGauge,
pub checkpoint_creation_latency_ms: Histogram,
pub remote_checkpoint_forks: IntCounter,
pub split_brain_checkpoint_forks: IntCounter,
pub last_created_checkpoint_age_ms: Histogram,
pub last_certified_checkpoint_age_ms: Histogram,
}
Expand Down Expand Up @@ -105,6 +106,12 @@ impl CheckpointMetrics {
registry
)
.unwrap(),
split_brain_checkpoint_forks: register_int_counter_with_registry!(
"split_brain_checkpoint_forks",
"Number of checkpoints that have resulted in a split brain",
registry
)
.unwrap(),
};
Arc::new(this)
}
Expand Down
Loading

0 comments on commit 879d7b6

Please sign in to comment.