Skip to content

Commit

Permalink
[reconfigurator] Add planner support for starting new Crucible pantri…
Browse files Browse the repository at this point in the history
…es (#6836)

This is a much smaller change than the diff stat implies; most of the
changes are expectorate outputs because the example system we set up for
tests now includes Crucible pantry zones, which shifted a bunch of other
zone UUIDs.

Fully supporting Crucible pantry replacement depends on #3763, which I'm
continuing to work on. But the reconfigurator side of "start new
pantries" is about as trivial as things go and does not depend on #3763,
hence this PR.
  • Loading branch information
jgallagher authored Oct 14, 2024
1 parent 6926ca9 commit 6fb91c6
Show file tree
Hide file tree
Showing 20 changed files with 847 additions and 601 deletions.
6 changes: 6 additions & 0 deletions common/src/policy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ pub const OXIMETER_REDUNDANCY: usize = 1;
/// Reconfigurator (to know whether to add new crdb zones)
pub const COCKROACHDB_REDUNDANCY: usize = 5;

/// The amount of redundancy for Crucible Pantry services.
///
/// This is used by both RSS (to distribute the initial set of services) and the
/// Reconfigurator (to know whether to add new pantry zones)
pub const CRUCIBLE_PANTRY_REDUNDANCY: usize = 3;

/// The amount of redundancy for internal DNS servers.
///
/// Must be less than or equal to RESERVED_INTERNAL_DNS_REDUNDANCY.
Expand Down
2 changes: 2 additions & 0 deletions nexus/reconfigurator/execution/src/dns.rs
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ mod test {
use omicron_common::api::external::IdentityMetadataCreateParams;
use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY;
use omicron_common::policy::COCKROACHDB_REDUNDANCY;
use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY;
use omicron_common::policy::INTERNAL_DNS_REDUNDANCY;
use omicron_common::policy::NEXUS_REDUNDANCY;
use omicron_common::policy::OXIMETER_REDUNDANCY;
Expand Down Expand Up @@ -1375,6 +1376,7 @@ mod test {
target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY,
target_cockroachdb_cluster_version:
CockroachDbClusterVersion::POLICY,
target_crucible_pantry_zone_count: CRUCIBLE_PANTRY_REDUNDANCY,
log,
}
.build()
Expand Down
45 changes: 45 additions & 0 deletions nexus/reconfigurator/planning/src/blueprint_builder/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,51 @@ impl<'a> BlueprintBuilder<'a> {
Ok(EnsureMultiple::Changed { added: num_oximeter_to_add, removed: 0 })
}

pub fn sled_ensure_zone_multiple_crucible_pantry(
&mut self,
sled_id: SledUuid,
desired_zone_count: usize,
) -> Result<EnsureMultiple, Error> {
// How many zones do we need to add?
let pantry_count = self
.sled_num_running_zones_of_kind(sled_id, ZoneKind::CruciblePantry);
let num_pantry_to_add =
match desired_zone_count.checked_sub(pantry_count) {
Some(0) => return Ok(EnsureMultiple::NotNeeded),
Some(n) => n,
None => {
return Err(Error::Planner(anyhow!(
"removing a Crucible pantry zone not yet supported \
(sled {sled_id} has {pantry_count}; \
planner wants {desired_zone_count})"
)));
}
};

for _ in 0..num_pantry_to_add {
let pantry_id = self.rng.zone_rng.next();
let ip = self.sled_alloc_ip(sled_id)?;
let port = omicron_common::address::CRUCIBLE_PANTRY_PORT;
let address = SocketAddrV6::new(ip, port, 0, 0);
let zone_type = BlueprintZoneType::CruciblePantry(
blueprint_zone_type::CruciblePantry { address },
);
let filesystem_pool =
self.sled_select_zpool(sled_id, zone_type.kind())?;

let zone = BlueprintZoneConfig {
disposition: BlueprintZoneDisposition::InService,
id: pantry_id,
underlay_address: ip,
filesystem_pool: Some(filesystem_pool),
zone_type,
};
self.sled_add_zone(sled_id, zone)?;
}

Ok(EnsureMultiple::Changed { added: num_pantry_to_add, removed: 0 })
}

pub fn cockroachdb_preserve_downgrade(
&mut self,
version: CockroachDbPreserveDowngrade,
Expand Down
51 changes: 48 additions & 3 deletions nexus/reconfigurator/planning/src/example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use nexus_types::deployment::OmicronZoneNic;
use nexus_types::deployment::PlanningInput;
use nexus_types::deployment::SledFilter;
use nexus_types::inventory::Collection;
use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY;
use omicron_common::policy::INTERNAL_DNS_REDUNDANCY;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::SledKind;
Expand Down Expand Up @@ -62,6 +63,7 @@ pub struct ExampleSystemBuilder {
nexus_count: Option<ZoneCount>,
internal_dns_count: ZoneCount,
external_dns_count: ZoneCount,
crucible_pantry_count: ZoneCount,
create_zones: bool,
create_disks_in_blueprint: bool,
}
Expand All @@ -85,6 +87,7 @@ impl ExampleSystemBuilder {
nexus_count: None,
internal_dns_count: ZoneCount(INTERNAL_DNS_REDUNDANCY),
external_dns_count: ZoneCount(Self::DEFAULT_EXTERNAL_DNS_COUNT),
crucible_pantry_count: ZoneCount(CRUCIBLE_PANTRY_REDUNDANCY),
create_zones: true,
create_disks_in_blueprint: true,
}
Expand Down Expand Up @@ -163,6 +166,17 @@ impl ExampleSystemBuilder {
Ok(self)
}

/// Set the number of Crucible pantry instances in the example system.
///
/// If [`Self::create_zones`] is set to `false`, this is ignored.
pub fn crucible_pantry_count(
mut self,
crucible_pantry_count: usize,
) -> Self {
self.crucible_pantry_count = ZoneCount(crucible_pantry_count);
self
}

/// Create zones in the example system.
///
/// The default is `true`.
Expand Down Expand Up @@ -200,6 +214,7 @@ impl ExampleSystemBuilder {
"nexus_count" => nexus_count.0,
"internal_dns_count" => self.internal_dns_count.0,
"external_dns_count" => self.external_dns_count.0,
"crucible_pantry_count" => self.crucible_pantry_count.0,
"create_zones" => self.create_zones,
"create_disks_in_blueprint" => self.create_disks_in_blueprint,
);
Expand All @@ -209,7 +224,8 @@ impl ExampleSystemBuilder {
// there's no external DNS count.)
system
.target_nexus_zone_count(nexus_count.0)
.target_internal_dns_zone_count(self.internal_dns_count.0);
.target_internal_dns_zone_count(self.internal_dns_count.0)
.target_crucible_pantry_zone_count(self.crucible_pantry_count.0);
let mut sled_rng =
TypedUuidRng::from_seed(&self.test_name, "ExampleSystem");
let sled_ids: Vec<_> =
Expand Down Expand Up @@ -301,6 +317,12 @@ impl ExampleSystemBuilder {
self.external_dns_count.on(i, self.nsleds),
)
.unwrap();
let _ = builder
.sled_ensure_zone_multiple_crucible_pantry(
sled_id,
self.crucible_pantry_count.on(i, self.nsleds),
)
.unwrap();
}
if self.create_disks_in_blueprint {
let _ =
Expand Down Expand Up @@ -426,6 +448,7 @@ mod tests {
ExampleSystemBuilder::new(&logctx.log, TEST_NAME)
.nsleds(5)
.nexus_count(6)
.crucible_pantry_count(5)
.internal_dns_count(2)
.unwrap()
.external_dns_count(10)
Expand All @@ -444,9 +467,10 @@ mod tests {
// Check that the system's target counts are set correctly.
assert_eq!(example.system.get_target_nexus_zone_count(), 6);
assert_eq!(example.system.get_target_internal_dns_zone_count(), 2);
assert_eq!(example.system.get_target_crucible_pantry_zone_count(), 5);

// Check that the right number of internal and external DNS zones are
// present in both the blueprint and in the collection.
// Check that the right number of zones are present in both the
// blueprint and in the collection.
let nexus_zones = blueprint_zones_of_kind(&blueprint, ZoneKind::Nexus);
assert_eq!(
nexus_zones.len(),
Expand Down Expand Up @@ -507,6 +531,27 @@ mod tests {
external_dns_zones,
);

let crucible_pantry_zones =
blueprint_zones_of_kind(&blueprint, ZoneKind::CruciblePantry);
assert_eq!(
crucible_pantry_zones.len(),
5,
"expected 5 Crucible pantry zones in blueprint, got {}: {:#?}",
crucible_pantry_zones.len(),
crucible_pantry_zones,
);
let crucible_pantry_zones = collection_zones_of_kind(
&example.collection,
ZoneKind::CruciblePantry,
);
assert_eq!(
crucible_pantry_zones.len(),
5,
"expected 5 Crucible pantry zones in collection, got {}: {:#?}",
crucible_pantry_zones.len(),
crucible_pantry_zones,
);

logctx.cleanup_successful();
}

Expand Down
84 changes: 83 additions & 1 deletion nexus/reconfigurator/planning/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ impl<'a> Planner<'a> {
DiscretionaryOmicronZone::ClickhouseKeeper,
DiscretionaryOmicronZone::ClickhouseServer,
DiscretionaryOmicronZone::CockroachDb,
DiscretionaryOmicronZone::CruciblePantry,
DiscretionaryOmicronZone::InternalDns,
DiscretionaryOmicronZone::ExternalDns,
DiscretionaryOmicronZone::Nexus,
Expand Down Expand Up @@ -451,6 +452,9 @@ impl<'a> Planner<'a> {
DiscretionaryOmicronZone::CockroachDb => {
self.input.target_cockroachdb_zone_count()
}
DiscretionaryOmicronZone::CruciblePantry => {
self.input.target_crucible_pantry_zone_count()
}
DiscretionaryOmicronZone::InternalDns => {
self.input.target_internal_dns_zone_count()
}
Expand Down Expand Up @@ -562,6 +566,12 @@ impl<'a> Planner<'a> {
new_total_zone_count,
)?
}
DiscretionaryOmicronZone::CruciblePantry => {
self.blueprint.sled_ensure_zone_multiple_crucible_pantry(
sled_id,
new_total_zone_count,
)?
}
DiscretionaryOmicronZone::InternalDns => {
self.blueprint.sled_ensure_zone_multiple_internal_dns(
sled_id,
Expand Down Expand Up @@ -823,6 +833,7 @@ mod test {
use nexus_types::external_api::views::SledState;
use omicron_common::api::external::Generation;
use omicron_common::disk::DiskIdentity;
use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY;
use omicron_test_utils::dev::test_setup_log;
use omicron_uuid_kinds::PhysicalDiskUuid;
use omicron_uuid_kinds::SledUuid;
Expand Down Expand Up @@ -1965,8 +1976,9 @@ mod test {
// * each of those 2 sleds should get exactly 3 new Nexuses
builder.policy_mut().target_nexus_zone_count = 9;

// Disable addition of internal DNS zones.
// Disable addition of zone types we're not checking for below.
builder.policy_mut().target_internal_dns_zone_count = 0;
builder.policy_mut().target_crucible_pantry_zone_count = 0;

let input = builder.build();
let mut blueprint2 = Planner::new_based_on(
Expand Down Expand Up @@ -2451,6 +2463,76 @@ mod test {
logctx.cleanup_successful();
}

#[test]
fn test_crucible_pantry() {
static TEST_NAME: &str = "test_crucible_pantry";
let logctx = test_setup_log(TEST_NAME);

// Use our example system as a starting point.
let (collection, input, blueprint1) = example(&logctx.log, TEST_NAME);

// We should start with CRUCIBLE_PANTRY_REDUNDANCY pantries spread out
// to at most 1 per sled. Find one of the sleds running one.
let pantry_sleds = blueprint1
.all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning)
.filter_map(|(sled_id, zone)| {
zone.zone_type.is_crucible_pantry().then_some(sled_id)
})
.collect::<Vec<_>>();
assert_eq!(
pantry_sleds.len(),
CRUCIBLE_PANTRY_REDUNDANCY,
"expected {CRUCIBLE_PANTRY_REDUNDANCY} pantries, but found {}",
pantry_sleds.len(),
);

// Expunge one of the pantry-hosting sleds and re-plan. The planner
// should immediately replace the zone with one on another
// (non-expunged) sled.
let expunged_sled_id = pantry_sleds[0];

let mut input_builder = input.into_builder();
input_builder
.sleds_mut()
.get_mut(&expunged_sled_id)
.expect("can't find sled")
.policy = SledPolicy::Expunged;
let input = input_builder.build();
let blueprint2 = Planner::new_based_on(
logctx.log.clone(),
&blueprint1,
&input,
"test_blueprint2",
&collection,
)
.expect("failed to create planner")
.with_rng_seed((TEST_NAME, "bp2"))
.plan()
.expect("failed to re-plan");

let diff = blueprint2.diff_since_blueprint(&blueprint1);
println!("1 -> 2 (expunged sled):\n{}", diff.display());
assert_eq!(
blueprint2
.all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning)
.filter(|(sled_id, zone)| *sled_id != expunged_sled_id
&& zone.zone_type.is_crucible_pantry())
.count(),
CRUCIBLE_PANTRY_REDUNDANCY,
"can't find replacement pantry zone"
);

// Test a no-op planning iteration.
assert_planning_makes_no_changes(
&logctx.log,
&blueprint2,
&input,
TEST_NAME,
);

logctx.cleanup_successful();
}

/// Check that the planner can replace a single-node ClickHouse zone.
/// This is completely distinct from (and much simpler than) the replicated
/// (multi-node) case.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ pub(crate) enum DiscretionaryOmicronZone {
ClickhouseKeeper,
ClickhouseServer,
CockroachDb,
CruciblePantry,
InternalDns,
ExternalDns,
Nexus,
Oximeter,
// TODO expand this enum as we start to place more services
}

impl DiscretionaryOmicronZone {
Expand All @@ -40,16 +40,15 @@ impl DiscretionaryOmicronZone {
Some(Self::ClickhouseServer)
}
BlueprintZoneType::CockroachDb(_) => Some(Self::CockroachDb),
BlueprintZoneType::CruciblePantry(_) => Some(Self::CruciblePantry),
BlueprintZoneType::InternalDns(_) => Some(Self::InternalDns),
BlueprintZoneType::ExternalDns(_) => Some(Self::ExternalDns),
BlueprintZoneType::Nexus(_) => Some(Self::Nexus),
BlueprintZoneType::Oximeter(_) => Some(Self::Oximeter),
// Zones that we should place but don't yet.
| BlueprintZoneType::CruciblePantry(_)
// Zones that get special handling for placement (all sleds get
// them, although internal NTP has some interactions with boundary
// NTP that are handled separately).
| BlueprintZoneType::Crucible(_)
BlueprintZoneType::Crucible(_)
| BlueprintZoneType::InternalNtp(_) => None,
}
}
Expand All @@ -67,6 +66,7 @@ impl From<DiscretionaryOmicronZone> for ZoneKind {
Self::ClickhouseServer
}
DiscretionaryOmicronZone::CockroachDb => Self::CockroachDb,
DiscretionaryOmicronZone::CruciblePantry => Self::CruciblePantry,
DiscretionaryOmicronZone::InternalDns => Self::InternalDns,
DiscretionaryOmicronZone::ExternalDns => Self::ExternalDns,
DiscretionaryOmicronZone::Nexus => Self::Nexus,
Expand Down
Loading

0 comments on commit 6fb91c6

Please sign in to comment.