Skip to content

Commit

Permalink
[nexus] Decommission disks in reconfigurator, clean their DB state (#…
Browse files Browse the repository at this point in the history
…6059)

This PR adds the following step to the reconfigurator's execution step:
For all disks that are marked **expunged**, mark them
**decommissioned**. This notably happens after the `deploy_disks` step
of execution.

This PR also adds a background task that looks for all disks that are
**decommissioned**, but have **zpools**. It deletes these zpools (and
their **datasets**) as long as no regions nor region snapshots are
referencing the contained datasets.

Fixes #6051
  • Loading branch information
smklein authored Jul 22, 2024
1 parent e346fd1 commit 38575ab
Show file tree
Hide file tree
Showing 21 changed files with 1,013 additions and 51 deletions.
15 changes: 15 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ task: "crdb_node_id_collector"
Collects node IDs of running CockroachDB zones


task: "decommissioned_disk_cleaner"
deletes DB records for decommissioned disks, after regions and region
snapshots have been replaced


task: "dns_config_external"
watches external DNS data stored in CockroachDB

Expand Down Expand Up @@ -187,6 +192,11 @@ task: "crdb_node_id_collector"
Collects node IDs of running CockroachDB zones


task: "decommissioned_disk_cleaner"
deletes DB records for decommissioned disks, after regions and region
snapshots have been replaced


task: "dns_config_external"
watches external DNS data stored in CockroachDB

Expand Down Expand Up @@ -314,6 +324,11 @@ task: "crdb_node_id_collector"
Collects node IDs of running CockroachDB zones


task: "decommissioned_disk_cleaner"
deletes DB records for decommissioned disks, after regions and region
snapshots have been replaced


task: "dns_config_external"
watches external DNS data stored in CockroachDB

Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,11 @@ task: "crdb_node_id_collector"
Collects node IDs of running CockroachDB zones


task: "decommissioned_disk_cleaner"
deletes DB records for decommissioned disks, after regions and region
snapshots have been replaced


task: "dns_config_external"
watches external DNS data stored in CockroachDB

Expand Down Expand Up @@ -453,6 +458,13 @@ task: "crdb_node_id_collector"
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
last completion reported error: no blueprint

task: "decommissioned_disk_cleaner"
configured period: every 1m
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
warning: unknown background task: "decommissioned_disk_cleaner" (don't know how to interpret details: Object {"deleted": Number(0), "error": Null, "error_count": Number(0), "found": Number(0), "not_ready_to_be_deleted": Number(0)})

task: "external_endpoints"
configured period: every 1m
currently executing: no
Expand Down
23 changes: 23 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,8 @@ pub struct BackgroundTaskConfig {
pub inventory: InventoryConfig,
/// configuration for physical disk adoption tasks
pub physical_disk_adoption: PhysicalDiskAdoptionConfig,
/// configuration for decommissioned disk cleaner task
pub decommissioned_disk_cleaner: DecommissionedDiskCleanerConfig,
/// configuration for phantom disks task
pub phantom_disks: PhantomDiskConfig,
/// configuration for blueprint related tasks
Expand Down Expand Up @@ -444,6 +446,20 @@ pub struct PhysicalDiskAdoptionConfig {
pub disable: bool,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct DecommissionedDiskCleanerConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,

/// A toggle to disable automated disk cleanup
///
/// Default: Off
#[serde(default)]
pub disable: bool,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct NatCleanupConfig {
Expand Down Expand Up @@ -822,6 +838,7 @@ mod test {
inventory.nkeep = 11
inventory.disable = false
physical_disk_adoption.period_secs = 30
decommissioned_disk_cleaner.period_secs = 30
phantom_disks.period_secs = 30
blueprints.period_secs_load = 10
blueprints.period_secs_execute = 60
Expand Down Expand Up @@ -947,6 +964,11 @@ mod test {
period_secs: Duration::from_secs(30),
disable: false,
},
decommissioned_disk_cleaner:
DecommissionedDiskCleanerConfig {
period_secs: Duration::from_secs(30),
disable: false,
},
phantom_disks: PhantomDiskConfig {
period_secs: Duration::from_secs(30),
},
Expand Down Expand Up @@ -1049,6 +1071,7 @@ mod test {
inventory.nkeep = 3
inventory.disable = false
physical_disk_adoption.period_secs = 30
decommissioned_disk_cleaner.period_secs = 30
phantom_disks.period_secs = 30
blueprints.period_secs_load = 10
blueprints.period_secs_execute = 60
Expand Down
74 changes: 34 additions & 40 deletions nexus/db-queries/src/db/datastore/physical_disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use omicron_common::api::external::LookupType;
use omicron_common::api::external::ResourceType;
use omicron_uuid_kinds::CollectionUuid;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::PhysicalDiskUuid;
use uuid::Uuid;

impl DataStore {
Expand Down Expand Up @@ -278,23 +279,36 @@ impl DataStore {
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// Decommissions all expunged disks.
pub async fn physical_disk_decommission_all_expunged(
&self,
opctx: &OpContext,
) -> Result<(), Error> {
opctx.authorize(authz::Action::Modify, &authz::FLEET).await?;
use db::schema::physical_disk::dsl;

let conn = &*self.pool_connection_authorized(&opctx).await?;
diesel::update(dsl::physical_disk)
.filter(dsl::time_deleted.is_null())
.physical_disk_filter(DiskFilter::ExpungedButActive)
.set(dsl::disk_state.eq(PhysicalDiskState::Decommissioned))
.execute_async(conn)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
Ok(())
}

/// Deletes a disk from the database.
pub async fn physical_disk_delete(
&self,
opctx: &OpContext,
vendor: String,
serial: String,
model: String,
sled_id: Uuid,
id: PhysicalDiskUuid,
) -> DeleteResult {
opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
let now = Utc::now();
use db::schema::physical_disk::dsl;
diesel::update(dsl::physical_disk)
.filter(dsl::vendor.eq(vendor))
.filter(dsl::serial.eq(serial))
.filter(dsl::model.eq(model))
.filter(dsl::sled_id.eq(sled_id))
.filter(dsl::id.eq(id.into_untyped_uuid()))
.filter(dsl::time_deleted.is_null())
.set(dsl::time_deleted.eq(now))
.execute_async(&*self.pool_connection_authorized(opctx).await?)
Expand Down Expand Up @@ -451,8 +465,9 @@ mod test {
let sled = create_test_sled(&datastore).await;

// Insert a disk
let disk_id = PhysicalDiskUuid::new_v4();
let disk = PhysicalDisk::new(
Uuid::new_v4(),
disk_id.into_untyped_uuid(),
String::from("Oxide"),
String::from("123"),
String::from("FakeDisk"),
Expand All @@ -472,13 +487,7 @@ mod test {

// Delete the inserted disk
datastore
.physical_disk_delete(
&opctx,
disk.vendor.clone(),
disk.serial.clone(),
disk.model.clone(),
disk.sled_id,
)
.physical_disk_delete(&opctx, disk_id)
.await
.expect("Failed to delete disk");
let disks = datastore
Expand All @@ -489,13 +498,7 @@ mod test {

// Deleting again should not throw an error
datastore
.physical_disk_delete(
&opctx,
disk.vendor,
disk.serial,
disk.model,
disk.sled_id,
)
.physical_disk_delete(&opctx, disk_id)
.await
.expect("Failed to delete disk");

Expand All @@ -520,8 +523,9 @@ mod test {
let sled_b = create_test_sled(&datastore).await;

// Insert a disk
let disk_id = PhysicalDiskUuid::new_v4();
let disk = PhysicalDisk::new(
Uuid::new_v4(),
disk_id.into_untyped_uuid(),
String::from("Oxide"),
String::from("123"),
String::from("FakeDisk"),
Expand All @@ -546,13 +550,7 @@ mod test {

// Delete the inserted disk
datastore
.physical_disk_delete(
&opctx,
disk.vendor,
disk.serial,
disk.model,
disk.sled_id,
)
.physical_disk_delete(&opctx, disk_id)
.await
.expect("Failed to delete disk");
let disks = datastore
Expand All @@ -567,8 +565,9 @@ mod test {
assert!(disks.is_empty());

// Attach the disk to the second sled
let disk_id = PhysicalDiskUuid::new_v4();
let disk = PhysicalDisk::new(
Uuid::new_v4(),
disk_id.into_untyped_uuid(),
String::from("Oxide"),
String::from("123"),
String::from("FakeDisk"),
Expand Down Expand Up @@ -613,8 +612,9 @@ mod test {
let sled_b = create_test_sled(&datastore).await;

// Insert a disk
let disk_id = PhysicalDiskUuid::new_v4();
let disk = PhysicalDisk::new(
Uuid::new_v4(),
disk_id.into_untyped_uuid(),
String::from("Oxide"),
String::from("123"),
String::from("FakeDisk"),
Expand All @@ -639,13 +639,7 @@ mod test {

// Remove the disk from the first sled
datastore
.physical_disk_delete(
&opctx,
disk.vendor.clone(),
disk.serial.clone(),
disk.model.clone(),
disk.sled_id,
)
.physical_disk_delete(&opctx, disk_id)
.await
.expect("Failed to delete disk");

Expand Down
Loading

0 comments on commit 38575ab

Please sign in to comment.