Skip to content

Commit caa1d50

Browse files
authored
blueprint_planner background task (#8287)
Fixes #8244. Adds a new `blueprint_planner` background task that periodically invokes the planer. If the resulting blueprint is different than the current target, it is saved and becomes the new target. Also fixes #8221 by adding a mock boundary-NTP zone to the `nexus-test-utils` context.
1 parent 5465ccf commit caa1d50

File tree

17 files changed

+607
-44
lines changed

17 files changed

+607
-44
lines changed

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ use nexus_types::deployment::ClickhousePolicy;
4646
use nexus_types::deployment::OximeterReadMode;
4747
use nexus_types::deployment::OximeterReadPolicy;
4848
use nexus_types::internal_api::background::AbandonedVmmReaperStatus;
49+
use nexus_types::internal_api::background::BlueprintPlannerStatus;
4950
use nexus_types::internal_api::background::BlueprintRendezvousStatus;
5051
use nexus_types::internal_api::background::InstanceReincarnationStatus;
5152
use nexus_types::internal_api::background::InstanceUpdaterStatus;
@@ -1061,6 +1062,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
10611062
"abandoned_vmm_reaper" => {
10621063
print_task_abandoned_vmm_reaper(details);
10631064
}
1065+
"blueprint_planner" => {
1066+
print_task_blueprint_planner(details);
1067+
}
10641068
"blueprint_executor" => {
10651069
print_task_blueprint_executor(details);
10661070
}
@@ -1209,6 +1213,44 @@ fn print_task_abandoned_vmm_reaper(details: &serde_json::Value) {
12091213
};
12101214
}
12111215

1216+
fn print_task_blueprint_planner(details: &serde_json::Value) {
1217+
let status =
1218+
match serde_json::from_value::<BlueprintPlannerStatus>(details.clone())
1219+
{
1220+
Ok(status) => status,
1221+
Err(error) => {
1222+
eprintln!(
1223+
"warning: failed to interpret task details: {:?}: {:?}",
1224+
error, details
1225+
);
1226+
return;
1227+
}
1228+
};
1229+
match status {
1230+
BlueprintPlannerStatus::Disabled => {
1231+
println!(" blueprint planning explicitly disabled by config!");
1232+
}
1233+
BlueprintPlannerStatus::Error(error) => {
1234+
println!(" task did not complete successfully: {error}");
1235+
}
1236+
BlueprintPlannerStatus::Unchanged { parent_blueprint_id } => {
1237+
println!(" plan unchanged from parent {parent_blueprint_id}");
1238+
}
1239+
BlueprintPlannerStatus::Planned { parent_blueprint_id, error } => {
1240+
println!(
1241+
" planned new blueprint from parent {parent_blueprint_id}, \
1242+
but could not make it the target: {error}"
1243+
);
1244+
}
1245+
BlueprintPlannerStatus::Targeted { blueprint_id, .. } => {
1246+
println!(
1247+
" planned new blueprint {blueprint_id}, \
1248+
and made it the current target"
1249+
);
1250+
}
1251+
}
1252+
}
1253+
12121254
fn print_task_blueprint_executor(details: &serde_json::Value) {
12131255
let mut value = details.clone();
12141256
// Extract and remove the event report. (If we don't do this, the

dev-tools/omdb/tests/env.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ task: "blueprint_loader"
4747
Loads the current target blueprint from the DB
4848

4949

50+
task: "blueprint_planner"
51+
Updates the target blueprint
52+
53+
5054
task: "blueprint_rendezvous"
5155
reconciles blueprints and inventory collection, updating Reconfigurator-
5256
owned rendezvous tables that other subsystems consume
@@ -243,6 +247,10 @@ task: "blueprint_loader"
243247
Loads the current target blueprint from the DB
244248

245249

250+
task: "blueprint_planner"
251+
Updates the target blueprint
252+
253+
246254
task: "blueprint_rendezvous"
247255
reconciles blueprints and inventory collection, updating Reconfigurator-
248256
owned rendezvous tables that other subsystems consume
@@ -426,6 +434,10 @@ task: "blueprint_loader"
426434
Loads the current target blueprint from the DB
427435

428436

437+
task: "blueprint_planner"
438+
Updates the target blueprint
439+
440+
429441
task: "blueprint_rendezvous"
430442
reconciles blueprints and inventory collection, updating Reconfigurator-
431443
owned rendezvous tables that other subsystems consume

dev-tools/omdb/tests/successes.out

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,10 @@ task: "blueprint_loader"
259259
Loads the current target blueprint from the DB
260260

261261

262+
task: "blueprint_planner"
263+
Updates the target blueprint
264+
265+
262266
task: "blueprint_rendezvous"
263267
reconciles blueprints and inventory collection, updating Reconfigurator-
264268
owned rendezvous tables that other subsystems consume
@@ -521,6 +525,13 @@ task: "bfd_manager"
521525
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
522526
last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN }
523527

528+
task: "blueprint_planner"
529+
configured period: every <REDACTED_DURATION>m
530+
currently executing: no
531+
last completed activation: <REDACTED ITERATIONS>, triggered by a dependent task completing
532+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
533+
blueprint planning explicitly disabled by config!
534+
524535
task: "blueprint_rendezvous"
525536
configured period: every <REDACTED_DURATION>m
526537
currently executing: no
@@ -1040,6 +1051,13 @@ task: "bfd_manager"
10401051
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
10411052
last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN }
10421053

1054+
task: "blueprint_planner"
1055+
configured period: every <REDACTED_DURATION>m
1056+
currently executing: no
1057+
last completed activation: <REDACTED ITERATIONS>, triggered by a dependent task completing
1058+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1059+
blueprint planning explicitly disabled by config!
1060+
10431061
task: "blueprint_rendezvous"
10441062
configured period: every <REDACTED_DURATION>m
10451063
currently executing: no
@@ -1450,12 +1468,14 @@ parent: <none>
14501468
oxp_..........<REDACTED_UUID>.........../crypt/zone/oxz_external_dns_..........<REDACTED_UUID>........... ..........<REDACTED_UUID>........... in service none none off
14511469
oxp_..........<REDACTED_UUID>.........../crypt/zone/oxz_internal_dns_..........<REDACTED_UUID>........... ..........<REDACTED_UUID>........... in service none none off
14521470
oxp_..........<REDACTED_UUID>.........../crypt/zone/oxz_nexus_..........<REDACTED_UUID>........... ..........<REDACTED_UUID>........... in service none none off
1471+
oxp_..........<REDACTED_UUID>.........../crypt/zone/oxz_ntp_..........<REDACTED_UUID>........... ..........<REDACTED_UUID>........... in service none none off
14531472

14541473

14551474
omicron zones:
14561475
---------------------------------------------------------------------------------------------------------
14571476
zone type zone id image source disposition underlay IP
14581477
---------------------------------------------------------------------------------------------------------
1478+
boundary_ntp ..........<REDACTED_UUID>........... install dataset in service ::1
14591479
clickhouse ..........<REDACTED_UUID>........... install dataset in service ::1
14601480
cockroach_db ..........<REDACTED_UUID>........... install dataset in service ::1
14611481
crucible_pantry ..........<REDACTED_UUID>........... install dataset in service ::1
@@ -1552,12 +1572,14 @@ parent: <none>
15521572
oxp_..........<REDACTED_UUID>.........../crypt/zone/oxz_external_dns_..........<REDACTED_UUID>........... ..........<REDACTED_UUID>........... in service none none off
15531573
oxp_..........<REDACTED_UUID>.........../crypt/zone/oxz_internal_dns_..........<REDACTED_UUID>........... ..........<REDACTED_UUID>........... in service none none off
15541574
oxp_..........<REDACTED_UUID>.........../crypt/zone/oxz_nexus_..........<REDACTED_UUID>........... ..........<REDACTED_UUID>........... in service none none off
1575+
oxp_..........<REDACTED_UUID>.........../crypt/zone/oxz_ntp_..........<REDACTED_UUID>........... ..........<REDACTED_UUID>........... in service none none off
15551576

15561577

15571578
omicron zones:
15581579
---------------------------------------------------------------------------------------------------------
15591580
zone type zone id image source disposition underlay IP
15601581
---------------------------------------------------------------------------------------------------------
1582+
boundary_ntp ..........<REDACTED_UUID>........... install dataset in service ::1
15611583
clickhouse ..........<REDACTED_UUID>........... install dataset in service ::1
15621584
cockroach_db ..........<REDACTED_UUID>........... install dataset in service ::1
15631585
crucible_pantry ..........<REDACTED_UUID>........... install dataset in service ::1

docs/reconfigurator.adoc

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -124,18 +124,16 @@ The Planner
124124
+----------+ | +----------/
125125
| | |
126126
v v v
127-
128-
"planner"
129-
(eventually a background task)
130-
|
131-
v no
132-
is a new blueprint necessary? ------> done
127+
planner background task
133128
|
134-
| yes
135129
v
136130
generate a new blueprint
137131
|
138132
|
133+
v no
134+
is the new blueprint different from the current target? ------> done
135+
|
136+
| yes
139137
v
140138
commit blueprint to database
141139
|

nexus-config/src/nexus_config.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -592,19 +592,27 @@ pub struct PhantomDiskConfig {
592592
#[serde_as]
593593
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
594594
pub struct BlueprintTasksConfig {
595+
/// background planner chicken switch
596+
pub disable_planner: bool,
597+
595598
/// period (in seconds) for periodic activations of the background task that
596599
/// reads the latest target blueprint from the database
597600
#[serde_as(as = "DurationSeconds<u64>")]
598601
pub period_secs_load: Duration,
599602

603+
/// period (in seconds) for periodic activations of the background task that
604+
/// plans and updates the target blueprint
605+
#[serde_as(as = "DurationSeconds<u64>")]
606+
pub period_secs_plan: Duration,
607+
600608
/// period (in seconds) for periodic activations of the background task that
601609
/// executes the latest target blueprint
602610
#[serde_as(as = "DurationSeconds<u64>")]
603611
pub period_secs_execute: Duration,
604612

605613
/// period (in seconds) for periodic activations of the background task that
606614
/// reconciles the latest blueprint and latest inventory collection into
607-
/// Rencofigurator rendezvous tables
615+
/// Reconfigurator rendezvous tables
608616
#[serde_as(as = "DurationSeconds<u64>")]
609617
pub period_secs_rendezvous: Duration,
610618

@@ -1055,7 +1063,9 @@ mod test {
10551063
physical_disk_adoption.period_secs = 30
10561064
decommissioned_disk_cleaner.period_secs = 30
10571065
phantom_disks.period_secs = 30
1066+
blueprints.disable_planner = true
10581067
blueprints.period_secs_load = 10
1068+
blueprints.period_secs_plan = 60
10591069
blueprints.period_secs_execute = 60
10601070
blueprints.period_secs_rendezvous = 300
10611071
blueprints.period_secs_collect_crdb_node_ids = 180
@@ -1220,7 +1230,9 @@ mod test {
12201230
period_secs: Duration::from_secs(30),
12211231
},
12221232
blueprints: BlueprintTasksConfig {
1233+
disable_planner: true,
12231234
period_secs_load: Duration::from_secs(10),
1235+
period_secs_plan: Duration::from_secs(60),
12241236
period_secs_execute: Duration::from_secs(60),
12251237
period_secs_collect_crdb_node_ids:
12261238
Duration::from_secs(180),
@@ -1364,7 +1376,9 @@ mod test {
13641376
physical_disk_adoption.period_secs = 30
13651377
decommissioned_disk_cleaner.period_secs = 30
13661378
phantom_disks.period_secs = 30
1379+
blueprints.disable_planner = true
13671380
blueprints.period_secs_load = 10
1381+
blueprints.period_secs_plan = 60
13681382
blueprints.period_secs_execute = 60
13691383
blueprints.period_secs_rendezvous = 300
13701384
blueprints.period_secs_collect_crdb_node_ids = 180

nexus/background-task-interface/src/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ pub struct BackgroundTasks {
2222
pub task_decommissioned_disk_cleaner: Activator,
2323
pub task_phantom_disks: Activator,
2424
pub task_blueprint_loader: Activator,
25+
pub task_blueprint_planner: Activator,
2526
pub task_blueprint_executor: Activator,
2627
pub task_blueprint_rendezvous: Activator,
2728
pub task_crdb_node_id_collector: Activator,

nexus/examples/config-second.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,9 @@ phantom_disks.period_secs = 30
118118
physical_disk_adoption.period_secs = 30
119119
support_bundle_collector.period_secs = 30
120120
decommissioned_disk_cleaner.period_secs = 60
121+
blueprints.disable_planner = true
121122
blueprints.period_secs_load = 10
123+
blueprints.period_secs_plan = 60
122124
blueprints.period_secs_execute = 60
123125
blueprints.period_secs_rendezvous = 300
124126
blueprints.period_secs_collect_crdb_node_ids = 180

nexus/examples/config.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ phantom_disks.period_secs = 30
104104
physical_disk_adoption.period_secs = 30
105105
support_bundle_collector.period_secs = 30
106106
decommissioned_disk_cleaner.period_secs = 60
107+
blueprints.disable_planner = true
107108
blueprints.period_secs_load = 10
109+
blueprints.period_secs_plan = 60
108110
blueprints.period_secs_execute = 60
109111
blueprints.period_secs_rendezvous = 300
110112
blueprints.period_secs_collect_crdb_node_ids = 180

0 commit comments

Comments
 (0)