Skip to content

Commit e25179a

Browse files
authored
Add mapping of cubby to sled ID to support bundles (#9187)
Currently support bundles contain the sled UUID in file paths, and serial number in `sled.txt`. However, when accessing the sled via the tech port we generally refer to a sled by its cubby. To make identification of sleds simpler, add a new `sled_info.json` file to the bundle with a JSON-encoded mapping of sled serial to cubby and UUID.
1 parent 98de948 commit e25179a

File tree

3 files changed

+133
-24
lines changed

3 files changed

+133
-24
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

nexus/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ dropshot.workspace = true
3737
fatfs.workspace = true
3838
futures.workspace = true
3939
gateway-client.workspace = true
40+
gateway-types.workspace = true
4041
headers.workspace = true
4142
hex.workspace = true
4243
hickory-resolver.workspace = true

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 131 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@ use futures::stream::FuturesUnordered;
2020
use gateway_client::Client as MgsClient;
2121
use gateway_client::types::SpIdentifier;
2222
use gateway_client::types::SpIgnition;
23+
use gateway_types::component::SpType;
2324
use internal_dns_resolver::Resolver;
2425
use internal_dns_types::names::ServiceName;
2526
use nexus_db_model::Ereport;
27+
use nexus_db_model::Sled;
2628
use nexus_db_model::SupportBundle;
2729
use nexus_db_model::SupportBundleState;
2830
use nexus_db_queries::authz;
@@ -47,9 +49,11 @@ use omicron_uuid_kinds::SledUuid;
4749
use omicron_uuid_kinds::SupportBundleUuid;
4850
use omicron_uuid_kinds::ZpoolUuid;
4951
use parallel_task_set::ParallelTaskSet;
52+
use serde::Serialize;
5053
use serde_json::json;
5154
use sha2::{Digest, Sha256};
5255
use slog_error_chain::InlineErrorChain;
56+
use std::collections::BTreeMap;
5357
use std::future::Future;
5458
use std::io::Write;
5559
use std::num::NonZeroU64;
@@ -61,6 +65,7 @@ use tokio::io::AsyncWriteExt;
6165
use tokio::io::SeekFrom;
6266
use tokio_util::task::AbortOnDropHandle;
6367
use tufaceous_artifact::ArtifactHash;
68+
use uuid::Uuid;
6469
use zip::ZipArchive;
6570
use zip::ZipWriter;
6671
use zip::write::FullFileOptions;
@@ -707,23 +712,44 @@ impl BundleCollection {
707712
None
708713
};
709714

710-
let sp_dumps_dir = dir.path().join("sp_task_dumps");
711-
tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| {
712-
format!("failed to create SP task dump directory {sp_dumps_dir}")
713-
})?;
714-
if let Err(e) =
715-
save_all_sp_dumps(log, &self.resolver, &sp_dumps_dir).await
716-
{
717-
error!(log, "failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref()));
718-
} else {
719-
report.listed_sps = true;
720-
};
721-
722-
if let Ok(all_sleds) = self
715+
let all_sleds = self
723716
.datastore
724717
.sled_list_all_batched(&self.opctx, SledFilter::InService)
718+
.await;
719+
720+
if let Ok(mgs_client) = self.create_mgs_client().await {
721+
if let Err(e) = write_sled_info(
722+
&self.log,
723+
&mgs_client,
724+
all_sleds.as_deref().ok(),
725+
dir.path(),
726+
)
725727
.await
726-
{
728+
{
729+
error!(log, "Failed to write sled_info.json"; "error" => InlineErrorChain::new(e.as_ref()));
730+
}
731+
732+
let sp_dumps_dir = dir.path().join("sp_task_dumps");
733+
tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(
734+
|| {
735+
format!(
736+
"Failed to create SP task dump directory {sp_dumps_dir}"
737+
)
738+
},
739+
)?;
740+
741+
if let Err(e) =
742+
save_all_sp_dumps(log, &mgs_client, &sp_dumps_dir).await
743+
{
744+
error!(log, "Failed to capture SP task dumps"; "error" => InlineErrorChain::new(e.as_ref()));
745+
} else {
746+
report.listed_sps = true;
747+
};
748+
} else {
749+
warn!(log, "No MGS client, skipping SP task dump collection");
750+
}
751+
752+
if let Ok(all_sleds) = all_sleds {
727753
report.listed_in_service_sleds = true;
728754

729755
const MAX_CONCURRENT_SLED_REQUESTS: usize = 16;
@@ -1031,6 +1057,20 @@ impl BundleCollection {
10311057
);
10321058
Ok(())
10331059
}
1060+
1061+
async fn create_mgs_client(&self) -> anyhow::Result<MgsClient> {
1062+
self
1063+
.resolver
1064+
.lookup_socket_v6(ServiceName::ManagementGatewayService)
1065+
.await
1066+
.map(|sockaddr| {
1067+
let url = format!("http://{}", sockaddr);
1068+
gateway_client::Client::new(&url, self.log.clone())
1069+
}).map_err(|e| {
1070+
error!(self.log, "failed to resolve MGS address"; "error" => InlineErrorChain::new(&e));
1071+
e.into()
1072+
})
1073+
}
10341074
}
10351075

10361076
impl BackgroundTask for SupportBundleCollector {
@@ -1316,18 +1356,9 @@ where
13161356
/// Collect task dumps from all SPs via MGS and save them to a directory.
13171357
async fn save_all_sp_dumps(
13181358
log: &slog::Logger,
1319-
resolver: &Resolver,
1359+
mgs_client: &MgsClient,
13201360
sp_dumps_dir: &Utf8Path,
13211361
) -> anyhow::Result<()> {
1322-
let mgs_client = resolver
1323-
.lookup_socket_v6(ServiceName::ManagementGatewayService)
1324-
.await
1325-
.map(|sockaddr| {
1326-
let url = format!("http://{}", sockaddr);
1327-
gateway_client::Client::new(&url, log.clone())
1328-
})
1329-
.context("failed to resolve address of MGS")?;
1330-
13311362
let available_sps = get_available_sps(&mgs_client).await?;
13321363

13331364
let mut tasks = ParallelTaskSet::new();
@@ -1412,6 +1443,82 @@ async fn save_sp_dumps(
14121443
Ok(())
14131444
}
14141445

1446+
/// Write a file with a JSON mapping of sled serial numbers to cubby and UUIDs for easier
1447+
/// identification of sleds present in a bundle.
1448+
async fn write_sled_info(
1449+
log: &slog::Logger,
1450+
mgs_client: &MgsClient,
1451+
nexus_sleds: Option<&[Sled]>,
1452+
dir: &Utf8Path,
1453+
) -> anyhow::Result<()> {
1454+
#[derive(Serialize)]
1455+
struct SledInfo {
1456+
cubby: Option<u16>,
1457+
uuid: Option<Uuid>,
1458+
}
1459+
1460+
let available_sps = get_available_sps(&mgs_client)
1461+
.await
1462+
.context("failed to get available SPs")?;
1463+
1464+
// We can still get a useful mapping of cubby to serial using just the data from MGS.
1465+
let mut nexus_map: BTreeMap<_, _> = nexus_sleds
1466+
.unwrap_or_default()
1467+
.into_iter()
1468+
.map(|sled| (sled.serial_number(), sled))
1469+
.collect();
1470+
1471+
let mut sled_info = BTreeMap::new();
1472+
for sp in
1473+
available_sps.into_iter().filter(|sp| matches!(sp.type_, SpType::Sled))
1474+
{
1475+
let sp_state = match mgs_client.sp_get(&sp.type_, sp.slot).await {
1476+
Ok(s) => s.into_inner(),
1477+
Err(e) => {
1478+
error!(log,
1479+
"Failed to get SP state for sled_info.json";
1480+
"cubby" => sp.slot,
1481+
"component" => %sp.type_,
1482+
"error" => InlineErrorChain::new(&e)
1483+
);
1484+
continue;
1485+
}
1486+
};
1487+
1488+
if let Some(sled) = nexus_map.remove(sp_state.serial_number.as_str()) {
1489+
sled_info.insert(
1490+
sp_state.serial_number.to_string(),
1491+
SledInfo {
1492+
cubby: Some(sp.slot),
1493+
uuid: Some(*sled.identity.id.as_untyped_uuid()),
1494+
},
1495+
);
1496+
} else {
1497+
sled_info.insert(
1498+
sp_state.serial_number.to_string(),
1499+
SledInfo { cubby: Some(sp.slot), uuid: None },
1500+
);
1501+
}
1502+
}
1503+
1504+
// Sleds not returned by MGS.
1505+
for (serial, sled) in nexus_map {
1506+
sled_info.insert(
1507+
serial.to_string(),
1508+
SledInfo {
1509+
cubby: None,
1510+
uuid: Some(*sled.identity.id.as_untyped_uuid()),
1511+
},
1512+
);
1513+
}
1514+
1515+
let json = serde_json::to_string_pretty(&sled_info)
1516+
.context("failed to serialize sled info to JSON")?;
1517+
tokio::fs::write(dir.join("sled_info.json"), json).await?;
1518+
1519+
Ok(())
1520+
}
1521+
14151522
fn is_fs_safe_single_path_component(s: &str) -> bool {
14161523
// Might be path traversal...
14171524
if s == "." || s == ".." {

0 commit comments

Comments
 (0)