Skip to content

Commit 1c27e88

Browse files
authored
Fix race between sled-agent and zone-setup service (#6152)
- Fixes #6149 - Most zones run the `zone-network-setup` once, at startup, with their underlay addresses already provided by the sled-agent. That's not true for the switch zone, which starts with only a localhost address, and then is provided an underlay address by the sled-agent only after the bootstrapping process has proceededed further. However, the zone-setup-service previously deleted its IP interfaces prior to setting the underlay address on it, apparently as a workaround for oxidecomputer/stlouis#435. That's fine for other zones, but that races with the sled-agent setting that underlay address later in the switch zone. It's possible for the zone-setup-service to delete the interface _after_ those addresses are set, which obviously prevents the rest of the control plane from deploying correctly. This fixes the issue by simply removing that call to `ipadm delete-if` in the zone-setup-service. The mentioned issue has been resolved, and the workaround is no longer needed. - Move the `zone-network-setup` service depend on the network milestone, instead of multi-user. This just moves it earlier a bit in the dependency graph, though should not be strictly necessary. We might want to move the sled-agent's notion of "zone readiness" to depend on `multi-user` instead of `single-user` in the future, so this could help with that. - Extract out a few constants, some whitespace cleanup
1 parent 836d3a2 commit 1c27e88

File tree

6 files changed

+85
-49
lines changed

6 files changed

+85
-49
lines changed

illumos-utils/src/addrobj.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,17 @@
55
//! API for operating on addrobj objects.
66
77
/// The name provided to all link-local IPv6 addresses.
8-
pub const IPV6_LINK_LOCAL_NAME: &str = "ll";
8+
pub const IPV6_LINK_LOCAL_ADDROBJ_NAME: &str = "ll";
9+
10+
/// The name provided to all static IPv6 underlay addresses.
11+
pub const IPV6_STATIC_ADDROBJ_NAME: &str = "omicron6";
12+
13+
/// The name provided to all static IPv4 addresses, usually for public OPTE
14+
/// interfaces.
15+
pub const IPV4_STATIC_ADDROBJ_NAME: &str = "omicron4";
16+
17+
/// The name provided to DHCP-configured addresses, of either family.
18+
pub const DHCP_ADDROBJ_NAME: &str = "omicron";
919

1020
/// Describes an "addrobj", which is the combination of an interface
1121
/// with an associated name.
@@ -59,7 +69,7 @@ impl AddrObject {
5969
/// Create a new addrobj on the same interface with the IPv6 link-local
6070
/// name.
6171
pub fn link_local_on_same_interface(&self) -> Result<Self, ParseError> {
62-
self.on_same_interface(IPV6_LINK_LOCAL_NAME)
72+
self.on_same_interface(IPV6_LINK_LOCAL_ADDROBJ_NAME)
6373
}
6474

6575
pub fn new(interface: &str, name: &str) -> Result<Self, ParseError> {
@@ -76,7 +86,7 @@ impl AddrObject {
7686

7787
/// A link-local IPv6 addrobj over the provided interface.
7888
pub fn link_local(interface: &str) -> Result<Self, ParseError> {
79-
Self::new(interface, IPV6_LINK_LOCAL_NAME)
89+
Self::new(interface, IPV6_LINK_LOCAL_ADDROBJ_NAME)
8090
}
8191

8292
pub fn interface(&self) -> &str {

illumos-utils/src/ipadm.rs

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,37 @@
44

55
//! Utilities for managing IP interfaces.
66
7+
use crate::addrobj::{IPV6_LINK_LOCAL_ADDROBJ_NAME, IPV6_STATIC_ADDROBJ_NAME};
78
use crate::zone::IPADM;
89
use crate::{execute, ExecutionError, PFEXEC};
910
use std::net::Ipv6Addr;
1011

1112
/// Wraps commands for interacting with interfaces.
1213
pub struct Ipadm {}
1314

15+
/// Expected error message contents when showing an addrobj that doesn't exist.
16+
const ADDROBJ_NOT_FOUND_ERR: &str = "Address object not found";
17+
18+
/// Expected error message when an interface already exists.
19+
const INTERFACE_ALREADY_EXISTS: &str = "Interface already exists";
20+
1421
#[cfg_attr(any(test, feature = "testing"), mockall::automock)]
1522
impl Ipadm {
16-
// Remove current IP interface and create a new temporary one.
17-
pub fn set_temp_interface_for_datalink(
23+
/// Ensure that an IP interface exists on the provided datalink.
24+
pub fn ensure_ip_interface_exists(
1825
datalink: &str,
1926
) -> Result<(), ExecutionError> {
20-
let mut cmd = std::process::Command::new(PFEXEC);
21-
let cmd = cmd.args(&[IPADM, "delete-if", datalink]);
22-
// First we remove IP interface if it already exists. If it doesn't
23-
// exist and the command returns an error we continue anyway as
24-
// the next step is to create it.
25-
let _ = execute(cmd);
26-
2727
let mut cmd = std::process::Command::new(PFEXEC);
2828
let cmd = cmd.args(&[IPADM, "create-if", "-t", datalink]);
29-
execute(cmd)?;
30-
Ok(())
29+
match execute(cmd) {
30+
Ok(_) => Ok(()),
31+
Err(ExecutionError::CommandFailure(info))
32+
if info.stderr.contains(INTERFACE_ALREADY_EXISTS) =>
33+
{
34+
Ok(())
35+
}
36+
Err(e) => Err(e),
37+
}
3138
}
3239

3340
// Set MTU to 9000 on both IPv4 and IPv6
@@ -65,11 +72,13 @@ impl Ipadm {
6572
listen_addr: &Ipv6Addr,
6673
) -> Result<(), ExecutionError> {
6774
// Create auto-configured address on the IP interface if it doesn't already exist
68-
let addrobj = format!("{}/ll", datalink);
75+
let addrobj = format!("{}/{}", datalink, IPV6_LINK_LOCAL_ADDROBJ_NAME);
6976
let mut cmd = std::process::Command::new(PFEXEC);
7077
let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]);
7178
match execute(cmd) {
72-
Err(_) => {
79+
Err(ExecutionError::CommandFailure(info))
80+
if info.stderr.contains(ADDROBJ_NOT_FOUND_ERR) =>
81+
{
7382
let mut cmd = std::process::Command::new(PFEXEC);
7483
let cmd = cmd.args(&[
7584
IPADM,
@@ -81,15 +90,18 @@ impl Ipadm {
8190
]);
8291
execute(cmd)?;
8392
}
93+
Err(other) => return Err(other),
8494
Ok(_) => (),
8595
};
8696

8797
// Create static address on the IP interface if it doesn't already exist
88-
let addrobj = format!("{}/omicron6", datalink);
98+
let addrobj = format!("{}/{}", datalink, IPV6_STATIC_ADDROBJ_NAME);
8999
let mut cmd = std::process::Command::new(PFEXEC);
90100
let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]);
91101
match execute(cmd) {
92-
Err(_) => {
102+
Err(ExecutionError::CommandFailure(info))
103+
if info.stderr.contains(ADDROBJ_NOT_FOUND_ERR) =>
104+
{
93105
let mut cmd = std::process::Command::new(PFEXEC);
94106
let cmd = cmd.args(&[
95107
IPADM,
@@ -101,11 +113,11 @@ impl Ipadm {
101113
&listen_addr.to_string(),
102114
&addrobj,
103115
]);
104-
execute(cmd)?;
116+
execute(cmd).map(|_| ())
105117
}
106-
Ok(_) => (),
107-
};
108-
Ok(())
118+
Err(other) => Err(other),
119+
Ok(_) => Ok(()),
120+
}
109121
}
110122

111123
// Create gateway on the IP interface if it doesn't already exist

illumos-utils/src/running_zone.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44

55
//! Utilities to manage running zones.
66
7-
use crate::addrobj::AddrObject;
7+
use crate::addrobj::{
8+
AddrObject, DHCP_ADDROBJ_NAME, IPV4_STATIC_ADDROBJ_NAME,
9+
IPV6_STATIC_ADDROBJ_NAME,
10+
};
811
use crate::dladm::Etherstub;
912
use crate::link::{Link, VnicAllocator};
1013
use crate::opte::{Port, PortTicket};
@@ -360,7 +363,11 @@ impl RunningZone {
360363
}
361364

362365
pub fn control_interface(&self) -> AddrObject {
363-
AddrObject::new(self.inner.get_control_vnic_name(), "omicron6").unwrap()
366+
AddrObject::new(
367+
self.inner.get_control_vnic_name(),
368+
IPV6_STATIC_ADDROBJ_NAME,
369+
)
370+
.unwrap()
364371
}
365372

366373
/// Runs a command within the Zone, return the output.
@@ -547,10 +554,10 @@ impl RunningZone {
547554
addrtype: AddressRequest,
548555
) -> Result<IpNetwork, EnsureAddressError> {
549556
let name = match addrtype {
550-
AddressRequest::Dhcp => "omicron",
557+
AddressRequest::Dhcp => DHCP_ADDROBJ_NAME,
551558
AddressRequest::Static(net) => match net.ip() {
552-
std::net::IpAddr::V4(_) => "omicron4",
553-
std::net::IpAddr::V6(_) => "omicron6",
559+
std::net::IpAddr::V4(_) => IPV4_STATIC_ADDROBJ_NAME,
560+
std::net::IpAddr::V6(_) => IPV6_STATIC_ADDROBJ_NAME,
554561
},
555562
};
556563
self.ensure_address_with_name(addrtype, name).await

sled-agent/src/services.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ use camino::{Utf8Path, Utf8PathBuf};
4343
use dpd_client::{types as DpdTypes, Client as DpdClient, Error as DpdError};
4444
use dropshot::HandlerTaskMode;
4545
use illumos_utils::addrobj::AddrObject;
46-
use illumos_utils::addrobj::IPV6_LINK_LOCAL_NAME;
46+
use illumos_utils::addrobj::IPV6_LINK_LOCAL_ADDROBJ_NAME;
4747
use illumos_utils::dladm::{
4848
Dladm, Etherstub, EtherstubVnic, GetSimnetError, PhysicalLink,
4949
};
@@ -2879,7 +2879,7 @@ impl ServiceManager {
28792879
// cabled together.
28802880
AddrObject::new(
28812881
&format!("tfportrear{}_0", i),
2882-
IPV6_LINK_LOCAL_NAME,
2882+
IPV6_LINK_LOCAL_ADDROBJ_NAME,
28832883
)
28842884
.unwrap()
28852885
})
@@ -2891,7 +2891,7 @@ impl ServiceManager {
28912891
.map(|i| {
28922892
AddrObject::new(
28932893
&i.to_string(),
2894-
IPV6_LINK_LOCAL_NAME,
2894+
IPV6_LINK_LOCAL_ADDROBJ_NAME,
28952895
)
28962896
.unwrap()
28972897
})
@@ -3648,7 +3648,7 @@ impl ServiceManager {
36483648
}
36493649
}
36503650
Err(e) => {
3651-
info!(self.inner.log, "chronyc command failed: {}", e);
3651+
error!(self.inner.log, "chronyc command failed: {}", e);
36523652
Err(Error::NtpZoneNotReady)
36533653
}
36543654
}

smf/zone-network-setup/manifest.xml

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,26 @@
66
<service name='oxide/zone-network-setup' type='service' version='1'>
77
<create_default_instance enabled='true' />
88

9-
<!-- Run after the operating system's svc:/network/physical service is done. -->
10-
<dependency name='physical' grouping='require_all' restart_on='none'
9+
<!-- Run after the zone's networking stack is up. -->
10+
<dependency name='network' grouping='require_all' restart_on='none'
1111
type='service'>
12-
<service_fmri value='svc:/network/physical:default' />
12+
<service_fmri value='svc:/milestone/network:default' />
1313
</dependency>
1414

15-
<dependency name='multi_user' grouping='require_all' restart_on='none'
16-
type='service'>
17-
<service_fmri value='svc:/milestone/multi-user:default' />
15+
<!-- The zone-setup binary is not ready to run until its initial properties
16+
have been set by the sled-agent, which happens after the
17+
`manifest-import` service is running.
18+
-->
19+
<dependency name='manifest-import' type='service' grouping='require_all' restart_on='none'>
20+
<service_fmri value='svc:/system/manifest-import:default' />
1821
</dependency>
1922

2023
<exec_method type='method' name='start'
2124
exec='/opt/oxide/zone-setup-cli/bin/zone-setup common-networking -d %{config/datalink} -s %{config/static_addr} -g %{config/gateway}'
2225
timeout_seconds='0' />
23-
26+
27+
<exec_method type='method' name='stop' exec=':true' timeout_seconds='0' />
28+
2429
<property_group name='startd' type='framework'>
2530
<propval name='duration' type='astring' value='transient' />
2631
</property_group>

zone-setup/src/bin/zone-setup.rs

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
77
use anyhow::anyhow;
88
use clap::{arg, command, value_parser, Arg, ArgMatches, Command};
9-
use illumos_utils::addrobj::{AddrObject, IPV6_LINK_LOCAL_NAME};
9+
use illumos_utils::addrobj::{AddrObject, IPV6_LINK_LOCAL_ADDROBJ_NAME};
1010
use illumos_utils::ipadm::Ipadm;
1111
use illumos_utils::route::{Gateway, Route};
1212
use illumos_utils::svcadm::Svcadm;
@@ -232,7 +232,7 @@ async fn do_run() -> Result<(), CmdError> {
232232
)
233233
.subcommand(
234234
Command::new(CHRONY_SETUP_CMD)
235-
.about("Sets up Chrony configuration for NTP zone")
235+
.about("Sets up Chrony configuration for NTP zone")
236236
.arg(
237237
arg!(-f --file <String> "Chrony configuration file")
238238
.default_value(CHRONY_CONFIG_FILE)
@@ -328,7 +328,7 @@ async fn switch_zone_setup(
328328
for link in &links {
329329
Zones::ensure_has_link_local_v6_address(
330330
None,
331-
&AddrObject::new(link, IPV6_LINK_LOCAL_NAME).unwrap(),
331+
&AddrObject::new(link, IPV6_LINK_LOCAL_ADDROBJ_NAME).unwrap(),
332332
)
333333
.map_err(|err| {
334334
CmdError::Failure(anyhow!(
@@ -635,7 +635,7 @@ maxslewrate 2708.333
635635
})?;
636636

637637
if old_file.clone().is_some_and(|f| f != new_config) {
638-
info!(&log, "Chrony configuration file has changed";
638+
info!(&log, "Chrony configuration file has changed";
639639
"old configuration file" => ?old_file, "new configuration file" => ?new_config,);
640640
}
641641

@@ -663,13 +663,15 @@ async fn common_nw_set_up(
663663
))
664664
})?;
665665

666-
// TODO: remove when https://github.com/oxidecomputer/stlouis/issues/435 is
667-
// addressed
668-
info!(&log, "Ensuring a temporary IP interface is created"; "data link" => ?datalink);
669-
Ipadm::set_temp_interface_for_datalink(&datalink)
666+
info!(
667+
&log,
668+
"Ensuring IP interface exists on datalink";
669+
"datalink" => datalink
670+
);
671+
Ipadm::ensure_ip_interface_exists(datalink)
670672
.map_err(|err| CmdError::Failure(anyhow!(err)))?;
671673

672-
info!(&log, "Setting MTU to 9000 for IPv6 and IPv4"; "data link" => ?datalink);
674+
info!(&log, "Setting MTU to 9000 for IPv6 and IPv4"; "datalink" => ?datalink);
673675
Ipadm::set_interface_mtu(&datalink)
674676
.map_err(|err| CmdError::Failure(anyhow!(err)))?;
675677

@@ -705,11 +707,11 @@ async fn common_nw_set_up(
705707
if gw.is_empty() {
706708
info!(&log, "Underlay is not available yet. Not ensuring there is a default route");
707709
} else {
708-
// We can safely retrieve the first address only as the CLI only accepts a single item.
710+
// We can safely retrieve the first address only as the CLI only accepts a single item.
709711
let gw = gw.first().unwrap();
710712

711713
// Ensuring default route with gateway must happen after peer agents have been initialized.
712-
// Omicron zones will be able ensure a default route with gateway immediately, but the
714+
// Omicron zones will be able ensure a default route with gateway immediately, but the
713715
// switch zone on the secondary scrimlet might need a few tries while it waits.
714716
retry_notify(
715717
retry_policy_local(),

0 commit comments

Comments
 (0)