Skip to content

Commit

Permalink
Fix race between sled-agent and zone-setup service (#6152)
Browse files Browse the repository at this point in the history
- Fixes #6149
- Most zones run the `zone-network-setup` once, at startup, with their
underlay addresses already provided by the sled-agent. That's not true
for the switch zone, which starts with only a localhost address, and
then is provided an underlay address by the sled-agent only after the
bootstrapping process has proceededed further. However, the
zone-setup-service previously deleted its IP interfaces prior to setting
the underlay address on it, apparently as a workaround for
oxidecomputer/stlouis#435. That's fine for
other zones, but that races with the sled-agent setting that underlay
address later in the switch zone. It's possible for the
zone-setup-service to delete the interface _after_ those addresses are
set, which obviously prevents the rest of the control plane from
deploying correctly. This fixes the issue by simply removing that call
to `ipadm delete-if` in the zone-setup-service. The mentioned issue has
been resolved, and the workaround is no longer needed.
- Move the `zone-network-setup` service depend on the network milestone,
instead of multi-user. This just moves it earlier a bit in the
dependency graph, though should not be strictly necessary. We might want
to move the sled-agent's notion of "zone readiness" to depend on
`multi-user` instead of `single-user` in the future, so this could help
with that.
- Extract out a few constants, some whitespace cleanup
  • Loading branch information
bnaecker authored Jul 25, 2024
1 parent 836d3a2 commit 1c27e88
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 49 deletions.
16 changes: 13 additions & 3 deletions illumos-utils/src/addrobj.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@
//! API for operating on addrobj objects.

/// The name provided to all link-local IPv6 addresses.
pub const IPV6_LINK_LOCAL_NAME: &str = "ll";
pub const IPV6_LINK_LOCAL_ADDROBJ_NAME: &str = "ll";

/// The name provided to all static IPv6 underlay addresses.
pub const IPV6_STATIC_ADDROBJ_NAME: &str = "omicron6";

/// The name provided to all static IPv4 addresses, usually for public OPTE
/// interfaces.
pub const IPV4_STATIC_ADDROBJ_NAME: &str = "omicron4";

/// The name provided to DHCP-configured addresses, of either family.
pub const DHCP_ADDROBJ_NAME: &str = "omicron";

/// Describes an "addrobj", which is the combination of an interface
/// with an associated name.
Expand Down Expand Up @@ -59,7 +69,7 @@ impl AddrObject {
/// Create a new addrobj on the same interface with the IPv6 link-local
/// name.
pub fn link_local_on_same_interface(&self) -> Result<Self, ParseError> {
self.on_same_interface(IPV6_LINK_LOCAL_NAME)
self.on_same_interface(IPV6_LINK_LOCAL_ADDROBJ_NAME)
}

pub fn new(interface: &str, name: &str) -> Result<Self, ParseError> {
Expand All @@ -76,7 +86,7 @@ impl AddrObject {

/// A link-local IPv6 addrobj over the provided interface.
pub fn link_local(interface: &str) -> Result<Self, ParseError> {
Self::new(interface, IPV6_LINK_LOCAL_NAME)
Self::new(interface, IPV6_LINK_LOCAL_ADDROBJ_NAME)
}

pub fn interface(&self) -> &str {
Expand Down
50 changes: 31 additions & 19 deletions illumos-utils/src/ipadm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,37 @@

//! Utilities for managing IP interfaces.

use crate::addrobj::{IPV6_LINK_LOCAL_ADDROBJ_NAME, IPV6_STATIC_ADDROBJ_NAME};
use crate::zone::IPADM;
use crate::{execute, ExecutionError, PFEXEC};
use std::net::Ipv6Addr;

/// Wraps commands for interacting with interfaces.
pub struct Ipadm {}

/// Expected error message contents when showing an addrobj that doesn't exist.
const ADDROBJ_NOT_FOUND_ERR: &str = "Address object not found";

/// Expected error message when an interface already exists.
const INTERFACE_ALREADY_EXISTS: &str = "Interface already exists";

#[cfg_attr(any(test, feature = "testing"), mockall::automock)]
impl Ipadm {
// Remove current IP interface and create a new temporary one.
pub fn set_temp_interface_for_datalink(
/// Ensure that an IP interface exists on the provided datalink.
pub fn ensure_ip_interface_exists(
datalink: &str,
) -> Result<(), ExecutionError> {
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[IPADM, "delete-if", datalink]);
// First we remove IP interface if it already exists. If it doesn't
// exist and the command returns an error we continue anyway as
// the next step is to create it.
let _ = execute(cmd);

let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[IPADM, "create-if", "-t", datalink]);
execute(cmd)?;
Ok(())
match execute(cmd) {
Ok(_) => Ok(()),
Err(ExecutionError::CommandFailure(info))
if info.stderr.contains(INTERFACE_ALREADY_EXISTS) =>
{
Ok(())
}
Err(e) => Err(e),
}
}

// Set MTU to 9000 on both IPv4 and IPv6
Expand Down Expand Up @@ -65,11 +72,13 @@ impl Ipadm {
listen_addr: &Ipv6Addr,
) -> Result<(), ExecutionError> {
// Create auto-configured address on the IP interface if it doesn't already exist
let addrobj = format!("{}/ll", datalink);
let addrobj = format!("{}/{}", datalink, IPV6_LINK_LOCAL_ADDROBJ_NAME);
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]);
match execute(cmd) {
Err(_) => {
Err(ExecutionError::CommandFailure(info))
if info.stderr.contains(ADDROBJ_NOT_FOUND_ERR) =>
{
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[
IPADM,
Expand All @@ -81,15 +90,18 @@ impl Ipadm {
]);
execute(cmd)?;
}
Err(other) => return Err(other),
Ok(_) => (),
};

// Create static address on the IP interface if it doesn't already exist
let addrobj = format!("{}/omicron6", datalink);
let addrobj = format!("{}/{}", datalink, IPV6_STATIC_ADDROBJ_NAME);
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]);
match execute(cmd) {
Err(_) => {
Err(ExecutionError::CommandFailure(info))
if info.stderr.contains(ADDROBJ_NOT_FOUND_ERR) =>
{
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[
IPADM,
Expand All @@ -101,11 +113,11 @@ impl Ipadm {
&listen_addr.to_string(),
&addrobj,
]);
execute(cmd)?;
execute(cmd).map(|_| ())
}
Ok(_) => (),
};
Ok(())
Err(other) => Err(other),
Ok(_) => Ok(()),
}
}

// Create gateway on the IP interface if it doesn't already exist
Expand Down
17 changes: 12 additions & 5 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

//! Utilities to manage running zones.

use crate::addrobj::AddrObject;
use crate::addrobj::{
AddrObject, DHCP_ADDROBJ_NAME, IPV4_STATIC_ADDROBJ_NAME,
IPV6_STATIC_ADDROBJ_NAME,
};
use crate::dladm::Etherstub;
use crate::link::{Link, VnicAllocator};
use crate::opte::{Port, PortTicket};
Expand Down Expand Up @@ -360,7 +363,11 @@ impl RunningZone {
}

pub fn control_interface(&self) -> AddrObject {
AddrObject::new(self.inner.get_control_vnic_name(), "omicron6").unwrap()
AddrObject::new(
self.inner.get_control_vnic_name(),
IPV6_STATIC_ADDROBJ_NAME,
)
.unwrap()
}

/// Runs a command within the Zone, return the output.
Expand Down Expand Up @@ -547,10 +554,10 @@ impl RunningZone {
addrtype: AddressRequest,
) -> Result<IpNetwork, EnsureAddressError> {
let name = match addrtype {
AddressRequest::Dhcp => "omicron",
AddressRequest::Dhcp => DHCP_ADDROBJ_NAME,
AddressRequest::Static(net) => match net.ip() {
std::net::IpAddr::V4(_) => "omicron4",
std::net::IpAddr::V6(_) => "omicron6",
std::net::IpAddr::V4(_) => IPV4_STATIC_ADDROBJ_NAME,
std::net::IpAddr::V6(_) => IPV6_STATIC_ADDROBJ_NAME,
},
};
self.ensure_address_with_name(addrtype, name).await
Expand Down
8 changes: 4 additions & 4 deletions sled-agent/src/services.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use dpd_client::{types as DpdTypes, Client as DpdClient, Error as DpdError};
use dropshot::HandlerTaskMode;
use illumos_utils::addrobj::AddrObject;
use illumos_utils::addrobj::IPV6_LINK_LOCAL_NAME;
use illumos_utils::addrobj::IPV6_LINK_LOCAL_ADDROBJ_NAME;
use illumos_utils::dladm::{
Dladm, Etherstub, EtherstubVnic, GetSimnetError, PhysicalLink,
};
Expand Down Expand Up @@ -2879,7 +2879,7 @@ impl ServiceManager {
// cabled together.
AddrObject::new(
&format!("tfportrear{}_0", i),
IPV6_LINK_LOCAL_NAME,
IPV6_LINK_LOCAL_ADDROBJ_NAME,
)
.unwrap()
})
Expand All @@ -2891,7 +2891,7 @@ impl ServiceManager {
.map(|i| {
AddrObject::new(
&i.to_string(),
IPV6_LINK_LOCAL_NAME,
IPV6_LINK_LOCAL_ADDROBJ_NAME,
)
.unwrap()
})
Expand Down Expand Up @@ -3648,7 +3648,7 @@ impl ServiceManager {
}
}
Err(e) => {
info!(self.inner.log, "chronyc command failed: {}", e);
error!(self.inner.log, "chronyc command failed: {}", e);
Err(Error::NtpZoneNotReady)
}
}
Expand Down
19 changes: 12 additions & 7 deletions smf/zone-network-setup/manifest.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,26 @@
<service name='oxide/zone-network-setup' type='service' version='1'>
<create_default_instance enabled='true' />

<!-- Run after the operating system's svc:/network/physical service is done. -->
<dependency name='physical' grouping='require_all' restart_on='none'
<!-- Run after the zone's networking stack is up. -->
<dependency name='network' grouping='require_all' restart_on='none'
type='service'>
<service_fmri value='svc:/network/physical:default' />
<service_fmri value='svc:/milestone/network:default' />
</dependency>

<dependency name='multi_user' grouping='require_all' restart_on='none'
type='service'>
<service_fmri value='svc:/milestone/multi-user:default' />
<!-- The zone-setup binary is not ready to run until its initial properties
have been set by the sled-agent, which happens after the
`manifest-import` service is running.
-->
<dependency name='manifest-import' type='service' grouping='require_all' restart_on='none'>
<service_fmri value='svc:/system/manifest-import:default' />
</dependency>

<exec_method type='method' name='start'
exec='/opt/oxide/zone-setup-cli/bin/zone-setup common-networking -d %{config/datalink} -s %{config/static_addr} -g %{config/gateway}'
timeout_seconds='0' />


<exec_method type='method' name='stop' exec=':true' timeout_seconds='0' />

<property_group name='startd' type='framework'>
<propval name='duration' type='astring' value='transient' />
</property_group>
Expand Down
24 changes: 13 additions & 11 deletions zone-setup/src/bin/zone-setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

use anyhow::anyhow;
use clap::{arg, command, value_parser, Arg, ArgMatches, Command};
use illumos_utils::addrobj::{AddrObject, IPV6_LINK_LOCAL_NAME};
use illumos_utils::addrobj::{AddrObject, IPV6_LINK_LOCAL_ADDROBJ_NAME};
use illumos_utils::ipadm::Ipadm;
use illumos_utils::route::{Gateway, Route};
use illumos_utils::svcadm::Svcadm;
Expand Down Expand Up @@ -232,7 +232,7 @@ async fn do_run() -> Result<(), CmdError> {
)
.subcommand(
Command::new(CHRONY_SETUP_CMD)
.about("Sets up Chrony configuration for NTP zone")
.about("Sets up Chrony configuration for NTP zone")
.arg(
arg!(-f --file <String> "Chrony configuration file")
.default_value(CHRONY_CONFIG_FILE)
Expand Down Expand Up @@ -328,7 +328,7 @@ async fn switch_zone_setup(
for link in &links {
Zones::ensure_has_link_local_v6_address(
None,
&AddrObject::new(link, IPV6_LINK_LOCAL_NAME).unwrap(),
&AddrObject::new(link, IPV6_LINK_LOCAL_ADDROBJ_NAME).unwrap(),
)
.map_err(|err| {
CmdError::Failure(anyhow!(
Expand Down Expand Up @@ -635,7 +635,7 @@ maxslewrate 2708.333
})?;

if old_file.clone().is_some_and(|f| f != new_config) {
info!(&log, "Chrony configuration file has changed";
info!(&log, "Chrony configuration file has changed";
"old configuration file" => ?old_file, "new configuration file" => ?new_config,);
}

Expand Down Expand Up @@ -663,13 +663,15 @@ async fn common_nw_set_up(
))
})?;

// TODO: remove when https://github.com/oxidecomputer/stlouis/issues/435 is
// addressed
info!(&log, "Ensuring a temporary IP interface is created"; "data link" => ?datalink);
Ipadm::set_temp_interface_for_datalink(&datalink)
info!(
&log,
"Ensuring IP interface exists on datalink";
"datalink" => datalink
);
Ipadm::ensure_ip_interface_exists(datalink)
.map_err(|err| CmdError::Failure(anyhow!(err)))?;

info!(&log, "Setting MTU to 9000 for IPv6 and IPv4"; "data link" => ?datalink);
info!(&log, "Setting MTU to 9000 for IPv6 and IPv4"; "datalink" => ?datalink);
Ipadm::set_interface_mtu(&datalink)
.map_err(|err| CmdError::Failure(anyhow!(err)))?;

Expand Down Expand Up @@ -705,11 +707,11 @@ async fn common_nw_set_up(
if gw.is_empty() {
info!(&log, "Underlay is not available yet. Not ensuring there is a default route");
} else {
// We can safely retrieve the first address only as the CLI only accepts a single item.
// We can safely retrieve the first address only as the CLI only accepts a single item.
let gw = gw.first().unwrap();

// Ensuring default route with gateway must happen after peer agents have been initialized.
// Omicron zones will be able ensure a default route with gateway immediately, but the
// Omicron zones will be able ensure a default route with gateway immediately, but the
// switch zone on the secondary scrimlet might need a few tries while it waits.
retry_notify(
retry_policy_local(),
Expand Down

0 comments on commit 1c27e88

Please sign in to comment.