Skip to content

Commit

Permalink
Fix race between sled-agent and zone-setup service
Browse files Browse the repository at this point in the history
- Fixes #6149
- Most zones run the `zone-setup-service` once, at startup, with their
  underlay addresses already provided by the sled-agent. That's not true
  for the switch zone, which starts with only a localhost address, and
  then is provided an underlay address by the sled-agent only after the
  bootstrapping process has proceededed further. However, the
  zone-setup-service previously deleted its IP interfaces prior to
  setting the underlay address on it, apparently as a workaround for
  oxidecomputer/stlouis#435. That's fine for
  other zones, but that races with the sled-agent setting that underlay
  address later in the switch zone. It's possible for the
  zone-setup-service to delete the interface _after_ those addresses are
  set, which obviously prevents the rest of the control plane from
  deploying correctly. This fixes the issue by simply removing that call
  to `ipadm delete-if` in the zone-setup-service. The mentioned issue
  has been resolved, and the workaround is no longer needed.
- Extract out a few constants, some whitespace cleanup
  • Loading branch information
bnaecker committed Jul 24, 2024
1 parent c1ecacf commit 0c9410b
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 49 deletions.
16 changes: 13 additions & 3 deletions illumos-utils/src/addrobj.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@
//! API for operating on addrobj objects.

/// The name provided to all link-local IPv6 addresses.
pub const IPV6_LINK_LOCAL_NAME: &str = "ll";
pub const IPV6_LINK_LOCAL_ADDROBJ_NAME: &str = "ll";

/// The name provided to all static IPv6 underlay addresses.
pub const IPV6_STATIC_ADDROBJ_NAME: &str = "omicron6";

/// The name provided to all static IPv4 addresses, usually for public OPTE
/// interfaces.
pub const IPV4_STATIC_ADDROBJ_NAME: &str = "omicron6";

/// The name provided to DHCP-configured addresses, of either family.
pub const DHCP_ADDROBJ_NAME: &str = "omicron";

/// Describes an "addrobj", which is the combination of an interface
/// with an associated name.
Expand Down Expand Up @@ -59,7 +69,7 @@ impl AddrObject {
/// Create a new addrobj on the same interface with the IPv6 link-local
/// name.
pub fn link_local_on_same_interface(&self) -> Result<Self, ParseError> {
self.on_same_interface(IPV6_LINK_LOCAL_NAME)
self.on_same_interface(IPV6_LINK_LOCAL_ADDROBJ_NAME)
}

pub fn new(interface: &str, name: &str) -> Result<Self, ParseError> {
Expand All @@ -76,7 +86,7 @@ impl AddrObject {

/// A link-local IPv6 addrobj over the provided interface.
pub fn link_local(interface: &str) -> Result<Self, ParseError> {
Self::new(interface, IPV6_LINK_LOCAL_NAME)
Self::new(interface, IPV6_LINK_LOCAL_ADDROBJ_NAME)
}

pub fn interface(&self) -> &str {
Expand Down
42 changes: 17 additions & 25 deletions illumos-utils/src/ipadm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,19 @@

//! Utilities for managing IP interfaces.

use crate::addrobj::{IPV6_LINK_LOCAL_ADDROBJ_NAME, IPV6_STATIC_ADDROBJ_NAME};
use crate::zone::IPADM;
use crate::{execute, ExecutionError, PFEXEC};
use std::net::Ipv6Addr;

/// Wraps commands for interacting with interfaces.
pub struct Ipadm {}

/// Expected error message contents when showing an addrobj that doesn't exist.
const ADDROBJ_NOT_FOUND_ERR: &str = "Address object not found";

#[cfg_attr(any(test, feature = "testing"), mockall::automock)]
impl Ipadm {
// Remove current IP interface and create a new temporary one.
pub fn set_temp_interface_for_datalink(
datalink: &str,
) -> Result<(), ExecutionError> {
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[IPADM, "delete-if", datalink]);
// First we remove IP interface if it already exists. If it doesn't
// exist and the command returns an error we continue anyway as
// the next step is to create it.
let _ = execute(cmd);

let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[IPADM, "create-if", "-t", datalink]);
execute(cmd)?;
Ok(())
}

// Set MTU to 9000 on both IPv4 and IPv6
pub fn set_interface_mtu(datalink: &str) -> Result<(), ExecutionError> {
let mut cmd = std::process::Command::new(PFEXEC);
Expand Down Expand Up @@ -65,11 +52,13 @@ impl Ipadm {
listen_addr: &Ipv6Addr,
) -> Result<(), ExecutionError> {
// Create auto-configured address on the IP interface if it doesn't already exist
let addrobj = format!("{}/ll", datalink);
let addrobj = format!("{}/{}", datalink, IPV6_LINK_LOCAL_ADDROBJ_NAME);
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]);
match execute(cmd) {
Err(_) => {
Err(ExecutionError::CommandFailure(info))
if info.stderr.contains(ADDROBJ_NOT_FOUND_ERR) =>
{
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[
IPADM,
Expand All @@ -81,15 +70,18 @@ impl Ipadm {
]);
execute(cmd)?;
}
Err(other) => return Err(other),
Ok(_) => (),
};

// Create static address on the IP interface if it doesn't already exist
let addrobj = format!("{}/omicron6", datalink);
let addrobj = format!("{}/{}", datalink, IPV6_STATIC_ADDROBJ_NAME);
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]);
match execute(cmd) {
Err(_) => {
Err(ExecutionError::CommandFailure(info))
if info.stderr.contains(ADDROBJ_NOT_FOUND_ERR) =>
{
let mut cmd = std::process::Command::new(PFEXEC);
let cmd = cmd.args(&[
IPADM,
Expand All @@ -101,11 +93,11 @@ impl Ipadm {
&listen_addr.to_string(),
&addrobj,
]);
execute(cmd)?;
execute(cmd).map(|_| ())
}
Ok(_) => (),
};
Ok(())
Err(other) => Err(other),
Ok(_) => Ok(()),
}
}

// Create gateway on the IP interface if it doesn't already exist
Expand Down
17 changes: 12 additions & 5 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

//! Utilities to manage running zones.

use crate::addrobj::AddrObject;
use crate::addrobj::{
AddrObject, DHCP_ADDROBJ_NAME, IPV4_STATIC_ADDROBJ_NAME,
IPV6_STATIC_ADDROBJ_NAME,
};
use crate::dladm::Etherstub;
use crate::link::{Link, VnicAllocator};
use crate::opte::{Port, PortTicket};
Expand Down Expand Up @@ -360,7 +363,11 @@ impl RunningZone {
}

pub fn control_interface(&self) -> AddrObject {
AddrObject::new(self.inner.get_control_vnic_name(), "omicron6").unwrap()
AddrObject::new(
self.inner.get_control_vnic_name(),
IPV6_STATIC_ADDROBJ_NAME,
)
.unwrap()
}

/// Runs a command within the Zone, return the output.
Expand Down Expand Up @@ -547,10 +554,10 @@ impl RunningZone {
addrtype: AddressRequest,
) -> Result<IpNetwork, EnsureAddressError> {
let name = match addrtype {
AddressRequest::Dhcp => "omicron",
AddressRequest::Dhcp => DHCP_ADDROBJ_NAME,
AddressRequest::Static(net) => match net.ip() {
std::net::IpAddr::V4(_) => "omicron4",
std::net::IpAddr::V6(_) => "omicron6",
std::net::IpAddr::V4(_) => IPV4_STATIC_ADDROBJ_NAME,
std::net::IpAddr::V6(_) => IPV6_STATIC_ADDROBJ_NAME,
},
};
self.ensure_address_with_name(addrtype, name).await
Expand Down
8 changes: 4 additions & 4 deletions sled-agent/src/services.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use dpd_client::{types as DpdTypes, Client as DpdClient, Error as DpdError};
use dropshot::HandlerTaskMode;
use illumos_utils::addrobj::AddrObject;
use illumos_utils::addrobj::IPV6_LINK_LOCAL_NAME;
use illumos_utils::addrobj::IPV6_LINK_LOCAL_ADDROBJ_NAME;
use illumos_utils::dladm::{
Dladm, Etherstub, EtherstubVnic, GetSimnetError, PhysicalLink,
};
Expand Down Expand Up @@ -2879,7 +2879,7 @@ impl ServiceManager {
// cabled together.
AddrObject::new(
&format!("tfportrear{}_0", i),
IPV6_LINK_LOCAL_NAME,
IPV6_LINK_LOCAL_ADDROBJ_NAME,
)
.unwrap()
})
Expand All @@ -2891,7 +2891,7 @@ impl ServiceManager {
.map(|i| {
AddrObject::new(
&i.to_string(),
IPV6_LINK_LOCAL_NAME,
IPV6_LINK_LOCAL_ADDROBJ_NAME,
)
.unwrap()
})
Expand Down Expand Up @@ -3648,7 +3648,7 @@ impl ServiceManager {
}
}
Err(e) => {
info!(self.inner.log, "chronyc command failed: {}", e);
error!(self.inner.log, "chronyc command failed: {}", e);
Err(Error::NtpZoneNotReady)
}
}
Expand Down
18 changes: 6 additions & 12 deletions zone-setup/src/bin/zone-setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

use anyhow::anyhow;
use clap::{arg, command, value_parser, Arg, ArgMatches, Command};
use illumos_utils::addrobj::{AddrObject, IPV6_LINK_LOCAL_NAME};
use illumos_utils::addrobj::{AddrObject, IPV6_LINK_LOCAL_ADDROBJ_NAME};
use illumos_utils::ipadm::Ipadm;
use illumos_utils::route::{Gateway, Route};
use illumos_utils::svcadm::Svcadm;
Expand Down Expand Up @@ -232,7 +232,7 @@ async fn do_run() -> Result<(), CmdError> {
)
.subcommand(
Command::new(CHRONY_SETUP_CMD)
.about("Sets up Chrony configuration for NTP zone")
.about("Sets up Chrony configuration for NTP zone")
.arg(
arg!(-f --file <String> "Chrony configuration file")
.default_value(CHRONY_CONFIG_FILE)
Expand Down Expand Up @@ -328,7 +328,7 @@ async fn switch_zone_setup(
for link in &links {
Zones::ensure_has_link_local_v6_address(
None,
&AddrObject::new(link, IPV6_LINK_LOCAL_NAME).unwrap(),
&AddrObject::new(link, IPV6_LINK_LOCAL_ADDROBJ_NAME).unwrap(),
)
.map_err(|err| {
CmdError::Failure(anyhow!(
Expand Down Expand Up @@ -635,7 +635,7 @@ maxslewrate 2708.333
})?;

if old_file.clone().is_some_and(|f| f != new_config) {
info!(&log, "Chrony configuration file has changed";
info!(&log, "Chrony configuration file has changed";
"old configuration file" => ?old_file, "new configuration file" => ?new_config,);
}

Expand Down Expand Up @@ -663,12 +663,6 @@ async fn common_nw_set_up(
))
})?;

// TODO: remove when https://github.com/oxidecomputer/stlouis/issues/435 is
// addressed
info!(&log, "Ensuring a temporary IP interface is created"; "data link" => ?datalink);
Ipadm::set_temp_interface_for_datalink(&datalink)
.map_err(|err| CmdError::Failure(anyhow!(err)))?;

info!(&log, "Setting MTU to 9000 for IPv6 and IPv4"; "data link" => ?datalink);
Ipadm::set_interface_mtu(&datalink)
.map_err(|err| CmdError::Failure(anyhow!(err)))?;
Expand Down Expand Up @@ -705,11 +699,11 @@ async fn common_nw_set_up(
if gw.is_empty() {
info!(&log, "Underlay is not available yet. Not ensuring there is a default route");
} else {
// We can safely retrieve the first address only as the CLI only accepts a single item.
// We can safely retrieve the first address only as the CLI only accepts a single item.
let gw = gw.first().unwrap();

// Ensuring default route with gateway must happen after peer agents have been initialized.
// Omicron zones will be able ensure a default route with gateway immediately, but the
// Omicron zones will be able ensure a default route with gateway immediately, but the
// switch zone on the secondary scrimlet might need a few tries while it waits.
retry_notify(
retry_policy_local(),
Expand Down

0 comments on commit 0c9410b

Please sign in to comment.