oxidecomputer · andrewjstone · Mar 7, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 23, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml
@@ -28,6 +28,7 @@ dpd-client.workspace = true
 display-error-chain.workspace = true
 dropshot.workspace = true
 flate2.workspace = true
+flume.workspace = true
 futures.workspace = true
 glob.workspace = true
 hex.workspace = true

diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs
@@ -33,6 +33,7 @@ use illumos_utils::svc::wait_for_service;
 use illumos_utils::zone::Zones;
 use illumos_utils::zone::PROPOLIS_ZONE_PREFIX;
 use omicron_common::address::NEXUS_INTERNAL_PORT;
+use omicron_common::api::external::ByteCount;
 use omicron_common::api::internal::nexus::{
     InstanceRuntimeState, SledInstanceState, VmmRuntimeState,
 };
@@ -49,7 +50,7 @@ use slog::Logger;
 use std::net::IpAddr;
 use std::net::{SocketAddr, SocketAddrV6};
 use std::sync::Arc;
-use tokio::sync::{mpsc, oneshot};
+use tokio::sync::{mpsc, oneshot, watch};
 use uuid::Uuid;
 
 // The depth of the request queue for the instance.
@@ -125,6 +126,9 @@ pub enum Error {
 
     #[error("Instance dropped our request")]
     RequestDropped(#[from] oneshot::error::RecvError),
+
+    #[error("VMM reservoir watch dropped")]
+    VmmReservoirWatchDropped(#[from] watch::error::RecvError),
 }
 
 // Issues read-only, idempotent HTTP requests at propolis until it responds with
@@ -354,6 +358,10 @@ struct InstanceRunner {
 
     // Object representing membership in the "instance manager".
     instance_ticket: InstanceTicket,
+
+    // Used to ensure the reservoir is allocated with enough capacity
+    // for starting an instance.
+    vmm_reservoir_watch: watch::Receiver<Option<ByteCount>>,
 }
 
 impl InstanceRunner {
@@ -948,6 +956,7 @@ impl Instance {
             storage,
             zone_bundler,
             zone_builder_factory,
+            vmm_reservoir_watch,
         } = services;
 
         let mut dhcp_config = DhcpCfg {
@@ -1028,6 +1037,7 @@ impl Instance {
             zone_builder_factory,
             zone_bundler,
             instance_ticket: ticket,
+            vmm_reservoir_watch,
         };
 
         let runner_handle =
@@ -1179,6 +1189,20 @@ impl InstanceRunner {
         &mut self,
         migration_params: Option<InstanceMigrationTargetParams>,
     ) -> Result<(), Error> {
+        // Wait for enough allocated reservoir space for at least this VM.
+        let mem_needed = self.properties.memory;
+        let val_ref = self
+            .vmm_reservoir_watch
+            .wait_for(|val| {
+                val.map_or(false, |size| {
+                    size.to_whole_mebibytes() >= mem_needed
+                })
+            })
+            .await?;
+        // Drop the ref so we don't hold the lock on the watch and block
+        // the producer.
+        drop(val_ref);
+
         if let Some(running_state) = self.running_state.as_ref() {
             info!(
                 &self.log,

diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs
@@ -13,24 +13,25 @@ use crate::params::{
     InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse,
     InstanceStateRequested, InstanceUnregisterResponse,
 };
+use crate::vmm_reservoir::VmmReservoirManagerHandle;
 use crate::zone_bundle::BundleError;
 use crate::zone_bundle::ZoneBundler;
+use omicron_common::api::external::ByteCount;
 
 use anyhow::anyhow;
 use illumos_utils::dladm::Etherstub;
 use illumos_utils::link::VnicAllocator;
 use illumos_utils::opte::PortManager;
 use illumos_utils::running_zone::ZoneBuilderFactory;
-use illumos_utils::vmm_reservoir;
-use omicron_common::api::external::ByteCount;
 use omicron_common::api::internal::nexus::InstanceRuntimeState;
 use omicron_common::api::internal::nexus::SledInstanceState;
 use omicron_common::api::internal::nexus::VmmRuntimeState;
 use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use std::collections::BTreeMap;
 use std::net::SocketAddr;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
+use tokio::sync::watch;
 use tokio::sync::{mpsc, oneshot};
 use uuid::Uuid;
 
@@ -48,12 +49,6 @@ pub enum Error {
     #[error("OPTE port management error: {0}")]
     Opte(#[from] illumos_utils::opte::Error),
 
-    #[error("Failed to create reservoir: {0}")]
-    Reservoir(#[from] vmm_reservoir::Error),
-
-    #[error("Invalid reservoir configuration: {0}")]
-    ReservoirConfig(String),
-
     #[error("Cannot find data link: {0}")]
     Underlay(#[from] sled_hardware::underlay::Error),
 
@@ -72,32 +67,21 @@ pub enum Error {
     RequestDropped(#[from] oneshot::error::RecvError),
 }
 
-pub enum ReservoirMode {
-    None,
-    Size(u32),
-    Percentage(u8),
-}
-
 pub(crate) struct InstanceManagerServices {
     pub nexus_client: NexusClientWithResolver,
     pub vnic_allocator: VnicAllocator<Etherstub>,
     pub port_manager: PortManager,
     pub storage: StorageHandle,
     pub zone_bundler: ZoneBundler,
     pub zone_builder_factory: ZoneBuilderFactory,
+    pub vmm_reservoir_watch: watch::Receiver<Option<ByteCount>>,
 }
 
 // Describes the internals of the "InstanceManager", though most of the
 // instance manager's state exists within the "InstanceManagerRunner" structure.
 struct InstanceManagerInternal {
-    log: Logger,
     tx: mpsc::Sender<InstanceManagerRequest>,
-    // NOTE: Arguably, this field could be "owned" by the InstanceManagerRunner.
-    // It was not moved there, and the reservoir functions were not converted to
-    // use the message-passing interface (see: "InstanceManagerRequest") because
-    // callers of "get/set reservoir size" are not async, and (in the case of
-    // getting the size) they also do not expect a "Result" type.
-    reservoir_size: Mutex<ByteCount>,
+    vmm_reservoir_manager: VmmReservoirManagerHandle,
 
     #[allow(dead_code)]
     runner_handle: tokio::task::JoinHandle<()>,
@@ -110,6 +94,7 @@ pub struct InstanceManager {
 
 impl InstanceManager {
     /// Initializes a new [`InstanceManager`] object.
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         log: Logger,
         nexus_client: NexusClientWithResolver,
@@ -118,6 +103,7 @@ impl InstanceManager {
         storage: StorageHandle,
         zone_bundler: ZoneBundler,
         zone_builder_factory: ZoneBuilderFactory,
+        vmm_reservoir_manager: VmmReservoirManagerHandle,
     ) -> Result<InstanceManager, Error> {
         let (tx, rx) = mpsc::channel(QUEUE_SIZE);
         let (terminate_tx, terminate_rx) = mpsc::unbounded_channel();
@@ -135,98 +121,21 @@ impl InstanceManager {
             storage,
             zone_bundler,
             zone_builder_factory,
+            vmm_reservoir_watch: vmm_reservoir_manager.watcher(),
         };
 
         let runner_handle =
             tokio::task::spawn(async move { runner.run().await });
 
         Ok(Self {
             inner: Arc::new(InstanceManagerInternal {
-                log,
                 tx,
-                // no reservoir size set on startup
-                reservoir_size: Mutex::new(ByteCount::from_kibibytes_u32(0)),
+                vmm_reservoir_manager,
                 runner_handle,
             }),
         })
     }
 
-    /// Sets the VMM reservoir to the requested percentage of usable physical
-    /// RAM or to a size in MiB. Either mode will round down to the nearest
-    /// aligned size required by the control plane.
-    pub fn set_reservoir_size(
-        &self,
-        hardware: &sled_hardware::HardwareManager,
-        mode: ReservoirMode,
-    ) -> Result<(), Error> {
-        let hardware_physical_ram_bytes = hardware.usable_physical_ram_bytes();
-        let req_bytes = match mode {
-            ReservoirMode::None => return Ok(()),
-            ReservoirMode::Size(mb) => {
-                let bytes = ByteCount::from_mebibytes_u32(mb).to_bytes();
-                if bytes > hardware_physical_ram_bytes {
-                    return Err(Error::ReservoirConfig(format!(
-                        "cannot specify a reservoir of {bytes} bytes when \
-                        physical memory is {hardware_physical_ram_bytes} bytes",
-                    )));
-                }
-                bytes
-            }
-            ReservoirMode::Percentage(percent) => {
-                if !matches!(percent, 1..=99) {
-                    return Err(Error::ReservoirConfig(format!(
-                        "VMM reservoir percentage of {} must be between 0 and \
-                        100",
-                        percent
-                    )));
-                };
-                (hardware_physical_ram_bytes as f64 * (percent as f64 / 100.0))
-                    .floor() as u64
-            }
-        };
-
-        let req_bytes_aligned = vmm_reservoir::align_reservoir_size(req_bytes);
-
-        if req_bytes_aligned == 0 {
-            warn!(
-                self.inner.log,
-                "Requested reservoir size of {} bytes < minimum aligned size \
-                of {} bytes",
-                req_bytes,
-                vmm_reservoir::RESERVOIR_SZ_ALIGN
-            );
-            return Ok(());
-        }
-
-        // The max ByteCount value is i64::MAX, which is ~8 million TiB.
-        // As this value is either a percentage of DRAM or a size in MiB
-        // represented as a u32, constructing this should always work.
-        let reservoir_size = ByteCount::try_from(req_bytes_aligned).unwrap();
-        if let ReservoirMode::Percentage(percent) = mode {
-            info!(
-                self.inner.log,
-                "{}% of {} physical ram = {} bytes)",
-                percent,
-                hardware_physical_ram_bytes,
-                req_bytes,
-            );
-        }
-        info!(
-            self.inner.log,
-            "Setting reservoir size to {reservoir_size} bytes"
-        );
-        vmm_reservoir::ReservoirControl::set(reservoir_size)?;
-
-        *self.inner.reservoir_size.lock().unwrap() = reservoir_size;
-
-        Ok(())
-    }
-
-    /// Returns the last-set size of the reservoir
-    pub fn reservoir_size(&self) -> ByteCount {
-        *self.inner.reservoir_size.lock().unwrap()
-    }
-
     pub async fn ensure_registered(
         &self,
         instance_id: Uuid,
@@ -379,6 +288,11 @@ impl InstanceManager {
             .map_err(|_| Error::FailedSendInstanceManagerClosed)?;
         rx.await?
     }
+
+    /// Returns the last-set size of the reservoir
+    pub fn reservoir_size(&self) -> ByteCount {
+        self.inner.vmm_reservoir_manager.reservoir_size()
+    }
 }
 
 // Most requests that can be sent to the "InstanceManagerRunner" task.
@@ -472,6 +386,7 @@ struct InstanceManagerRunner {
     storage: StorageHandle,
     zone_bundler: ZoneBundler,
     zone_builder_factory: ZoneBuilderFactory,
+    vmm_reservoir_watch: watch::Receiver<Option<ByteCount>>,
 }
 
 impl InstanceManagerRunner {
@@ -630,6 +545,7 @@ impl InstanceManagerRunner {
                     storage: self.storage.clone(),
                     zone_bundler: self.zone_bundler.clone(),
                     zone_builder_factory: self.zone_builder_factory.clone(),
+                    vmm_reservoir_watch: self.vmm_reservoir_watch.clone(),
                 };
 
                 let state = crate::instance::InstanceInitialState {

diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs
@@ -40,6 +40,7 @@ mod smf_helper;
 mod storage_monitor;
 mod swap_device;
 mod updates;
+mod vmm_reservoir;
 mod zone_bundle;
 
 #[cfg(test)]