From 405293eea4d2bb0269b6f46d74fa22073920e5d8 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Tue, 30 Sep 2025 14:42:50 -0700 Subject: [PATCH] Secure AVIC support --- openhcl/hcl/src/ioctl.rs | 39 +- openhcl/hcl/src/ioctl/snp.rs | 60 ++- openhcl/hcl/src/ioctl/tdx.rs | 8 +- openhcl/hcl/src/vmsa.rs | 6 + openhcl/virt_mshv_vtl/src/cvm_cpuid/snp.rs | 12 +- .../virt_mshv_vtl/src/processor/snp/mod.rs | 437 ++++++++++++++++-- .../virt_mshv_vtl/src/processor/tdx/mod.rs | 8 +- vm/hv1/hvdef/src/lib.rs | 40 +- vm/loader/igvmfilegen/src/file_loader.rs | 6 + vm/loader/igvmfilegen/src/main.rs | 6 + .../igvmfilegen/src/vp_context_builder/snp.rs | 18 + vm/loader/igvmfilegen_config/src/lib.rs | 12 + vm/loader/manifests/openhcl-x64-cvm-dev.json | 3 +- .../manifests/openhcl-x64-cvm-release.json | 5 +- vm/x86/x86defs/src/cpuid.rs | 12 +- vm/x86/x86defs/src/lib.rs | 8 + vm/x86/x86defs/src/snp.rs | 250 +++++++++- vm/x86/x86defs/src/vmx.rs | 13 +- 18 files changed, 873 insertions(+), 70 deletions(-) diff --git a/openhcl/hcl/src/ioctl.rs b/openhcl/hcl/src/ioctl.rs index 748f4f9589..8e87a1db90 100644 --- a/openhcl/hcl/src/ioctl.rs +++ b/openhcl/hcl/src/ioctl.rs @@ -80,9 +80,10 @@ use std::sync::atomic::Ordering; use thiserror::Error; use user_driver::DmaClient; use user_driver::memory::MemoryBlock; +use x86defs::snp::SevAvicPage; use x86defs::snp::SevVmsa; use x86defs::tdx::TdCallResultCode; -use x86defs::vmx::ApicPage; +use x86defs::vmx::VmxApicPage; use zerocopy::FromBytes; use zerocopy::FromZeros; use zerocopy::Immutable; @@ -394,6 +395,7 @@ mod ioctls { const MSHV_INVLPGB: u16 = 0x36; const MSHV_TLBSYNC: u16 = 0x37; const MSHV_KICKCPUS: u16 = 0x38; + const MSHV_VTL_SECURE_AVIC_VTL0_PFN: u16 = 0x39; #[repr(C)] #[derive(Copy, Clone)] @@ -571,6 +573,13 @@ mod ioctls { u64 ); + ioctl_readwrite!( + hcl_read_secure_avic_vtl0_pfn, + MSHV_IOCTL, + MSHV_VTL_SECURE_AVIC_VTL0_PFN, + u64 + ); + pub const HCL_CAP_REGISTER_PAGE: u32 = 1; pub const HCL_CAP_VTL_RETURN_ACTION: u32 = 2; pub const HCL_CAP_DR6_SHARED: u32 = 3; @@ -1662,9 +1671,12 @@ enum BackingState { }, Snp { vmsa: VtlArray, 2>, + vtl0_apic_page: MappedPage, + /// VTL1 runs with the alternate interrupt injection. + vtl1_apic_page: MemoryBlock, }, Tdx { - vtl0_apic_page: MappedPage, + vtl0_apic_page: MappedPage, vtl1_apic_page: MemoryBlock, }, } @@ -1728,6 +1740,12 @@ impl HclVp { .map_err(|e| Error::MmapVp(e, Some(Vtl::Vtl1)))?; BackingState::Snp { vmsa: [vmsa_vtl0, vmsa_vtl1].into(), + vtl0_apic_page: MappedPage::new(fd, MSHV_APIC_PAGE_OFFSET | vp as i64) + .map_err(|e| Error::MmapVp(e, Some(Vtl::Vtl0)))?, + vtl1_apic_page: private_dma_client + .ok_or(Error::MissingPrivateMemory)? + .allocate_dma_buffer(HV_PAGE_SIZE as usize) + .map_err(Error::AllocVp)?, } } IsolationType::Tdx => BackingState::Tdx { @@ -2254,6 +2272,8 @@ impl<'a, T: Backing<'a>> ProcessorRunner<'a, T> { | HvX64RegisterName::VsmVpSecureConfigVtl0 | HvX64RegisterName::VsmVpSecureConfigVtl1 | HvX64RegisterName::CrInterceptControl + | HvX64RegisterName::SevAvicGpa + | HvX64RegisterName::GuestVsmPartitionConfig ) )); self.set_vp_registers_hvcall_inner(vtl, ®isters) @@ -3208,6 +3228,21 @@ impl Hcl { vp_pfn } + /// Gets the PFN for the VTL 0 secure AVIC + pub fn secure_avic_vtl0_pfn(&self, cpu_index: u32) -> u64 { + let mut savic_pfn = cpu_index as u64; // input vp, output pfn + + // SAFETY: The ioctl requires no prerequisites other than the Secure AVIC VTL 0 + // should be mapped. This ioctl should never fail as long as the VTL 0 + // Secure AVIC was mapped. + unsafe { + hcl_read_secure_avic_vtl0_pfn(self.mshv_vtl.file.as_raw_fd(), &mut savic_pfn) + .expect("should always succeed"); + } + + savic_pfn + } + /// Returns the isolation type for the partition. pub fn isolation(&self) -> IsolationType { self.isolation diff --git a/openhcl/hcl/src/ioctl/snp.rs b/openhcl/hcl/src/ioctl/snp.rs index 3d5d8c2eb6..dd48dd0daf 100644 --- a/openhcl/hcl/src/ioctl/snp.rs +++ b/openhcl/hcl/src/ioctl/snp.rs @@ -25,12 +25,14 @@ use sidecar_client::SidecarVp; use std::cell::UnsafeCell; use std::os::fd::AsRawFd; use thiserror::Error; +use x86defs::snp::SevAvicPage; use x86defs::snp::SevRmpAdjust; use x86defs::snp::SevVmsa; /// Runner backing for SNP partitions. pub struct Snp<'a> { vmsa: VtlArray<&'a UnsafeCell, 2>, + avic_pages: VtlArray<&'a UnsafeCell, 2>, } /// Error returned by failing SNP operations. @@ -176,11 +178,21 @@ impl MshvVtl { impl<'a> super::private::BackingPrivate<'a> for Snp<'a> { fn new(vp: &'a HclVp, sidecar: Option<&SidecarVp<'_>>, _hcl: &Hcl) -> Result { assert!(sidecar.is_none()); - let super::BackingState::Snp { vmsa } = &vp.backing else { + let super::BackingState::Snp { + vtl0_apic_page, + vtl1_apic_page, + vmsa, + } = &vp.backing + else { return Err(NoRunner::MismatchedIsolation); }; + // SAFETY: The mapping is held for the appropriate lifetime, and the + // APIC page is never accessed as any other type, or by any other location. + let vtl1_apic_page = unsafe { &*vtl1_apic_page.base().cast() }; + Ok(Self { + avic_pages: [vtl0_apic_page.as_ref(), vtl1_apic_page].into(), vmsa: vmsa.each_ref().map(|mp| mp.as_ref()), }) } @@ -242,4 +254,50 @@ impl<'a> ProcessorRunner<'a, Snp<'a>> { }) .into_inner() } + + /// Gets a PFN of the VTL0 secure AVIC page. + /// TODO: Maybe there is a better way other than passing `cpu_index`` here. + pub fn secure_avic_vtl0_pfn(&self, cpu_index: u32) -> u64 { + self.hcl.secure_avic_vtl0_pfn(cpu_index) + } + + /// Gets a reference to the secure AVIC page for the given VTL. + pub fn secure_avic_page(&self, vtl: GuestVtl) -> &SevAvicPage { + // SAFETY: the APIC pages will not be concurrently accessed by the processor + // while this VP is in VTL2. + unsafe { &*self.state.avic_pages[vtl].get() } + } + + /// Gets a mutable reference to the secure AVIC page for the given VTL. + pub fn secure_avic_page_mut(&mut self, vtl: GuestVtl) -> &mut SevAvicPage { + // SAFETY: the AVIC pages will not be concurrently accessed by the processor + // while this VP is in VTL2. + unsafe { &mut *self.state.avic_pages[vtl].get() } + } + + /// Gets a mutable reference to the secure AVIC page for the given VTL. + pub fn secure_avic_page_vmsa_mut( + &mut self, + vtl: GuestVtl, + ) -> (&mut SevAvicPage, VmsaWrapper<'_, &mut SevVmsa>) { + // SAFETY: the AVIC pages will not be concurrently accessed by the processor + // while this VP is in VTL2. + let avic_page = unsafe { &mut *self.state.avic_pages[vtl].get() }; + let vmsa = self.vmsa_mut(vtl); + + (avic_page, vmsa) + } + + /// Gets a mutable reference to the secure AVIC page and the proxy_irr_exit field + pub fn secure_avic_page_proxy_irr_exit_vtl0_mut( + &mut self, + ) -> (&mut SevAvicPage, &mut [u32; 8]) { + // SAFETY: the AVIC pages will not be concurrently accessed by the processor + // while this VP is in VTL2. + let avic_page = unsafe { &mut *self.state.avic_pages[GuestVtl::Vtl0].get() }; + // SAFETY: The `proxy_irr_exit` field of the run page will not be concurrently updated. + let proxy_irr_vtl0 = unsafe { &mut (*self.run.get()).proxy_irr_exit }; + + (avic_page, proxy_irr_vtl0) + } } diff --git a/openhcl/hcl/src/ioctl/tdx.rs b/openhcl/hcl/src/ioctl/tdx.rs index 33604284a7..2f7f0fe327 100644 --- a/openhcl/hcl/src/ioctl/tdx.rs +++ b/openhcl/hcl/src/ioctl/tdx.rs @@ -39,12 +39,12 @@ use x86defs::tdx::TdxGp; use x86defs::tdx::TdxL2Ctls; use x86defs::tdx::TdxL2EnterGuestState; use x86defs::tdx::TdxVmFlags; -use x86defs::vmx::ApicPage; use x86defs::vmx::VmcsField; +use x86defs::vmx::VmxApicPage; /// Runner backing for TDX partitions. pub struct Tdx<'a> { - apic_pages: VtlArray<&'a UnsafeCell, 2>, + apic_pages: VtlArray<&'a UnsafeCell, 2>, } impl MshvVtl { @@ -123,14 +123,14 @@ impl<'a> ProcessorRunner<'a, Tdx<'a>> { } /// Gets a reference to the tdx APIC page for the given VTL. - pub fn tdx_apic_page(&self, vtl: GuestVtl) -> &ApicPage { + pub fn tdx_apic_page(&self, vtl: GuestVtl) -> &VmxApicPage { // SAFETY: the APIC pages will not be concurrently accessed by the processor // while this VP is in VTL2. unsafe { &*self.state.apic_pages[vtl].get() } } /// Gets a mutable reference to the tdx APIC page for the given VTL. - pub fn tdx_apic_page_mut(&mut self, vtl: GuestVtl) -> &mut ApicPage { + pub fn tdx_apic_page_mut(&mut self, vtl: GuestVtl) -> &mut VmxApicPage { // SAFETY: the APIC pages will not be concurrently accessed by the processor // while this VP is in VTL2. unsafe { &mut *self.state.apic_pages[vtl].get() } diff --git a/openhcl/hcl/src/vmsa.rs b/openhcl/hcl/src/vmsa.rs index 85de8f9d6c..4c5803eabf 100644 --- a/openhcl/hcl/src/vmsa.rs +++ b/openhcl/hcl/src/vmsa.rs @@ -7,6 +7,7 @@ use std::array; use std::ops::Deref; use std::ops::DerefMut; +use x86defs::snp::SecureAvicControl; use x86defs::snp::SevEventInjectInfo; use x86defs::snp::SevFeatures; use x86defs::snp::SevSelector; @@ -267,6 +268,11 @@ reg_direct_mut!(v_intr_cntrl, v_intr_cntrl_mut, SevVirtualInterruptControl); reg_direct!(virtual_tom, set_virtual_tom, u64); reg_direct!(event_inject, set_event_inject, SevEventInjectInfo); reg_direct!(guest_error_code, set_guest_error_code, u64); +reg_direct_mut!( + secure_avic_control, + secure_avic_control_mut, + SecureAvicControl +); regss!(es, set_es); regss!(cs, set_cs); regss!(ss, set_ss); diff --git a/openhcl/virt_mshv_vtl/src/cvm_cpuid/snp.rs b/openhcl/virt_mshv_vtl/src/cvm_cpuid/snp.rs index 1781549e36..4380923872 100644 --- a/openhcl/virt_mshv_vtl/src/cvm_cpuid/snp.rs +++ b/openhcl/virt_mshv_vtl/src/cvm_cpuid/snp.rs @@ -215,6 +215,7 @@ impl CpuidArchInitializer for SnpCpuidInitializer { .with_vmpl(true) .with_rmp_query(true) .with_tsc_aux_virtualization(true) + .with_secure_avic(true) .into(), cpuid::ExtendedSevFeaturesEbx::new() .with_cbit_position(0x3f) @@ -430,12 +431,21 @@ impl CpuidArchInitializer for SnpCpuidInitializer { .with_use_ex_processor_masks(true) // If only xAPIC is supported, then the Hyper-V MSRs are // more efficient for EOIs. + // // If X2APIC is supported, then we can use the X2APIC MSRs. These // are as efficient as the Hyper-V MSRs, and they are // compatible with APIC hardware offloads. // However, Lazy EOI on SNP is beneficial and requires the // Hyper-V MSRs to function. Enable it here always. - .with_use_apic_msrs(true) + // + // When Secure AVIC is enabled, x2APIC MSR accesses are + // not intercepted and are always intercepted when Secure AVIC is + // disabled (15.36.21.5 Guest APIC Accesses). As the production + // scenario is to have Secure AVIC enabled, we can disable the use + // of Hyper-V MSRs to save some performance overhead of intercept processing. + // + // Secure AVIC accelerates EOIs (15.36.21.5 Guest APIC Accesses). + .with_use_apic_msrs(false) .with_long_spin_wait_count(!0) .with_use_hypercall_for_remote_flush_and_local_flush_entire(true) .with_use_synthetic_cluster_ipi(true); diff --git a/openhcl/virt_mshv_vtl/src/processor/snp/mod.rs b/openhcl/virt_mshv_vtl/src/processor/snp/mod.rs index 2da625b9ff..2603b7dd33 100644 --- a/openhcl/virt_mshv_vtl/src/processor/snp/mod.rs +++ b/openhcl/virt_mshv_vtl/src/processor/snp/mod.rs @@ -22,11 +22,13 @@ use crate::UhCvmVpState; use crate::UhPartitionInner; use crate::UhPartitionNewParams; use crate::WakeReason; +use crate::devmsr; use crate::processor::UhHypercallHandler; use crate::processor::UhProcessor; use crate::processor::hardware_cvm::apic::ApicBacking; use cvm_tracing::CVM_ALLOWED; use cvm_tracing::CVM_CONFIDENTIAL; +use hcl::protocol::hcl_intr_offload_flags; use hcl::vmsa::VmsaWrapper; use hv1_emulator::hv::ProcessorVtlHv; use hv1_emulator::synic::ProcessorSynic; @@ -68,7 +70,14 @@ use virt_support_x86emu::emulate::emulate_translate_gva; use virt_support_x86emu::translate::TranslationRegisters; use vmcore::vmtime::VmTimeAccess; use x86defs::RFlags; +use x86defs::apic::X2APIC_MSR_BASE; use x86defs::cpuid::CpuidFunction; +use x86defs::snp::SecureAvicControl; +use x86defs::snp::SevAvicIncompleteIpiInfo1; +use x86defs::snp::SevAvicIncompleteIpiInfo2; +use x86defs::snp::SevAvicNoAccelInfo; +use x86defs::snp::SevAvicPage; +use x86defs::snp::SevAvicRegisterNumber; use x86defs::snp::SevEventInjectInfo; use x86defs::snp::SevExitCode; use x86defs::snp::SevInvlpgbEcx; @@ -120,6 +129,7 @@ struct GeneralStats { #[derive(Inspect, Default)] struct ExitStats { automatic_exit: Counter, + bus_lock: Counter, cpuid: Counter, hlt: Counter, intr: Counter, @@ -137,6 +147,8 @@ struct ExitStats { xsetbv: Counter, excp_db: Counter, secure_reg_write: Counter, + avic_no_accel: Counter, + avic_incomplete_ipi: Counter, } enum UhDirectOverlay { @@ -352,7 +364,7 @@ impl HardwareIsolatedBacking for SnpBacked { check_rflags: bool, dev: &impl CpuIo, ) -> bool { - let vmsa = this.runner.vmsa_mut(vtl); + let (avic_page, vmsa) = this.runner.secure_avic_page_vmsa_mut(vtl); if vmsa.event_inject().valid() && vmsa.event_inject().interruption_type() == x86defs::snp::SEV_INTR_TYPE_NMI { @@ -365,6 +377,7 @@ impl HardwareIsolatedBacking for SnpBacked { .access(&mut SnpApicClient { partition: this.partition, vmsa, + avic_page, dev, vmtime: &this.vmtime, vtl, @@ -400,6 +413,7 @@ pub struct SnpBackedShared { tsc_aux_virtualized: bool, #[inspect(debug)] sev_status: SevStatusMsr, + secure_avic: bool, } impl SnpBackedShared { @@ -414,12 +428,21 @@ impl SnpBackedShared { .result(CpuidFunction::ExtendedAddressSpaceSizes.0, 0, &[0; 4])[3], ) .invlpgb_count_max(); - let tsc_aux_virtualized = x86defs::cpuid::ExtendedSevFeaturesEax::from( + let extended_sev_features = x86defs::cpuid::ExtendedSevFeaturesEax::from( params .cpuid .result(CpuidFunction::ExtendedSevFeatures.0, 0, &[0; 4])[0], - ) - .tsc_aux_virtualization(); + ); + let tsc_aux_virtualized = extended_sev_features.tsc_aux_virtualization(); + + let msr = devmsr::MsrDevice::new(0).expect("open msr"); + let secure_avic = + SevStatusMsr::from(msr.read_msr(x86defs::X86X_AMD_MSR_SEV).expect("read msr")) + .secure_avic(); + + if secure_avic { + tracing::info!("Secure AVIC is available"); + } // Query the SEV_FEATURES MSR to determine the features enabled on VTL2's VMSA // and use that to set btb_isolation, prevent_host_ibs, and VMSA register protection. @@ -432,6 +455,7 @@ impl SnpBackedShared { sev_status, invlpgb_count_max, tsc_aux_virtualized, + secure_avic, cvm, }) } @@ -528,6 +552,47 @@ impl BackingPrivate for SnpBacked { this.runner .set_vp_registers_hvcall(Vtl::Vtl0, values) .expect("set_vp_registers hypercall for direct overlays should succeed"); + + let using_secure_avic = this + .runner + .vmsa(GuestVtl::Vtl0) + .sev_features() + .secure_avic(); + tracing::debug!(?using_secure_avic, "Using secure AVIC for VTL0"); + + if using_secure_avic { + // Specification: "SEV-ES Guest-Hypervisor Communication Block Standartization", + // 4.1.16.1 "Backing page support". + + let vtl0_avic_pfn = this.runner.secure_avic_vtl0_pfn(this.inner.cpu_index); + let mut vmsa = this.runner.vmsa_mut(GuestVtl::Vtl0); + let savic_ctrl = SecureAvicControl::from(vmsa.secure_avic_control()) + .with_secure_avic_en(1) + .with_guest_apic_backing_page_ptr(vtl0_avic_pfn); + *(vmsa.secure_avic_control_mut()) = savic_ctrl; + + this.set_apic_offload(GuestVtl::Vtl0, true); + + // Let the hypervisor know so it can improve performance. + + this.runner + .set_vp_register( + GuestVtl::Vtl0, + HvX64RegisterName::SevAvicGpa, + savic_ctrl.into_bits().into(), + ) + .expect("can communicate with the hypervisor"); + } + + // No secure AVIC for VTL 1. + assert!( + !this + .runner + .vmsa(GuestVtl::Vtl1) + .sev_features() + .secure_avic() + ); + this.set_apic_offload(GuestVtl::Vtl1, false); } type StateAccess<'p, 'a> @@ -552,10 +617,52 @@ impl BackingPrivate for SnpBacked { } fn poll_apic(this: &mut UhProcessor<'_, Self>, vtl: GuestVtl, scan_irr: bool) { + // TODO: If the APIC is offloaded, we need to process the IRRs + // from the offloaded page. + // Clear any pending interrupt. this.runner.vmsa_mut(vtl).v_intr_cntrl_mut().set_irq(false); - hardware_cvm::apic::poll_apic_core(this, vtl, scan_irr) + hardware_cvm::apic::poll_apic_core(this, vtl, scan_irr); + + // TODO: handle TMRs. + if this.backing.cvm.lapics[vtl].lapic.is_offloaded() { + debug_assert!(vtl == GuestVtl::Vtl0); + + match this.backing.cvm.lapics[vtl] + .lapic + .push_to_offload(|irr, isr, tmr| { + let (apic_page, proxy_irr_vtl0) = + this.runner.secure_avic_page_proxy_irr_exit_vtl0_mut(); + + for (((((irr, page_irr), isr), page_isr), tmr), proxy_irr_vtl0) in irr + .iter() + .zip(&mut apic_page.irr) + .zip(isr) + .zip(&mut apic_page.isr) + .zip(tmr) + .zip(proxy_irr_vtl0) + { + page_irr.value |= *irr; + page_isr.value |= *isr; + *proxy_irr_vtl0 = *tmr; + } + }) { + Ok(_) => {} + Err(_) => todo!(), + } + + // If there is a pending interrupt, clear the halted and idle state. + // TODO SNP: There are few other bits to take into account, such as the VintCtrl.GIF + // and the RFLAGS.IF ones as well as running in the interrupt shadow. + // Shouldn't be of concern for now as the guests account for these. + if matches!( + this.backing.cvm.lapics[vtl].activity, + MpState::Halted | MpState::Idle + ) { + this.backing.cvm.lapics[vtl].activity = MpState::Running; + } + } } fn request_extint_readiness(_this: &mut UhProcessor<'_, Self>) { @@ -633,6 +740,41 @@ impl BackingPrivate for SnpBacked { } } +impl UhProcessor<'_, SnpBacked> { + // TODO: cribbed from the TDX code. + // Try to avoid duplication. + fn access_apic_without_offload( + &mut self, + vtl: GuestVtl, + f: impl FnOnce(&mut Self) -> R, + ) -> R { + let offloaded = self.backing.cvm.lapics[vtl].lapic.is_offloaded(); + self.set_apic_offload(vtl, false); + let r = f(self); + self.set_apic_offload(vtl, offloaded); + r + } + + fn set_apic_offload(&mut self, vtl: GuestVtl, offload: bool) { + let offloaded = self.backing.cvm.lapics[vtl].lapic.is_offloaded(); + if !offload { + if offloaded { + debug_assert!(vtl == GuestVtl::Vtl0); + + let (irr, isr) = pull_apic_offload(self.runner.secure_avic_page_mut(vtl)); + self.backing.cvm.lapics[vtl] + .lapic + .disable_offload(&irr, &isr); + } + } else { + debug_assert!(vtl == GuestVtl::Vtl0); + if !offloaded { + self.backing.cvm.lapics[vtl].lapic.enable_offload(); + } + } + } +} + fn virt_seg_to_snp(val: SegmentRegister) -> SevSelector { SevSelector { selector: val.selector, @@ -690,9 +832,24 @@ fn init_vmsa( // handle injection and intercepts using trustworthy information. vmsa.sev_features_mut().set_alternate_injection(true); vmsa.sev_features_mut().set_reflect_vc(true); - vmsa.v_intr_cntrl_mut().set_guest_busy(true); vmsa.sev_features_mut().set_debug_swap(true); + // 15.36.16 Interrupt Injection Restrictions + // + // In guests that run with Alternate Injection, bit 63 of the encrypted VIntrCtrl field is defined as a BUSY + // bit. On VMRUN, if VIntrCtrl[BUSY] is set to 1, then the VMRUN fails with a VMEXIT_BUSY error + // code. The BUSY bit enables a VMSA to be temporarily marked non-runnable while software + // modifications are in progress. + if vmsa.sev_features().alternate_injection() { + vmsa.v_intr_cntrl_mut().set_guest_busy(true); + } + + if vtl == GuestVtl::Vtl0 && sev_status.secure_avic() { + vmsa.sev_features_mut().set_secure_avic(true); + vmsa.sev_features_mut().set_guest_intercept_control(true); + vmsa.sev_features_mut().set_alternate_injection(false); + } + let vmpl = match vtl { GuestVtl::Vtl0 => Vmpl::Vmpl2, GuestVtl::Vtl1 => Vmpl::Vmpl1, @@ -711,6 +868,7 @@ fn init_vmsa( struct SnpApicClient<'a, T> { partition: &'a UhPartitionInner, vmsa: VmsaWrapper<'a, &'a mut SevVmsa>, + avic_page: &'a mut SevAvicPage, dev: &'a T, vmtime: &'a VmTimeAccess, vtl: GuestVtl, @@ -743,10 +901,26 @@ impl ApicClient for SnpApicClient<'_, T> { } fn pull_offload(&mut self) -> ([u32; 8], [u32; 8]) { - unreachable!() + assert_eq!(self.vtl, GuestVtl::Vtl0); + pull_apic_offload(self.avic_page) } } +fn pull_apic_offload(page: &mut SevAvicPage) -> ([u32; 8], [u32; 8]) { + let mut irr = [0; 8]; + let mut isr = [0; 8]; + for (((irr, page_irr), isr), page_isr) in irr + .iter_mut() + .zip(page.irr.iter_mut()) + .zip(isr.iter_mut()) + .zip(page.isr.iter_mut()) + { + *irr = std::mem::take(&mut page_irr.value); + *isr = std::mem::take(&mut page_isr.value); + } + (irr, isr) +} + impl UhHypercallHandler<'_, '_, T, SnpBacked> { // Trusted hypercalls from the guest. const TRUSTED_DISPATCHER: hv1_hypercall::Dispatcher = hv1_hypercall::dispatcher!( @@ -1020,12 +1194,13 @@ impl UhProcessor<'_, SnpBacked> { entered_from_vtl: GuestVtl, msr: u32, is_write: bool, + is_fault: bool, ) { if is_write && self.cvm_try_protect_msr_write(entered_from_vtl, msr) { return; } - let vmsa = self.runner.vmsa_mut(entered_from_vtl); + let (avic_page, vmsa) = self.runner.secure_avic_page_vmsa_mut(entered_from_vtl); let gp = if is_write { let value = (vmsa.rax() as u32 as u64) | ((vmsa.rdx() as u32 as u64) << 32); @@ -1034,6 +1209,7 @@ impl UhProcessor<'_, SnpBacked> { .access(&mut SnpApicClient { partition: self.partition, vmsa, + avic_page, dev, vmtime: &self.vmtime, vtl: entered_from_vtl, @@ -1056,6 +1232,7 @@ impl UhProcessor<'_, SnpBacked> { .access(&mut SnpApicClient { partition: self.partition, vmsa, + avic_page, dev, vmtime: &self.vmtime, vtl: entered_from_vtl, @@ -1093,7 +1270,9 @@ impl UhProcessor<'_, SnpBacked> { .with_valid(true), ); } else { - advance_to_next_instruction(&mut vmsa); + if is_fault { + advance_to_next_instruction(&mut vmsa); + } } } @@ -1195,22 +1374,42 @@ impl UhProcessor<'_, SnpBacked> { let mut vmsa = self.runner.vmsa_mut(next_vtl); let last_interrupt_ctrl = vmsa.v_intr_cntrl(); - if vmsa.sev_features().alternate_injection() { - vmsa.v_intr_cntrl_mut().set_guest_busy(false); - } + // OpenHCL runs with: + // * the alternate interrupt injection, and the busy bit is used by the software to + // disallow running the VMSA, OR + // * the secure AVIC, where the hardware might set the busy bit on exits + // ("15.36.16 Interrupt Injection Restrictions", "15.36.21.5 Guest APIC Accesses") + // Clear the guest busy bit uncoditionally as the prerequisites are met in either case. + vmsa.v_intr_cntrl_mut().set_guest_busy(false); self.unlock_tlb_lock(Vtl::Vtl2); let tlb_halt = self.should_halt_for_tlb_unlock(next_vtl); - let halt = self.backing.cvm.lapics[next_vtl].activity != MpState::Running || tlb_halt; + // If we are halted in the kernel due to hlt or idle, and we receive an interrupt + // we'd like to unhalt, inject the interrupt, and resume vtl0 without returning to + // user-mode. To enable this, the kernel must know why are are halted + let activity = self.backing.cvm.lapics[next_vtl].activity; + let kernel_known_state = + matches!(activity, MpState::Running | MpState::Halted | MpState::Idle); + let halted_other = tlb_halt || !kernel_known_state; + + self.runner.set_halted(halt); + self.runner.set_exit_vtl(next_vtl); + if halt && next_vtl == GuestVtl::Vtl1 && !tlb_halt { tracelimit::warn_ratelimited!(CVM_ALLOWED, "halting VTL 1, which might halt the guest"); } - self.runner.set_halted(halt); - - self.runner.set_exit_vtl(next_vtl); + let x2apic_enabled = self.backing.cvm.lapics[next_vtl].lapic.x2apic_enabled(); + let offload_enabled = self.backing.cvm.lapics[next_vtl].lapic.can_offload_irr(); + let offload_flags = hcl_intr_offload_flags::new() + .with_offload_intr_inject(offload_enabled) + .with_offload_x2apic(offload_enabled && x2apic_enabled) + .with_halted_other(halted_other) + .with_halted_hlt(activity == MpState::Halted) + .with_halted_idle(activity == MpState::Idle); + *self.runner.offload_flags_mut() = offload_flags; // Set the lazy EOI bit just before running. let lazy_eoi = self.sync_lazy_eoi(next_vtl); @@ -1221,7 +1420,7 @@ impl UhProcessor<'_, SnpBacked> { .map_err(|e| dev.fatal_error(SnpRunVpError(e).into()))?; let entered_from_vtl = next_vtl; - let mut vmsa = self.runner.vmsa_mut(entered_from_vtl); + let (avic_page, mut vmsa) = self.runner.secure_avic_page_vmsa_mut(entered_from_vtl); // TODO SNP: The guest busy bit needs to be tested and set atomically. let inject = if vmsa.sev_features().alternate_injection() { @@ -1258,7 +1457,15 @@ impl UhProcessor<'_, SnpBacked> { None } } else { - unimplemented!("Only alternate injection is supported for SNP") + // "15.36 Secure Nested Paging (SEV-SNP" + // "15.36.21 Secure AVIC" + // "15.36.21.2 VMRUN and #VMEXIT" + // The processor reinjects events automatically. + assert!( + vmsa.sev_features().secure_avic(), + "secure AVIC must be enabled" + ); + None }; if let Some(inject) = inject { @@ -1272,6 +1479,8 @@ impl UhProcessor<'_, SnpBacked> { self.backing.general_stats[entered_from_vtl] .int_ack .increment(); + // TODO: Account for the offloaded state. + // The guest has acknowledged the interrupt. self.backing.cvm.lapics[entered_from_vtl] .lapic @@ -1287,6 +1496,7 @@ impl UhProcessor<'_, SnpBacked> { .access(&mut SnpApicClient { partition: self.partition, vmsa, + avic_page, dev, vmtime: &self.vmtime, vtl: entered_from_vtl, @@ -1314,8 +1524,8 @@ impl UhProcessor<'_, SnpBacked> { SevExitCode::MSR => { let is_write = vmsa.exit_info1() & 1 != 0; let msr = vmsa.rcx() as u32; - - self.handle_msr_access(dev, entered_from_vtl, msr, is_write); + let is_fault = true; + self.handle_msr_access(dev, entered_from_vtl, msr, is_write, is_fault); if is_write { &mut self.backing.exit_stats[entered_from_vtl].msr_write @@ -1529,12 +1739,18 @@ impl UhProcessor<'_, SnpBacked> { | SevExitCode::PAUSE | SevExitCode::SMI | SevExitCode::VMGEXIT - | SevExitCode::BUSLOCK | SevExitCode::IDLE_HLT => { // Ignore intercept processing if the guest exited due to an automatic exit. &mut self.backing.exit_stats[entered_from_vtl].automatic_exit } + SevExitCode::BUSLOCK => { + // The guest performs a misaligned atomic operation, + // or updating A/D bits in the PTEs. Might help in investigating + // performance issues. + &mut self.backing.exit_stats[entered_from_vtl].bus_lock + } + SevExitCode::VINTR => { // Receipt of a virtual interrupt intercept indicates that a virtual interrupt is ready // for injection but injection cannot complete due to the intercept. Rewind the pending @@ -1591,6 +1807,129 @@ impl UhProcessor<'_, SnpBacked> { &mut self.backing.exit_stats[entered_from_vtl].secure_reg_write } + SevExitCode::AVIC_NOACCEL => { + let no_accel_info = SevAvicNoAccelInfo::from(vmsa.exit_info1()); + tracing::debug!("AVIC no acceleration SEV exit: {no_accel_info:x?}"); + + assert!( + matches!( + no_accel_info.apic_register_number(), + SevAvicRegisterNumber::APIC_ID + | SevAvicRegisterNumber::VERSION + | SevAvicRegisterNumber::TPR + | SevAvicRegisterNumber::APR + | SevAvicRegisterNumber::PPR + | SevAvicRegisterNumber::EOI + | SevAvicRegisterNumber::REMOTE_READ + | SevAvicRegisterNumber::LDR + | SevAvicRegisterNumber::DFR + | SevAvicRegisterNumber::SPURIOUS + | SevAvicRegisterNumber::ISR0 + | SevAvicRegisterNumber::ISR1 + | SevAvicRegisterNumber::ISR2 + | SevAvicRegisterNumber::ISR3 + | SevAvicRegisterNumber::ISR4 + | SevAvicRegisterNumber::ISR5 + | SevAvicRegisterNumber::ISR6 + | SevAvicRegisterNumber::ISR7 + | SevAvicRegisterNumber::TMR0 + | SevAvicRegisterNumber::TMR1 + | SevAvicRegisterNumber::TMR2 + | SevAvicRegisterNumber::TMR3 + | SevAvicRegisterNumber::TMR4 + | SevAvicRegisterNumber::TMR5 + | SevAvicRegisterNumber::TMR6 + | SevAvicRegisterNumber::TMR7 + | SevAvicRegisterNumber::IRR0 + | SevAvicRegisterNumber::IRR1 + | SevAvicRegisterNumber::IRR2 + | SevAvicRegisterNumber::IRR3 + | SevAvicRegisterNumber::IRR4 + | SevAvicRegisterNumber::IRR5 + | SevAvicRegisterNumber::IRR6 + | SevAvicRegisterNumber::IRR7 + | SevAvicRegisterNumber::ERROR + | SevAvicRegisterNumber::ICR_LOW + | SevAvicRegisterNumber::ICR_HIGH + | SevAvicRegisterNumber::TIMER_LVT + | SevAvicRegisterNumber::THERMAL_LVT + | SevAvicRegisterNumber::PERFMON_LVT + | SevAvicRegisterNumber::LINT0_LVT + | SevAvicRegisterNumber::LINT1_LVT + | SevAvicRegisterNumber::ERROR_LVT + | SevAvicRegisterNumber::INITIAL_COUNT + | SevAvicRegisterNumber::CURRENT_COUNT + | SevAvicRegisterNumber::DIVIDER + | SevAvicRegisterNumber::SELF_IPI + ), + "unexpected AVIC register number {:#x?}", + no_accel_info.apic_register_number() + ); + + // Might be a fault (where the hardware doesn't advance the + // instruction pointer) or a trap (where the hardware + // advances the instruction pointer). + let is_write = no_accel_info.write_access(); + let is_fault = matches!( + no_accel_info.apic_register_number(), + SevAvicRegisterNumber::VERSION + | SevAvicRegisterNumber::APR + | SevAvicRegisterNumber::PPR + | SevAvicRegisterNumber::ISR0 + | SevAvicRegisterNumber::ISR1 + | SevAvicRegisterNumber::ISR2 + | SevAvicRegisterNumber::ISR3 + | SevAvicRegisterNumber::ISR4 + | SevAvicRegisterNumber::ISR5 + | SevAvicRegisterNumber::ISR6 + | SevAvicRegisterNumber::ISR7 + | SevAvicRegisterNumber::TMR0 + | SevAvicRegisterNumber::TMR1 + | SevAvicRegisterNumber::TMR2 + | SevAvicRegisterNumber::TMR3 + | SevAvicRegisterNumber::TMR4 + | SevAvicRegisterNumber::TMR5 + | SevAvicRegisterNumber::TMR6 + | SevAvicRegisterNumber::TMR7 + | SevAvicRegisterNumber::IRR0 + | SevAvicRegisterNumber::IRR1 + | SevAvicRegisterNumber::IRR2 + | SevAvicRegisterNumber::IRR3 + | SevAvicRegisterNumber::IRR4 + | SevAvicRegisterNumber::IRR5 + | SevAvicRegisterNumber::IRR6 + | SevAvicRegisterNumber::IRR7 + | SevAvicRegisterNumber::CURRENT_COUNT + ); + let msr = X2APIC_MSR_BASE + no_accel_info.apic_register_number().0; + self.handle_msr_access(dev, entered_from_vtl, msr, is_write, is_fault); + + &mut self.backing.exit_stats[entered_from_vtl].avic_no_accel + } + + SevExitCode::AVIC_INCOMPLETE_IPI => { + let ipi_info1 = SevAvicIncompleteIpiInfo1::from(vmsa.exit_info1()); + let ipi_info2 = SevAvicIncompleteIpiInfo2::from(vmsa.exit_info2()); + let icr = x86defs::apic::Icr::from_bits(vmsa.exit_info1()); + + tracing::debug!( + "AVIC incomplete IPI SEV exit: {ipi_info1:x?} {ipi_info2:x?}, {icr:x?}" + ); + + // This a trap, and the hardware has already advanced the instruction pointer: + // "15.36.21.5 Guest APIC Accesses". + let is_fault = false; + let is_write = true; + let msr = X2APIC_MSR_BASE + x86defs::apic::ApicRegister::ICR0.0 as u32; + + // As the ICR is accessed through the `wrmsr` instruction (secure AVIC allows only + // the x2 APIC access), we already have `rax` and `rdx` set to the desired value by + // the guest. + self.handle_msr_access(dev, entered_from_vtl, msr, is_write, is_fault); + + &mut self.backing.exit_stats[entered_from_vtl].avic_incomplete_ipi + } + _ => { tracing::error!( CVM_CONFIDENTIAL, @@ -1849,11 +2188,13 @@ impl X86EmulatorSupport for UhEmulationState<'_, '_, T, SnpBacked> { fn lapic_read(&mut self, address: u64, data: &mut [u8]) { let vtl = self.vtl; + let (avic_page, vmsa) = self.vp.runner.secure_avic_page_vmsa_mut(vtl); self.vp.backing.cvm.lapics[vtl] .lapic .access(&mut SnpApicClient { partition: self.vp.partition, - vmsa: self.vp.runner.vmsa_mut(vtl), + vmsa, + avic_page, dev: self.devices, vmtime: &self.vp.vmtime, vtl, @@ -1863,11 +2204,13 @@ impl X86EmulatorSupport for UhEmulationState<'_, '_, T, SnpBacked> { fn lapic_write(&mut self, address: u64, data: &[u8]) { let vtl = self.vtl; + let (avic_page, vmsa) = self.vp.runner.secure_avic_page_vmsa_mut(vtl); self.vp.backing.cvm.lapics[vtl] .lapic .access(&mut SnpApicClient { partition: self.vp.partition, - vmsa: self.vp.runner.vmsa_mut(vtl), + vmsa, + avic_page, dev: self.devices, vmtime: &self.vp.vmtime, vtl, @@ -2093,15 +2436,20 @@ impl AccessVpState for UhVpStateAccess<'_, '_, SnpBacked> { } fn apic(&mut self) -> Result { - Ok(self.vp.backing.cvm.lapics[self.vtl].lapic.save()) + self.vp.access_apic_without_offload(self.vtl, |vp| { + Ok(vp.backing.cvm.lapics[self.vtl].lapic.save()) + }) } fn set_apic(&mut self, value: &vp::Apic) -> Result<(), Self::Error> { - self.vp.backing.cvm.lapics[self.vtl] - .lapic - .restore(value) - .map_err(vp_state::Error::InvalidApicBase)?; - Ok(()) + self.vp.access_apic_without_offload(self.vtl, |vp| { + vp.backing.cvm.lapics[self.vtl] + .lapic + .restore(value) + .map_err(vp_state::Error::InvalidApicBase)?; + + Ok(()) + }) } fn xcr(&mut self) -> Result { @@ -2317,9 +2665,36 @@ impl AccessVpState for UhVpStateAccess<'_, '_, SnpBacked> { } } -/// Advances rip to be the same as next_rip. +/// Advances the instruction pointer. +/// +/// The hardware may have provided the next instruction pointer in the VMSA, so we +/// use that if available. That is always the case for the automatic exits (exit on #VC +/// when ReflectVC is set in VMSA SEV features). If the hypervisor interaction si not +/// required, there would be no #VC exit, and the next instruction pointer would be +/// not populated by the hardware. See +/// * 15.35.4 Types of Exits +/// * 15.35.5 #VC Exception +/// in the AMD PPR for more details. fn advance_to_next_instruction(vmsa: &mut VmsaWrapper<'_, &mut SevVmsa>) { - vmsa.set_rip(vmsa.next_rip()); + match SevExitCode(vmsa.guest_error_code()) { + SevExitCode::AVIC_NOACCEL => { + // Access is performed via WRMSR/RDMSR, + // no next RIP is provided. + vmsa.set_rip(vmsa.rip() + 2); + } + _ => vmsa.set_rip(vmsa.next_rip()), + } + + // TODO SNP: provide the precise implementaion for + // the next instruction pointer. For now, as a heuristic, we + // we report on `0`'s in the rip field. The guest would need to + // execute an instruction at the top of the VA space to make the + // insrtruction pointer wrap around to `0` or fault at `0` -- + // seems unlikely. + if vmsa.rip() == 0 { + tracing::warn!("rip is zero, might need to parse the instruction stream"); + } + vmsa.v_intr_cntrl_mut().set_intr_shadow(false); } diff --git a/openhcl/virt_mshv_vtl/src/processor/tdx/mod.rs b/openhcl/virt_mshv_vtl/src/processor/tdx/mod.rs index 116d195e5b..8f979f62db 100644 --- a/openhcl/virt_mshv_vtl/src/processor/tdx/mod.rs +++ b/openhcl/virt_mshv_vtl/src/processor/tdx/mod.rs @@ -79,6 +79,7 @@ use virt_support_x86emu::emulate::emulate_io; use virt_support_x86emu::emulate::emulate_translate_gva; use virt_support_x86emu::translate::TranslationRegisters; use vmcore::vmtime::VmTimeAccess; +use x86defs::ApicRegister; use x86defs::RFlags; use x86defs::X64_CR0_ET; use x86defs::X64_CR0_NE; @@ -100,8 +101,6 @@ use x86defs::tdx::TdxGp; use x86defs::tdx::TdxInstructionInfo; use x86defs::tdx::TdxL2Ctls; use x86defs::tdx::TdxVpEnterRaxResult; -use x86defs::vmx::ApicPage; -use x86defs::vmx::ApicRegister; use x86defs::vmx::CR_ACCESS_TYPE_LMSW; use x86defs::vmx::CR_ACCESS_TYPE_MOV_TO_CR; use x86defs::vmx::CrAccessQualification; @@ -123,6 +122,7 @@ use x86defs::vmx::SecondaryProcessorControls; use x86defs::vmx::VMX_ENTRY_CONTROL_LONG_MODE_GUEST; use x86defs::vmx::VMX_FEATURE_CONTROL_LOCKED; use x86defs::vmx::VmcsField; +use x86defs::vmx::VmxApicPage; use x86defs::vmx::VmxEptExitQualification; use x86defs::vmx::VmxExit; use x86defs::vmx::VmxExitBasic; @@ -3447,7 +3447,7 @@ impl UhProcessor<'_, TdxBacked> { struct TdxApicClient<'a, T> { partition: &'a UhPartitionInner, - apic_page: &'a mut ApicPage, + apic_page: &'a mut VmxApicPage, dev: &'a T, vmtime: &'a VmTimeAccess, vtl: GuestVtl, @@ -3483,7 +3483,7 @@ impl ApicClient for TdxApicClient<'_, T> { } } -fn pull_apic_offload(page: &mut ApicPage) -> ([u32; 8], [u32; 8]) { +fn pull_apic_offload(page: &mut VmxApicPage) -> ([u32; 8], [u32; 8]) { let mut irr = [0; 8]; let mut isr = [0; 8]; for (((irr, page_irr), isr), page_isr) in irr diff --git a/vm/hv1/hvdef/src/lib.rs b/vm/hv1/hvdef/src/lib.rs index fd08bba304..bb7e82edd3 100644 --- a/vm/hv1/hvdef/src/lib.rs +++ b/vm/hv1/hvdef/src/lib.rs @@ -2329,6 +2329,8 @@ registers! { // AMD SEV configuration MSRs SevControl = 0x00090040, + SevGhcbGpa = 0x00090041, + SevAvicGpa = 0x00090043, CrInterceptControl = 0x000E0000, CrInterceptCr0Mask = 0x000E0001, @@ -3178,11 +3180,37 @@ pub struct HvRegisterVsmPartitionStatus { pub reserved: u64, } +open_enum! { + #[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] + pub enum HvSnpInterruptInjection : u8 { + #![allow(non_upper_case_globals)] + HvSnpRestricted = 0x0, + HvSnpNormal = 0x1, + HvSnpAlternate = 0x2, + HvSnpSecureAvic = 0x3, + } +} + +// Support for bitfield structures. +impl HvSnpInterruptInjection { + const fn from_bits(val: u8) -> Self { + HvSnpInterruptInjection(val) + } + + const fn into_bits(self) -> u8 { + self.0 + } +} + #[bitfield(u64)] pub struct HvRegisterGuestVsmPartitionConfig { #[bits(4)] pub maximum_vtl: u8, - #[bits(60)] + #[bits(2)] + pub vtl0_interrupt_injection: HvSnpInterruptInjection, + #[bits(2)] + pub vtl1_interrupt_injection: HvSnpInterruptInjection, + #[bits(56)] pub reserved: u64, } @@ -3645,6 +3673,16 @@ pub struct HvX64RegisterSevControl { pub vmsa_gpa_page_number: u64, } +#[bitfield(u64)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct HvX64RegisterSevAvic { + pub enable_secure_apic: bool, + #[bits(11)] + _rsvd1: u64, + #[bits(52)] + pub avic_gpa_page_number: u64, +} + #[bitfield(u64)] #[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] pub struct HvRegisterReferenceTsc { diff --git a/vm/loader/igvmfilegen/src/file_loader.rs b/vm/loader/igvmfilegen/src/file_loader.rs index baae840fa1..3e1927f43e 100644 --- a/vm/loader/igvmfilegen/src/file_loader.rs +++ b/vm/loader/igvmfilegen/src/file_loader.rs @@ -14,6 +14,7 @@ use crate::vp_context_builder::VpContextBuilder; use crate::vp_context_builder::VpContextPageState; use crate::vp_context_builder::VpContextState; use crate::vp_context_builder::snp::InjectionType; +use crate::vp_context_builder::snp::SecureAvic; use crate::vp_context_builder::snp::SnpHardwareContext; use crate::vp_context_builder::tdx::TdxHardwareContext; use crate::vp_context_builder::vbs::VbsRegister; @@ -149,6 +150,7 @@ pub enum LoaderIsolationType { shared_gpa_boundary_bits: Option, policy: SnpPolicy, injection_type: InjectionType, + secure_avic: SecureAvic, // TODO SNP: SNP Keys? Other data? }, Tdx { @@ -201,6 +203,7 @@ impl IgvmLoaderRegister for X86Register { shared_gpa_boundary_bits, policy, injection_type, + secure_avic, } => { // TODO SNP: assumed that shared_gpa_boundary is always available. let shared_gpa_boundary = @@ -227,6 +230,7 @@ impl IgvmLoaderRegister for X86Register { !with_paravisor, shared_gpa_boundary, injection_type, + secure_avic, )); (platform_header, vec![init_header], vp_context_builder) @@ -900,6 +904,7 @@ impl ImageLoad for IgvmVtlLoader shared_gpa_boundary_bits, policy: _, injection_type: _, + secure_avic: _, } => IsolationConfig { paravisor_present: self.loader.paravisor_present, isolation_type: IsolationType::Snp, @@ -1260,6 +1265,7 @@ mod tests { shared_gpa_boundary_bits: Some(39), policy: SnpPolicy::from((0x1 << 17) | (0x1 << 16) | (0x1f)), injection_type: InjectionType::Restricted, + secure_avic: SecureAvic::Enabled, }, ); let data = vec![0, 5]; diff --git a/vm/loader/igvmfilegen/src/main.rs b/vm/loader/igvmfilegen/src/main.rs index ecee5cad8b..810271b909 100644 --- a/vm/loader/igvmfilegen/src/main.rs +++ b/vm/loader/igvmfilegen/src/main.rs @@ -25,6 +25,7 @@ use igvmfilegen_config::Image; use igvmfilegen_config::LinuxImage; use igvmfilegen_config::ResourceType; use igvmfilegen_config::Resources; +use igvmfilegen_config::SecureAvicType; use igvmfilegen_config::SnpInjectionType; use igvmfilegen_config::UefiConfigType; use loader::importer::Aarch64Register; @@ -181,6 +182,7 @@ fn create_igvm_file( policy, enable_debug, injection_type, + secure_avic, } => LoaderIsolationType::Snp { shared_gpa_boundary_bits, policy: SnpPolicy::from(policy).with_debug(enable_debug as u8), @@ -190,6 +192,10 @@ fn create_igvm_file( vp_context_builder::snp::InjectionType::Restricted } }, + secure_avic: match secure_avic { + SecureAvicType::Enabled => vp_context_builder::snp::SecureAvic::Enabled, + SecureAvicType::Disabled => vp_context_builder::snp::SecureAvic::Disabled, + }, }, ConfigIsolationType::Tdx { enable_debug, diff --git a/vm/loader/igvmfilegen/src/vp_context_builder/snp.rs b/vm/loader/igvmfilegen/src/vp_context_builder/snp.rs index 473e34fad7..c665c382ff 100644 --- a/vm/loader/igvmfilegen/src/vp_context_builder/snp.rs +++ b/vm/loader/igvmfilegen/src/vp_context_builder/snp.rs @@ -29,6 +29,15 @@ pub enum InjectionType { Restricted, } +/// The secure AVIC. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum SecureAvic { + /// Offload AVIC to the hardware. + Enabled, + /// The paravisor emulates APIC. + Disabled, +} + /// A hardware SNP VP context, that is imported as a VMSA. #[derive(Debug)] pub struct SnpHardwareContext { @@ -59,6 +68,7 @@ impl SnpHardwareContext { enlightened_uefi: bool, shared_gpa_boundary: u64, injection_type: InjectionType, + secure_avic: SecureAvic, ) -> Self { let mut vmsa: SevVmsa = FromZeros::new_zeroed(); @@ -85,6 +95,12 @@ impl SnpHardwareContext { if vtl < HCL_SECURE_VTL { vmsa.sev_features .set_alternate_injection(injection_type == InjectionType::Restricted); + if injection_type == InjectionType::Normal { + vmsa.sev_features + .set_secure_avic(secure_avic == SecureAvic::Enabled); + vmsa.sev_features + .set_guest_intercept_control(secure_avic == SecureAvic::Enabled); + } } else { vmsa.sev_features .set_restrict_injection(injection_type == InjectionType::Restricted); @@ -92,6 +108,8 @@ impl SnpHardwareContext { vmsa.sev_features.set_prevent_host_ibs(true); vmsa.sev_features.set_vmsa_reg_prot(true); vmsa.sev_features.set_vtom(false); + vmsa.sev_features + .set_secure_avic(secure_avic == SecureAvic::Enabled); vmsa.virtual_tom = 0; } } diff --git a/vm/loader/igvmfilegen_config/src/lib.rs b/vm/loader/igvmfilegen_config/src/lib.rs index f1fa7d1179..4ce6b9bd3e 100644 --- a/vm/loader/igvmfilegen_config/src/lib.rs +++ b/vm/loader/igvmfilegen_config/src/lib.rs @@ -33,6 +33,16 @@ pub enum SnpInjectionType { Restricted, } +/// Secure AVIC type. +#[derive(Serialize, Deserialize, Debug)] +#[serde(rename_all = "snake_case")] +pub enum SecureAvicType { + /// Offload AVIC to the hardware. + Enabled, + /// The paravisor emulates APIC. + Disabled, +} + /// The isolation type that should be used for the loader. #[derive(Serialize, Deserialize, Debug)] #[serde(rename_all = "snake_case")] @@ -56,6 +66,8 @@ pub enum ConfigIsolationType { enable_debug: bool, /// The interrupt injection type to use for the highest vmpl. injection_type: SnpInjectionType, + /// Secure AVIC + secure_avic: SecureAvicType, }, /// Intel TDX. Tdx { diff --git a/vm/loader/manifests/openhcl-x64-cvm-dev.json b/vm/loader/manifests/openhcl-x64-cvm-dev.json index d5eef86cd0..2bcbea0501 100644 --- a/vm/loader/manifests/openhcl-x64-cvm-dev.json +++ b/vm/loader/manifests/openhcl-x64-cvm-dev.json @@ -9,7 +9,8 @@ "shared_gpa_boundary_bits": 46, "policy": 196639, "enable_debug": true, - "injection_type": "normal" + "injection_type": "normal", + "secure_avic": "enabled" } }, "image": { diff --git a/vm/loader/manifests/openhcl-x64-cvm-release.json b/vm/loader/manifests/openhcl-x64-cvm-release.json index 51b8f3e109..c04d128934 100644 --- a/vm/loader/manifests/openhcl-x64-cvm-release.json +++ b/vm/loader/manifests/openhcl-x64-cvm-release.json @@ -8,8 +8,9 @@ "snp": { "shared_gpa_boundary_bits": 46, "policy": 196639, - "enable_debug": false, - "injection_type": "normal" + "enable_debug": true, + "injection_type": "normal", + "secure_avic": "enabled" } }, "image": { diff --git a/vm/x86/x86defs/src/cpuid.rs b/vm/x86/x86defs/src/cpuid.rs index 2dfcc7d053..9995358b89 100644 --- a/vm/x86/x86defs/src/cpuid.rs +++ b/vm/x86/x86defs/src/cpuid.rs @@ -700,14 +700,18 @@ pub struct ExtendedSevFeaturesEax { pub vmgexit_parameter: bool, pub virtual_tom_msr: bool, pub ibs_virtualization: bool, - #[bits(4)] + #[bits(2)] _reserved1: u32, + pub guest_intercept_control: bool, + pub segmented_rmp: bool, pub vmsa_register_protection: bool, - #[bits(4)] - _reserved2: u32, + _reserved2: bool, + pub secure_avic: bool, + pub allowed_sev_features: bool, + _reserved3: bool, pub nested_virt_msr_snp: bool, #[bits(2)] - _reserved3: u32, + _reserved4: u32, } #[bitfield(u32)] diff --git a/vm/x86/x86defs/src/lib.rs b/vm/x86/x86defs/src/lib.rs index 33499b7af7..9a58bf2930 100644 --- a/vm/x86/x86defs/src/lib.rs +++ b/vm/x86/x86defs/src/lib.rs @@ -275,6 +275,7 @@ pub const X86X_AMD_MSR_NB_CFG: u32 = 0xC001001F; pub const X86X_AMD_MSR_VM_CR: u32 = 0xC0010114; pub const X86X_AMD_MSR_GHCB: u32 = 0xC0010130; pub const X86X_AMD_MSR_SEV: u32 = 0xC0010131; +pub const X86X_AMD_MSR_SECURE_AVIC_CONTROL: u32 = 0xc0010138; pub const X86X_AMD_MSR_OSVW_ID_LENGTH: u32 = 0xc0010140; pub const X86X_AMD_MSR_OSVW_ID_STATUS: u32 = 0xc0010141; pub const X86X_AMD_MSR_DE_CFG: u32 = 0xc0011029; @@ -518,3 +519,10 @@ pub struct X86xMcgStatusRegister { #[bits(61)] pub reserved0: u64, } + +#[repr(C)] +#[derive(Debug, Copy, Clone, IntoBytes, Immutable, KnownLayout, FromBytes)] +pub struct ApicRegister { + pub value: u32, + _reserved: [u32; 3], +} diff --git a/vm/x86/x86defs/src/snp.rs b/vm/x86/x86defs/src/snp.rs index ecfd4f6742..cac382ed4c 100644 --- a/vm/x86/x86defs/src/snp.rs +++ b/vm/x86/x86defs/src/snp.rs @@ -3,7 +3,9 @@ //! AMD SEV-SNP specific definitions. +use crate::ApicRegister; use bitfield_struct::bitfield; +use static_assertions::const_assert_eq; use zerocopy::FromBytes; use zerocopy::Immutable; use zerocopy::IntoBytes; @@ -120,12 +122,16 @@ pub struct SevFeatures { pub vmgexit_param: bool, pub pmc_virt: bool, pub ibs_virt: bool, - rsvd: bool, + pub guest_intercept_control: bool, pub vmsa_reg_prot: bool, pub smt_prot: bool, pub secure_avic: bool, - #[bits(47)] - _unused: u64, + #[bits(4)] + _reserved0: u64, + pub ibpb_on_entry: bool, + #[bits(41)] + _reserved1: u64, + pub allowed_sev_features_enable: bool, } #[bitfield(u64)] @@ -135,16 +141,21 @@ pub struct SevVirtualInterruptControl { pub irq: bool, pub gif: bool, pub intr_shadow: bool, - #[bits(5)] + pub nmi: bool, + pub nmi_mask: bool, + #[bits(3)] _rsvd1: u64, #[bits(4)] pub priority: u64, pub ignore_tpr: bool, - #[bits(11)] + #[bits(5)] _rsvd2: u64, + pub nmi_enable: bool, + #[bits(5)] + _rsvd3: u64, pub vector: u8, #[bits(23)] - _rsvd3: u64, + _rsvd4: u64, pub guest_busy: bool, } @@ -208,6 +219,162 @@ pub struct SevNpfInfo { rsvd38_63: u64, } +/// SEV secure AVIC control register +#[bitfield(u64)] +#[derive(IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)] +pub struct SecureAvicControl { + #[bits(1)] + pub secure_avic_en: u64, + #[bits(1)] + pub allowed_nmi: u64, + #[bits(10)] + _rsvd: u64, + #[bits(52)] + pub guest_apic_backing_page_ptr: u64, +} + +/// AVIC exit info1 for the incopmplete IPI exit +#[bitfield(u64)] +pub struct SevAvicIncompleteIpiInfo1 { + pub icr_low: u32, + pub icr_high: u32, +} + +open_enum::open_enum! { + pub enum SevAvicIpiFailure: u32 { + INVALID_TYPE = 0, + NOT_RUNNING = 1, + INVALID_TARGET = 2, + INVALID_BACKING_PAGE = 3, + INVALID_VECTOR = 4, + UNACCELERATED_IPI = 5, + } +} + +impl SevAvicIpiFailure { + const fn into_bits(self) -> u32 { + self.0 + } + + const fn from_bits(bits: u32) -> Self { + Self(bits) + } +} + +/// AVIC exit info2 for the incopmplete IPI exit +#[bitfield(u64)] +pub struct SevAvicIncompleteIpiInfo2 { + #[bits(8)] + pub index: u32, + #[bits(24)] + _mbz: u32, + #[bits(32)] + pub failure: SevAvicIpiFailure, +} + +open_enum::open_enum! { + pub enum SevAvicRegisterNumber: u32 { + /// APIC ID Register. + APIC_ID = 0x2, + /// APIC Version Register. + VERSION = 0x3, + /// Task Priority Register + TPR = 0x8, + /// Arbitration Priority Register. + APR = 0x9, + /// Processor Priority Register. + PPR = 0xA, + /// End Of Interrupt Register. + EOI = 0xB, + /// Remote Read Register + REMOTE_READ = 0xC, + /// Logical Destination Register. + LDR = 0xD, + /// Destination Format Register. + DFR = 0xE, + /// Spurious Interrupt Vector. + SPURIOUS = 0xF, + /// In-Service Registers. + ISR0 = 0x10, + ISR1 = 0x11, + ISR2 = 0x12, + ISR3 = 0x13, + ISR4 = 0x14, + ISR5 = 0x15, + ISR6 = 0x16, + ISR7 = 0x17, + /// Trigger Mode Registers. + TMR0 = 0x18, + TMR1 = 0x19, + TMR2 = 0x1A, + TMR3 = 0x1B, + TMR4 = 0x1C, + TMR5 = 0x1D, + TMR6 = 0x1E, + TMR7 = 0x1F, + /// Interrupt Request Registers. + IRR0 = 0x20, + IRR1 = 0x21, + IRR2 = 0x22, + IRR3 = 0x23, + IRR4 = 0x24, + IRR5 = 0x25, + IRR6 = 0x26, + IRR7 = 0x27, + /// Error Status Register. + ERROR = 0x28, + /// ICR Low. + ICR_LOW = 0x30, + /// ICR High. + ICR_HIGH = 0x31, + /// LVT Timer Register. + TIMER_LVT = 0x32, + /// LVT Thermal Register. + THERMAL_LVT = 0x33, + /// LVT Performance Monitor Register. + PERFMON_LVT = 0x34, + /// LVT Local Int0 Register. + LINT0_LVT = 0x35, + /// LVT Local Int1 Register. + LINT1_LVT = 0x36, + /// LVT Error Register. + ERROR_LVT = 0x37, + /// Initial count Register. + INITIAL_COUNT = 0x38, + /// R/O Current count Register. + CURRENT_COUNT = 0x39, + /// Divide configuration Register. + DIVIDER = 0x3e, + /// Self IPI register, only present in x2APIC. + SELF_IPI = 0x3f, + } +} + +impl SevAvicRegisterNumber { + const fn into_bits(self) -> u32 { + self.0 + } + + const fn from_bits(bits: u32) -> Self { + Self(bits) + } +} + +/// AVIC SEV exit info1 for the no acceleration exit +#[bitfield(u64)] +pub struct SevAvicNoAccelInfo { + #[bits(4)] + _rsvd1: u64, + #[bits(8)] + pub apic_register_number: SevAvicRegisterNumber, + #[bits(20)] + _rsvd2: u64, + #[bits(1)] + pub write_access: bool, + #[bits(31)] + _rsvd3: u64, +} + /// SEV VMSA structure representing CPU state #[repr(C)] #[derive(Debug, Clone, IntoBytes, Immutable, KnownLayout, FromBytes, PartialEq, Eq)] @@ -346,7 +513,7 @@ pub struct SevVmsa { pub rcx: u64, pub rdx: u64, pub rbx: u64, - pub vmsa_reserved8: u64, // MBZ + pub secure_avic_control: SecureAvicControl, pub rbp: u64, pub rsi: u64, pub rdi: u64, @@ -418,6 +585,65 @@ pub struct SevVmsa { pub ymm_registers: [SevXmmRegister; 16], } +#[repr(C)] +#[derive(Debug, Clone, IntoBytes, Immutable, KnownLayout, FromBytes)] +/// Structure representing the SEV-ES AVIC IRR register. +/// +/// If the UpdateIRR bit is set in the VMCB, the guest-controlled AllowedIRR mask +/// is logically AND-ed with the host-controlled RequestedIRR and then is logically +/// OR-ed into the IRR field in the Guest APIC Backing page. +pub struct SevAvicIrrRegister { + pub value: u32, + pub allowed: u32, + _reserved: [u32; 2], +} + +#[repr(C)] +#[derive(Debug, Clone, IntoBytes, Immutable, KnownLayout, FromBytes)] +/// Structure representing the SEV-ES AVIC backing page. +/// Specification: "AMD64 PPR Vol3 System Programming", 15.29.3 AVIC Backing Page. +pub struct SevAvicPage { + pub reserved_0: [ApicRegister; 2], + pub id: ApicRegister, + pub version: ApicRegister, + pub reserved_4: [ApicRegister; 4], + pub tpr: ApicRegister, + pub apr: ApicRegister, + pub ppr: ApicRegister, + pub eoi: ApicRegister, + pub rrd: ApicRegister, + pub ldr: ApicRegister, + pub dfr: ApicRegister, + pub svr: ApicRegister, + pub isr: [ApicRegister; 8], + pub tmr: [ApicRegister; 8], + pub irr: [SevAvicIrrRegister; 8], + pub esr: ApicRegister, + pub reserved_29: [ApicRegister; 6], + pub lvt_cmci: ApicRegister, + pub icr: [ApicRegister; 2], + pub lvt_timer: ApicRegister, + pub lvt_thermal: ApicRegister, + pub lvt_pmc: ApicRegister, + pub lvt_lint0: ApicRegister, + pub lvt_lint1: ApicRegister, + pub lvt_error: ApicRegister, + pub timer_icr: ApicRegister, + pub timer_ccr: ApicRegister, + pub reserved_3a: [ApicRegister; 4], + pub timer_dcr: ApicRegister, + pub self_ipi: ApicRegister, + pub eafr: ApicRegister, + pub eacr: ApicRegister, + pub seoi: ApicRegister, + pub reserved_44: [ApicRegister; 0x5], + pub ier: [ApicRegister; 8], + pub ei_lv_tr: [ApicRegister; 3], + pub reserved_54: [ApicRegister; 0xad], +} + +const_assert_eq!(size_of::(), 4096); + // Info codes for the GHCB MSR protocol. open_enum::open_enum! { pub enum GhcbInfo: u64 { @@ -772,14 +998,16 @@ pub struct SevStatusMsr { pub debug_swap: bool, pub prevent_host_ibs: bool, pub snp_btb_isolation: bool, - pub _rsvd1: bool, + pub vmpl_sss: bool, pub secure_tsc: bool, - pub _rsvd2: bool, + pub vmgexit_param: bool, pub _rsvd3: bool, - pub _rsvd4: bool, + pub ibs_virt: bool, pub _rsvd5: bool, pub vmsa_reg_prot: bool, - #[bits(47)] + pub smt_prot: bool, + pub secure_avic: bool, + #[bits(45)] _unused: u64, } diff --git a/vm/x86/x86defs/src/vmx.rs b/vm/x86/x86defs/src/vmx.rs index c9e80b8525..87e6140e48 100644 --- a/vm/x86/x86defs/src/vmx.rs +++ b/vm/x86/x86defs/src/vmx.rs @@ -5,8 +5,10 @@ // TODO: move VMX defs somewhere? +use crate::ApicRegister; use bitfield_struct::bitfield; use open_enum::open_enum; +use static_assertions::const_assert_eq; use zerocopy::FromBytes; use zerocopy::Immutable; use zerocopy::IntoBytes; @@ -496,16 +498,9 @@ pub struct SecondaryProcessorControls { pub instruction_timeout: bool, } -#[repr(C)] -#[derive(Debug, Copy, Clone, IntoBytes, Immutable, KnownLayout, FromBytes)] -pub struct ApicRegister { - pub value: u32, - _reserved: [u32; 3], -} - #[repr(C)] #[derive(Debug, Clone, IntoBytes, Immutable, KnownLayout, FromBytes)] -pub struct ApicPage { +pub struct VmxApicPage { pub reserved_0: [ApicRegister; 2], pub id: ApicRegister, pub version: ApicRegister, @@ -538,3 +533,5 @@ pub struct ApicPage { pub reserved_3f: ApicRegister, pub reserved_40: [ApicRegister; 0xc0], } + +const_assert_eq!(size_of::(), 4096);