From b573490e7be3491917a29b75023e74b3adac4f72 Mon Sep 17 00:00:00 2001 From: Chris Oo Date: Wed, 3 Jun 2026 11:41:58 -0700 Subject: [PATCH 1/3] kvm: add confidential guest memory UAPI wrappers Add low-level KVM wrappers and constants needed for guest_memfd-backed confidential guests, including memory attributes, x86 SNP VM and launch ioctls, hypercall exits, and Arm CCA realm population support. Note that the ARM CCA bindings are based on the v14 KVM patch series and is subject to change. --- vm/kvm/src/lib.rs | 564 ++++++++++++++++++++++- vmm_core/virt_kvm/src/arch/x86_64/mod.rs | 41 ++ vmm_core/virt_kvm/src/lib.rs | 1 - 3 files changed, 602 insertions(+), 4 deletions(-) diff --git a/vm/kvm/src/lib.rs b/vm/kvm/src/lib.rs index 3d83c79bc1..aa566d5c14 100644 --- a/vm/kvm/src/lib.rs +++ b/vm/kvm/src/lib.rs @@ -23,12 +23,19 @@ use std::sync::atomic::Ordering; use thiserror::Error; mod ioctl { + #[cfg(target_arch = "aarch64")] + use super::KvmArmRmiPopulate; use kvm_bindings::*; + #[cfg(target_arch = "x86_64")] + use nix::errno::Errno; use nix::ioctl_read; use nix::ioctl_readwrite; + use nix::ioctl_readwrite_bad; use nix::ioctl_write_int_bad; use nix::ioctl_write_ptr; use nix::request_code_none; + use nix::request_code_readwrite; + use std::mem::size_of; const KVMIO: u8 = 0xae; ioctl_write_int_bad!(kvm_create_vm, request_code_none!(KVMIO, 0x1)); ioctl_write_int_bad!(kvm_check_extension, request_code_none!(KVMIO, 0x03)); @@ -44,6 +51,12 @@ mod ioctl { 0x46, kvm_userspace_memory_region ); + ioctl_write_ptr!( + kvm_set_user_memory_region2, + KVMIO, + 0x49, + kvm_userspace_memory_region2 + ); ioctl_write_ptr!(kvm_irq_line, KVMIO, 0x61, kvm_irq_level); ioctl_write_ptr!(kvm_set_gsi_routing, KVMIO, 0x6a, kvm_irq_routing); ioctl_write_ptr!(kvm_irqfd, KVMIO, 0x76, kvm_irqfd); @@ -99,8 +112,148 @@ mod ioctl { ioctl_read!(kvm_arm_preferred_target, KVMIO, 0xaf, kvm_vcpu_init); ioctl_write_ptr!(kvm_ioeventfd, KVMIO, 0x79, kvm_ioeventfd); ioctl_write_ptr!(kvm_set_guest_debug, KVMIO, 0x9b, kvm_guest_debug); + ioctl_write_ptr!( + kvm_set_memory_attributes, + KVMIO, + 0xd2, + kvm_memory_attributes + ); ioctl_readwrite!(kvm_create_device, KVMIO, 0xe0, kvm_create_device); ioctl_write_ptr!(kvm_set_device_attr, KVMIO, 0xe1, kvm_device_attr); + ioctl_readwrite!(kvm_create_guest_memfd, KVMIO, 0xd4, kvm_create_guest_memfd); + #[cfg(target_arch = "aarch64")] + ioctl_readwrite_bad!( + kvm_arm_rmi_populate, + request_code_readwrite!(KVMIO, 0xd7, size_of::()), + KvmArmRmiPopulate + ); + #[cfg(target_arch = "x86_64")] + ioctl_readwrite_bad!( + kvm_memory_encrypt_op, + request_code_readwrite!(KVMIO, 0xba, size_of::()), + kvm_sev_cmd + ); + #[cfg(target_arch = "x86_64")] + /// # Safety + /// + /// `fd` must refer to a valid KVM VM file descriptor. + pub unsafe fn kvm_memory_encrypt_op_supported(fd: libc::c_int) -> nix::Result<()> { + // SAFETY: Calling the KVM_MEMORY_ENCRYPT_OP ioctl with a null argument is + // the documented availability probe for SEV support. + match unsafe { + libc::ioctl( + fd, + request_code_readwrite!(KVMIO, 0xba, size_of::()), + std::ptr::null_mut::(), + ) + } { + 0 => Ok(()), + _ => Err(Errno::last()), + } + } +} + +#[cfg(target_arch = "x86_64")] +const KVM_CAP_VM_TYPES_UAPI: u32 = 235; +#[cfg(target_arch = "x86_64")] +const KVM_CAP_EXIT_HYPERCALL_UAPI: u32 = 201; +#[cfg(target_arch = "x86_64")] +pub const KVM_HC_MAP_GPA_RANGE_UAPI: u64 = 12; +#[cfg(target_arch = "x86_64")] +pub const KVM_MAP_GPA_RANGE_ENCRYPTED_UAPI: u64 = 1 << 4; +#[cfg(target_arch = "x86_64")] +pub const KVM_MAP_GPA_RANGE_DECRYPTED_UAPI: u64 = 0 << 4; +#[cfg(target_arch = "x86_64")] +const KVM_X86_SNP_VM_UAPI: libc::c_int = 4; + +pub const KVM_MEMORY_EXIT_FLAG_PRIVATE_UAPI: u64 = 1 << 3; + +#[cfg(target_arch = "aarch64")] +pub const KVM_CAP_ARM_RMI_UAPI: u32 = 249; +#[cfg(target_arch = "aarch64")] +pub const KVM_ARM_RMI_POPULATE_FLAGS_MEASURE_UAPI: u32 = 1 << 0; +#[cfg(target_arch = "aarch64")] +const KVM_VM_TYPE_ARM_IPA_SIZE_MASK_UAPI: u64 = 0xff; +#[cfg(target_arch = "aarch64")] +const KVM_VM_TYPE_ARM_REALM_UAPI: u64 = 1 << 30; + +#[cfg(target_arch = "x86_64")] +pub const KVM_SEV_SNP_PAGE_TYPE_NORMAL_UAPI: u8 = KVM_SEV_SNP_PAGE_TYPE_NORMAL as u8; +#[cfg(target_arch = "x86_64")] +pub const KVM_SEV_SNP_PAGE_TYPE_ZERO_UAPI: u8 = KVM_SEV_SNP_PAGE_TYPE_ZERO as u8; +#[cfg(target_arch = "x86_64")] +pub const KVM_SEV_SNP_PAGE_TYPE_UNMEASURED_UAPI: u8 = KVM_SEV_SNP_PAGE_TYPE_UNMEASURED as u8; +#[cfg(target_arch = "x86_64")] +pub const KVM_SEV_SNP_PAGE_TYPE_SECRETS_UAPI: u8 = KVM_SEV_SNP_PAGE_TYPE_SECRETS as u8; +#[cfg(target_arch = "x86_64")] +pub const KVM_SEV_SNP_PAGE_TYPE_CPUID_UAPI: u8 = KVM_SEV_SNP_PAGE_TYPE_CPUID as u8; + +#[cfg(target_arch = "x86_64")] +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum SevSnpPageType { + Normal, + Zero, + Unmeasured, + Secrets, + Cpuid, +} + +#[cfg(target_arch = "x86_64")] +impl SevSnpPageType { + pub const fn as_uapi(self) -> u8 { + match self { + SevSnpPageType::Normal => KVM_SEV_SNP_PAGE_TYPE_NORMAL_UAPI, + SevSnpPageType::Zero => KVM_SEV_SNP_PAGE_TYPE_ZERO_UAPI, + SevSnpPageType::Unmeasured => KVM_SEV_SNP_PAGE_TYPE_UNMEASURED_UAPI, + SevSnpPageType::Secrets => KVM_SEV_SNP_PAGE_TYPE_SECRETS_UAPI, + SevSnpPageType::Cpuid => KVM_SEV_SNP_PAGE_TYPE_CPUID_UAPI, + } + } +} + +#[cfg(target_arch = "x86_64")] +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum X86VmType { + Snp, +} + +#[cfg(target_arch = "x86_64")] +impl X86VmType { + const fn as_raw(self) -> libc::c_int { + match self { + X86VmType::Snp => KVM_X86_SNP_VM_UAPI, + } + } +} + +#[cfg(target_arch = "aarch64")] +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Aarch64VmType { + Realm { ipa_bits: u8 }, +} + +#[cfg(target_arch = "aarch64")] +impl Aarch64VmType { + const fn as_raw(self) -> libc::c_int { + match self { + Aarch64VmType::Realm { ipa_bits } => { + (KVM_VM_TYPE_ARM_REALM_UAPI + | ((ipa_bits as u64) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK_UAPI)) + as libc::c_int + } + } + } +} + +#[cfg(target_arch = "aarch64")] +#[repr(C)] +#[derive(Debug, Copy, Clone, Default, Eq, PartialEq)] +pub struct KvmArmRmiPopulate { + pub base: u64, + pub size: u64, + pub source_uaddr: u64, + pub flags: u32, + pub reserved: u32, } #[derive(Error, Debug)] @@ -111,8 +264,28 @@ pub enum Error { SignalMsi(#[source] nix::Error), #[error("SetMemoryRegion")] SetMemoryRegion(#[source] nix::Error), + #[error("SetMemoryAttributes")] + SetMemoryAttributes(#[source] nix::Error), + #[error("CreateGuestMemfd")] + CreateGuestMemfd(#[source] nix::Error), #[error("CreateVm")] CreateVm(#[source] nix::Error), + #[cfg(target_arch = "aarch64")] + #[error("ArmRmiPopulate")] + ArmRmiPopulate(#[source] nix::Error), + #[error("missing KVM capability: {0}")] + MissingCapability(&'static str), + #[cfg(target_arch = "x86_64")] + #[error("unsupported x86 VM type: {0:?}")] + UnsupportedX86VmType(X86VmType), + #[cfg(target_arch = "x86_64")] + #[error("MemoryEncryptOp({command}, firmware_error={firmware_error:#x})")] + MemoryEncryptOp { + command: &'static str, + firmware_error: u32, + #[source] + source: nix::Error, + }, #[error("EnableCap({0})")] EnableCap(&'static str, #[source] nix::Error), #[error("CreateVCpu")] @@ -127,6 +300,14 @@ pub enum Error { SetSRegs(#[source] nix::Error), #[error("Run")] Run(#[source] nix::Error), + #[error("RunMemoryFault(flags={flags:#x}, gpa={gpa:#x}, size={size:#x})")] + RunMemoryFault { + flags: u64, + gpa: u64, + size: u64, + #[source] + source: nix::Error, + }, #[error("GetVCpuMmapSize")] GetVCpuMmapSize(#[source] nix::Error), #[error("MmapVCpu")] @@ -283,6 +464,34 @@ impl Kvm { unsafe { ioctl::kvm_check_extension(self.as_fd().as_raw_fd(), extension as i32) } } + pub fn check_private_memory_extensions(&self) -> Result<()> { + if self + .check_extension(KVM_CAP_USER_MEMORY2) + .map_err(Error::CheckExtension)? + == 0 + { + return Err(Error::MissingCapability("KVM_CAP_USER_MEMORY2")); + } + if self + .check_extension(KVM_CAP_GUEST_MEMFD) + .map_err(Error::CheckExtension)? + == 0 + { + return Err(Error::MissingCapability("KVM_CAP_GUEST_MEMFD")); + } + if self + .check_extension(KVM_CAP_MEMORY_ATTRIBUTES) + .map_err(Error::CheckExtension)? + & KVM_MEMORY_ATTRIBUTE_PRIVATE as libc::c_int + == 0 + { + return Err(Error::MissingCapability( + "KVM_CAP_MEMORY_ATTRIBUTES(KVM_MEMORY_ATTRIBUTE_PRIVATE)", + )); + } + Ok(()) + } + pub fn new_vm(&self) -> Result { // On ARM, can request memory isolation which we don't use. // For that, include the `KVM_VM_TYPE_ARM_PROTECTED` flag. @@ -290,6 +499,30 @@ impl Kvm { // IPA on ARM64, and on x86_64 is the only option. let vm_type = self.check_extension(KVM_CAP_ARM_VM_IPA_SIZE).unwrap_or(0); + self.new_vm_with_type(vm_type) + } + + #[cfg(target_arch = "x86_64")] + pub fn new_x86_vm(&self, vm_type: X86VmType) -> Result { + let supported_vm_types = self + .check_extension(KVM_CAP_VM_TYPES_UAPI) + .map_err(Error::CheckExtension)? as u64; + let raw_vm_type = vm_type.as_raw(); + let vm_type_bit = 1_u64 + .checked_shl(raw_vm_type as u32) + .ok_or(Error::UnsupportedX86VmType(vm_type))?; + if supported_vm_types & vm_type_bit == 0 { + return Err(Error::UnsupportedX86VmType(vm_type)); + } + self.new_vm_with_type(raw_vm_type) + } + + #[cfg(target_arch = "aarch64")] + pub fn new_aarch64_vm(&self, vm_type: Aarch64VmType) -> Result { + self.new_vm_with_type(vm_type.as_raw()) + } + + fn new_vm_with_type(&self, vm_type: libc::c_int) -> Result { // SAFETY: Calling IOCTL as documented, with no special requirements. let vm = unsafe { let fd = @@ -362,6 +595,193 @@ pub struct Partition { } impl Partition { + pub fn check_private_memory_extensions(&self) -> Result<()> { + if self + .check_extension(KVM_CAP_USER_MEMORY2) + .map_err(Error::CheckExtension)? + == 0 + { + return Err(Error::MissingCapability("KVM_CAP_USER_MEMORY2")); + } + if self + .check_extension(KVM_CAP_GUEST_MEMFD) + .map_err(Error::CheckExtension)? + == 0 + { + return Err(Error::MissingCapability("KVM_CAP_GUEST_MEMFD")); + } + if self + .check_extension(KVM_CAP_MEMORY_ATTRIBUTES) + .map_err(Error::CheckExtension)? + & KVM_MEMORY_ATTRIBUTE_PRIVATE as libc::c_int + == 0 + { + return Err(Error::MissingCapability( + "KVM_CAP_MEMORY_ATTRIBUTES(KVM_MEMORY_ATTRIBUTE_PRIVATE)", + )); + } + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + pub fn check_sev_snp_launch_extensions(&self) -> Result<()> { + self.check_private_memory_extensions()?; + // SAFETY: This is the documented KVM_MEMORY_ENCRYPT_OP availability + // probe, and does not pass any userspace data pointer to KVM. + unsafe { ioctl::kvm_memory_encrypt_op_supported(self.vm.as_raw_fd()) }.map_err(|err| { + Error::MemoryEncryptOp { + command: "KVM_MEMORY_ENCRYPT_OP(NULL)", + firmware_error: 0, + source: err, + } + })?; + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + pub fn enable_hypercall_exits(&self, hypercall_mask: u64) -> Result<()> { + // SAFETY: Calling IOCTL as documented, with no special requirements. + unsafe { + ioctl::kvm_enable_cap( + self.vm.as_raw_fd(), + &kvm_enable_cap { + cap: KVM_CAP_EXIT_HYPERCALL_UAPI, + args: [hypercall_mask, 0, 0, 0], + ..Default::default() + }, + ) + .map_err(|err| Error::EnableCap("exit_hypercall", err))?; + } + Ok(()) + } + + pub fn check_extension(&self, extension: u32) -> nix::Result { + // SAFETY: Calling IOCTL as documented, with no special requirements. + unsafe { ioctl::kvm_check_extension(self.vm.as_raw_fd(), extension as i32) } + } + + #[cfg(target_arch = "aarch64")] + pub fn arm_rmi_populate(&self, populate: &mut KvmArmRmiPopulate) -> Result<()> { + // SAFETY: `populate` points to a valid KVM_ARM_RMI_POPULATE argument for + // the duration of the ioctl. KVM may update it to report partial progress. + unsafe { ioctl::kvm_arm_rmi_populate(self.vm.as_raw_fd(), populate) } + .map_err(Error::ArmRmiPopulate)?; + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + pub fn sev_snp_init(&self, sev: BorrowedFd<'_>) -> Result<()> { + let mut init = kvm_sev_init::default(); + let mut command = kvm_sev_cmd { + id: sev_cmd_id_KVM_SEV_INIT2, + data: std::ptr::from_mut(&mut init) as u64, + sev_fd: sev.as_raw_fd() as u32, + ..Default::default() + }; + + // SAFETY: `command` and its data pointer refer to stack-allocated C ABI + // structs that remain valid for the duration of the ioctl. + unsafe { + ioctl::kvm_memory_encrypt_op(self.vm.as_raw_fd(), &mut command).map_err(|err| { + Error::MemoryEncryptOp { + command: "KVM_SEV_INIT2", + firmware_error: command.error, + source: err, + } + })?; + } + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + fn sev_snp_cmd( + &self, + sev: BorrowedFd<'_>, + command_name: &'static str, + command_id: sev_cmd_id, + data: &mut T, + ) -> Result<()> { + let mut command = kvm_sev_cmd { + id: command_id, + data: std::ptr::from_mut(data) as u64, + sev_fd: sev.as_raw_fd() as u32, + ..Default::default() + }; + + loop { + // SAFETY: `command` and its data pointer refer to stack-allocated C ABI + // structs that remain valid for the duration of the ioctl. + match unsafe { ioctl::kvm_memory_encrypt_op(self.vm.as_raw_fd(), &mut command) } { + Ok(_) => break, + Err(nix::errno::Errno::EAGAIN) => {} + Err(err) => { + return Err(Error::MemoryEncryptOp { + command: command_name, + firmware_error: command.error, + source: err, + }); + } + } + } + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + pub fn sev_snp_launch_start( + &self, + sev: BorrowedFd<'_>, + data: &mut kvm_sev_snp_launch_start, + ) -> Result<()> { + self.sev_snp_cmd( + sev, + "KVM_SEV_SNP_LAUNCH_START", + sev_cmd_id_KVM_SEV_SNP_LAUNCH_START, + data, + ) + } + + #[cfg(target_arch = "x86_64")] + pub fn sev_snp_launch_update( + &self, + sev: BorrowedFd<'_>, + gfn_start: u64, + uaddr: u64, + len: u64, + page_type: SevSnpPageType, + ) -> Result<()> { + let mut update = kvm_sev_snp_launch_update { + gfn_start, + uaddr, + len, + type_: page_type.as_uapi(), + ..Default::default() + }; + + while update.len != 0 { + self.sev_snp_cmd( + sev, + "KVM_SEV_SNP_LAUNCH_UPDATE", + sev_cmd_id_KVM_SEV_SNP_LAUNCH_UPDATE, + &mut update, + )?; + } + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + pub fn sev_snp_launch_finish( + &self, + sev: BorrowedFd<'_>, + data: &mut kvm_sev_snp_launch_finish, + ) -> Result<()> { + self.sev_snp_cmd( + sev, + "KVM_SEV_SNP_LAUNCH_FINISH", + sev_cmd_id_KVM_SEV_SNP_LAUNCH_FINISH, + data, + ) + } + pub fn enable_split_irqchip(&self, lines: u32) -> Result<()> { // TODO: We are not checking KVM_CAP_ENABLE_CAP_VM first. // TODO: We are not calling KVM_CHECK_EXTENSION first. @@ -506,7 +926,14 @@ impl Partition { Ok(()) } - #[expect(clippy::missing_safety_doc, clippy::undocumented_unsafe_blocks)] + /// Sets or clears a userspace memory slot. + /// + /// # Safety + /// + /// If `size` is nonzero, `data..data + size` must be a valid userspace + /// mapping for KVM to access until the slot is changed or cleared. The + /// caller must also ensure that `addr` and `size` satisfy KVM's memory-slot + /// alignment and range requirements. pub unsafe fn set_user_memory_region( &self, slot: u32, @@ -522,6 +949,8 @@ impl Partition { memory_size: size as u64, userspace_addr: data as usize as u64, }; + // SAFETY: the caller guarantees that any non-empty userspace range + // remains valid for KVM while the slot references it. unsafe { ioctl::kvm_set_user_memory_region(self.vm.as_raw_fd(), ®ion) .map_err(Error::SetMemoryRegion)?; @@ -529,6 +958,77 @@ impl Partition { Ok(()) } + /// Sets or clears a userspace memory slot with optional guestmemfd backing. + /// + /// # Safety + /// + /// If `size` is nonzero, `data..data + size` must be a valid userspace + /// mapping for KVM to access until the slot is changed or cleared. The + /// caller must ensure that `addr`, `size`, and any `guestmemfd` offset + /// satisfy KVM's memory-slot alignment and range requirements. If + /// `guest_memfd` is supplied, the file must remain open and valid for as + /// long as KVM may reference the slot. + pub unsafe fn set_user_memory_region2( + &self, + slot: u32, + data: *mut u8, + size: usize, + addr: u64, + readonly: bool, + guest_memfd: Option<(&File, u64)>, + ) -> Result<()> { + let (guest_memfd, guest_memfd_offset, guest_memfd_flag) = guest_memfd + .map(|(file, offset)| (file.as_raw_fd() as u32, offset, KVM_MEM_GUEST_MEMFD)) + .unwrap_or((0, 0, 0)); + let region = kvm_userspace_memory_region2 { + slot, + flags: if readonly { KVM_MEM_READONLY } else { 0 } | guest_memfd_flag, + guest_phys_addr: addr, + memory_size: size as u64, + userspace_addr: data as usize as u64, + guest_memfd_offset, + guest_memfd, + ..Default::default() + }; + // SAFETY: the caller guarantees that any non-empty userspace range and + // optional guestmemfd backing remain valid for KVM while the slot + // references them. + unsafe { + ioctl::kvm_set_user_memory_region2(self.vm.as_raw_fd(), ®ion) + .map_err(Error::SetMemoryRegion)?; + } + Ok(()) + } + + pub fn create_guest_memfd(&self, size: u64) -> Result { + let mut guest_memfd = kvm_create_guest_memfd { + size, + ..Default::default() + }; + // SAFETY: `guest_memfd` is a valid C ABI struct for KVM to read. + let fd = unsafe { + ioctl::kvm_create_guest_memfd(self.vm.as_raw_fd(), &mut guest_memfd) + .map_err(Error::CreateGuestMemfd)? + }; + // SAFETY: On success, KVM returns a new owned file descriptor. + Ok(unsafe { File::from_raw_fd(fd) }) + } + + pub fn set_memory_attributes(&self, addr: u64, size: u64, attributes: u64) -> Result<()> { + let attr = kvm_memory_attributes { + address: addr, + size, + attributes, + ..Default::default() + }; + // SAFETY: `attr` is a valid C ABI struct for KVM to read. + unsafe { + ioctl::kvm_set_memory_attributes(self.vm.as_raw_fd(), &attr) + .map_err(Error::SetMemoryAttributes)?; + } + Ok(()) + } + pub fn set_gsi_routes(&self, routes: &[(u32, RoutingEntry)]) -> Result<()> { const MAX_ROUTES: usize = 2048; assert!(routes.len() <= MAX_ROUTES); @@ -1304,6 +1804,16 @@ impl<'a> VpRunner<'a> { Ok(_) => Ok(true), Err(err) => match err { nix::errno::Errno::EINTR | nix::errno::Errno::EAGAIN => Ok(false), + _ if self.run_data().exit_reason == KVM_EXIT_MEMORY_FAULT => { + // SAFETY: KVM reported KVM_EXIT_MEMORY_FAULT, so this is the active union field. + let memory_fault = unsafe { self.run_data().__bindgen_anon_1.memory_fault }; + Err(Error::RunMemoryFault { + flags: memory_fault.flags, + gpa: memory_fault.gpa, + size: memory_fault.size, + source: err, + }) + } _ => Err(Error::Run(err)), }, } @@ -1443,6 +1953,18 @@ impl<'a> VpRunner<'a> { } } #[cfg(target_arch = "x86_64")] + KVM_EXIT_HYPERCALL => { + // SAFETY: this is the active union field. + let hypercall = unsafe { &mut self.run_data().__bindgen_anon_1.hypercall }; + Exit::Hypercall { + nr: hypercall.nr, + args: hypercall.args, + result: &mut hypercall.ret, + // SAFETY: this is the active field for KVM_EXIT_HYPERCALL. + flags: unsafe { hypercall.__bindgen_anon_1.flags }, + } + } + #[cfg(target_arch = "x86_64")] KVM_EXIT_X86_WRMSR => { // SAFETY: this is the active union field. let msr = unsafe { &mut self.run_data().__bindgen_anon_1.msr }; @@ -1465,7 +1987,6 @@ impl<'a> VpRunner<'a> { error: &mut msr.error, } } - #[cfg(target_arch = "aarch64")] KVM_EXIT_SYSTEM_EVENT => { // SAFETY: this is the active union field. let system_event = unsafe { &self.run_data().__bindgen_anon_1.system_event }; @@ -1535,6 +2056,13 @@ pub enum Exit<'a> { data: &'a [u8], }, #[cfg(target_arch = "x86_64")] + Hypercall { + nr: u64, + args: [u64; 6], + result: &'a mut u64, + flags: u64, + }, + #[cfg(target_arch = "x86_64")] MsrRead { index: u32, data: &'a mut u64, @@ -1581,7 +2109,6 @@ pub enum Exit<'a> { Eoi { irq: u8, }, - #[cfg(target_arch = "aarch64")] SystemEvent { event_type: u32, event_flags: u64, @@ -1651,3 +2178,34 @@ pub struct DebugRegisters { pub dr6: u64, pub dr7: u64, } + +#[cfg(all(test, target_arch = "x86_64"))] +mod tests { + use super::*; + + #[test] + fn sev_snp_page_type_values_match_kvm_uapi() { + assert_eq!(SevSnpPageType::Normal.as_uapi(), 1); + assert_eq!(SevSnpPageType::Zero.as_uapi(), 3); + assert_eq!(SevSnpPageType::Unmeasured.as_uapi(), 4); + assert_eq!(SevSnpPageType::Secrets.as_uapi(), 5); + assert_eq!(SevSnpPageType::Cpuid.as_uapi(), 6); + } + + #[test] + fn sev_snp_launch_update_uses_expected_zero_page_shape() { + let update = kvm_sev_snp_launch_update { + gfn_start: 0x1234, + uaddr: 0, + len: 0x2000, + type_: SevSnpPageType::Zero.as_uapi(), + ..Default::default() + }; + + assert_eq!(update.gfn_start, 0x1234); + assert_eq!(update.uaddr, 0); + assert_eq!(update.len, 0x2000); + assert_eq!(update.type_, KVM_SEV_SNP_PAGE_TYPE_ZERO_UAPI); + assert_eq!(update.flags, 0); + } +} diff --git a/vmm_core/virt_kvm/src/arch/x86_64/mod.rs b/vmm_core/virt_kvm/src/arch/x86_64/mod.rs index 0e6b5a31f3..1a1427cf42 100644 --- a/vmm_core/virt_kvm/src/arch/x86_64/mod.rs +++ b/vmm_core/virt_kvm/src/arch/x86_64/mod.rs @@ -1490,6 +1490,23 @@ impl<'p> Processor for KvmProcessor<'p> { KvmHypercallExit::DISPATCHER.dispatch(&self.partition.gm, &mut handler); *result = handler.registers.result; } + kvm::Exit::Hypercall { + nr, + args, + result, + flags, + } => { + // This is only reachable for hypercall exits explicitly + // enabled on the VM. Later SNP support enables + // KVM_HC_MAP_GPA_RANGE and handles it here. + tracelimit::error_ratelimited!( + nr, + ?args, + flags, + "unhandled KVM hypercall" + ); + *result = 1; + } kvm::Exit::Debug { exception: _, pc: _, @@ -1526,6 +1543,30 @@ impl<'p> Processor for KvmProcessor<'p> { tracing::error!(hardware_entry_failure_reason, "VP entry failed"); return Err(dev.fatal_error(KvmRunVpError::InvalidVpState.into())); } + kvm::Exit::SystemEvent { + event_type, + event_flags, + } => { + // KVM reports architectural shutdown/reset/crash + // notifications here; SNP adds SEV termination handling. + tracing::info!(event_type, event_flags, "system event"); + match event_type { + kvm::KVM_SYSTEM_EVENT_SHUTDOWN => { + return Err(VpHaltReason::PowerOff); + } + kvm::KVM_SYSTEM_EVENT_RESET => { + return Err(VpHaltReason::Reset); + } + kvm::KVM_SYSTEM_EVENT_CRASH => { + return Err(VpHaltReason::TripleFault { vtl: Vtl::Vtl0 }); + } + _ => { + return Err(dev.fatal_error( + KvmRunVpError::UnhandledSystemEvent(event_type).into(), + )); + } + } + } } } } diff --git a/vmm_core/virt_kvm/src/lib.rs b/vmm_core/virt_kvm/src/lib.rs index 3b587ce9eb..4083df9176 100644 --- a/vmm_core/virt_kvm/src/lib.rs +++ b/vmm_core/virt_kvm/src/lib.rs @@ -144,7 +144,6 @@ enum KvmRunVpError { InvalidVpState, #[error("failed to run VP")] Run(#[source] kvm::Error), - #[cfg_attr(guest_arch = "x86_64", expect(dead_code))] #[error("unhandled system event type: {0:#x}")] UnhandledSystemEvent(u32), #[cfg(guest_arch = "x86_64")] From 1e5fc23657fef63c79a7a13c37dafe7ddb07c0fd Mon Sep 17 00:00:00 2001 From: Chris Oo Date: Wed, 3 Jun 2026 13:36:02 -0700 Subject: [PATCH 2/3] virt_kvm: move memory mapping code to memory module Move the existing userspace KVM memory range state and partition memory mapping implementation out of lib.rs into memory.rs. This is a mechanical split that prepares the KVM memory path for guestmemfd support without changing behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- vmm_core/virt_kvm/src/lib.rs | 111 +----------------------------- vmm_core/virt_kvm/src/memory.rs | 117 ++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 109 deletions(-) create mode 100644 vmm_core/virt_kvm/src/memory.rs diff --git a/vmm_core/virt_kvm/src/lib.rs b/vmm_core/virt_kvm/src/lib.rs index 4083df9176..bb0e148f80 100644 --- a/vmm_core/virt_kvm/src/lib.rs +++ b/vmm_core/virt_kvm/src/lib.rs @@ -11,12 +11,13 @@ mod arch; mod gsi; +mod memory; pub use arch::Kvm; use guestmem::GuestMemory; use inspect::Inspect; -use memory_range::MemoryRange; +use memory::KvmMemoryRangeState; use parking_lot::Mutex; use std::sync::Arc; use thiserror::Error; @@ -32,7 +33,6 @@ pub fn is_available() -> Result { } use arch::KvmVpInner; -use hvdef::Vtl; use std::sync::atomic::Ordering; use virt::VpIndex; use vmcore::vmtime::VmTimeAccess; @@ -70,21 +70,6 @@ pub enum KvmError { TopologyCpuid(#[source] virt::x86::topology::UnknownVendor), } -#[derive(Debug, Inspect)] -struct KvmMemoryRange { - host_addr: *mut u8, - range: MemoryRange, -} - -unsafe impl Sync for KvmMemoryRange {} -unsafe impl Send for KvmMemoryRange {} - -#[derive(Debug, Default, Inspect)] -struct KvmMemoryRangeState { - #[inspect(flatten, iter_by_index)] - ranges: Vec>, -} - #[derive(Inspect)] pub struct KvmPartition { #[inspect(flatten)] @@ -177,96 +162,4 @@ impl KvmPartitionInner { #[cfg(guest_arch = "aarch64")] self.kvm.vp(vp.vp_info().base.vp_index.index()).force_exit(); } - - /// # Safety - /// - /// `data..data+size` must be and remain an allocated VA range until the - /// partition is destroyed or the region is unmapped. - unsafe fn map_region( - &self, - data: *mut u8, - size: usize, - addr: u64, - readonly: bool, - ) -> anyhow::Result<()> { - let mut state = self.memory.lock(); - - // Memory slots cannot be resized but can be moved within the guest - // address space. Find the existing slot if there is one. - let mut slot_to_use = None; - for (slot, range) in state.ranges.iter_mut().enumerate() { - match range { - Some(range) if range.host_addr == data => { - slot_to_use = Some(slot); - break; - } - Some(_) => (), - None => slot_to_use = Some(slot), - } - } - if slot_to_use.is_none() { - slot_to_use = Some(state.ranges.len()); - state.ranges.push(None); - } - let slot_to_use = slot_to_use.unwrap(); - unsafe { - self.kvm - .set_user_memory_region(slot_to_use as u32, data, size, addr, readonly)? - }; - state.ranges[slot_to_use] = Some(KvmMemoryRange { - host_addr: data, - range: MemoryRange::new(addr..addr + size as u64), - }); - Ok(()) - } -} - -impl virt::PartitionMemoryMapper for KvmPartition { - fn memory_mapper(&self, vtl: Vtl) -> Arc { - assert_eq!(vtl, Vtl::Vtl0); - self.inner.clone() - } -} - -// TODO: figure out a better abstraction that works for both KVM and WHP. -impl virt::PartitionMemoryMap for KvmPartitionInner { - unsafe fn map_range( - &self, - data: *mut u8, - size: usize, - addr: u64, - writable: bool, - _exec: bool, - ) -> anyhow::Result<()> { - // SAFETY: guaranteed by caller. - unsafe { self.map_region(data, size, addr, !writable) } - } - - fn unmap_range(&self, addr: u64, size: u64) -> anyhow::Result<()> { - let range = MemoryRange::new(addr..addr + size); - let mut state = self.memory.lock(); - for (slot, entry) in state.ranges.iter_mut().enumerate() { - let Some(kvm_range) = entry else { continue }; - if range.contains(&kvm_range.range) { - // SAFETY: clearing a slot should always be safe since it removes - // and does not add memory references. - unsafe { - self.kvm.set_user_memory_region( - slot as u32, - std::ptr::null_mut(), - 0, - 0, - false, - )?; - } - *entry = None; - } else { - assert!( - !range.overlaps(&kvm_range.range), - "can only unmap existing ranges of exact size" - ); - } - } - Ok(()) - } } diff --git a/vmm_core/virt_kvm/src/memory.rs b/vmm_core/virt_kvm/src/memory.rs new file mode 100644 index 0000000000..92f4eabacd --- /dev/null +++ b/vmm_core/virt_kvm/src/memory.rs @@ -0,0 +1,117 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use crate::KvmPartition; +use crate::KvmPartitionInner; +use inspect::Inspect; +use memory_range::MemoryRange; +use std::sync::Arc; + +#[derive(Debug, Inspect)] +pub(crate) struct KvmMemoryRange { + host_addr: *mut u8, + range: MemoryRange, +} + +unsafe impl Sync for KvmMemoryRange {} +unsafe impl Send for KvmMemoryRange {} + +#[derive(Debug, Default, Inspect)] +pub(crate) struct KvmMemoryRangeState { + #[inspect(flatten, iter_by_index)] + pub(crate) ranges: Vec>, +} + +impl KvmPartitionInner { + /// # Safety + /// + /// `data..data+size` must be and remain an allocated VA range until the + /// partition is destroyed or the region is unmapped. + unsafe fn map_region( + &self, + data: *mut u8, + size: usize, + addr: u64, + readonly: bool, + ) -> anyhow::Result<()> { + let mut state = self.memory.lock(); + + // Memory slots cannot be resized but can be moved within the guest + // address space. Find the existing slot if there is one. + let mut slot_to_use = None; + for (slot, range) in state.ranges.iter_mut().enumerate() { + match range { + Some(range) if range.host_addr == data => { + slot_to_use = Some(slot); + break; + } + Some(_) => (), + None => slot_to_use = Some(slot), + } + } + if slot_to_use.is_none() { + slot_to_use = Some(state.ranges.len()); + state.ranges.push(None); + } + let slot_to_use = slot_to_use.unwrap(); + unsafe { + self.kvm + .set_user_memory_region(slot_to_use as u32, data, size, addr, readonly)? + }; + state.ranges[slot_to_use] = Some(KvmMemoryRange { + host_addr: data, + range: MemoryRange::new(addr..addr + size as u64), + }); + Ok(()) + } +} + +impl virt::PartitionMemoryMapper for KvmPartition { + fn memory_mapper(&self, vtl: hvdef::Vtl) -> Arc { + assert_eq!(vtl, hvdef::Vtl::Vtl0); + self.inner.clone() + } +} + +// TODO: figure out a better abstraction that works for both KVM and WHP. +impl virt::PartitionMemoryMap for KvmPartitionInner { + unsafe fn map_range( + &self, + data: *mut u8, + size: usize, + addr: u64, + writable: bool, + _exec: bool, + ) -> anyhow::Result<()> { + // SAFETY: guaranteed by caller. + unsafe { self.map_region(data, size, addr, !writable) } + } + + fn unmap_range(&self, addr: u64, size: u64) -> anyhow::Result<()> { + let range = MemoryRange::new(addr..addr + size); + let mut state = self.memory.lock(); + for (slot, entry) in state.ranges.iter_mut().enumerate() { + let Some(kvm_range) = entry else { continue }; + if range.contains(&kvm_range.range) { + // SAFETY: clearing a slot should always be safe since it removes + // and does not add memory references. + unsafe { + self.kvm.set_user_memory_region( + slot as u32, + std::ptr::null_mut(), + 0, + 0, + false, + )?; + } + *entry = None; + } else { + assert!( + !range.overlaps(&kvm_range.range), + "can only unmap existing ranges of exact size" + ); + } + } + Ok(()) + } +} From 4ff0791b35dbc444f1deb5589c0e0766b69a979e Mon Sep 17 00:00:00 2001 From: Chris Oo Date: Wed, 3 Jun 2026 13:41:33 -0700 Subject: [PATCH 3/3] virt_kvm: add guestmemfd memory backing foundation Add KVM guestmemfd memory-slot plumbing and neutral private-memory range helpers without enabling confidential VM launch behavior yet. Non-isolated KVM partitions continue to use userspace memory backing. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- vm/kvm/src/lib.rs | 166 +++------ vmm_core/virt_kvm/src/arch/aarch64/mod.rs | 13 +- vmm_core/virt_kvm/src/arch/x86_64/mod.rs | 22 +- vmm_core/virt_kvm/src/lib.rs | 14 + vmm_core/virt_kvm/src/memory.rs | 410 +++++++++++++++++++++- 5 files changed, 486 insertions(+), 139 deletions(-) diff --git a/vm/kvm/src/lib.rs b/vm/kvm/src/lib.rs index aa566d5c14..11c9a11cc6 100644 --- a/vm/kvm/src/lib.rs +++ b/vm/kvm/src/lib.rs @@ -36,7 +36,9 @@ mod ioctl { use nix::request_code_none; use nix::request_code_readwrite; use std::mem::size_of; + const KVMIO: u8 = 0xae; + ioctl_write_int_bad!(kvm_create_vm, request_code_none!(KVMIO, 0x1)); ioctl_write_int_bad!(kvm_check_extension, request_code_none!(KVMIO, 0x03)); ioctl_write_int_bad!(kvm_get_vcpu_mmap_size, request_code_none!(KVMIO, 0x04)); @@ -188,6 +190,17 @@ pub const KVM_SEV_SNP_PAGE_TYPE_SECRETS_UAPI: u8 = KVM_SEV_SNP_PAGE_TYPE_SECRETS #[cfg(target_arch = "x86_64")] pub const KVM_SEV_SNP_PAGE_TYPE_CPUID_UAPI: u8 = KVM_SEV_SNP_PAGE_TYPE_CPUID as u8; +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum VmType { + Default, + #[cfg(target_arch = "x86_64")] + Snp, + #[cfg(target_arch = "aarch64")] + Realm { + ipa_bits: u8, + }, +} + #[cfg(target_arch = "x86_64")] #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum SevSnpPageType { @@ -211,40 +224,6 @@ impl SevSnpPageType { } } -#[cfg(target_arch = "x86_64")] -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum X86VmType { - Snp, -} - -#[cfg(target_arch = "x86_64")] -impl X86VmType { - const fn as_raw(self) -> libc::c_int { - match self { - X86VmType::Snp => KVM_X86_SNP_VM_UAPI, - } - } -} - -#[cfg(target_arch = "aarch64")] -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum Aarch64VmType { - Realm { ipa_bits: u8 }, -} - -#[cfg(target_arch = "aarch64")] -impl Aarch64VmType { - const fn as_raw(self) -> libc::c_int { - match self { - Aarch64VmType::Realm { ipa_bits } => { - (KVM_VM_TYPE_ARM_REALM_UAPI - | ((ipa_bits as u64) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK_UAPI)) - as libc::c_int - } - } - } -} - #[cfg(target_arch = "aarch64")] #[repr(C)] #[derive(Debug, Copy, Clone, Default, Eq, PartialEq)] @@ -275,9 +254,8 @@ pub enum Error { ArmRmiPopulate(#[source] nix::Error), #[error("missing KVM capability: {0}")] MissingCapability(&'static str), - #[cfg(target_arch = "x86_64")] - #[error("unsupported x86 VM type: {0:?}")] - UnsupportedX86VmType(X86VmType), + #[error("unsupported KVM VM type: {0:?}")] + UnsupportedVmType(VmType), #[cfg(target_arch = "x86_64")] #[error("MemoryEncryptOp({command}, firmware_error={firmware_error:#x})")] MemoryEncryptOp { @@ -464,62 +442,48 @@ impl Kvm { unsafe { ioctl::kvm_check_extension(self.as_fd().as_raw_fd(), extension as i32) } } - pub fn check_private_memory_extensions(&self) -> Result<()> { - if self - .check_extension(KVM_CAP_USER_MEMORY2) - .map_err(Error::CheckExtension)? - == 0 - { - return Err(Error::MissingCapability("KVM_CAP_USER_MEMORY2")); - } - if self - .check_extension(KVM_CAP_GUEST_MEMFD) - .map_err(Error::CheckExtension)? - == 0 - { - return Err(Error::MissingCapability("KVM_CAP_GUEST_MEMFD")); - } - if self - .check_extension(KVM_CAP_MEMORY_ATTRIBUTES) - .map_err(Error::CheckExtension)? - & KVM_MEMORY_ATTRIBUTE_PRIVATE as libc::c_int - == 0 - { - return Err(Error::MissingCapability( - "KVM_CAP_MEMORY_ATTRIBUTES(KVM_MEMORY_ATTRIBUTE_PRIVATE)", - )); + pub fn new_vm(&self, vm_type: VmType) -> Result { + let raw_vm_type = self.raw_vm_type(vm_type)?; + self.new_vm_with_type(raw_vm_type) + } + + fn raw_vm_type(&self, vm_type: VmType) -> Result { + match vm_type { + VmType::Default => Ok(self.default_vm_type()), + #[cfg(target_arch = "x86_64")] + VmType::Snp => { + let supported_vm_types = + self.check_extension(KVM_CAP_VM_TYPES_UAPI) + .map_err(Error::CheckExtension)? as u64; + let raw_vm_type = KVM_X86_SNP_VM_UAPI; + let vm_type_bit = 1_u64 + .checked_shl(raw_vm_type as u32) + .ok_or(Error::UnsupportedVmType(vm_type))?; + if supported_vm_types & vm_type_bit == 0 { + return Err(Error::UnsupportedVmType(vm_type)); + } + Ok(raw_vm_type) + } + #[cfg(target_arch = "aarch64")] + VmType::Realm { ipa_bits } => Ok((KVM_VM_TYPE_ARM_REALM_UAPI + | ((ipa_bits as u64) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK_UAPI)) + as libc::c_int), } - Ok(()) } - pub fn new_vm(&self) -> Result { + fn default_vm_type(&self) -> libc::c_int { // On ARM, can request memory isolation which we don't use. // For that, include the `KVM_VM_TYPE_ARM_PROTECTED` flag. // Use 0 as the fallback machine type, which implies 40bit // IPA on ARM64, and on x86_64 is the only option. - let vm_type = self.check_extension(KVM_CAP_ARM_VM_IPA_SIZE).unwrap_or(0); - - self.new_vm_with_type(vm_type) - } - - #[cfg(target_arch = "x86_64")] - pub fn new_x86_vm(&self, vm_type: X86VmType) -> Result { - let supported_vm_types = self - .check_extension(KVM_CAP_VM_TYPES_UAPI) - .map_err(Error::CheckExtension)? as u64; - let raw_vm_type = vm_type.as_raw(); - let vm_type_bit = 1_u64 - .checked_shl(raw_vm_type as u32) - .ok_or(Error::UnsupportedX86VmType(vm_type))?; - if supported_vm_types & vm_type_bit == 0 { - return Err(Error::UnsupportedX86VmType(vm_type)); + #[cfg(target_arch = "aarch64")] + { + self.check_extension(KVM_CAP_ARM_VM_IPA_SIZE).unwrap_or(0) + } + #[cfg(not(target_arch = "aarch64"))] + { + 0 } - self.new_vm_with_type(raw_vm_type) - } - - #[cfg(target_arch = "aarch64")] - pub fn new_aarch64_vm(&self, vm_type: Aarch64VmType) -> Result { - self.new_vm_with_type(vm_type.as_raw()) } fn new_vm_with_type(&self, vm_type: libc::c_int) -> Result { @@ -595,37 +559,8 @@ pub struct Partition { } impl Partition { - pub fn check_private_memory_extensions(&self) -> Result<()> { - if self - .check_extension(KVM_CAP_USER_MEMORY2) - .map_err(Error::CheckExtension)? - == 0 - { - return Err(Error::MissingCapability("KVM_CAP_USER_MEMORY2")); - } - if self - .check_extension(KVM_CAP_GUEST_MEMFD) - .map_err(Error::CheckExtension)? - == 0 - { - return Err(Error::MissingCapability("KVM_CAP_GUEST_MEMFD")); - } - if self - .check_extension(KVM_CAP_MEMORY_ATTRIBUTES) - .map_err(Error::CheckExtension)? - & KVM_MEMORY_ATTRIBUTE_PRIVATE as libc::c_int - == 0 - { - return Err(Error::MissingCapability( - "KVM_CAP_MEMORY_ATTRIBUTES(KVM_MEMORY_ATTRIBUTE_PRIVATE)", - )); - } - Ok(()) - } - #[cfg(target_arch = "x86_64")] pub fn check_sev_snp_launch_extensions(&self) -> Result<()> { - self.check_private_memory_extensions()?; // SAFETY: This is the documented KVM_MEMORY_ENCRYPT_OP availability // probe, and does not pass any userspace data pointer to KVM. unsafe { ioctl::kvm_memory_encrypt_op_supported(self.vm.as_raw_fd()) }.map_err(|err| { @@ -634,8 +569,7 @@ impl Partition { firmware_error: 0, source: err, } - })?; - Ok(()) + }) } #[cfg(target_arch = "x86_64")] diff --git a/vmm_core/virt_kvm/src/arch/aarch64/mod.rs b/vmm_core/virt_kvm/src/arch/aarch64/mod.rs index 1205876fd6..55fb9b76e8 100644 --- a/vmm_core/virt_kvm/src/arch/aarch64/mod.rs +++ b/vmm_core/virt_kvm/src/arch/aarch64/mod.rs @@ -18,6 +18,7 @@ use crate::KvmRunVpError; use crate::gsi::GsiRouting; use crate::gsi::KvmIrqFdState; use crate::gsi::MsiRouteBuilder; +use crate::memory::KvmMemoryBackingMode; use aarch64defs::SystemReg; use aarch64defs::Vendor; use aarch64defs::gic::GicV2mRegister; @@ -239,7 +240,7 @@ impl Kvm { // Probe GIC version by creating a throwaway VM and attempting to // create a GICv3 device. If that fails, try GICv2. let kvm = kvm::Kvm::from(file); - let probe_vm = kvm.new_vm()?; + let probe_vm = kvm.new_vm(kvm::VmType::Default)?; let supports_gic_v3 = if probe_vm .test_create_device(kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3) .is_ok() @@ -830,6 +831,14 @@ impl virt::ProtoPartition for KvmProtoPartition<'_> { let partition = Arc::new(KvmPartitionInner { kvm: self.vm, memory: Default::default(), + memory_backing_mode: KvmMemoryBackingMode::Userspace, + ram_ranges: config + .mem_layout + .ram() + .iter() + .map(|range| range.range) + .chain(config.mem_layout.vtl2_range()) + .collect(), hv1_enabled: self.config.hv_config.is_some(), gm: config.guest_memory.clone(), vps: self @@ -1145,7 +1154,7 @@ impl virt::Hypervisor for Kvm { _ => 40, }; - let vm = self.kvm.new_vm()?; + let vm = self.kvm.new_vm(kvm::VmType::Default)?; Ok(KvmProtoPartition { vm, diff --git a/vmm_core/virt_kvm/src/arch/x86_64/mod.rs b/vmm_core/virt_kvm/src/arch/x86_64/mod.rs index 1a1427cf42..a3e67ee3d3 100644 --- a/vmm_core/virt_kvm/src/arch/x86_64/mod.rs +++ b/vmm_core/virt_kvm/src/arch/x86_64/mod.rs @@ -17,6 +17,7 @@ use crate::KvmRunVpError; use crate::gsi::GsiRouting; use crate::gsi::KvmIrqFdState; use crate::gsi::MsiRouteBuilder; +use crate::memory::KvmMemoryBackingMode; use guestmem::DoorbellRegistration; use guestmem::GuestMemory; use guestmem::GuestMemoryError; @@ -349,7 +350,7 @@ impl virt::Hypervisor for Kvm { } } - let vm = self.kvm.new_vm()?; + let vm = self.kvm.new_vm(kvm::VmType::Default)?; vm.enable_split_irqchip(virt::irqcon::IRQ_LINES as u32)?; vm.enable_x2apic_api()?; vm.enable_unknown_msr_exits()?; @@ -456,6 +457,14 @@ impl ProtoPartition for KvmProtoPartition<'_> { let partition = Arc::new(KvmPartitionInner { kvm: self.vm, memory: Default::default(), + memory_backing_mode: KvmMemoryBackingMode::Userspace, + ram_ranges: config + .mem_layout + .ram() + .iter() + .map(|range| range.range) + .chain(config.mem_layout.vtl2_range()) + .collect(), hv1_enabled: self.config.hv_config.is_some(), gm: config.guest_memory.clone(), vps: self @@ -1492,20 +1501,17 @@ impl<'p> Processor for KvmProcessor<'p> { } kvm::Exit::Hypercall { nr, - args, + args: _, result, flags, } => { // This is only reachable for hypercall exits explicitly // enabled on the VM. Later SNP support enables // KVM_HC_MAP_GPA_RANGE and handles it here. - tracelimit::error_ratelimited!( - nr, - ?args, - flags, - "unhandled KVM hypercall" - ); *result = 1; + return Err( + dev.fatal_error(KvmRunVpError::UnhandledHypercall { nr, flags }.into()) + ); } kvm::Exit::Debug { exception: _, diff --git a/vmm_core/virt_kvm/src/lib.rs b/vmm_core/virt_kvm/src/lib.rs index bb0e148f80..460ea53b49 100644 --- a/vmm_core/virt_kvm/src/lib.rs +++ b/vmm_core/virt_kvm/src/lib.rs @@ -17,7 +17,9 @@ pub use arch::Kvm; use guestmem::GuestMemory; use inspect::Inspect; +use memory::KvmMemoryBackingMode; use memory::KvmMemoryRangeState; +use memory_range::MemoryRange; use parking_lot::Mutex; use std::sync::Arc; use thiserror::Error; @@ -53,6 +55,12 @@ pub enum KvmError { State(#[from] Box>), #[error("invalid state while restoring: {0}")] InvalidState(&'static str), + #[error("unsupported isolation configuration: {0}")] + UnsupportedIsolationConfiguration(&'static str), + #[error("cannot resize KVM guest_memfd memory slot")] + CannotResizeGuestMemfdSlot, + #[error("private memory range is not contained in guest_memfd private memory")] + InvalidPrivateMemoryRange, #[error("misaligned gic base address")] Misaligned, #[error("host does not support GICv2 or GICv3")] @@ -86,6 +94,9 @@ struct KvmPartitionInner { #[inspect(skip)] kvm: kvm::Partition, memory: Mutex, + memory_backing_mode: KvmMemoryBackingMode, + #[inspect(iter_by_index)] + ram_ranges: Vec, hv1_enabled: bool, gm: GuestMemory, #[inspect(skip)] @@ -132,6 +143,9 @@ enum KvmRunVpError { #[error("unhandled system event type: {0:#x}")] UnhandledSystemEvent(u32), #[cfg(guest_arch = "x86_64")] + #[error("unhandled KVM hypercall: nr={nr:#x}, flags={flags:#x}")] + UnhandledHypercall { nr: u64, flags: u64 }, + #[cfg(guest_arch = "x86_64")] #[error("failed to inject an extint interrupt")] ExtintInterrupt(#[source] kvm::Error), } diff --git a/vmm_core/virt_kvm/src/memory.rs b/vmm_core/virt_kvm/src/memory.rs index 92f4eabacd..6b820ff7a2 100644 --- a/vmm_core/virt_kvm/src/memory.rs +++ b/vmm_core/virt_kvm/src/memory.rs @@ -1,16 +1,20 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +use crate::KvmError; use crate::KvmPartition; use crate::KvmPartitionInner; use inspect::Inspect; use memory_range::MemoryRange; +use std::fs::File; use std::sync::Arc; #[derive(Debug, Inspect)] pub(crate) struct KvmMemoryRange { host_addr: *mut u8, range: MemoryRange, + guest_memfd_offset: Option, + private_attributes_set: bool, } unsafe impl Sync for KvmMemoryRange {} @@ -22,6 +26,65 @@ pub(crate) struct KvmMemoryRangeState { pub(crate) ranges: Vec>, } +#[derive(Debug, Inspect)] +#[inspect(external_tag)] +pub(crate) enum KvmMemoryBackingMode { + Userspace, + GuestMemfd(KvmGuestMemfdBacking), +} + +#[derive(Debug, Inspect)] +pub(crate) struct KvmGuestMemfdBacking { + #[inspect(skip)] + file: File, + #[inspect(iter_by_index)] + ranges: Vec, + initial_private: bool, +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Inspect)] +struct KvmGuestMemfdRange { + range: MemoryRange, + file_offset: u64, +} + +#[derive(Debug)] +enum KvmMemoryBacking<'a> { + Userspace, + GuestMemfd { + file: &'a File, + file_offset: u64, + initial_private: bool, + }, +} + +impl KvmMemoryBackingMode { + #[expect(dead_code)] + pub(crate) fn guest_memfd( + kvm: &kvm::Partition, + ram_ranges: impl IntoIterator, + initial_private: bool, + ) -> Result { + check_private_memory_extensions(kvm)?; + + let mut file_size = 0u64; + let mut ranges = Vec::new(); + for range in ram_ranges { + ranges.push(KvmGuestMemfdRange { + range, + file_offset: file_size, + }); + file_size += range.len(); + } + + Ok(Self::GuestMemfd(KvmGuestMemfdBacking { + file: kvm.create_guest_memfd(file_size)?, + ranges, + initial_private, + })) + } +} + impl KvmPartitionInner { /// # Safety /// @@ -34,6 +97,8 @@ impl KvmPartitionInner { addr: u64, readonly: bool, ) -> anyhow::Result<()> { + let range = MemoryRange::new(addr..addr + size as u64); + let backing = self.memory_backing(range)?; let mut state = self.memory.lock(); // Memory slots cannot be resized but can be moved within the guest @@ -54,16 +119,186 @@ impl KvmPartitionInner { state.ranges.push(None); } let slot_to_use = slot_to_use.unwrap(); - unsafe { - self.kvm - .set_user_memory_region(slot_to_use as u32, data, size, addr, readonly)? + if let Some(existing_range) = &state.ranges[slot_to_use] { + if existing_range.guest_memfd_offset.is_some() + && existing_range.range.len() != size as u64 + { + return Err(KvmError::CannotResizeGuestMemfdSlot.into()); + } + if existing_range.private_attributes_set { + self.kvm.set_memory_attributes( + existing_range.range.start(), + existing_range.range.len(), + 0, + )?; + } + if existing_range.guest_memfd_offset.is_some() { + // SAFETY: clearing a slot removes the memory reference. + unsafe { self.clear_slot(slot_to_use, true)? }; + state.ranges[slot_to_use] = None; + } + } + let (guest_memfd_offset, private_attributes_set) = match backing { + KvmMemoryBacking::Userspace => { + // SAFETY: `map_region` requires its caller to keep + // `data..data+size` valid until this guest-physical range is + // unmapped or the partition is destroyed. + unsafe { + self.kvm.set_user_memory_region( + slot_to_use as u32, + data, + size, + addr, + readonly, + )? + }; + (None, false) + } + KvmMemoryBacking::GuestMemfd { + file, + file_offset, + initial_private, + } => { + // SAFETY: `map_region` requires its caller to keep + // `data..data+size` valid until this guest-physical range is + // unmapped or the partition is destroyed. `memory_backing` + // The partition owns the backing guestmemfd for at least as long + // as KVM references it. + unsafe { + self.kvm.set_user_memory_region2( + slot_to_use as u32, + data, + size, + addr, + readonly, + Some((file, file_offset)), + )?; + }; + if initial_private { + if let Err(err) = self.kvm.set_memory_attributes( + addr, + size as u64, + kvm::KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + ) { + // SAFETY: clearing a slot removes the memory reference. + unsafe { self.clear_slot(slot_to_use, true)? }; + state.ranges[slot_to_use] = None; + return Err(err.into()); + } + } + (Some(file_offset), initial_private) + } }; state.ranges[slot_to_use] = Some(KvmMemoryRange { host_addr: data, - range: MemoryRange::new(addr..addr + size as u64), + range, + guest_memfd_offset, + private_attributes_set, }); Ok(()) } + + fn memory_backing(&self, range: MemoryRange) -> Result, KvmError> { + match &self.memory_backing_mode { + KvmMemoryBackingMode::Userspace => Ok(KvmMemoryBacking::Userspace), + KvmMemoryBackingMode::GuestMemfd(backing) => { + match classify_guest_memfd_backing(range, &backing.ranges)? { + Some(file_offset) => Ok(KvmMemoryBacking::GuestMemfd { + file: &backing.file, + file_offset, + initial_private: backing.initial_private, + }), + None => Ok(KvmMemoryBacking::Userspace), + } + } + } + } + + /// # Safety + /// + /// The caller must ensure that clearing the target slot is valid. + unsafe fn clear_slot(&self, slot: usize, guest_memfd_backed: bool) -> Result<(), kvm::Error> { + if guest_memfd_backed { + // SAFETY: the caller ensures clearing this slot is valid. + unsafe { + self.kvm.set_user_memory_region2( + slot as u32, + std::ptr::null_mut(), + 0, + 0, + false, + None, + ) + } + } else { + // SAFETY: the caller ensures clearing this slot is valid. + unsafe { + self.kvm + .set_user_memory_region(slot as u32, std::ptr::null_mut(), 0, 0, false) + } + } + } +} + +fn check_private_memory_extensions(kvm: &kvm::Partition) -> Result<(), KvmError> { + require_kvm_extension(kvm, kvm::KVM_CAP_USER_MEMORY2, "KVM_CAP_USER_MEMORY2")?; + require_kvm_extension(kvm, kvm::KVM_CAP_GUEST_MEMFD, "KVM_CAP_GUEST_MEMFD")?; + let memory_attributes = require_kvm_extension( + kvm, + kvm::KVM_CAP_MEMORY_ATTRIBUTES, + "KVM_CAP_MEMORY_ATTRIBUTES", + )?; + if memory_attributes as u64 & kvm::KVM_MEMORY_ATTRIBUTE_PRIVATE as u64 == 0 { + return Err(kvm::Error::MissingCapability( + "KVM_CAP_MEMORY_ATTRIBUTES(KVM_MEMORY_ATTRIBUTE_PRIVATE)", + ) + .into()); + } + Ok(()) +} + +fn require_kvm_extension( + kvm: &kvm::Partition, + extension: u32, + capability: &'static str, +) -> Result { + let value = kvm + .check_extension(extension) + .map_err(kvm::Error::CheckExtension)?; + if value == 0 { + return Err(kvm::Error::MissingCapability(capability).into()); + } + Ok(value) +} + +fn classify_guest_memfd_backing( + range: MemoryRange, + ram_ranges: &[KvmGuestMemfdRange], +) -> Result, KvmError> { + let mut containing_ranges = ram_ranges + .iter() + .filter(|ram_range| ram_range.range.contains(&range)); + if let Some(ram_range) = containing_ranges.next() { + if containing_ranges.next().is_some() { + return Err(KvmError::UnsupportedIsolationConfiguration( + "KVM guest_memfd mappings must be contained in exactly one RAM range", + )); + } + return Ok(Some( + ram_range.file_offset + (range.start() - ram_range.range.start()), + )); + } + + if ram_ranges + .iter() + .any(|ram_range| ram_range.range.overlaps(&range)) + { + return Err(KvmError::UnsupportedIsolationConfiguration( + "KVM guest_memfd mappings must be fully contained in one RAM range", + )); + } + + Ok(None) } impl virt::PartitionMemoryMapper for KvmPartition { @@ -83,7 +318,10 @@ impl virt::PartitionMemoryMap for KvmPartitionInner { writable: bool, _exec: bool, ) -> anyhow::Result<()> { - // SAFETY: guaranteed by caller. + // SAFETY: `PartitionMemoryMap::map_range` requires the caller to keep + // `data..data+size` valid for the lifetime of the mapping. `map_region` + // preserves that lifetime requirement and records the mapped range so + // it can be cleared on unmap. unsafe { self.map_region(data, size, addr, !writable) } } @@ -93,17 +331,17 @@ impl virt::PartitionMemoryMap for KvmPartitionInner { for (slot, entry) in state.ranges.iter_mut().enumerate() { let Some(kvm_range) = entry else { continue }; if range.contains(&kvm_range.range) { - // SAFETY: clearing a slot should always be safe since it removes - // and does not add memory references. - unsafe { - self.kvm.set_user_memory_region( - slot as u32, - std::ptr::null_mut(), - 0, + let guest_memfd_backed = kvm_range.guest_memfd_offset.is_some(); + if kvm_range.private_attributes_set { + self.kvm.set_memory_attributes( + kvm_range.range.start(), + kvm_range.range.len(), 0, - false, )?; } + // SAFETY: clearing a slot should always be safe since it removes + // and does not add memory references. + unsafe { self.clear_slot(slot, guest_memfd_backed)? }; *entry = None; } else { assert!( @@ -115,3 +353,149 @@ impl virt::PartitionMemoryMap for KvmPartitionInner { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + + fn range(start: u64, end: u64) -> MemoryRange { + MemoryRange::new(start..end) + } + + fn guest_memfd_ranges(ranges: &[MemoryRange]) -> Vec { + let mut file_offset = 0; + ranges + .iter() + .map(|&range| { + let guest_memfd_range = KvmGuestMemfdRange { range, file_offset }; + file_offset += range.len(); + guest_memfd_range + }) + .collect() + } + + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + struct KvmPrivateMemoryRange { + gpa: MemoryRange, + hva: *mut u8, + } + + fn private_memory_range_from_slots( + range: MemoryRange, + slots: &[Option], + ) -> Result { + let slot = slots + .iter() + .flatten() + .find(|slot| slot.range.contains(&range)) + .ok_or(KvmError::InvalidPrivateMemoryRange)?; + + if slot.guest_memfd_offset.is_none() || !slot.private_attributes_set { + return Err(KvmError::InvalidPrivateMemoryRange); + } + + let offset = range.start() - slot.range.start(); + Ok(KvmPrivateMemoryRange { + gpa: range, + hva: slot.host_addr.wrapping_add(offset as usize), + }) + } + + #[test] + fn guest_memfd_classifier_selects_contained_ram() { + let ram_ranges = guest_memfd_ranges(&[range(0x1000, 0x9000), range(0x1_0000, 0x2_0000)]); + + assert_eq!( + classify_guest_memfd_backing(range(0x2000, 0x4000), &ram_ranges).unwrap(), + Some(0x1000) + ); + assert_eq!( + classify_guest_memfd_backing(range(0x1_1000, 0x1_3000), &ram_ranges).unwrap(), + Some(0x9000) + ); + } + + #[test] + fn guest_memfd_classifier_keeps_non_ram_userspace() { + let ram_ranges = guest_memfd_ranges(&[range(0x1000, 0x9000), range(0x1_0000, 0x2_0000)]); + + assert_eq!( + classify_guest_memfd_backing(range(0xa000, 0xc000), &ram_ranges).unwrap(), + None + ); + } + + #[test] + fn guest_memfd_classifier_rejects_partial_ram_overlap() { + let ram_ranges = guest_memfd_ranges(&[range(0x1000, 0x9000), range(0x1_0000, 0x2_0000)]); + + assert!(matches!( + classify_guest_memfd_backing(range(0x8000, 0xa000), &ram_ranges), + Err(KvmError::UnsupportedIsolationConfiguration(_)) + )); + } + + #[test] + fn guest_memfd_classifier_does_not_merge_adjacent_ram_ranges() { + let ram_ranges = guest_memfd_ranges(&[range(0x1000, 0x3000), range(0x3000, 0x5000)]); + + assert!(matches!( + classify_guest_memfd_backing(range(0x2000, 0x4000), &ram_ranges), + Err(KvmError::UnsupportedIsolationConfiguration(_)) + )); + } + + #[test] + fn guest_memfd_classifier_rejects_ambiguous_ram_containment() { + let ram_ranges = guest_memfd_ranges(&[range(0x1000, 0x5000), range(0x2000, 0x4000)]); + + assert!(matches!( + classify_guest_memfd_backing(range(0x2000, 0x4000), &ram_ranges), + Err(KvmError::UnsupportedIsolationConfiguration(_)) + )); + } + + #[test] + fn private_memory_range_resolves_hva_offset() { + let mut backing = vec![0u8; 0x4000]; + let host_addr = backing.as_mut_ptr(); + let slots = [Some(KvmMemoryRange { + host_addr, + range: range(0x1000, 0x5000), + guest_memfd_offset: Some(0), + private_attributes_set: true, + })]; + + let resolved = private_memory_range_from_slots(range(0x3000, 0x5000), &slots).unwrap(); + + assert_eq!(resolved.gpa, range(0x3000, 0x5000)); + assert_eq!(resolved.hva, host_addr.wrapping_add(0x2000)); + } + + #[test] + fn private_memory_range_rejects_non_private_or_non_guest_memfd_slots() { + let mut backing = vec![0u8; 0x4000]; + let host_addr = backing.as_mut_ptr(); + let userspace_slots = [Some(KvmMemoryRange { + host_addr, + range: range(0x1000, 0x5000), + guest_memfd_offset: None, + private_attributes_set: true, + })]; + assert!(matches!( + private_memory_range_from_slots(range(0x1000, 0x2000), &userspace_slots), + Err(KvmError::InvalidPrivateMemoryRange) + )); + + let shared_slots = [Some(KvmMemoryRange { + host_addr, + range: range(0x1000, 0x5000), + guest_memfd_offset: Some(0), + private_attributes_set: false, + })]; + assert!(matches!( + private_memory_range_from_slots(range(0x1000, 0x2000), &shared_slots), + Err(KvmError::InvalidPrivateMemoryRange) + )); + } +}