Skip to content

Commit 57812bf

Browse files
committed
feat(vmm): configure kvm userfault if secret free is enabled
This is needed to instruct the kernel to exit to userspace when a vCPU fault occurs and the corresponding bit in the userfault bitmap is set. The userfault bitmap is allocated in a memfd by Firecracker and sent to the UFFD handler. This also sends 3 fds to the UFFD handler in the handshake: - UFFD (original) - guest_memfd: for the handler to be able to populate guest memory - userfault bitmap memfd: for the handler to be able to disable exits to userspace for the pages that have already been populated Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
1 parent f07f75c commit 57812bf

File tree

3 files changed

+220
-62
lines changed

3 files changed

+220
-62
lines changed

src/vmm/src/builder.rs

Lines changed: 161 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
//! Enables pre-boot setup, instantiation and booting of a Firecracker VMM.
55
66
use std::fmt::Debug;
7+
use std::fs::File;
78
use std::io;
8-
use std::os::fd::AsFd;
9+
use std::os::fd::{AsFd, AsRawFd};
910
use std::os::unix::fs::MetadataExt;
1011
#[cfg(feature = "gdb")]
1112
use std::sync::mpsc;
@@ -14,7 +15,6 @@ use std::sync::{Arc, Mutex};
1415
use event_manager::SubscriberOps;
1516
use kvm_ioctls::Cap;
1617
use linux_loader::cmdline::Cmdline as LoaderKernelCmdline;
17-
use userfaultfd::Uffd;
1818
use utils::time::TimestampUs;
1919
use vm_allocator::AllocPolicy;
2020
use vm_memory::GuestAddress;
@@ -29,6 +29,7 @@ use crate::cpu_config::templates::{
2929
};
3030
#[cfg(target_arch = "x86_64")]
3131
use crate::device_manager;
32+
use crate::device_manager::acpi::ACPIDeviceError;
3233
use crate::device_manager::pci_mngr::PciManagerError;
3334
use crate::device_manager::{
3435
AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError,
@@ -45,16 +46,20 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend};
4546
use crate::gdb;
4647
use crate::initrd::{InitrdConfig, InitrdError};
4748
use crate::logger::debug;
48-
use crate::persist::{MicrovmState, MicrovmStateError};
49+
use crate::persist::{
50+
GuestMemoryFromFileError, GuestMemoryFromUffdError, MicrovmState, MicrovmStateError,
51+
guest_memory_from_file, guest_memory_from_uffd,
52+
};
4953
use crate::resources::VmResources;
5054
use crate::seccomp::BpfThreadMap;
5155
use crate::snapshot::Persist;
5256
use crate::utils::{mib_to_bytes, u64_to_usize};
5357
use crate::vmm_config::instance_info::InstanceInfo;
5458
use crate::vmm_config::machine_config::MachineConfigError;
5559
use crate::vmm_config::memory_hotplug::MemoryHotplugConfig;
60+
use crate::vmm_config::snapshot::{LoadSnapshotParams, MemBackendType};
5661
use crate::vstate::kvm::{Kvm, KvmError};
57-
use crate::vstate::memory::{GuestRegionMmap, MaybeBounce};
62+
use crate::vstate::memory::{GuestMemoryState, MaybeBounce, bitmap_size, create_memfd};
5863
#[cfg(target_arch = "aarch64")]
5964
use crate::vstate::resources::ResourceAllocator;
6065
use crate::vstate::vcpu::VcpuError;
@@ -382,6 +387,7 @@ pub fn build_microvm_for_boot(
382387
kvm,
383388
vm,
384389
uffd: None,
390+
uffd_socket: None,
385391
vcpus_handles: Vec::new(),
386392
vcpus_exit_evt,
387393
device_manager,
@@ -455,6 +461,17 @@ pub fn build_and_boot_microvm(
455461
Ok(vmm)
456462
}
457463

464+
/// Sub-Error type for [`build_microvm_from_snapshot`] to contain either
465+
/// [`GuestMemoryFromFileError`] or [`GuestMemoryFromUffdError`] within
466+
/// [`BuildMicrovmFromSnapshotError`].
467+
#[derive(Debug, thiserror::Error, displaydoc::Display)]
468+
pub enum BuildMicrovmFromSnapshotErrorGuestMemoryError {
469+
/// Error creating guest memory from file: {0}
470+
File(#[from] GuestMemoryFromFileError),
471+
/// Error creating guest memory from uffd: {0}
472+
Uffd(#[from] GuestMemoryFromUffdError),
473+
}
474+
458475
/// Error type for [`build_microvm_from_snapshot`].
459476
#[derive(Debug, thiserror::Error, displaydoc::Display)]
460477
pub enum BuildMicrovmFromSnapshotError {
@@ -490,6 +507,55 @@ pub enum BuildMicrovmFromSnapshotError {
490507
SeccompFiltersInternal(#[from] crate::seccomp::InstallationError),
491508
/// Failed to restore devices: {0}
492509
RestoreDevices(#[from] DevicePersistError),
510+
/// Failed to restore ACPI device manager: {0}
511+
ACPIDeviManager(#[from] ACPIDeviceError),
512+
/// VMGenID update failed: {0}
513+
VMGenIDUpdate(std::io::Error),
514+
/// Internal error while restoring microVM: {0}
515+
Internal(#[from] VmmError),
516+
/// Failed to load guest memory: {0}
517+
GuestMemory(#[from] BuildMicrovmFromSnapshotErrorGuestMemoryError),
518+
/// Userfault bitmap memfd error: {0}
519+
UserfaultBitmapMemfd(#[from] crate::vstate::memory::MemoryError),
520+
}
521+
522+
fn memfd_to_slice(memfd: &mut Option<File>) -> Option<&mut [u8]> {
523+
if let Some(bitmap_file) = memfd {
524+
let len = u64_to_usize(
525+
bitmap_file
526+
.metadata()
527+
.expect("Failed to get metadata")
528+
.len(),
529+
);
530+
531+
// SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense
532+
let bitmap_addr = unsafe {
533+
libc::mmap(
534+
std::ptr::null_mut(),
535+
len,
536+
libc::PROT_WRITE,
537+
libc::MAP_SHARED,
538+
bitmap_file.as_raw_fd(),
539+
0,
540+
)
541+
};
542+
543+
if bitmap_addr == libc::MAP_FAILED {
544+
panic!(
545+
"Failed to mmap userfault bitmap file: {}",
546+
std::io::Error::last_os_error()
547+
);
548+
}
549+
550+
// SAFETY: `bitmap_addr` is a valid memory address returned by `mmap`.
551+
Some(unsafe { std::slice::from_raw_parts_mut(bitmap_addr.cast(), len) })
552+
} else {
553+
None
554+
}
555+
}
556+
557+
fn memory_size_from_mem_state(mem_state: &GuestMemoryState) -> usize {
558+
mem_state.regions.iter().map(|region| region.size).sum()
493559
}
494560

495561
/// Builds and starts a microVM based on the provided MicrovmState.
@@ -501,26 +567,108 @@ pub fn build_microvm_from_snapshot(
501567
instance_info: &InstanceInfo,
502568
event_manager: &mut EventManager,
503569
microvm_state: MicrovmState,
504-
guest_memory: Vec<GuestRegionMmap>,
505-
uffd: Option<Uffd>,
506570
seccomp_filters: &BpfThreadMap,
571+
params: &LoadSnapshotParams,
507572
vm_resources: &mut VmResources,
508573
) -> Result<Arc<Mutex<Vmm>>, BuildMicrovmFromSnapshotError> {
574+
// TODO: take it from kvm-bindings when userfault support is merged upstream
575+
const KVM_CAP_USERFAULT: u32 = 246;
576+
509577
// Build Vmm.
510578
debug!("event_start: build microvm from snapshot");
511579

512-
let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone())
513-
.map_err(StartMicrovmError::Kvm)?;
580+
let secret_free = vm_resources.machine_config.secret_free;
581+
let mut kvm_capabilities = microvm_state.kvm_state.kvm_cap_modifiers.clone();
582+
if secret_free {
583+
kvm_capabilities.push(KvmCapability::Add(Cap::GuestMemfd as u32));
584+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_GUEST_MEMFD_MMAP));
585+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_GUEST_MEMFD_NO_DIRECT_MAP));
586+
kvm_capabilities.push(KvmCapability::Add(KVM_CAP_USERFAULT));
587+
}
588+
589+
let kvm = Kvm::new(kvm_capabilities).map_err(StartMicrovmError::Kvm)?;
590+
514591
// Set up Kvm Vm and register memory regions.
515592
// Build custom CPU config if a custom template is provided.
516-
let mut vm = Vm::new(&kvm, false).map_err(StartMicrovmError::Vm)?;
593+
let mut vm = Vm::new(&kvm, secret_free).map_err(StartMicrovmError::Vm)?;
517594

518595
let (mut vcpus, vcpus_exit_evt) = vm
519596
.create_vcpus(vm_resources.machine_config.vcpu_count)
520597
.map_err(StartMicrovmError::Vm)?;
521598

522-
vm.restore_memory_regions(guest_memory, &microvm_state.vm_state.memory, None)
523-
.map_err(StartMicrovmError::Vm)?;
599+
let guest_memfd = match secret_free {
600+
true => Some(
601+
vm.create_guest_memfd(
602+
memory_size_from_mem_state(&microvm_state.vm_state.memory),
603+
GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_NO_DIRECT_MAP,
604+
)
605+
.map_err(VmmError::Vm)?,
606+
),
607+
false => None,
608+
};
609+
610+
let mut userfault_bitmap_memfd = if secret_free {
611+
let bitmap_size =
612+
bitmap_size(memory_size_from_mem_state(&microvm_state.vm_state.memory) as u64);
613+
let bitmap_file = create_memfd(bitmap_size, None)?;
614+
615+
Some(bitmap_file.into_file())
616+
} else {
617+
None
618+
};
619+
620+
let mem_backend_path = &params.mem_backend.backend_path;
621+
let mem_state = &microvm_state.vm_state.memory;
622+
let track_dirty_pages = params.track_dirty_pages;
623+
624+
let (guest_memory, uffd, uffd_socket) = match params.mem_backend.backend_type {
625+
MemBackendType::File => {
626+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() {
627+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File(
628+
GuestMemoryFromFileError::HugetlbfsSnapshot,
629+
)
630+
.into());
631+
}
632+
(
633+
guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)
634+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File)?,
635+
None,
636+
None,
637+
)
638+
}
639+
MemBackendType::Uffd => {
640+
if vm_resources.machine_config.huge_pages.is_hugetlbfs() && guest_memfd.is_some() {
641+
return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd(
642+
GuestMemoryFromUffdError::HugetlbfsSnapshot,
643+
)
644+
.into());
645+
}
646+
guest_memory_from_uffd(
647+
mem_backend_path,
648+
mem_state,
649+
track_dirty_pages,
650+
vm_resources.machine_config.huge_pages,
651+
guest_memfd,
652+
userfault_bitmap_memfd.as_ref(),
653+
)
654+
.map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd)?
655+
}
656+
};
657+
658+
let mut userfault_bitmap_slice = memfd_to_slice(&mut userfault_bitmap_memfd);
659+
if let Some(ref mut slice) = userfault_bitmap_slice {
660+
// Set all bits so a fault on any page will cause a VM exit
661+
slice.fill(0xffu8);
662+
}
663+
664+
let userfault_bitmap: Option<u64> = userfault_bitmap_slice.map(|s| s.as_ptr() as u64);
665+
666+
vm.restore_memory_regions(
667+
guest_memory,
668+
&microvm_state.vm_state.memory,
669+
userfault_bitmap,
670+
)
671+
.map_err(StartMicrovmError::Vm)?;
524672

525673
#[cfg(target_arch = "x86_64")]
526674
{
@@ -582,6 +730,7 @@ pub fn build_microvm_from_snapshot(
582730
kvm,
583731
vm,
584732
uffd,
733+
uffd_socket,
585734
vcpus_handles: Vec::new(),
586735
vcpus_exit_evt,
587736
device_manager,
@@ -921,6 +1070,7 @@ pub(crate) mod tests {
9211070
kvm,
9221071
vm: Arc::new(vm),
9231072
uffd: None,
1073+
uffd_socket: None,
9241074
vcpus_handles: Vec::new(),
9251075
vcpus_exit_evt,
9261076
device_manager: default_device_manager(),

src/vmm/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ pub mod initrd;
119119
use std::collections::HashMap;
120120
use std::io;
121121
use std::os::unix::io::AsRawFd;
122+
use std::os::unix::net::UnixStream;
122123
use std::sync::mpsc::RecvTimeoutError;
123124
use std::sync::{Arc, Barrier, Mutex};
124125
use std::time::Duration;
@@ -307,6 +308,8 @@ pub struct Vmm {
307308
// Save UFFD in order to keep it open in the Firecracker process, as well.
308309
#[allow(unused)]
309310
uffd: Option<Uffd>,
311+
// Used for userfault communication with the UFFD handler when secret freedom is enabled
312+
uffd_socket: Option<UnixStream>,
310313
/// Handles to the vcpu threads with vcpu_fds inside them.
311314
pub vcpus_handles: Vec<VcpuHandle>,
312315
// Used by Vcpus and devices to initiate teardown; Vmm should never write here.

0 commit comments

Comments
 (0)