44//! Enables pre-boot setup, instantiation and booting of a Firecracker VMM.
55
66use std:: fmt:: Debug ;
7+ use std:: fs:: File ;
78use std:: io;
8- use std:: os:: fd:: AsFd ;
9+ use std:: os:: fd:: { AsFd , AsRawFd } ;
910use std:: os:: unix:: fs:: MetadataExt ;
1011#[ cfg( feature = "gdb" ) ]
1112use std:: sync:: mpsc;
@@ -14,7 +15,6 @@ use std::sync::{Arc, Mutex};
1415use event_manager:: SubscriberOps ;
1516use kvm_ioctls:: Cap ;
1617use linux_loader:: cmdline:: Cmdline as LoaderKernelCmdline ;
17- use userfaultfd:: Uffd ;
1818use utils:: time:: TimestampUs ;
1919use vm_allocator:: AllocPolicy ;
2020use vm_memory:: GuestAddress ;
@@ -29,6 +29,7 @@ use crate::cpu_config::templates::{
2929} ;
3030#[ cfg( target_arch = "x86_64" ) ]
3131use crate :: device_manager;
32+ use crate :: device_manager:: acpi:: ACPIDeviceError ;
3233use crate :: device_manager:: pci_mngr:: PciManagerError ;
3334use crate :: device_manager:: {
3435 AttachDeviceError , DeviceManager , DeviceManagerCreateError , DevicePersistError ,
@@ -45,16 +46,20 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend};
4546use crate :: gdb;
4647use crate :: initrd:: { InitrdConfig , InitrdError } ;
4748use crate :: logger:: debug;
48- use crate :: persist:: { MicrovmState , MicrovmStateError } ;
49+ use crate :: persist:: {
50+ GuestMemoryFromFileError , GuestMemoryFromUffdError , MicrovmState , MicrovmStateError ,
51+ guest_memory_from_file, guest_memory_from_uffd,
52+ } ;
4953use crate :: resources:: VmResources ;
5054use crate :: seccomp:: BpfThreadMap ;
5155use crate :: snapshot:: Persist ;
5256use crate :: utils:: { mib_to_bytes, u64_to_usize} ;
5357use crate :: vmm_config:: instance_info:: InstanceInfo ;
5458use crate :: vmm_config:: machine_config:: MachineConfigError ;
5559use crate :: vmm_config:: memory_hotplug:: MemoryHotplugConfig ;
60+ use crate :: vmm_config:: snapshot:: { LoadSnapshotParams , MemBackendType } ;
5661use crate :: vstate:: kvm:: { Kvm , KvmError } ;
57- use crate :: vstate:: memory:: { GuestRegionMmap , MaybeBounce } ;
62+ use crate :: vstate:: memory:: { GuestMemoryState , MaybeBounce , bitmap_size , create_memfd } ;
5863#[ cfg( target_arch = "aarch64" ) ]
5964use crate :: vstate:: resources:: ResourceAllocator ;
6065use crate :: vstate:: vcpu:: VcpuError ;
@@ -382,6 +387,7 @@ pub fn build_microvm_for_boot(
382387 kvm,
383388 vm,
384389 uffd : None ,
390+ uffd_socket : None ,
385391 vcpus_handles : Vec :: new ( ) ,
386392 vcpus_exit_evt,
387393 device_manager,
@@ -455,6 +461,17 @@ pub fn build_and_boot_microvm(
455461 Ok ( vmm)
456462}
457463
464+ /// Sub-Error type for [`build_microvm_from_snapshot`] to contain either
465+ /// [`GuestMemoryFromFileError`] or [`GuestMemoryFromUffdError`] within
466+ /// [`BuildMicrovmFromSnapshotError`].
467+ #[ derive( Debug , thiserror:: Error , displaydoc:: Display ) ]
468+ pub enum BuildMicrovmFromSnapshotErrorGuestMemoryError {
469+ /// Error creating guest memory from file: {0}
470+ File ( #[ from] GuestMemoryFromFileError ) ,
471+ /// Error creating guest memory from uffd: {0}
472+ Uffd ( #[ from] GuestMemoryFromUffdError ) ,
473+ }
474+
458475/// Error type for [`build_microvm_from_snapshot`].
459476#[ derive( Debug , thiserror:: Error , displaydoc:: Display ) ]
460477pub enum BuildMicrovmFromSnapshotError {
@@ -490,6 +507,55 @@ pub enum BuildMicrovmFromSnapshotError {
490507 SeccompFiltersInternal ( #[ from] crate :: seccomp:: InstallationError ) ,
491508 /// Failed to restore devices: {0}
492509 RestoreDevices ( #[ from] DevicePersistError ) ,
510+ /// Failed to restore ACPI device manager: {0}
511+ ACPIDeviManager ( #[ from] ACPIDeviceError ) ,
512+ /// VMGenID update failed: {0}
513+ VMGenIDUpdate ( std:: io:: Error ) ,
514+ /// Internal error while restoring microVM: {0}
515+ Internal ( #[ from] VmmError ) ,
516+ /// Failed to load guest memory: {0}
517+ GuestMemory ( #[ from] BuildMicrovmFromSnapshotErrorGuestMemoryError ) ,
518+ /// Userfault bitmap memfd error: {0}
519+ UserfaultBitmapMemfd ( #[ from] crate :: vstate:: memory:: MemoryError ) ,
520+ }
521+
522+ fn memfd_to_slice ( memfd : & mut Option < File > ) -> Option < & mut [ u8 ] > {
523+ if let Some ( bitmap_file) = memfd {
524+ let len = u64_to_usize (
525+ bitmap_file
526+ . metadata ( )
527+ . expect ( "Failed to get metadata" )
528+ . len ( ) ,
529+ ) ;
530+
531+ // SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense
532+ let bitmap_addr = unsafe {
533+ libc:: mmap (
534+ std:: ptr:: null_mut ( ) ,
535+ len,
536+ libc:: PROT_WRITE ,
537+ libc:: MAP_SHARED ,
538+ bitmap_file. as_raw_fd ( ) ,
539+ 0 ,
540+ )
541+ } ;
542+
543+ if bitmap_addr == libc:: MAP_FAILED {
544+ panic ! (
545+ "Failed to mmap userfault bitmap file: {}" ,
546+ std:: io:: Error :: last_os_error( )
547+ ) ;
548+ }
549+
550+ // SAFETY: `bitmap_addr` is a valid memory address returned by `mmap`.
551+ Some ( unsafe { std:: slice:: from_raw_parts_mut ( bitmap_addr. cast ( ) , len) } )
552+ } else {
553+ None
554+ }
555+ }
556+
557+ fn memory_size_from_mem_state ( mem_state : & GuestMemoryState ) -> usize {
558+ mem_state. regions . iter ( ) . map ( |region| region. size ) . sum ( )
493559}
494560
495561/// Builds and starts a microVM based on the provided MicrovmState.
@@ -501,26 +567,108 @@ pub fn build_microvm_from_snapshot(
501567 instance_info : & InstanceInfo ,
502568 event_manager : & mut EventManager ,
503569 microvm_state : MicrovmState ,
504- guest_memory : Vec < GuestRegionMmap > ,
505- uffd : Option < Uffd > ,
506570 seccomp_filters : & BpfThreadMap ,
571+ params : & LoadSnapshotParams ,
507572 vm_resources : & mut VmResources ,
508573) -> Result < Arc < Mutex < Vmm > > , BuildMicrovmFromSnapshotError > {
574+ // TODO: take it from kvm-bindings when userfault support is merged upstream
575+ const KVM_CAP_USERFAULT : u32 = 246 ;
576+
509577 // Build Vmm.
510578 debug ! ( "event_start: build microvm from snapshot" ) ;
511579
512- let kvm = Kvm :: new ( microvm_state. kvm_state . kvm_cap_modifiers . clone ( ) )
513- . map_err ( StartMicrovmError :: Kvm ) ?;
580+ let secret_free = vm_resources. machine_config . secret_free ;
581+ let mut kvm_capabilities = microvm_state. kvm_state . kvm_cap_modifiers . clone ( ) ;
582+ if secret_free {
583+ kvm_capabilities. push ( KvmCapability :: Add ( Cap :: GuestMemfd as u32 ) ) ;
584+ kvm_capabilities. push ( KvmCapability :: Add ( KVM_CAP_GUEST_MEMFD_MMAP ) ) ;
585+ kvm_capabilities. push ( KvmCapability :: Add ( KVM_CAP_GUEST_MEMFD_NO_DIRECT_MAP ) ) ;
586+ kvm_capabilities. push ( KvmCapability :: Add ( KVM_CAP_USERFAULT ) ) ;
587+ }
588+
589+ let kvm = Kvm :: new ( kvm_capabilities) . map_err ( StartMicrovmError :: Kvm ) ?;
590+
514591 // Set up Kvm Vm and register memory regions.
515592 // Build custom CPU config if a custom template is provided.
516- let mut vm = Vm :: new ( & kvm, false ) . map_err ( StartMicrovmError :: Vm ) ?;
593+ let mut vm = Vm :: new ( & kvm, secret_free ) . map_err ( StartMicrovmError :: Vm ) ?;
517594
518595 let ( mut vcpus, vcpus_exit_evt) = vm
519596 . create_vcpus ( vm_resources. machine_config . vcpu_count )
520597 . map_err ( StartMicrovmError :: Vm ) ?;
521598
522- vm. restore_memory_regions ( guest_memory, & microvm_state. vm_state . memory , None )
523- . map_err ( StartMicrovmError :: Vm ) ?;
599+ let guest_memfd = match secret_free {
600+ true => Some (
601+ vm. create_guest_memfd (
602+ memory_size_from_mem_state ( & microvm_state. vm_state . memory ) ,
603+ GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_NO_DIRECT_MAP ,
604+ )
605+ . map_err ( VmmError :: Vm ) ?,
606+ ) ,
607+ false => None ,
608+ } ;
609+
610+ let mut userfault_bitmap_memfd = if secret_free {
611+ let bitmap_size =
612+ bitmap_size ( memory_size_from_mem_state ( & microvm_state. vm_state . memory ) as u64 ) ;
613+ let bitmap_file = create_memfd ( bitmap_size, None ) ?;
614+
615+ Some ( bitmap_file. into_file ( ) )
616+ } else {
617+ None
618+ } ;
619+
620+ let mem_backend_path = & params. mem_backend . backend_path ;
621+ let mem_state = & microvm_state. vm_state . memory ;
622+ let track_dirty_pages = params. track_dirty_pages ;
623+
624+ let ( guest_memory, uffd, uffd_socket) = match params. mem_backend . backend_type {
625+ MemBackendType :: File => {
626+ if vm_resources. machine_config . huge_pages . is_hugetlbfs ( ) {
627+ return Err ( BuildMicrovmFromSnapshotErrorGuestMemoryError :: File (
628+ GuestMemoryFromFileError :: HugetlbfsSnapshot ,
629+ )
630+ . into ( ) ) ;
631+ }
632+ (
633+ guest_memory_from_file ( mem_backend_path, mem_state, track_dirty_pages)
634+ . map_err ( BuildMicrovmFromSnapshotErrorGuestMemoryError :: File ) ?,
635+ None ,
636+ None ,
637+ )
638+ }
639+ MemBackendType :: Uffd => {
640+ if vm_resources. machine_config . huge_pages . is_hugetlbfs ( ) && guest_memfd. is_some ( ) {
641+ return Err ( BuildMicrovmFromSnapshotErrorGuestMemoryError :: Uffd (
642+ GuestMemoryFromUffdError :: HugetlbfsSnapshot ,
643+ )
644+ . into ( ) ) ;
645+ }
646+ guest_memory_from_uffd (
647+ mem_backend_path,
648+ mem_state,
649+ track_dirty_pages,
650+ vm_resources. machine_config . huge_pages ,
651+ guest_memfd,
652+ userfault_bitmap_memfd. as_ref ( ) ,
653+ )
654+ . map_err ( BuildMicrovmFromSnapshotErrorGuestMemoryError :: Uffd ) ?
655+ }
656+ } ;
657+
658+ let mut userfault_bitmap_slice = memfd_to_slice ( & mut userfault_bitmap_memfd) ;
659+ if let Some ( ref mut slice) = userfault_bitmap_slice {
660+ // Set all bits so a fault on any page will cause a VM exit
661+ slice. fill ( 0xffu8 ) ;
662+ }
663+
664+ let userfault_bitmap: Option < u64 > = userfault_bitmap_slice. map ( |s| s. as_ptr ( ) as u64 ) ;
665+
666+ vm. restore_memory_regions (
667+ guest_memory,
668+ & microvm_state. vm_state . memory ,
669+ userfault_bitmap,
670+ )
671+ . map_err ( StartMicrovmError :: Vm ) ?;
524672
525673 #[ cfg( target_arch = "x86_64" ) ]
526674 {
@@ -582,6 +730,7 @@ pub fn build_microvm_from_snapshot(
582730 kvm,
583731 vm,
584732 uffd,
733+ uffd_socket,
585734 vcpus_handles : Vec :: new ( ) ,
586735 vcpus_exit_evt,
587736 device_manager,
@@ -921,6 +1070,7 @@ pub(crate) mod tests {
9211070 kvm,
9221071 vm : Arc :: new ( vm) ,
9231072 uffd : None ,
1073+ uffd_socket : None ,
9241074 vcpus_handles : Vec :: new ( ) ,
9251075 vcpus_exit_evt,
9261076 device_manager : default_device_manager ( ) ,
0 commit comments