Skip to content

Commit b400181

Browse files
committed
feat(vmm): implement secret-free fault handling protocol
It contains two parts: - external: between the VMM thread and the UFFD handler - internal: between vCPUs and the VMM thread An outline of the workflow: - When a vCPU fault occurs, vCPU exits to userspace - The vCPU thread sends sends the exit syndrome in the vCPU to VMM channel and writes to the eventfd - The VMM thread forwards the syndrome to the UFFD handler via the UDS socket - The UFFD handler populates the page, clears the corresponding bit in the userfault bitmap and sends a reply to Firecracker - The VMM thread receives the reply and updates a vCPU condvar to notify the vCPU that the fault has been resolved - The vCPU resumes execution Note that as a result of this change, an ability to exit the VM gracefully is lost (at least on x86). In the existing implementation, the VMM thread initiated an exit if an event was read from the eventfd, but no VcpuResponse::Exited responses were read for unknown reason. Since the exit_evt eventfd is now also used by vCPUs to notify the VMM thread of the VM exits caused by pagefaults, this situation (an eventfd event, but response in the channel) can occur also because we have read all VcpuResponse::Userfault in response to the previous eventfd event. Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
1 parent fbdb81c commit b400181

File tree

6 files changed

+269
-39
lines changed

6 files changed

+269
-39
lines changed

src/vmm/src/builder.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,8 @@ pub fn build_microvm_for_boot(
189189
// Set up Kvm Vm and register memory regions.
190190
// Build custom CPU config if a custom template is provided.
191191
let mut vm = Vm::new(&kvm, secret_free)?;
192-
let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?;
192+
let (mut vcpus, vcpus_exit_evt) =
193+
vm.create_vcpus(vm_resources.machine_config.vcpu_count, secret_free)?;
193194

194195
let guest_memfd = match secret_free {
195196
true => Some(Arc::new(
@@ -593,7 +594,7 @@ pub fn build_microvm_from_snapshot(
593594
let mut vm = Vm::new(&kvm, secret_free).map_err(StartMicrovmError::Vm)?;
594595

595596
let (mut vcpus, vcpus_exit_evt) = vm
596-
.create_vcpus(vm_resources.machine_config.vcpu_count)
597+
.create_vcpus(vm_resources.machine_config.vcpu_count, secret_free)
597598
.map_err(StartMicrovmError::Vm)?;
598599

599600
let guest_memfd = match secret_free {
@@ -1062,7 +1063,7 @@ pub(crate) mod tests {
10621063
pub(crate) fn default_vmm() -> Vmm {
10631064
let (kvm, mut vm) = setup_vm_with_memory(mib_to_bytes(128));
10641065

1065-
let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap();
1066+
let (_, vcpus_exit_evt) = vm.create_vcpus(1, false).unwrap();
10661067

10671068
Vmm {
10681069
instance_info: InstanceInfo::default(),

src/vmm/src/lib.rs

Lines changed: 148 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@ pub mod vstate;
117117
pub mod initrd;
118118

119119
use std::collections::HashMap;
120-
use std::io;
120+
use std::io::{self, Read, Write};
121+
use std::os::fd::RawFd;
121122
use std::os::unix::io::AsRawFd;
122123
use std::os::unix::net::UnixStream;
123124
use std::sync::mpsc::RecvTimeoutError;
@@ -129,6 +130,7 @@ use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEvent
129130
use seccomp::BpfProgram;
130131
use snapshot::Persist;
131132
use userfaultfd::Uffd;
133+
use vm_memory::GuestAddress;
132134
use vmm_sys_util::epoll::EventSet;
133135
use vmm_sys_util::eventfd::EventFd;
134136
use vmm_sys_util::terminal::Terminal;
@@ -145,12 +147,15 @@ use crate::devices::virtio::block::device::Block;
145147
use crate::devices::virtio::mem::{VIRTIO_MEM_DEV_ID, VirtioMem, VirtioMemError, VirtioMemStatus};
146148
use crate::devices::virtio::net::Net;
147149
use crate::logger::{METRICS, MetricsError, error, info, warn};
148-
use crate::persist::{MicrovmState, MicrovmStateError, VmInfo};
150+
use crate::persist::{FaultReply, FaultRequest, MicrovmState, MicrovmStateError, VmInfo};
149151
use crate::rate_limiter::BucketUpdate;
150152
use crate::vmm_config::instance_info::{InstanceInfo, VmState};
151-
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
153+
use crate::vstate::memory::{
154+
GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion,
155+
};
152156
use crate::vstate::vcpu::VcpuState;
153157
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
158+
use crate::vstate::vm::UserfaultData;
154159
pub use crate::vstate::vm::Vm;
155160

156161
/// Shorthand type for the EventManager flavour used by Firecracker.
@@ -688,6 +693,111 @@ impl Vmm {
688693
self.shutdown_exit_code = Some(exit_code);
689694
}
690695

696+
fn process_vcpu_userfault(&mut self, vcpu: u32, userfault_data: UserfaultData) {
697+
let offset = self
698+
.vm
699+
.guest_memory()
700+
.gpa_to_offset(GuestAddress(userfault_data.gpa))
701+
.expect("Failed to convert GPA to offset");
702+
703+
let fault_request = FaultRequest {
704+
vcpu,
705+
offset,
706+
flags: userfault_data.flags,
707+
token: None,
708+
};
709+
let fault_request_json =
710+
serde_json::to_string(&fault_request).expect("Failed to serialize fault request");
711+
712+
let written = self
713+
.uffd_socket
714+
.as_ref()
715+
.expect("Uffd socket is not set")
716+
.write(fault_request_json.as_bytes())
717+
.expect("Failed to write to uffd socket");
718+
719+
if written != fault_request_json.len() {
720+
panic!(
721+
"Failed to write the entire fault request to the uffd socket: expected {}, \
722+
written {}",
723+
fault_request_json.len(),
724+
written
725+
);
726+
}
727+
}
728+
729+
fn active_event_in_uffd_socket(&self, source: RawFd, event_set: EventSet) -> bool {
730+
if let Some(uffd_socket) = &self.uffd_socket {
731+
uffd_socket.as_raw_fd() == source && event_set == EventSet::IN
732+
} else {
733+
false
734+
}
735+
}
736+
737+
fn process_uffd_socket(&mut self) {
738+
const BUFFER_SIZE: usize = 4096;
739+
740+
let stream = self.uffd_socket.as_mut().expect("Uffd socket is not set");
741+
742+
let mut buffer = [0u8; BUFFER_SIZE];
743+
let mut current_pos = 0;
744+
745+
loop {
746+
if current_pos < BUFFER_SIZE {
747+
match stream.read(&mut buffer[current_pos..]) {
748+
Ok(0) => break,
749+
Ok(n) => current_pos += n,
750+
Err(e) if e.kind() == io::ErrorKind::WouldBlock => {
751+
if current_pos == 0 {
752+
break;
753+
}
754+
}
755+
Err(e) => panic!("Read error: {}", e),
756+
}
757+
}
758+
759+
let mut parser = serde_json::Deserializer::from_slice(&buffer[..current_pos])
760+
.into_iter::<FaultReply>();
761+
let mut total_consumed = 0;
762+
let mut needs_more = false;
763+
764+
while let Some(result) = parser.next() {
765+
match result {
766+
Ok(fault_reply) => {
767+
let vcpu = fault_reply.vcpu.expect("vCPU must be set");
768+
769+
self.vcpus_handles
770+
.get(vcpu as usize)
771+
.expect("Invalid vcpu index")
772+
.send_userfault_resolved();
773+
774+
total_consumed = parser.byte_offset();
775+
}
776+
Err(e) if e.is_eof() => {
777+
needs_more = true;
778+
break;
779+
}
780+
Err(e) => {
781+
println!(
782+
"Buffer content: {:?}",
783+
std::str::from_utf8(&buffer[..current_pos])
784+
);
785+
panic!("Invalid JSON: {}", e);
786+
}
787+
}
788+
}
789+
790+
if total_consumed > 0 {
791+
buffer.copy_within(total_consumed..current_pos, 0);
792+
current_pos -= total_consumed;
793+
}
794+
795+
if needs_more {
796+
continue;
797+
}
798+
}
799+
}
800+
691801
/// Gets a reference to kvm-ioctls Vm
692802
#[cfg(feature = "gdb")]
693803
pub fn vm(&self) -> &Vm {
@@ -765,38 +875,55 @@ impl MutEventSubscriber for Vmm {
765875
let event_set = event.event_set();
766876

767877
if source == self.vcpus_exit_evt.as_raw_fd() && event_set == EventSet::IN {
768-
// Exit event handling should never do anything more than call 'self.stop()'.
769878
let _ = self.vcpus_exit_evt.read();
770879

771-
let exit_code = 'exit_code: {
772-
// Query each vcpu for their exit_code.
773-
for handle in &self.vcpus_handles {
774-
// Drain all vcpu responses that are pending from this vcpu until we find an
775-
// exit status.
776-
for response in handle.response_receiver().try_iter() {
777-
if let VcpuResponse::Exited(status) = response {
778-
// It could be that some vcpus exited successfully while others
779-
// errored out. Thus make sure that error exits from one vcpu always
780-
// takes precedence over "ok" exits
880+
let mut pending_userfaults = Vec::with_capacity(self.vcpus_handles.len());
881+
let mut should_exit = false;
882+
let mut final_exit_code = FcExitCode::Ok;
883+
884+
// First pass: collect all responses and determine exit status
885+
for (handle, index) in self.vcpus_handles.iter().zip(0u32..) {
886+
for response in handle.response_receiver().try_iter() {
887+
match response {
888+
VcpuResponse::Exited(status) => {
889+
should_exit = true;
781890
if status != FcExitCode::Ok {
782-
break 'exit_code status;
891+
final_exit_code = status;
783892
}
784893
}
894+
VcpuResponse::Userfault(userfault_data) => {
895+
pending_userfaults.push((index, userfault_data));
896+
}
897+
_ => panic!("Unexpected response from vcpu: {:?}", response),
785898
}
786899
}
900+
}
787901

788-
// No CPUs exited with error status code, report "Ok"
789-
FcExitCode::Ok
790-
};
791-
self.stop(exit_code);
792-
} else {
793-
error!("Spurious EventManager event for handler: Vmm");
902+
// Process any pending userfaults
903+
for (index, userfault_data) in pending_userfaults {
904+
self.process_vcpu_userfault(index, userfault_data);
905+
}
906+
907+
// Stop if we received an exit event
908+
if should_exit {
909+
self.stop(final_exit_code);
910+
}
911+
}
912+
913+
if self.active_event_in_uffd_socket(source, event_set) {
914+
self.process_uffd_socket();
794915
}
795916
}
796917

797918
fn init(&mut self, ops: &mut EventOps) {
798919
if let Err(err) = ops.add(Events::new(&self.vcpus_exit_evt, EventSet::IN)) {
799920
error!("Failed to register vmm exit event: {}", err);
800921
}
922+
923+
if let Some(uffd_socket) = self.uffd_socket.as_ref()
924+
&& let Err(err) = ops.add(Events::new(uffd_socket, EventSet::IN))
925+
{
926+
panic!("Failed to register UFFD socket: {}", err);
927+
}
801928
}
802929
}

src/vmm/src/persist.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,10 @@ fn send_uffd_handshake(
616616
let backend_mappings = serde_json::to_string(backend_mappings).unwrap();
617617

618618
let socket = UnixStream::connect(mem_uds_path)?;
619+
socket
620+
.set_nonblocking(true)
621+
.expect("Cannot set non-blocking");
622+
619623
socket.send_with_fds(
620624
&[backend_mappings.as_bytes()],
621625
// In the happy case we can close the fd since the other process has it open and is

0 commit comments

Comments
 (0)