From 1c5fe1ccb6423847bbf2468747132e3e1c569128 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 27 Oct 2025 12:04:12 +0100 Subject: [PATCH 1/8] vmclock ABI: add snapshot safety features Add support for vm_generation_counter and notifications. Keep this separately for now, since currently upstream bindings don't include it. We will recreate this for Linux headers once they are released. Signed-off-by: Babis Chalios --- src/vmm/src/devices/acpi/generated/vmclock_abi.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs index 134c8393f0c..80228ad848b 100644 --- a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +++ b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs @@ -38,6 +38,8 @@ pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u64 = 16; pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u64 = 32; pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u64 = 64; pub const VMCLOCK_FLAG_TIME_MONOTONIC: u64 = 128; +pub const VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: u64 = 256; +pub const VMCLOCK_FLAG_NOTIFICATION_PRESENT: u64 = 512; pub const VMCLOCK_STATUS_UNKNOWN: u8 = 0; pub const VMCLOCK_STATUS_INITIALIZING: u8 = 1; pub const VMCLOCK_STATUS_SYNCHRONIZED: u8 = 2; @@ -153,10 +155,11 @@ pub struct vmclock_abi { pub time_frac_sec: __le64, pub time_esterror_nanosec: __le64, pub time_maxerror_nanosec: __le64, + pub vm_generation_counter: __le64, } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { - ["Size of vmclock_abi"][::std::mem::size_of::() - 104usize]; + ["Size of vmclock_abi"][::std::mem::size_of::() - 112usize]; ["Alignment of vmclock_abi"][::std::mem::align_of::() - 8usize]; ["Offset of field: vmclock_abi::magic"][::std::mem::offset_of!(vmclock_abi, magic) - 0usize]; ["Offset of field: vmclock_abi::size"][::std::mem::offset_of!(vmclock_abi, size) - 4usize]; @@ -198,4 +201,6 @@ const _: () = { [::std::mem::offset_of!(vmclock_abi, time_esterror_nanosec) - 88usize]; ["Offset of field: vmclock_abi::time_maxerror_nanosec"] [::std::mem::offset_of!(vmclock_abi, time_maxerror_nanosec) - 96usize]; + ["Offset of field: vmclock_abi::vm_generation_counter"] + [::std::mem::offset_of!(vmclock_abi, vm_generation_counter) - 104usize]; }; From 40c6c8632bfc8e3873e3ea0fc61f31ed9609d836 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 21 Oct 2025 14:45:42 +0200 Subject: [PATCH 2/8] vmclock: add snapshot safety features Add support for `vm_generation_counter` field in VMClock ABI. This field is similar to `disruption_marker` but it's only updated on snapshot loading events (not in live migration). It is meant to provide the guest with snapshot safety notifications. Moreover, add support for the notification capability. This capability require us to send an ACPI notification every time we change the seq_count field to a new even value. This essentially means that we need to send a notification upon resuming from a snapshot just before resuming vCPUs. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/acpi.rs | 52 +++++++----- src/vmm/src/device_manager/mod.rs | 4 + src/vmm/src/device_manager/persist.rs | 8 +- src/vmm/src/devices/acpi/vmclock.rs | 85 ++++++++++++++++--- .../functional/test_max_devices.py | 6 +- 5 files changed, 118 insertions(+), 37 deletions(-) diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 9764143b5a9..968c5afebb0 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -1,6 +1,7 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +#[cfg(target_arch = "x86_64")] use acpi_tables::{Aml, aml}; use vm_memory::GuestMemoryError; @@ -45,11 +46,13 @@ impl ACPIDeviceManager { #[cfg(target_arch = "x86_64")] pub fn attach_vmclock(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { + vm.register_irq(&self.vmclock.interrupt_evt, self.vmclock.gsi)?; self.vmclock.activate(vm.guest_memory())?; Ok(()) } } +#[cfg(target_arch = "x86_64")] impl Aml for ACPIDeviceManager { fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { // AML for [`VmGenId`] device. @@ -65,30 +68,41 @@ impl Aml for ACPIDeviceManager { &aml::Name::new("_HID".try_into()?, &"ACPI0013")?, &aml::Name::new( "_CRS".try_into()?, - &aml::ResourceTemplate::new(vec![&aml::Interrupt::new( - true, - true, - false, - false, - self.vmgenid.gsi, - )]), + &aml::ResourceTemplate::new(vec![ + &aml::Interrupt::new(true, true, false, false, self.vmgenid.gsi), + &aml::Interrupt::new(true, true, false, false, self.vmclock.gsi), + ]), )?, &aml::Method::new( "_EVT".try_into()?, 1, true, - vec![&aml::If::new( - // We know that the maximum IRQ number fits in a u8. We have up to - // 32 IRQs in x86 and up to 128 in - // ARM (look into - // `vmm::crate::arch::layout::GSI_LEGACY_END`) - #[allow(clippy::cast_possible_truncation)] - &aml::Equal::new(&aml::Arg(0), &(self.vmgenid.gsi as u8)), - vec![&aml::Notify::new( - &aml::Path::new("\\_SB_.VGEN")?, - &0x80usize, - )], - )], + vec![ + &aml::If::new( + // We know that the maximum IRQ number fits in a u8. We have up to + // 32 IRQs in x86 and up to 128 in + // ARM (look into + // `vmm::crate::arch::layout::GSI_LEGACY_END`) + #[allow(clippy::cast_possible_truncation)] + &aml::Equal::new(&aml::Arg(0), &(self.vmgenid.gsi as u8)), + vec![&aml::Notify::new( + &aml::Path::new("\\_SB_.VGEN")?, + &0x80usize, + )], + ), + &aml::If::new( + // We know that the maximum IRQ number fits in a u8. We have up to + // 32 IRQs in x86 and up to 128 in + // ARM (look into + // `vmm::crate::arch::layout::GSI_LEGACY_END`) + #[allow(clippy::cast_possible_truncation)] + &aml::Equal::new(&aml::Arg(0), &(self.vmclock.gsi as u8)), + vec![&aml::Notify::new( + &aml::Path::new("\\_SB_.VCLK")?, + &0x80usize, + )], + ), + ], ), ], ) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index fc245e05539..25c527f884a 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -465,6 +465,10 @@ impl<'a> Persist<'a> for DeviceManager { // Restore ACPI devices let mut acpi_devices = ACPIDeviceManager::restore(constructor_args.vm, &state.acpi_state)?; acpi_devices.vmgenid.notify_guest()?; + #[cfg(target_arch = "x86_64")] + acpi_devices + .vmclock + .post_load_update(constructor_args.vm.guest_memory()); // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 2a0393e57f2..d93b2487a0c 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -192,9 +192,15 @@ impl<'a> Persist<'a> for ACPIDeviceManager { vmgenid: VmGenId::restore((), &state.vmgenid).unwrap(), // Safe to unwrap() here, this will never return an error. #[cfg(target_arch = "x86_64")] - vmclock: VmClock::restore(vm.guest_memory(), &state.vmclock).unwrap(), + vmclock: VmClock::restore((), &state.vmclock).unwrap(), }; + #[cfg(target_arch = "x86_64")] + vm.register_irq( + &acpi_devices.vmclock.interrupt_evt, + acpi_devices.vmclock.gsi, + )?; + acpi_devices.attach_vmgenid(vm)?; Ok(acpi_devices) } diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs index d7882a78ded..97af643e921 100644 --- a/src/vmm/src/devices/acpi/vmclock.rs +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -6,14 +6,19 @@ use std::mem::offset_of; use std::sync::atomic::{Ordering, fence}; use acpi_tables::{Aml, aml}; -use log::error; +use log::{debug, error}; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryError}; +use vm_superio::Trigger; +use vmm_sys_util::eventfd::EventFd; +use crate::Vm; use crate::devices::acpi::generated::vmclock_abi::{ - VMCLOCK_COUNTER_INVALID, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, + VMCLOCK_COUNTER_INVALID, VMCLOCK_FLAG_NOTIFICATION_PRESENT, + VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, }; +use crate::devices::legacy::EventFdTrigger; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::resources::ResourceAllocator; @@ -47,6 +52,10 @@ macro_rules! write_vmclock_field { pub struct VmClock { /// Guest address in which we will write the VMclock struct pub guest_address: GuestAddress, + /// Interrupt line for notifying the device about changes + pub interrupt_evt: EventFdTrigger, + /// GSI number allocated for the device. + pub gsi: u32, /// The [`VmClock`] state we are exposing to the guest inner: vmclock_abi, } @@ -62,17 +71,33 @@ impl VmClock { ) .expect("vmclock: could not allocate guest memory for device"); + let gsi = resource_allocator + .allocate_gsi_legacy(1) + .inspect_err(|err| error!("vmclock: Could not allocate GSI for VMClock: {err}")) + .unwrap()[0]; + + let interrupt_evt = EventFdTrigger::new( + EventFd::new(libc::EFD_NONBLOCK) + .inspect_err(|err| { + error!("vmclock: Could not create EventFd for VMClock device: {err}") + }) + .unwrap(), + ); + let mut inner = vmclock_abi { magic: VMCLOCK_MAGIC, size: VMCLOCK_SIZE, version: 1, clock_status: VMCLOCK_STATUS_UNKNOWN, counter_id: VMCLOCK_COUNTER_INVALID, + flags: VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT | VMCLOCK_FLAG_NOTIFICATION_PRESENT, ..Default::default() }; VmClock { guest_address: GuestAddress(addr), + interrupt_evt, + gsi, inner, } } @@ -98,11 +123,23 @@ impl VmClock { self.inner.disruption_marker.wrapping_add(1) ); - // This fence ensures guest sees the `disruption_marker` update. It is matched to a - // read barrier in the guest. + write_vmclock_field!( + self, + mem, + vm_generation_counter, + self.inner.vm_generation_counter.wrapping_add(1) + ); + + // This fence ensures guest sees the `disruption_marker` and `vm_generation_counter` + // updates. It is matched to a read barrier in the guest. fence(Ordering::Release); write_vmclock_field!(self, mem, seq_count, self.inner.seq_count.wrapping_add(1)); + self.interrupt_evt + .trigger() + .inspect_err(|err| error!("vmclock: could not send guest notification: {err}")) + .unwrap(); + debug!("vmclock: notifying guest about VMClock updates"); } } @@ -113,31 +150,39 @@ impl VmClock { pub struct VmClockState { /// Guest address in which we write the [`VmClock`] info pub guest_address: u64, + /// GSI used for notifying the guest about device changes + pub gsi: u32, /// Data we expose to the guest pub inner: vmclock_abi, } impl<'a> Persist<'a> for VmClock { type State = VmClockState; - type ConstructorArgs = &'a GuestMemoryMmap; + type ConstructorArgs = (); type Error = Infallible; fn save(&self) -> Self::State { VmClockState { guest_address: self.guest_address.0, + gsi: self.gsi, inner: self.inner, } } - fn restore( - constructor_args: Self::ConstructorArgs, - state: &Self::State, - ) -> Result { + fn restore(vm: Self::ConstructorArgs, state: &Self::State) -> Result { + let interrupt_evt = EventFdTrigger::new( + EventFd::new(libc::EFD_NONBLOCK) + .inspect_err(|err| { + error!("vmclock: Could not create EventFd for VMClock device: {err}") + }) + .unwrap(), + ); let mut vmclock = VmClock { guest_address: GuestAddress(state.guest_address), + interrupt_evt, + gsi: state.gsi, inner: state.inner, }; - vmclock.post_load_update(constructor_args); Ok(vmclock) } } @@ -174,14 +219,20 @@ impl Aml for VmClock { #[cfg(test)] mod tests { use vm_memory::{Bytes, GuestAddress}; + use vmm_sys_util::tempfile::TempFile; - use crate::arch; + use crate::Vm; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::layout; + use crate::arch::{self, Kvm}; use crate::devices::acpi::generated::vmclock_abi::vmclock_abi; use crate::devices::acpi::vmclock::{VMCLOCK_SIZE, VmClock}; - use crate::snapshot::Persist; + use crate::devices::virtio::test_utils::default_mem; + use crate::snapshot::{Persist, Snapshot}; use crate::test_utils::single_region_mem; use crate::utils::u64_to_usize; use crate::vstate::resources::ResourceAllocator; + use crate::vstate::vm::tests::setup_vm_with_memory; // We are allocating memory from the end of the system memory portion const VMCLOCK_TEST_GUEST_ADDR: GuestAddress = @@ -211,15 +262,17 @@ mod tests { #[test] fn test_device_save_restore() { let vmclock = default_vmclock(); + // We're using memory inside the system memory portion of the guest RAM. So we need a + // memory region that includes it. let mem = single_region_mem( u64_to_usize(arch::SYSTEM_MEM_START) + u64_to_usize(arch::SYSTEM_MEM_SIZE), ); vmclock.activate(&mem).unwrap(); - let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); let state = vmclock.save(); - let vmclock_new = VmClock::restore(&mem, &state).unwrap(); + let mut vmclock_new = VmClock::restore((), &state).unwrap(); + vmclock_new.post_load_update(&mem); let guest_data_new: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); assert_ne!(guest_data_new, vmclock.inner); @@ -228,5 +281,9 @@ mod tests { vmclock.inner.disruption_marker + 1, vmclock_new.inner.disruption_marker ); + assert_eq!( + vmclock.inner.vm_generation_counter + 1, + vmclock_new.inner.vm_generation_counter + ); } } diff --git a/tests/integration_tests/functional/test_max_devices.py b/tests/integration_tests/functional/test_max_devices.py index 54153b27d2d..d2ba3e7ab16 100644 --- a/tests/integration_tests/functional/test_max_devices.py +++ b/tests/integration_tests/functional/test_max_devices.py @@ -22,9 +22,9 @@ def max_devices(uvm): # at the same time is 93. return 93 case "x86_64": - # IRQs are available from 5 to 23. We always use one IRQ for VMGenID device, so - # the maximum number of devices supported at the same time is 18. - return 18 + # IRQs are available from 5 to 23. We always use one IRQ for VMGenID and VMClock + # devices, so the maximum number of devices supported at the same time is 17. + return 17 case _: raise ValueError("Unknown platform") From 734f841fc6ac11e89943391ab96cea6d27a85be0 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 10 Nov 2025 15:54:10 +0100 Subject: [PATCH 3/8] vmclock: test snapshot safety features Extend VMClock integration tests to also account for the vm_generation_counter field and notification support flag. Signed-off-by: Babis Chalios --- tests/host_tools/vmclock-abi.h | 24 +++ tests/host_tools/vmclock.c | 152 +++++++++++++++--- .../functional/test_vmclock.py | 79 +++++++-- 3 files changed, 228 insertions(+), 27 deletions(-) diff --git a/tests/host_tools/vmclock-abi.h b/tests/host_tools/vmclock-abi.h index 2d99b29ac44..5c707e263cb 100644 --- a/tests/host_tools/vmclock-abi.h +++ b/tests/host_tools/vmclock-abi.h @@ -115,6 +115,17 @@ struct vmclock_abi { * bit again after the update, using the about-to-be-valid fields. */ #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + /* + * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will + * bump the vm_generation_counter field every time the guest is + * loaded from some save state (restored from a snapshot). + */ +#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + /* + * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send + * a notification every time it updates seq_count to a new even number. + */ +#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) __u8 pad[2]; __u8 clock_status; @@ -177,6 +188,19 @@ struct vmclock_abi { __le64 time_frac_sec; /* Units of 1/2^64 of a second */ __le64 time_esterror_nanosec; __le64 time_maxerror_nanosec; + + /* + * This field changes to another non-repeating value when the VM + * is loaded from a snapshot. This event, typically, represents a + * "jump" forward in time. As a result, in this case as well, the + * guest needs to discard any calibrarion against external sources. + * Loading a snapshot in a VM has different semantics than other VM + * events such as live migration, i.e. apart from re-adjusting guest + * clocks a guest user space might want to discard UUIDs, reset + * network connections or reseed entropy, etc. As a result, we + * use a dedicated marker for such events. + */ + __le64 vm_generation_counter; }; #endif /* __VMCLOCK_ABI_H__ */ diff --git a/tests/host_tools/vmclock.c b/tests/host_tools/vmclock.c index d69304ac87c..b27d0acdc29 100644 --- a/tests/host_tools/vmclock.c +++ b/tests/host_tools/vmclock.c @@ -1,12 +1,12 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -#include #include #include #include #include #include +#include #include #include #include @@ -16,23 +16,26 @@ const char *VMCLOCK_DEV_PATH = "/dev/vmclock0"; -int get_vmclock_handle(struct vmclock_abi **vmclock) +int open_vmclock(void) { int fd = open(VMCLOCK_DEV_PATH, 0); - if (fd == -1) - goto out_err; + if (fd == -1) { + perror("open"); + exit(1); + } - void *ptr = mmap(NULL, sizeof(struct vmclock_abi), PROT_READ, MAP_SHARED, fd, 0); - if (ptr == MAP_FAILED) - goto out_err_mmap; + return fd; +} - *vmclock = ptr; - return 0; +struct vmclock_abi *get_vmclock_handle(int fd) +{ + void *ptr = mmap(NULL, sizeof(struct vmclock_abi), PROT_READ, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + exit(1); + } -out_err_mmap: - close(fd); -out_err: - return errno; + return ptr; } #define READ_VMCLOCK_FIELD_FN(type, field) \ @@ -56,16 +59,43 @@ type read##_##field (struct vmclock_abi *vmclock) { \ } READ_VMCLOCK_FIELD_FN(uint64_t, disruption_marker); +READ_VMCLOCK_FIELD_FN(uint64_t, vm_generation_counter); -int main() +/* + * Read `vmclock_abi` structure using a file descriptor pointing to + * `/dev/vmclock0`. + */ +void read_vmclock(int fd, struct vmclock_abi *vmclock) { - struct vmclock_abi *vmclock; + int ret; - int err = get_vmclock_handle(&vmclock); - if (err) { - printf("Could not mmap vmclock struct: %s\n", strerror(err)); + /* + * Use `pread()`, since the device doesn't implement lseek(), so + * we can't reset `fp`. + */ + ret = pread(fd, vmclock, sizeof(*vmclock), 0); + if (ret < 0) { + perror("read"); + exit(1); + } else if (ret < (int) sizeof(*vmclock)) { + fprintf(stderr, "We don't handle partial writes (%d). Exiting!\n", ret); exit(1); } +} + +void print_vmclock(struct vmclock_abi *vmclock) +{ + if (vmclock->flags & VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT) { + printf("VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: true\n"); + } else { + printf("VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: false\n"); + } + + if (vmclock->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT) { + printf("VMCLOCK_FLAG_NOTIFICATION_PRESENT: true\n"); + } else { + printf("VMCLOCK_FLAG_NOTIFICATION_PRESENT: false\n"); + } printf("VMCLOCK_MAGIC: 0x%x\n", vmclock->magic); printf("VMCLOCK_SIZE: 0x%x\n", vmclock->size); @@ -73,6 +103,92 @@ int main() printf("VMCLOCK_CLOCK_STATUS: %u\n", vmclock->clock_status); printf("VMCLOCK_COUNTER_ID: %u\n", vmclock->counter_id); printf("VMCLOCK_DISRUPTION_MARKER: %lu\n", read_disruption_marker(vmclock)); + printf("VMCLOCK_VM_GENERATION_COUNTER: %lu\n", read_vm_generation_counter(vmclock)); + fflush(stdout); +} + +void run_poll(int fd) +{ + struct vmclock_abi vmclock; + int epfd, ret, nfds; + struct epoll_event ev; + + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + + epfd = epoll_create(1); + if (epfd < 0) { + perror("epoll_create"); + exit(1); + } + + ev.events = EPOLLIN | EPOLLRDNORM; + ev.data.fd = fd; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + if (ret < 0) { + perror("epoll_add"); + exit(1); + } + + while (1) { + nfds = epoll_wait(epfd, &ev, 1, -1); + if (nfds < 0) { + perror("epoll_wait"); + exit(1); + } + + if (ev.data.fd != fd) { + fprintf(stderr, "Unknown file descriptor %d\n", ev.data.fd); + exit(1); + } + + if (ev.events & EPOLLHUP) { + fprintf(stderr, "Device does not support notifications. Stop polling\n"); + exit(1); + } else if (ev.events & EPOLLIN) { + fprintf(stdout, "Got VMClock notification\n"); + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + } + } +} + +void print_help_message() +{ + fprintf(stderr, "usage: vmclock MODE\n"); + fprintf(stderr, "Available modes:\n"); + fprintf(stderr, " -r\tRead vmclock_abi using read()\n"); + fprintf(stderr, " -m\tRead vmclock_abi using mmap()\n"); + fprintf(stderr, " -p\tPoll VMClock for changes\n"); +} + +int main(int argc, char *argv[]) +{ + int fd; + struct vmclock_abi vmclock, *vmclock_ptr; + + if (argc != 2) { + print_help_message(); + exit(1); + } + + fd = open_vmclock(); + + if (!strncmp(argv[1], "-r", 3)) { + printf("Reading VMClock with read()\n"); + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + } else if (!strncmp(argv[1], "-m", 3)) { + printf("Reading VMClock with mmap()\n"); + vmclock_ptr = get_vmclock_handle(fd); + print_vmclock(vmclock_ptr); + } else if (!strncmp(argv[1], "-p", 3)) { + printf("Polling VMClock\n"); + run_poll(fd); + } else { + print_help_message(); + exit(1); + } return 0; } diff --git a/tests/integration_tests/functional/test_vmclock.py b/tests/integration_tests/functional/test_vmclock.py index b487526abdb..1e838a539f3 100644 --- a/tests/integration_tests/functional/test_vmclock.py +++ b/tests/integration_tests/functional/test_vmclock.py @@ -21,40 +21,100 @@ def vm_with_vmclock(uvm_plain_acpi, bin_vmclock_path): yield basevm -def parse_vmclock(vm): +def parse_vmclock(vm, use_mmap=False): """Parse the VMclock struct inside the guest and return a dictionary with its fields""" - _, stdout, _ = vm.ssh.check_output("/tmp/vmclock") + + cmd = "/tmp/vmclock -m" if use_mmap else "/tmp/vmclock -r" + _, stdout, _ = vm.ssh.check_output(cmd) + fields = stdout.strip().split("\n") + if use_mmap: + assert fields[0] == "Reading VMClock with mmap()" + else: + assert fields[0] == "Reading VMClock with read()" + + return dict(item.split(": ") for item in fields if item.startswith("VMCLOCK")) + + +def parse_vmclock_from_poll(vm, expected_notifications): + """Parse the output of the 'vmclock -p' command in the guest""" + + _, stdout, _ = vm.ssh.check_output("cat /tmp/vmclock.out") fields = stdout.strip().split("\n") - return dict(item.split(": ") for item in fields) + + nr_notifications = 0 + for line in fields: + if line == "Got VMClock notification": + nr_notifications += 1 + + assert nr_notifications == expected_notifications + return dict(item.split(": ") for item in fields if item.startswith("VMCLOCK")) @pytest.mark.skipif( platform.machine() != "x86_64", reason="VMClock device is currently supported only on x86 systems", ) -def test_vmclock_fields(vm_with_vmclock): +@pytest.mark.parametrize("use_mmap", [False, True], ids=["read()", "mmap()"]) +def test_vmclock_read_fields(vm_with_vmclock, use_mmap): """Make sure that we expose the expected values in the VMclock struct""" vm = vm_with_vmclock - vmclock = parse_vmclock(vm) + vmclock = parse_vmclock(vm, use_mmap) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" assert vmclock["VMCLOCK_MAGIC"] == "0x4b4c4356" assert vmclock["VMCLOCK_SIZE"] == "0x1000" assert vmclock["VMCLOCK_VERSION"] == "1" assert vmclock["VMCLOCK_CLOCK_STATUS"] == "0" assert vmclock["VMCLOCK_COUNTER_ID"] == "255" assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" @pytest.mark.skipif( platform.machine() != "x86_64", reason="VMClock device is currently supported only on x86 systems", ) -def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type): - """Test that `disruption_marker` is updated upon snapshot resume""" +@pytest.mark.parametrize("use_mmap", [False, True], ids=["read()", "mmap()"]) +def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type, use_mmap): + """Test that `disruption_marker` and `vm_generation_counter` are updated + upon snapshot resume""" basevm = vm_with_vmclock - vmclock = parse_vmclock(basevm) + vmclock = parse_vmclock(basevm, use_mmap) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" + + snapshot = basevm.make_snapshot(snapshot_type) + basevm.kill() + + for i, vm in enumerate( + microvm_factory.build_n_from_snapshot(snapshot, 5, incremental=True) + ): + vmclock = parse_vmclock(vm, use_mmap) + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == f"{i+1}" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == f"{i+1}" + + +# TODO: remove this skip when we backport VMClock snapshot safety patches to 5.10 and 6.1 +@pytest.mark.skip( + reason="Skip until we get guest microVM kernels with support for the notification mechanism", +) +def test_vmclock_notifications(vm_with_vmclock, microvm_factory, snapshot_type): + """Test that Firecracker will send a notification on snapshot load""" + basevm = vm_with_vmclock + + # Launch vmclock utility in polling mode + basevm.ssh.check_output("/tmp/vmclock -p > /tmp/vmclock.out 2>&1 &") + + # We should not have received any notification yet + vmclock = parse_vmclock_from_poll(basevm, 0) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" snapshot = basevm.make_snapshot(snapshot_type) basevm.kill() @@ -62,5 +122,6 @@ def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type): for i, vm in enumerate( microvm_factory.build_n_from_snapshot(snapshot, 5, incremental=True) ): - vmclock = parse_vmclock(vm) + vmclock = parse_vmclock_from_poll(vm, i + 1) assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == f"{i+1}" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == f"{i+1}" From 5b6f3c8f418b21fd5cac819a375c67fff513b930 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 5 Dec 2025 12:29:15 +0100 Subject: [PATCH 4/8] vmclock: add support for Aarch64 Expose VMClock device to guest via DT and enable compiling the vmclock for ARM architectures. Keep VMClock tests only on x86 until we get support from the guest kernel. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 14 ++++++++++++++ src/vmm/src/arch/aarch64/output_GICv3.dtb | Bin 2097152 -> 2097152 bytes .../src/arch/aarch64/output_initrd_GICv3.dtb | Bin 2097152 -> 2097152 bytes src/vmm/src/builder.rs | 1 - src/vmm/src/device_manager/acpi.rs | 4 ---- src/vmm/src/device_manager/mod.rs | 2 -- src/vmm/src/device_manager/persist.rs | 5 ----- src/vmm/src/devices/acpi/vmclock.rs | 2 +- .../functional/test_max_devices.py | 6 +++--- 9 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 949435e6c83..5f98431b1d9 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -20,6 +20,7 @@ use crate::arch::{ use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; use crate::device_manager::pci_mngr::PciDevices; +use crate::devices::acpi::vmclock::{VMCLOCK_SIZE, VmClock}; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestRegionType}; @@ -97,6 +98,7 @@ pub fn create_fdt( create_psci_node(&mut fdt_writer)?; create_devices_node(&mut fdt_writer, device_manager)?; create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; + create_vmclock_node(&mut fdt_writer, &device_manager.acpi_devices.vmclock)?; create_pci_nodes(&mut fdt_writer, &device_manager.pci_devices)?; // End Header node. @@ -287,6 +289,18 @@ fn create_vmgenid_node(fdt: &mut FdtWriter, vmgenid: &VmGenId) -> Result<(), Fdt Ok(()) } +fn create_vmclock_node(fdt: &mut FdtWriter, vmclock: &VmClock) -> Result<(), FdtError> { + let vmclock_node = fdt.begin_node(&format!("ptp@{}", vmclock.guest_address.0))?; + fdt.property_string("compatible", "amazon,vmclock")?; + fdt.property_array_u64("reg", &[vmclock.guest_address.0, VMCLOCK_SIZE as u64])?; + fdt.property_array_u32( + "interrupts", + &[GIC_FDT_IRQ_TYPE_SPI, vmclock.gsi, IRQ_TYPE_EDGE_RISING], + )?; + fdt.end_node(vmclock_node)?; + Ok(()) +} + fn create_gic_node(fdt: &mut FdtWriter, gic_device: &GICDevice) -> Result<(), FdtError> { let interrupt = fdt.begin_node("intc")?; fdt.property_string("compatible", gic_device.fdt_compatibility())?; diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 979cd68a285710b054e1b7a8f26e1599d41116f5..d1004096059f564f8c031c8aeeb520ba44249412 100644 GIT binary patch delta 261 zcmW;CISxTl6vpv;dCxpNJoE52K!O;fu>}iIh|*YyLUj{`TC{HyDxEdZD{R09M8yC4 zl3&jK&bdk{sbJlr-HVglQkGtbCmAX6ihc8)LcNtcT4;tKaT6J9Rz~FXH+! zSxk1TWp%rl&R5f!-u!PCbEJ!Mo_Y|y`rjhVt1d+DUE|;y5*~P=y9pos2q1_M!iXS> e7~-&yKoTj~NF#$Ra>%2AB1$Nua&Zmk^Zo&adq6M% delta 171 zcmWm2D-yym6hP6Jl7w$bpg{O{4LD{nT}}m(1!M++X0ixE05o7^8~t zul)BGF;5ZMi|E_c2X#t2Tig#veRk&Jj6#MMG@7$$qk}Gb=wpB!LpXShFh+q1rkEjM Ujs=!jVT}#84`<5HH&wR&19vPVn*aa+ diff --git a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb index 63ab6765036fccb4140b5c2ac84830bebeb976ef..9477bb72d17616a922a1a6b876a6c703ff2d52fb 100644 GIT binary patch delta 262 zcmW;CI}SlX6vpwn?%eCm^?tJf1>zYBTd)9yC_M|2P)(vx3mZ|{z+HgC0z_{E3h^Ib z^2?d;oKZ>{3eG+JeG6glgb-aJtij1idn|d5*@VFF{U%C+E;L4*b4R~NecMCSW@WGD&f(Rju e2%>NhLmUYtkwO|7WRXK21r$+2`R2;X Self { ACPIDeviceManager { vmgenid: VmGenId::new(resource_allocator), - #[cfg(target_arch = "x86_64")] vmclock: VmClock::new(resource_allocator), } } @@ -44,7 +41,6 @@ impl ACPIDeviceManager { Ok(()) } - #[cfg(target_arch = "x86_64")] pub fn attach_vmclock(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { vm.register_irq(&self.vmclock.interrupt_evt, self.vmclock.gsi)?; self.vmclock.activate(vm.guest_memory())?; diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 25c527f884a..1dc24d2feb3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -237,7 +237,6 @@ impl DeviceManager { Ok(()) } - #[cfg(target_arch = "x86_64")] pub(crate) fn attach_vmclock_device(&mut self, vm: &Vm) -> Result<(), AttachDeviceError> { self.acpi_devices.attach_vmclock(vm)?; Ok(()) @@ -465,7 +464,6 @@ impl<'a> Persist<'a> for DeviceManager { // Restore ACPI devices let mut acpi_devices = ACPIDeviceManager::restore(constructor_args.vm, &state.acpi_state)?; acpi_devices.vmgenid.notify_guest()?; - #[cfg(target_arch = "x86_64")] acpi_devices .vmclock .post_load_update(constructor_args.vm.guest_memory()); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index d93b2487a0c..7ca2a2bb81d 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -15,7 +15,6 @@ use super::mmio::*; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::device_manager::acpi::ACPIDeviceError; -#[cfg(target_arch = "x86_64")] use crate::devices::acpi::vmclock::{VmClock, VmClockState}; use crate::devices::acpi::vmgenid::{VMGenIDState, VmGenId}; #[cfg(target_arch = "aarch64")] @@ -169,7 +168,6 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { vmgenid: VMGenIDState, - #[cfg(target_arch = "x86_64")] vmclock: VmClockState, } @@ -181,7 +179,6 @@ impl<'a> Persist<'a> for ACPIDeviceManager { fn save(&self) -> Self::State { ACPIDeviceManagerState { vmgenid: self.vmgenid.save(), - #[cfg(target_arch = "x86_64")] vmclock: self.vmclock.save(), } } @@ -191,11 +188,9 @@ impl<'a> Persist<'a> for ACPIDeviceManager { // Safe to unwrap() here, this will never return an error. vmgenid: VmGenId::restore((), &state.vmgenid).unwrap(), // Safe to unwrap() here, this will never return an error. - #[cfg(target_arch = "x86_64")] vmclock: VmClock::restore((), &state.vmclock).unwrap(), }; - #[cfg(target_arch = "x86_64")] vm.register_irq( &acpi_devices.vmclock.interrupt_evt, acpi_devices.vmclock.gsi, diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs index 97af643e921..94e8c4563a4 100644 --- a/src/vmm/src/devices/acpi/vmclock.rs +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -27,7 +27,7 @@ use crate::vstate::resources::ResourceAllocator; unsafe impl ByteValued for vmclock_abi {} // We are reserving a physical page to expose the [`VmClock`] data -const VMCLOCK_SIZE: u32 = 0x1000; +pub const VMCLOCK_SIZE: u32 = 0x1000; // Write a value in `vmclock_abi` both in the Firecracker-managed state // and inside guest memory address that corresponds to it. diff --git a/tests/integration_tests/functional/test_max_devices.py b/tests/integration_tests/functional/test_max_devices.py index d2ba3e7ab16..bd51d4e53a9 100644 --- a/tests/integration_tests/functional/test_max_devices.py +++ b/tests/integration_tests/functional/test_max_devices.py @@ -18,9 +18,9 @@ def max_devices(uvm): match platform.machine(): case "aarch64": # On aarch64, IRQs are available from 32 to 127. We always use one IRQ each for - # the VMGenID, RTC and serial devices, so the maximum number of devices supported - # at the same time is 93. - return 93 + # the VMGenID, VMClock, RTC and serial devices, so the maximum number of devices + # supported at the same time is 92. + return 92 case "x86_64": # IRQs are available from 5 to 23. We always use one IRQ for VMGenID and VMClock # devices, so the maximum number of devices supported at the same time is 17. From 2fa64a6db4cb4d8a447f8d693a0f137a387d4dec Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 10 Dec 2025 11:45:39 +0100 Subject: [PATCH 5/8] ci: apply patches before build CI kernels Add support to apply patches while build kernels for our CI. For the time being we use this to apply patches for snapshot safety VMClock extensions [1] (backported to 5.10 and 6.1 AL kernels). [1] https://lkml.org/lkml/2025/12/3/653 Signed-off-by: Babis Chalios --- ...tp-vmclock-add-vm-generation-counter.patch | 60 ++++ ...vmclock-support-device-notifications.patch | 257 ++++++++++++++++++ ...3-dt-bindings-ptp-Add-amazon-vmclock.patch | 76 ++++++ ...-ptp_vmclock-Add-device-tree-support.patch | 180 ++++++++++++ ...tp-vmclock-add-vm-generation-counter.patch | 60 ++++ ...vmclock-support-device-notifications.patch | 257 ++++++++++++++++++ ...3-dt-bindings-ptp-Add-amazon-vmclock.patch | 76 ++++++ ...-ptp_vmclock-Add-device-tree-support.patch | 180 ++++++++++++ resources/rebuild.sh | 11 + 9 files changed, 1157 insertions(+) create mode 100644 resources/patches/vmclock/5.10/0001-ptp-vmclock-add-vm-generation-counter.patch create mode 100644 resources/patches/vmclock/5.10/0002-ptp-vmclock-support-device-notifications.patch create mode 100644 resources/patches/vmclock/5.10/0003-dt-bindings-ptp-Add-amazon-vmclock.patch create mode 100644 resources/patches/vmclock/5.10/0004-ptp-ptp_vmclock-Add-device-tree-support.patch create mode 100644 resources/patches/vmclock/6.1/0001-ptp-vmclock-add-vm-generation-counter.patch create mode 100644 resources/patches/vmclock/6.1/0002-ptp-vmclock-support-device-notifications.patch create mode 100644 resources/patches/vmclock/6.1/0003-dt-bindings-ptp-Add-amazon-vmclock.patch create mode 100644 resources/patches/vmclock/6.1/0004-ptp-ptp_vmclock-Add-device-tree-support.patch diff --git a/resources/patches/vmclock/5.10/0001-ptp-vmclock-add-vm-generation-counter.patch b/resources/patches/vmclock/5.10/0001-ptp-vmclock-add-vm-generation-counter.patch new file mode 100644 index 00000000000..28588e1c924 --- /dev/null +++ b/resources/patches/vmclock/5.10/0001-ptp-vmclock-add-vm-generation-counter.patch @@ -0,0 +1,60 @@ +From a46562c571c6d50e7afc3994b33d0ffb61ff7409 Mon Sep 17 00:00:00 2001 +From: Babis Chalios +Date: Tue, 2 Dec 2025 20:11:32 +0000 +Subject: [PATCH 1/4] ptp: vmclock: add vm generation counter + +Similar to live migration, loading a VM from some saved state (aka +snapshot) is also an event that calls for clock adjustments in the +guest. However, guests might want to take more actions as a response to +such events, e.g. as discarding UUIDs, resetting network connections, +reseeding entropy pools, etc. These are actions that guests don't +typically take during live migration, so add a new field in the +vmclock_abi called vm_generation_counter which informs the guest about +such events. + +Hypervisor advertises support for vm_generation_counter through the +VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT flag. Users need to check the +presence of this bit in vmclock_abi flags field before using this flag. + +Signed-off-by: Babis Chalios +Reviewed-by: David Woodhouse +--- + include/uapi/linux/vmclock-abi.h | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h +index d7ca44313bf8..75deb6ae2b27 100644 +--- a/include/uapi/linux/vmclock-abi.h ++++ b/include/uapi/linux/vmclock-abi.h +@@ -119,6 +119,12 @@ struct vmclock_abi { + * bit again after the update, using the about-to-be-valid fields. + */ + #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) ++ /* ++ * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will ++ * bump the vm_generation_counter field every time the guest is ++ * loaded from some save state (restored from a snapshot). ++ */ ++#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + + uint8_t pad[2]; + uint8_t clock_status; +@@ -183,6 +189,15 @@ struct vmclock_abi { + uint64_t time_frac_sec; /* (seconds >> 64) */ + uint64_t time_esterror_picosec; /* (± picoseconds) */ + uint64_t time_maxerror_picosec; /* (± picoseconds) */ ++ ++ /* ++ * This field changes to another non-repeating value when the guest ++ * has been loaded from a snapshot. In addition to handling a ++ * disruption in time (which will also be signalled through the ++ * disruption_marker field), a guest may wish to discard UUIDs, ++ * reset network connections, reseed entropy, etc. ++ */ ++ uint64_t vm_generation_counter; + }; + + #endif /* __VMCLOCK_ABI_H__ */ +-- +2.34.1 + diff --git a/resources/patches/vmclock/5.10/0002-ptp-vmclock-support-device-notifications.patch b/resources/patches/vmclock/5.10/0002-ptp-vmclock-support-device-notifications.patch new file mode 100644 index 00000000000..f9cde8c7242 --- /dev/null +++ b/resources/patches/vmclock/5.10/0002-ptp-vmclock-support-device-notifications.patch @@ -0,0 +1,257 @@ +From d0a6bf47dd6cd2a9ed17dbdc32dd34a6ba0f5b5f Mon Sep 17 00:00:00 2001 +From: Babis Chalios +Date: Tue, 2 Dec 2025 20:11:44 +0000 +Subject: [PATCH 2/4] ptp: vmclock: support device notifications + +Add optional support for device notifications in VMClock. When +supported, the hypervisor will send a device notification every time it +updates the seq_count to a new even value. + +Moreover, add support for poll() in VMClock as a means to propagate this +notification to user space. poll() will return a POLLIN event to +listeners every time seq_count changes to a value different than the one +last seen (since open() or last read()/pread()). This means that when +poll() returns a POLLIN event, listeners need to use read() to observe +what has changed and update the reader's view of seq_count. In other +words, after a poll() returned, all subsequent calls to poll() will +immediately return with a POLLIN event until the listener calls read(). + +The device advertises support for the notification mechanism by setting +flag VMCLOCK_FLAG_NOTIFICATION_PRESENT in vmclock_abi flags field. If +the flag is not present the driver won't setup the ACPI notification +handler and poll() will always immediately return POLLHUP. + +Signed-off-by: Babis Chalios +--- + drivers/ptp/ptp_vmclock.c | 130 ++++++++++++++++++++++++++++--- + include/uapi/linux/vmclock-abi.h | 5 ++ + 2 files changed, 126 insertions(+), 9 deletions(-) + +diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c +index 1ce69eada4b2..4673915c43e7 100644 +--- a/drivers/ptp/ptp_vmclock.c ++++ b/drivers/ptp/ptp_vmclock.c +@@ -5,6 +5,9 @@ + * Copyright © 2024 Amazon.com, Inc. or its affiliates. + */ + ++#include "linux/poll.h" ++#include "linux/types.h" ++#include "linux/wait.h" + #include + #include + #include +@@ -37,6 +40,7 @@ struct vmclock_state { + struct resource res; + struct vmclock_abi *clk; + struct miscdevice miscdev; ++ wait_queue_head_t disrupt_wait; + struct ptp_clock_info ptp_clock_info; + struct ptp_clock *ptp_clock; + enum clocksource_ids cs_id, sys_cs_id; +@@ -311,10 +315,15 @@ static const struct ptp_clock_info ptp_vmclock_info = { + .getcrosststamp = ptp_vmclock_getcrosststamp, + }; + ++struct vmclock_file_state { ++ struct vmclock_state *st; ++ atomic_t seq; ++}; ++ + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) + { +- struct vmclock_state *st = container_of(fp->private_data, +- struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; + + if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) + return -EROFS; +@@ -333,11 +342,12 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) + static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) + { +- struct vmclock_state *st = container_of(fp->private_data, +- struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; ++ + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); + size_t max_count; +- int32_t seq; ++ int32_t seq, old_seq; + + if (*ppos >= PAGE_SIZE) + return 0; +@@ -346,6 +356,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + if (count > max_count) + count = max_count; + ++ old_seq = atomic_read(&fst->seq); + while (1) { + seq = st->clk->seq_count & ~1ULL; + virt_rmb(); +@@ -354,8 +365,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + return -EFAULT; + + virt_rmb(); +- if (seq == st->clk->seq_count) +- break; ++ if (seq == st->clk->seq_count) { ++ /* ++ * Either we updated fst->seq to seq (the latest version we observed) ++ * or someone else did (old_seq == seq), so we can break. ++ */ ++ if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) || ++ old_seq == seq) { ++ break; ++ } ++ } + + if (ktime_after(ktime_get(), deadline)) + return -ETIMEDOUT; +@@ -365,9 +384,57 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + return count; + } + ++static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait) ++{ ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; ++ uint32_t seq; ++ ++ /* ++ * Hypervisor will not send us any notifications, so fail immediately ++ * to avoid having caller sleeping for ever. ++ */ ++ if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return POLLHUP; ++ ++ poll_wait(fp, &st->disrupt_wait, wait); ++ ++ seq = st->clk->seq_count; ++ if (atomic_read(&fst->seq) != seq) ++ return POLLIN | POLLRDNORM; ++ ++ return 0; ++} ++ ++static int vmclock_miscdev_open(struct inode *inode, struct file *fp) ++{ ++ struct vmclock_state *st = container_of(fp->private_data, ++ struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL); ++ ++ if (!fst) ++ return -ENOMEM; ++ ++ fst->st = st; ++ atomic_set(&fst->seq, 0); ++ ++ fp->private_data = fst; ++ ++ return 0; ++} ++ ++static int vmclock_miscdev_release(struct inode *inode, struct file *fp) ++{ ++ kfree(fp->private_data); ++ return 0; ++} ++ + static const struct file_operations vmclock_miscdev_fops = { +- .mmap = vmclock_miscdev_mmap, +- .read = vmclock_miscdev_read, ++ .open = vmclock_miscdev_open, ++ .release = vmclock_miscdev_release, ++ .mmap = vmclock_miscdev_mmap, ++ .read = vmclock_miscdev_read, ++ .poll = vmclock_miscdev_poll, + }; + + /* module operations */ +@@ -413,6 +480,44 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data + return AE_ERROR; + } + ++static void ++vmclock_acpi_notification_handler(acpi_handle __always_unused handle, ++ u32 __always_unused event, void *dev) ++{ ++ struct device *device = dev; ++ struct vmclock_state *st = device->driver_data; ++ ++ wake_up_interruptible(&st->disrupt_wait); ++} ++ ++static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) ++{ ++ struct acpi_device *adev = ACPI_COMPANION(dev); ++ acpi_status status; ++ ++ /* ++ * This should never happen as this function is only called when ++ * has_acpi_companion(dev) is true, but the logic is sufficiently ++ * complex that Coverity can't see the tautology. ++ */ ++ if (!adev) ++ return -ENODEV; ++ ++ /* The device does not support notifications. Nothing else to do */ ++ if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return 0; ++ ++ status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, ++ vmclock_acpi_notification_handler, ++ dev); ++ if (ACPI_FAILURE(status)) { ++ dev_err(dev, "failed to install notification handler"); ++ return -ENODEV; ++ } ++ ++ return 0; ++} ++ + static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) + { + struct acpi_device *adev = ACPI_COMPANION(dev); +@@ -495,6 +600,11 @@ static int vmclock_probe(struct platform_device *pdev) + goto out; + } + ++ init_waitqueue_head(&st->disrupt_wait); ++ ret = vmclock_setup_notification(dev, st); ++ if (ret) ++ return ret; ++ + /* If the structure is big enough, it can be mapped to userspace */ + if (st->clk->size >= PAGE_SIZE) { + st->miscdev.minor = MISC_DYNAMIC_MINOR; +@@ -544,6 +654,8 @@ static int vmclock_probe(struct platform_device *pdev) + goto out; + } + ++ dev->driver_data = st; ++ + dev_info(dev, "%s: registered %s%s%s\n", st->name, + st->miscdev.minor ? "miscdev" : "", + (st->miscdev.minor && st->ptp_clock) ? ", " : "", +diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h +index 75deb6ae2b27..4b7cd2b8532c 100644 +--- a/include/uapi/linux/vmclock-abi.h ++++ b/include/uapi/linux/vmclock-abi.h +@@ -125,6 +125,11 @@ struct vmclock_abi { + * loaded from some save state (restored from a snapshot). + */ + #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) ++ /* ++ * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send ++ * a notification every time it updates seq_count to a new even number. ++ */ ++#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) + + uint8_t pad[2]; + uint8_t clock_status; +-- +2.34.1 + diff --git a/resources/patches/vmclock/5.10/0003-dt-bindings-ptp-Add-amazon-vmclock.patch b/resources/patches/vmclock/5.10/0003-dt-bindings-ptp-Add-amazon-vmclock.patch new file mode 100644 index 00000000000..67fea022740 --- /dev/null +++ b/resources/patches/vmclock/5.10/0003-dt-bindings-ptp-Add-amazon-vmclock.patch @@ -0,0 +1,76 @@ +From d594b01069fb6fabb068379b59bd26e59dbd6661 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Tue, 2 Dec 2025 20:11:55 +0000 +Subject: [PATCH 3/4] dt-bindings: ptp: Add amazon,vmclock + +The vmclock device provides a PTP clock source and precise timekeeping +across live migration and snapshot/restore operations. + +The binding has a required memory region containing the vmclock_abi +structure and an optional interrupt for clock disruption notifications. + +The full specification is at https://david.woodhou.se/VMClock.pdf + +Signed-off-by: David Woodhouse +Signed-off-by: Babis Chalios +Reviewed-by: Krzysztof Kozlowski +--- + .../bindings/ptp/amazon,vmclock.yaml | 46 +++++++++++++++++++ + 1 file changed, 46 insertions(+) + create mode 100644 Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml + +diff --git a/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml +new file mode 100644 +index 000000000000..b98fee20ce5f +--- /dev/null ++++ b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml +@@ -0,0 +1,46 @@ ++# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) ++%YAML 1.2 ++--- ++$id: http://devicetree.org/schemas/ptp/amazon,vmclock.yaml# ++$schema: http://devicetree.org/meta-schemas/core.yaml# ++ ++title: Virtual Machine Clock ++ ++maintainers: ++ - David Woodhouse ++ ++description: ++ The vmclock device provides a precise clock source and allows for ++ accurate timekeeping across live migration and snapshot/restore ++ operations. The full specification of the shared data structure ++ is available at https://david.woodhou.se/VMClock.pdf ++ ++properties: ++ compatible: ++ const: amazon,vmclock ++ ++ reg: ++ description: ++ Specifies the shared memory region containing the vmclock_abi structure. ++ maxItems: 1 ++ ++ interrupts: ++ description: ++ Interrupt used to notify when the contents of the vmclock_abi structure ++ have been updated. ++ maxItems: 1 ++ ++required: ++ - compatible ++ - reg ++ ++additionalProperties: false ++ ++examples: ++ - | ++ #include ++ ptp@80000000 { ++ compatible = "amazon,vmclock"; ++ reg = <0x80000000 0x1000>; ++ interrupts = ; ++ }; +-- +2.34.1 + diff --git a/resources/patches/vmclock/5.10/0004-ptp-ptp_vmclock-Add-device-tree-support.patch b/resources/patches/vmclock/5.10/0004-ptp-ptp_vmclock-Add-device-tree-support.patch new file mode 100644 index 00000000000..e7b4fbf568d --- /dev/null +++ b/resources/patches/vmclock/5.10/0004-ptp-ptp_vmclock-Add-device-tree-support.patch @@ -0,0 +1,180 @@ +From a70db7595dac8a3b84d14a8dc62b4067cc152055 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Tue, 2 Dec 2025 20:12:07 +0000 +Subject: [PATCH 4/4] ptp: ptp_vmclock: Add device tree support + +Add device tree support to the ptp_vmclock driver, allowing it to probe +via device tree in addition to ACPI. + +Handle optional interrupt for clock disruption notifications, mirroring +the ACPI notification behavior. + +Signed-off-by: David Woodhouse +Signed-off-by: Babis Chalios +--- + drivers/ptp/Kconfig | 2 +- + drivers/ptp/ptp_vmclock.c | 83 ++++++++++++++++++++++++++++++++++++--- + 2 files changed, 78 insertions(+), 7 deletions(-) + +diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig +index 44bc88a0a772..8c1aad77d708 100644 +--- a/drivers/ptp/Kconfig ++++ b/drivers/ptp/Kconfig +@@ -121,7 +121,7 @@ config PTP_1588_CLOCK_KVM + config PTP_1588_CLOCK_VMCLOCK + tristate "Virtual machine PTP clock" + depends on X86_TSC || ARM_ARCH_TIMER +- depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128 ++ depends on PTP_1588_CLOCK && ARCH_SUPPORTS_INT128 + default y + help + This driver adds support for using a virtual precision clock +diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c +index 4673915c43e7..4b8c7fa4ea91 100644 +--- a/drivers/ptp/ptp_vmclock.c ++++ b/drivers/ptp/ptp_vmclock.c +@@ -14,10 +14,13 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -453,6 +456,7 @@ static int vmclock_remove(struct platform_device *pdev) + return 0; + } + ++#ifdef CONFIG_ACPI + static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) + { + struct vmclock_state *st = data; +@@ -490,7 +494,7 @@ vmclock_acpi_notification_handler(acpi_handle __always_unused handle, + wake_up_interruptible(&st->disrupt_wait); + } + +-static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) ++static int vmclock_setup_acpi_notification(struct device *dev) + { + struct acpi_device *adev = ACPI_COMPANION(dev); + acpi_status status; +@@ -503,10 +507,6 @@ static int vmclock_setup_notification(struct device *dev, struct vmclock_state * + if (!adev) + return -ENODEV; + +- /* The device does not support notifications. Nothing else to do */ +- if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) +- return 0; +- + status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, + vmclock_acpi_notification_handler, + dev); +@@ -540,6 +540,70 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) + + return 0; + } ++#else ++static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) ++{ ++ return -EINVAL; ++} ++ ++static int vmclock_setup_acpi_notification(struct device *dev) ++{ ++ return -EINVAL; ++} ++ ++#endif ++ ++static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *dev) ++{ ++ struct device *device = dev; ++ struct vmclock_state *st = device->driver_data; ++ ++ wake_up_interruptible(&st->disrupt_wait); ++ return IRQ_HANDLED; ++} ++ ++static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st) ++{ ++ struct platform_device *pdev = to_platform_device(dev); ++ struct resource *res; ++ ++ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); ++ if (!res) ++ return -ENODEV; ++ ++ st->res = *res; ++ ++ return 0; ++} ++ ++static int vmclock_setup_of_notification(struct device *dev) ++{ ++ struct platform_device *pdev = to_platform_device(dev); ++ int irq; ++ ++ irq = platform_get_irq(pdev, 0); ++ if (irq < 0) ++ return irq; ++ ++ return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED, ++ "vmclock", dev); ++} ++ ++static int vmclock_setup_notification(struct device *dev, ++ struct vmclock_state *st) ++{ ++ /* The device does not support notifications. Nothing else to do */ ++ if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return 0; ++ ++ if (has_acpi_companion(dev)) { ++ return vmclock_setup_acpi_notification(dev); ++ } else { ++ return vmclock_setup_of_notification(dev); ++ } ++ ++} ++ + + static void vmclock_put_idx(void *data) + { +@@ -561,7 +625,7 @@ static int vmclock_probe(struct platform_device *pdev) + if (has_acpi_companion(dev)) + ret = vmclock_probe_acpi(dev, st); + else +- ret = -EINVAL; /* Only ACPI for now */ ++ ret = vmclock_probe_dt(dev, st); + + if (ret) { + dev_info(dev, "Failed to obtain physical address: %d\n", ret); +@@ -673,12 +737,19 @@ static const struct acpi_device_id vmclock_acpi_ids[] = { + }; + MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); + ++static const struct of_device_id vmclock_of_ids[] = { ++ { .compatible = "amazon,vmclock", }, ++ { }, ++}; ++MODULE_DEVICE_TABLE(of, vmclock_of_ids); ++ + static struct platform_driver vmclock_platform_driver = { + .probe = vmclock_probe, + .remove = vmclock_remove, + .driver = { + .name = "vmclock", + .acpi_match_table = vmclock_acpi_ids, ++ .of_match_table = vmclock_of_ids, + }, + }; + +-- +2.34.1 + diff --git a/resources/patches/vmclock/6.1/0001-ptp-vmclock-add-vm-generation-counter.patch b/resources/patches/vmclock/6.1/0001-ptp-vmclock-add-vm-generation-counter.patch new file mode 100644 index 00000000000..28588e1c924 --- /dev/null +++ b/resources/patches/vmclock/6.1/0001-ptp-vmclock-add-vm-generation-counter.patch @@ -0,0 +1,60 @@ +From a46562c571c6d50e7afc3994b33d0ffb61ff7409 Mon Sep 17 00:00:00 2001 +From: Babis Chalios +Date: Tue, 2 Dec 2025 20:11:32 +0000 +Subject: [PATCH 1/4] ptp: vmclock: add vm generation counter + +Similar to live migration, loading a VM from some saved state (aka +snapshot) is also an event that calls for clock adjustments in the +guest. However, guests might want to take more actions as a response to +such events, e.g. as discarding UUIDs, resetting network connections, +reseeding entropy pools, etc. These are actions that guests don't +typically take during live migration, so add a new field in the +vmclock_abi called vm_generation_counter which informs the guest about +such events. + +Hypervisor advertises support for vm_generation_counter through the +VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT flag. Users need to check the +presence of this bit in vmclock_abi flags field before using this flag. + +Signed-off-by: Babis Chalios +Reviewed-by: David Woodhouse +--- + include/uapi/linux/vmclock-abi.h | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h +index d7ca44313bf8..75deb6ae2b27 100644 +--- a/include/uapi/linux/vmclock-abi.h ++++ b/include/uapi/linux/vmclock-abi.h +@@ -119,6 +119,12 @@ struct vmclock_abi { + * bit again after the update, using the about-to-be-valid fields. + */ + #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) ++ /* ++ * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will ++ * bump the vm_generation_counter field every time the guest is ++ * loaded from some save state (restored from a snapshot). ++ */ ++#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + + uint8_t pad[2]; + uint8_t clock_status; +@@ -183,6 +189,15 @@ struct vmclock_abi { + uint64_t time_frac_sec; /* (seconds >> 64) */ + uint64_t time_esterror_picosec; /* (± picoseconds) */ + uint64_t time_maxerror_picosec; /* (± picoseconds) */ ++ ++ /* ++ * This field changes to another non-repeating value when the guest ++ * has been loaded from a snapshot. In addition to handling a ++ * disruption in time (which will also be signalled through the ++ * disruption_marker field), a guest may wish to discard UUIDs, ++ * reset network connections, reseed entropy, etc. ++ */ ++ uint64_t vm_generation_counter; + }; + + #endif /* __VMCLOCK_ABI_H__ */ +-- +2.34.1 + diff --git a/resources/patches/vmclock/6.1/0002-ptp-vmclock-support-device-notifications.patch b/resources/patches/vmclock/6.1/0002-ptp-vmclock-support-device-notifications.patch new file mode 100644 index 00000000000..f9cde8c7242 --- /dev/null +++ b/resources/patches/vmclock/6.1/0002-ptp-vmclock-support-device-notifications.patch @@ -0,0 +1,257 @@ +From d0a6bf47dd6cd2a9ed17dbdc32dd34a6ba0f5b5f Mon Sep 17 00:00:00 2001 +From: Babis Chalios +Date: Tue, 2 Dec 2025 20:11:44 +0000 +Subject: [PATCH 2/4] ptp: vmclock: support device notifications + +Add optional support for device notifications in VMClock. When +supported, the hypervisor will send a device notification every time it +updates the seq_count to a new even value. + +Moreover, add support for poll() in VMClock as a means to propagate this +notification to user space. poll() will return a POLLIN event to +listeners every time seq_count changes to a value different than the one +last seen (since open() or last read()/pread()). This means that when +poll() returns a POLLIN event, listeners need to use read() to observe +what has changed and update the reader's view of seq_count. In other +words, after a poll() returned, all subsequent calls to poll() will +immediately return with a POLLIN event until the listener calls read(). + +The device advertises support for the notification mechanism by setting +flag VMCLOCK_FLAG_NOTIFICATION_PRESENT in vmclock_abi flags field. If +the flag is not present the driver won't setup the ACPI notification +handler and poll() will always immediately return POLLHUP. + +Signed-off-by: Babis Chalios +--- + drivers/ptp/ptp_vmclock.c | 130 ++++++++++++++++++++++++++++--- + include/uapi/linux/vmclock-abi.h | 5 ++ + 2 files changed, 126 insertions(+), 9 deletions(-) + +diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c +index 1ce69eada4b2..4673915c43e7 100644 +--- a/drivers/ptp/ptp_vmclock.c ++++ b/drivers/ptp/ptp_vmclock.c +@@ -5,6 +5,9 @@ + * Copyright © 2024 Amazon.com, Inc. or its affiliates. + */ + ++#include "linux/poll.h" ++#include "linux/types.h" ++#include "linux/wait.h" + #include + #include + #include +@@ -37,6 +40,7 @@ struct vmclock_state { + struct resource res; + struct vmclock_abi *clk; + struct miscdevice miscdev; ++ wait_queue_head_t disrupt_wait; + struct ptp_clock_info ptp_clock_info; + struct ptp_clock *ptp_clock; + enum clocksource_ids cs_id, sys_cs_id; +@@ -311,10 +315,15 @@ static const struct ptp_clock_info ptp_vmclock_info = { + .getcrosststamp = ptp_vmclock_getcrosststamp, + }; + ++struct vmclock_file_state { ++ struct vmclock_state *st; ++ atomic_t seq; ++}; ++ + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) + { +- struct vmclock_state *st = container_of(fp->private_data, +- struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; + + if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) + return -EROFS; +@@ -333,11 +342,12 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) + static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) + { +- struct vmclock_state *st = container_of(fp->private_data, +- struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; ++ + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); + size_t max_count; +- int32_t seq; ++ int32_t seq, old_seq; + + if (*ppos >= PAGE_SIZE) + return 0; +@@ -346,6 +356,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + if (count > max_count) + count = max_count; + ++ old_seq = atomic_read(&fst->seq); + while (1) { + seq = st->clk->seq_count & ~1ULL; + virt_rmb(); +@@ -354,8 +365,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + return -EFAULT; + + virt_rmb(); +- if (seq == st->clk->seq_count) +- break; ++ if (seq == st->clk->seq_count) { ++ /* ++ * Either we updated fst->seq to seq (the latest version we observed) ++ * or someone else did (old_seq == seq), so we can break. ++ */ ++ if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) || ++ old_seq == seq) { ++ break; ++ } ++ } + + if (ktime_after(ktime_get(), deadline)) + return -ETIMEDOUT; +@@ -365,9 +384,57 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + return count; + } + ++static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait) ++{ ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; ++ uint32_t seq; ++ ++ /* ++ * Hypervisor will not send us any notifications, so fail immediately ++ * to avoid having caller sleeping for ever. ++ */ ++ if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return POLLHUP; ++ ++ poll_wait(fp, &st->disrupt_wait, wait); ++ ++ seq = st->clk->seq_count; ++ if (atomic_read(&fst->seq) != seq) ++ return POLLIN | POLLRDNORM; ++ ++ return 0; ++} ++ ++static int vmclock_miscdev_open(struct inode *inode, struct file *fp) ++{ ++ struct vmclock_state *st = container_of(fp->private_data, ++ struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL); ++ ++ if (!fst) ++ return -ENOMEM; ++ ++ fst->st = st; ++ atomic_set(&fst->seq, 0); ++ ++ fp->private_data = fst; ++ ++ return 0; ++} ++ ++static int vmclock_miscdev_release(struct inode *inode, struct file *fp) ++{ ++ kfree(fp->private_data); ++ return 0; ++} ++ + static const struct file_operations vmclock_miscdev_fops = { +- .mmap = vmclock_miscdev_mmap, +- .read = vmclock_miscdev_read, ++ .open = vmclock_miscdev_open, ++ .release = vmclock_miscdev_release, ++ .mmap = vmclock_miscdev_mmap, ++ .read = vmclock_miscdev_read, ++ .poll = vmclock_miscdev_poll, + }; + + /* module operations */ +@@ -413,6 +480,44 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data + return AE_ERROR; + } + ++static void ++vmclock_acpi_notification_handler(acpi_handle __always_unused handle, ++ u32 __always_unused event, void *dev) ++{ ++ struct device *device = dev; ++ struct vmclock_state *st = device->driver_data; ++ ++ wake_up_interruptible(&st->disrupt_wait); ++} ++ ++static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) ++{ ++ struct acpi_device *adev = ACPI_COMPANION(dev); ++ acpi_status status; ++ ++ /* ++ * This should never happen as this function is only called when ++ * has_acpi_companion(dev) is true, but the logic is sufficiently ++ * complex that Coverity can't see the tautology. ++ */ ++ if (!adev) ++ return -ENODEV; ++ ++ /* The device does not support notifications. Nothing else to do */ ++ if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return 0; ++ ++ status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, ++ vmclock_acpi_notification_handler, ++ dev); ++ if (ACPI_FAILURE(status)) { ++ dev_err(dev, "failed to install notification handler"); ++ return -ENODEV; ++ } ++ ++ return 0; ++} ++ + static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) + { + struct acpi_device *adev = ACPI_COMPANION(dev); +@@ -495,6 +600,11 @@ static int vmclock_probe(struct platform_device *pdev) + goto out; + } + ++ init_waitqueue_head(&st->disrupt_wait); ++ ret = vmclock_setup_notification(dev, st); ++ if (ret) ++ return ret; ++ + /* If the structure is big enough, it can be mapped to userspace */ + if (st->clk->size >= PAGE_SIZE) { + st->miscdev.minor = MISC_DYNAMIC_MINOR; +@@ -544,6 +654,8 @@ static int vmclock_probe(struct platform_device *pdev) + goto out; + } + ++ dev->driver_data = st; ++ + dev_info(dev, "%s: registered %s%s%s\n", st->name, + st->miscdev.minor ? "miscdev" : "", + (st->miscdev.minor && st->ptp_clock) ? ", " : "", +diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h +index 75deb6ae2b27..4b7cd2b8532c 100644 +--- a/include/uapi/linux/vmclock-abi.h ++++ b/include/uapi/linux/vmclock-abi.h +@@ -125,6 +125,11 @@ struct vmclock_abi { + * loaded from some save state (restored from a snapshot). + */ + #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) ++ /* ++ * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send ++ * a notification every time it updates seq_count to a new even number. ++ */ ++#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) + + uint8_t pad[2]; + uint8_t clock_status; +-- +2.34.1 + diff --git a/resources/patches/vmclock/6.1/0003-dt-bindings-ptp-Add-amazon-vmclock.patch b/resources/patches/vmclock/6.1/0003-dt-bindings-ptp-Add-amazon-vmclock.patch new file mode 100644 index 00000000000..67fea022740 --- /dev/null +++ b/resources/patches/vmclock/6.1/0003-dt-bindings-ptp-Add-amazon-vmclock.patch @@ -0,0 +1,76 @@ +From d594b01069fb6fabb068379b59bd26e59dbd6661 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Tue, 2 Dec 2025 20:11:55 +0000 +Subject: [PATCH 3/4] dt-bindings: ptp: Add amazon,vmclock + +The vmclock device provides a PTP clock source and precise timekeeping +across live migration and snapshot/restore operations. + +The binding has a required memory region containing the vmclock_abi +structure and an optional interrupt for clock disruption notifications. + +The full specification is at https://david.woodhou.se/VMClock.pdf + +Signed-off-by: David Woodhouse +Signed-off-by: Babis Chalios +Reviewed-by: Krzysztof Kozlowski +--- + .../bindings/ptp/amazon,vmclock.yaml | 46 +++++++++++++++++++ + 1 file changed, 46 insertions(+) + create mode 100644 Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml + +diff --git a/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml +new file mode 100644 +index 000000000000..b98fee20ce5f +--- /dev/null ++++ b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml +@@ -0,0 +1,46 @@ ++# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) ++%YAML 1.2 ++--- ++$id: http://devicetree.org/schemas/ptp/amazon,vmclock.yaml# ++$schema: http://devicetree.org/meta-schemas/core.yaml# ++ ++title: Virtual Machine Clock ++ ++maintainers: ++ - David Woodhouse ++ ++description: ++ The vmclock device provides a precise clock source and allows for ++ accurate timekeeping across live migration and snapshot/restore ++ operations. The full specification of the shared data structure ++ is available at https://david.woodhou.se/VMClock.pdf ++ ++properties: ++ compatible: ++ const: amazon,vmclock ++ ++ reg: ++ description: ++ Specifies the shared memory region containing the vmclock_abi structure. ++ maxItems: 1 ++ ++ interrupts: ++ description: ++ Interrupt used to notify when the contents of the vmclock_abi structure ++ have been updated. ++ maxItems: 1 ++ ++required: ++ - compatible ++ - reg ++ ++additionalProperties: false ++ ++examples: ++ - | ++ #include ++ ptp@80000000 { ++ compatible = "amazon,vmclock"; ++ reg = <0x80000000 0x1000>; ++ interrupts = ; ++ }; +-- +2.34.1 + diff --git a/resources/patches/vmclock/6.1/0004-ptp-ptp_vmclock-Add-device-tree-support.patch b/resources/patches/vmclock/6.1/0004-ptp-ptp_vmclock-Add-device-tree-support.patch new file mode 100644 index 00000000000..e7b4fbf568d --- /dev/null +++ b/resources/patches/vmclock/6.1/0004-ptp-ptp_vmclock-Add-device-tree-support.patch @@ -0,0 +1,180 @@ +From a70db7595dac8a3b84d14a8dc62b4067cc152055 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Tue, 2 Dec 2025 20:12:07 +0000 +Subject: [PATCH 4/4] ptp: ptp_vmclock: Add device tree support + +Add device tree support to the ptp_vmclock driver, allowing it to probe +via device tree in addition to ACPI. + +Handle optional interrupt for clock disruption notifications, mirroring +the ACPI notification behavior. + +Signed-off-by: David Woodhouse +Signed-off-by: Babis Chalios +--- + drivers/ptp/Kconfig | 2 +- + drivers/ptp/ptp_vmclock.c | 83 ++++++++++++++++++++++++++++++++++++--- + 2 files changed, 78 insertions(+), 7 deletions(-) + +diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig +index 44bc88a0a772..8c1aad77d708 100644 +--- a/drivers/ptp/Kconfig ++++ b/drivers/ptp/Kconfig +@@ -121,7 +121,7 @@ config PTP_1588_CLOCK_KVM + config PTP_1588_CLOCK_VMCLOCK + tristate "Virtual machine PTP clock" + depends on X86_TSC || ARM_ARCH_TIMER +- depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128 ++ depends on PTP_1588_CLOCK && ARCH_SUPPORTS_INT128 + default y + help + This driver adds support for using a virtual precision clock +diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c +index 4673915c43e7..4b8c7fa4ea91 100644 +--- a/drivers/ptp/ptp_vmclock.c ++++ b/drivers/ptp/ptp_vmclock.c +@@ -14,10 +14,13 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -453,6 +456,7 @@ static int vmclock_remove(struct platform_device *pdev) + return 0; + } + ++#ifdef CONFIG_ACPI + static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) + { + struct vmclock_state *st = data; +@@ -490,7 +494,7 @@ vmclock_acpi_notification_handler(acpi_handle __always_unused handle, + wake_up_interruptible(&st->disrupt_wait); + } + +-static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) ++static int vmclock_setup_acpi_notification(struct device *dev) + { + struct acpi_device *adev = ACPI_COMPANION(dev); + acpi_status status; +@@ -503,10 +507,6 @@ static int vmclock_setup_notification(struct device *dev, struct vmclock_state * + if (!adev) + return -ENODEV; + +- /* The device does not support notifications. Nothing else to do */ +- if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) +- return 0; +- + status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, + vmclock_acpi_notification_handler, + dev); +@@ -540,6 +540,70 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) + + return 0; + } ++#else ++static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) ++{ ++ return -EINVAL; ++} ++ ++static int vmclock_setup_acpi_notification(struct device *dev) ++{ ++ return -EINVAL; ++} ++ ++#endif ++ ++static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *dev) ++{ ++ struct device *device = dev; ++ struct vmclock_state *st = device->driver_data; ++ ++ wake_up_interruptible(&st->disrupt_wait); ++ return IRQ_HANDLED; ++} ++ ++static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st) ++{ ++ struct platform_device *pdev = to_platform_device(dev); ++ struct resource *res; ++ ++ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); ++ if (!res) ++ return -ENODEV; ++ ++ st->res = *res; ++ ++ return 0; ++} ++ ++static int vmclock_setup_of_notification(struct device *dev) ++{ ++ struct platform_device *pdev = to_platform_device(dev); ++ int irq; ++ ++ irq = platform_get_irq(pdev, 0); ++ if (irq < 0) ++ return irq; ++ ++ return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED, ++ "vmclock", dev); ++} ++ ++static int vmclock_setup_notification(struct device *dev, ++ struct vmclock_state *st) ++{ ++ /* The device does not support notifications. Nothing else to do */ ++ if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return 0; ++ ++ if (has_acpi_companion(dev)) { ++ return vmclock_setup_acpi_notification(dev); ++ } else { ++ return vmclock_setup_of_notification(dev); ++ } ++ ++} ++ + + static void vmclock_put_idx(void *data) + { +@@ -561,7 +625,7 @@ static int vmclock_probe(struct platform_device *pdev) + if (has_acpi_companion(dev)) + ret = vmclock_probe_acpi(dev, st); + else +- ret = -EINVAL; /* Only ACPI for now */ ++ ret = vmclock_probe_dt(dev, st); + + if (ret) { + dev_info(dev, "Failed to obtain physical address: %d\n", ret); +@@ -673,12 +737,19 @@ static const struct acpi_device_id vmclock_acpi_ids[] = { + }; + MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); + ++static const struct of_device_id vmclock_of_ids[] = { ++ { .compatible = "amazon,vmclock", }, ++ { }, ++}; ++MODULE_DEVICE_TABLE(of, vmclock_of_ids); ++ + static struct platform_driver vmclock_platform_driver = { + .probe = vmclock_probe, + .remove = vmclock_remove, + .driver = { + .name = "vmclock", + .acpi_match_table = vmclock_acpi_ids, ++ .of_match_table = vmclock_of_ids, + }, + }; + +-- +2.34.1 + diff --git a/resources/rebuild.sh b/resources/rebuild.sh index 505afd555d1..235055e6874 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -167,6 +167,12 @@ function build_al_kernel { git checkout $(get_tag $KERNEL_VERSION) + # Apply any patchset we have for our kernels + for patchset in ../patches/*; do + echo "Applying patchset ${patchset}/${KERNEL_VERSION}" + git apply ${patchset}/${KERNEL_VERSION}/*.patch + done + arch=$(uname -m) if [ "$arch" = "x86_64" ]; then format="elf" @@ -194,6 +200,11 @@ function build_al_kernel { cp -v $binary_path $OUTPUT_FILE cp -v .config $OUTPUT_FILE.config + # Undo any patches previsouly applied, so that we can build the same kernel with different + # configs, e.g. no-acpi + git reset --hard $(get_tag $KERNEL_VERSION) + git clean -f -d + popd &>/dev/null } From 7903ccaf505c22a504bf8fa43b3c3b9a1bef14e3 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 10 Dec 2025 15:43:20 +0100 Subject: [PATCH 6/8] test(vmclock): enable VMClock for Aarch64 kernels Enable CONFIG_PTP_1588_CLOCK_VMCLOCK for both 5.10 and 6.1 kernels Signed-off-by: Babis Chalios --- resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config | 1 + resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config b/resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config index ac44904c1b4..55330d5beda 100644 --- a/resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config +++ b/resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config @@ -1767,6 +1767,7 @@ CONFIG_PTP_1588_CLOCK=y # Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks. # CONFIG_PTP_1588_CLOCK_KVM=y +CONFIG_PTP_1588_CLOCK_VMCLOCK=y # end of PTP clock support # CONFIG_PINCTRL is not set diff --git a/resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config b/resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config index 26b87a6580b..376112e230e 100644 --- a/resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config +++ b/resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config @@ -1864,7 +1864,7 @@ CONFIG_PTP_1588_CLOCK_OPTIONAL=y # Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks. # CONFIG_PTP_1588_CLOCK_KVM=y -# CONFIG_PTP_1588_CLOCK_VMCLOCK is not set +CONFIG_PTP_1588_CLOCK_VMCLOCK=y # end of PTP clock support # CONFIG_PINCTRL is not set From 99d68b0d98f8e54f68e14c0cb1215e085b316356 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 10 Dec 2025 11:52:34 +0100 Subject: [PATCH 7/8] test(vmclock): enable skipped tests We now apply backported patches that add support for VMClock on Aarch64 systems via DT bindings, so enable tests for Aarch64 as well. Moreover, backported patches add support for the poll() system calls family, so enable the relevant test. Signed-off-by: Babis Chalios --- tests/integration_tests/functional/test_vmclock.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/integration_tests/functional/test_vmclock.py b/tests/integration_tests/functional/test_vmclock.py index 1e838a539f3..925c6b021c5 100644 --- a/tests/integration_tests/functional/test_vmclock.py +++ b/tests/integration_tests/functional/test_vmclock.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 """Test VMclock device emulation""" -import platform - import pytest @@ -50,10 +48,6 @@ def parse_vmclock_from_poll(vm, expected_notifications): return dict(item.split(": ") for item in fields if item.startswith("VMCLOCK")) -@pytest.mark.skipif( - platform.machine() != "x86_64", - reason="VMClock device is currently supported only on x86 systems", -) @pytest.mark.parametrize("use_mmap", [False, True], ids=["read()", "mmap()"]) def test_vmclock_read_fields(vm_with_vmclock, use_mmap): """Make sure that we expose the expected values in the VMclock struct""" @@ -71,10 +65,6 @@ def test_vmclock_read_fields(vm_with_vmclock, use_mmap): assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" -@pytest.mark.skipif( - platform.machine() != "x86_64", - reason="VMClock device is currently supported only on x86 systems", -) @pytest.mark.parametrize("use_mmap", [False, True], ids=["read()", "mmap()"]) def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type, use_mmap): """Test that `disruption_marker` and `vm_generation_counter` are updated @@ -98,10 +88,6 @@ def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type, use_mm assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == f"{i+1}" -# TODO: remove this skip when we backport VMClock snapshot safety patches to 5.10 and 6.1 -@pytest.mark.skip( - reason="Skip until we get guest microVM kernels with support for the notification mechanism", -) def test_vmclock_notifications(vm_with_vmclock, microvm_factory, snapshot_type): """Test that Firecracker will send a notification on snapshot load""" basevm = vm_with_vmclock From e419450fc7590351ffdde632c75b19a5b500c546 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 11 Dec 2025 09:56:30 +0100 Subject: [PATCH 8/8] temp: point to CI bucket with VMClock kernels We created a CI bucket that includes VMClock kernels with backported patches for the snapshot safety features. Signed-off-by: Babis Chalios --- tools/devtool | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/devtool b/tools/devtool index dd95860695c..80ee3aa2c6b 100755 --- a/tools/devtool +++ b/tools/devtool @@ -573,7 +573,8 @@ ensure_ci_artifacts() { # Fetch all the artifacts so they are local say "Fetching CI artifacts from S3" FC_VERSION=$(cmd_sh "cd src/firecracker/src; cargo pkgid | cut -d# -f2 | cut -d. -f1-2") - S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + #S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + S3_URL=s3://spec.ccfc.min/firecracker-ci/v1.15-vmclock/$(uname -m) ARTIFACTS=$MICROVM_IMAGES_DIR/$(uname -m) if [ ! -d "$ARTIFACTS" ]; then mkdir -pv $ARTIFACTS