diff --git a/resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config b/resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config index ac44904c1b4..55330d5beda 100644 --- a/resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config +++ b/resources/guest_configs/microvm-kernel-ci-aarch64-5.10.config @@ -1767,6 +1767,7 @@ CONFIG_PTP_1588_CLOCK=y # Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks. # CONFIG_PTP_1588_CLOCK_KVM=y +CONFIG_PTP_1588_CLOCK_VMCLOCK=y # end of PTP clock support # CONFIG_PINCTRL is not set diff --git a/resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config b/resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config index 26b87a6580b..376112e230e 100644 --- a/resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config +++ b/resources/guest_configs/microvm-kernel-ci-aarch64-6.1.config @@ -1864,7 +1864,7 @@ CONFIG_PTP_1588_CLOCK_OPTIONAL=y # Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks. # CONFIG_PTP_1588_CLOCK_KVM=y -# CONFIG_PTP_1588_CLOCK_VMCLOCK is not set +CONFIG_PTP_1588_CLOCK_VMCLOCK=y # end of PTP clock support # CONFIG_PINCTRL is not set diff --git a/resources/patches/vmclock/5.10/0001-ptp-vmclock-add-vm-generation-counter.patch b/resources/patches/vmclock/5.10/0001-ptp-vmclock-add-vm-generation-counter.patch new file mode 100644 index 00000000000..28588e1c924 --- /dev/null +++ b/resources/patches/vmclock/5.10/0001-ptp-vmclock-add-vm-generation-counter.patch @@ -0,0 +1,60 @@ +From a46562c571c6d50e7afc3994b33d0ffb61ff7409 Mon Sep 17 00:00:00 2001 +From: Babis Chalios +Date: Tue, 2 Dec 2025 20:11:32 +0000 +Subject: [PATCH 1/4] ptp: vmclock: add vm generation counter + +Similar to live migration, loading a VM from some saved state (aka +snapshot) is also an event that calls for clock adjustments in the +guest. However, guests might want to take more actions as a response to +such events, e.g. as discarding UUIDs, resetting network connections, +reseeding entropy pools, etc. These are actions that guests don't +typically take during live migration, so add a new field in the +vmclock_abi called vm_generation_counter which informs the guest about +such events. + +Hypervisor advertises support for vm_generation_counter through the +VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT flag. Users need to check the +presence of this bit in vmclock_abi flags field before using this flag. + +Signed-off-by: Babis Chalios +Reviewed-by: David Woodhouse +--- + include/uapi/linux/vmclock-abi.h | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h +index d7ca44313bf8..75deb6ae2b27 100644 +--- a/include/uapi/linux/vmclock-abi.h ++++ b/include/uapi/linux/vmclock-abi.h +@@ -119,6 +119,12 @@ struct vmclock_abi { + * bit again after the update, using the about-to-be-valid fields. + */ + #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) ++ /* ++ * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will ++ * bump the vm_generation_counter field every time the guest is ++ * loaded from some save state (restored from a snapshot). ++ */ ++#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + + uint8_t pad[2]; + uint8_t clock_status; +@@ -183,6 +189,15 @@ struct vmclock_abi { + uint64_t time_frac_sec; /* (seconds >> 64) */ + uint64_t time_esterror_picosec; /* (± picoseconds) */ + uint64_t time_maxerror_picosec; /* (± picoseconds) */ ++ ++ /* ++ * This field changes to another non-repeating value when the guest ++ * has been loaded from a snapshot. In addition to handling a ++ * disruption in time (which will also be signalled through the ++ * disruption_marker field), a guest may wish to discard UUIDs, ++ * reset network connections, reseed entropy, etc. ++ */ ++ uint64_t vm_generation_counter; + }; + + #endif /* __VMCLOCK_ABI_H__ */ +-- +2.34.1 + diff --git a/resources/patches/vmclock/5.10/0002-ptp-vmclock-support-device-notifications.patch b/resources/patches/vmclock/5.10/0002-ptp-vmclock-support-device-notifications.patch new file mode 100644 index 00000000000..f9cde8c7242 --- /dev/null +++ b/resources/patches/vmclock/5.10/0002-ptp-vmclock-support-device-notifications.patch @@ -0,0 +1,257 @@ +From d0a6bf47dd6cd2a9ed17dbdc32dd34a6ba0f5b5f Mon Sep 17 00:00:00 2001 +From: Babis Chalios +Date: Tue, 2 Dec 2025 20:11:44 +0000 +Subject: [PATCH 2/4] ptp: vmclock: support device notifications + +Add optional support for device notifications in VMClock. When +supported, the hypervisor will send a device notification every time it +updates the seq_count to a new even value. + +Moreover, add support for poll() in VMClock as a means to propagate this +notification to user space. poll() will return a POLLIN event to +listeners every time seq_count changes to a value different than the one +last seen (since open() or last read()/pread()). This means that when +poll() returns a POLLIN event, listeners need to use read() to observe +what has changed and update the reader's view of seq_count. In other +words, after a poll() returned, all subsequent calls to poll() will +immediately return with a POLLIN event until the listener calls read(). + +The device advertises support for the notification mechanism by setting +flag VMCLOCK_FLAG_NOTIFICATION_PRESENT in vmclock_abi flags field. If +the flag is not present the driver won't setup the ACPI notification +handler and poll() will always immediately return POLLHUP. + +Signed-off-by: Babis Chalios +--- + drivers/ptp/ptp_vmclock.c | 130 ++++++++++++++++++++++++++++--- + include/uapi/linux/vmclock-abi.h | 5 ++ + 2 files changed, 126 insertions(+), 9 deletions(-) + +diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c +index 1ce69eada4b2..4673915c43e7 100644 +--- a/drivers/ptp/ptp_vmclock.c ++++ b/drivers/ptp/ptp_vmclock.c +@@ -5,6 +5,9 @@ + * Copyright © 2024 Amazon.com, Inc. or its affiliates. + */ + ++#include "linux/poll.h" ++#include "linux/types.h" ++#include "linux/wait.h" + #include + #include + #include +@@ -37,6 +40,7 @@ struct vmclock_state { + struct resource res; + struct vmclock_abi *clk; + struct miscdevice miscdev; ++ wait_queue_head_t disrupt_wait; + struct ptp_clock_info ptp_clock_info; + struct ptp_clock *ptp_clock; + enum clocksource_ids cs_id, sys_cs_id; +@@ -311,10 +315,15 @@ static const struct ptp_clock_info ptp_vmclock_info = { + .getcrosststamp = ptp_vmclock_getcrosststamp, + }; + ++struct vmclock_file_state { ++ struct vmclock_state *st; ++ atomic_t seq; ++}; ++ + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) + { +- struct vmclock_state *st = container_of(fp->private_data, +- struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; + + if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) + return -EROFS; +@@ -333,11 +342,12 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) + static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) + { +- struct vmclock_state *st = container_of(fp->private_data, +- struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; ++ + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); + size_t max_count; +- int32_t seq; ++ int32_t seq, old_seq; + + if (*ppos >= PAGE_SIZE) + return 0; +@@ -346,6 +356,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + if (count > max_count) + count = max_count; + ++ old_seq = atomic_read(&fst->seq); + while (1) { + seq = st->clk->seq_count & ~1ULL; + virt_rmb(); +@@ -354,8 +365,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + return -EFAULT; + + virt_rmb(); +- if (seq == st->clk->seq_count) +- break; ++ if (seq == st->clk->seq_count) { ++ /* ++ * Either we updated fst->seq to seq (the latest version we observed) ++ * or someone else did (old_seq == seq), so we can break. ++ */ ++ if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) || ++ old_seq == seq) { ++ break; ++ } ++ } + + if (ktime_after(ktime_get(), deadline)) + return -ETIMEDOUT; +@@ -365,9 +384,57 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + return count; + } + ++static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait) ++{ ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; ++ uint32_t seq; ++ ++ /* ++ * Hypervisor will not send us any notifications, so fail immediately ++ * to avoid having caller sleeping for ever. ++ */ ++ if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return POLLHUP; ++ ++ poll_wait(fp, &st->disrupt_wait, wait); ++ ++ seq = st->clk->seq_count; ++ if (atomic_read(&fst->seq) != seq) ++ return POLLIN | POLLRDNORM; ++ ++ return 0; ++} ++ ++static int vmclock_miscdev_open(struct inode *inode, struct file *fp) ++{ ++ struct vmclock_state *st = container_of(fp->private_data, ++ struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL); ++ ++ if (!fst) ++ return -ENOMEM; ++ ++ fst->st = st; ++ atomic_set(&fst->seq, 0); ++ ++ fp->private_data = fst; ++ ++ return 0; ++} ++ ++static int vmclock_miscdev_release(struct inode *inode, struct file *fp) ++{ ++ kfree(fp->private_data); ++ return 0; ++} ++ + static const struct file_operations vmclock_miscdev_fops = { +- .mmap = vmclock_miscdev_mmap, +- .read = vmclock_miscdev_read, ++ .open = vmclock_miscdev_open, ++ .release = vmclock_miscdev_release, ++ .mmap = vmclock_miscdev_mmap, ++ .read = vmclock_miscdev_read, ++ .poll = vmclock_miscdev_poll, + }; + + /* module operations */ +@@ -413,6 +480,44 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data + return AE_ERROR; + } + ++static void ++vmclock_acpi_notification_handler(acpi_handle __always_unused handle, ++ u32 __always_unused event, void *dev) ++{ ++ struct device *device = dev; ++ struct vmclock_state *st = device->driver_data; ++ ++ wake_up_interruptible(&st->disrupt_wait); ++} ++ ++static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) ++{ ++ struct acpi_device *adev = ACPI_COMPANION(dev); ++ acpi_status status; ++ ++ /* ++ * This should never happen as this function is only called when ++ * has_acpi_companion(dev) is true, but the logic is sufficiently ++ * complex that Coverity can't see the tautology. ++ */ ++ if (!adev) ++ return -ENODEV; ++ ++ /* The device does not support notifications. Nothing else to do */ ++ if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return 0; ++ ++ status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, ++ vmclock_acpi_notification_handler, ++ dev); ++ if (ACPI_FAILURE(status)) { ++ dev_err(dev, "failed to install notification handler"); ++ return -ENODEV; ++ } ++ ++ return 0; ++} ++ + static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) + { + struct acpi_device *adev = ACPI_COMPANION(dev); +@@ -495,6 +600,11 @@ static int vmclock_probe(struct platform_device *pdev) + goto out; + } + ++ init_waitqueue_head(&st->disrupt_wait); ++ ret = vmclock_setup_notification(dev, st); ++ if (ret) ++ return ret; ++ + /* If the structure is big enough, it can be mapped to userspace */ + if (st->clk->size >= PAGE_SIZE) { + st->miscdev.minor = MISC_DYNAMIC_MINOR; +@@ -544,6 +654,8 @@ static int vmclock_probe(struct platform_device *pdev) + goto out; + } + ++ dev->driver_data = st; ++ + dev_info(dev, "%s: registered %s%s%s\n", st->name, + st->miscdev.minor ? "miscdev" : "", + (st->miscdev.minor && st->ptp_clock) ? ", " : "", +diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h +index 75deb6ae2b27..4b7cd2b8532c 100644 +--- a/include/uapi/linux/vmclock-abi.h ++++ b/include/uapi/linux/vmclock-abi.h +@@ -125,6 +125,11 @@ struct vmclock_abi { + * loaded from some save state (restored from a snapshot). + */ + #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) ++ /* ++ * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send ++ * a notification every time it updates seq_count to a new even number. ++ */ ++#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) + + uint8_t pad[2]; + uint8_t clock_status; +-- +2.34.1 + diff --git a/resources/patches/vmclock/5.10/0003-dt-bindings-ptp-Add-amazon-vmclock.patch b/resources/patches/vmclock/5.10/0003-dt-bindings-ptp-Add-amazon-vmclock.patch new file mode 100644 index 00000000000..67fea022740 --- /dev/null +++ b/resources/patches/vmclock/5.10/0003-dt-bindings-ptp-Add-amazon-vmclock.patch @@ -0,0 +1,76 @@ +From d594b01069fb6fabb068379b59bd26e59dbd6661 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Tue, 2 Dec 2025 20:11:55 +0000 +Subject: [PATCH 3/4] dt-bindings: ptp: Add amazon,vmclock + +The vmclock device provides a PTP clock source and precise timekeeping +across live migration and snapshot/restore operations. + +The binding has a required memory region containing the vmclock_abi +structure and an optional interrupt for clock disruption notifications. + +The full specification is at https://david.woodhou.se/VMClock.pdf + +Signed-off-by: David Woodhouse +Signed-off-by: Babis Chalios +Reviewed-by: Krzysztof Kozlowski +--- + .../bindings/ptp/amazon,vmclock.yaml | 46 +++++++++++++++++++ + 1 file changed, 46 insertions(+) + create mode 100644 Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml + +diff --git a/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml +new file mode 100644 +index 000000000000..b98fee20ce5f +--- /dev/null ++++ b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml +@@ -0,0 +1,46 @@ ++# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) ++%YAML 1.2 ++--- ++$id: http://devicetree.org/schemas/ptp/amazon,vmclock.yaml# ++$schema: http://devicetree.org/meta-schemas/core.yaml# ++ ++title: Virtual Machine Clock ++ ++maintainers: ++ - David Woodhouse ++ ++description: ++ The vmclock device provides a precise clock source and allows for ++ accurate timekeeping across live migration and snapshot/restore ++ operations. The full specification of the shared data structure ++ is available at https://david.woodhou.se/VMClock.pdf ++ ++properties: ++ compatible: ++ const: amazon,vmclock ++ ++ reg: ++ description: ++ Specifies the shared memory region containing the vmclock_abi structure. ++ maxItems: 1 ++ ++ interrupts: ++ description: ++ Interrupt used to notify when the contents of the vmclock_abi structure ++ have been updated. ++ maxItems: 1 ++ ++required: ++ - compatible ++ - reg ++ ++additionalProperties: false ++ ++examples: ++ - | ++ #include ++ ptp@80000000 { ++ compatible = "amazon,vmclock"; ++ reg = <0x80000000 0x1000>; ++ interrupts = ; ++ }; +-- +2.34.1 + diff --git a/resources/patches/vmclock/5.10/0004-ptp-ptp_vmclock-Add-device-tree-support.patch b/resources/patches/vmclock/5.10/0004-ptp-ptp_vmclock-Add-device-tree-support.patch new file mode 100644 index 00000000000..e7b4fbf568d --- /dev/null +++ b/resources/patches/vmclock/5.10/0004-ptp-ptp_vmclock-Add-device-tree-support.patch @@ -0,0 +1,180 @@ +From a70db7595dac8a3b84d14a8dc62b4067cc152055 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Tue, 2 Dec 2025 20:12:07 +0000 +Subject: [PATCH 4/4] ptp: ptp_vmclock: Add device tree support + +Add device tree support to the ptp_vmclock driver, allowing it to probe +via device tree in addition to ACPI. + +Handle optional interrupt for clock disruption notifications, mirroring +the ACPI notification behavior. + +Signed-off-by: David Woodhouse +Signed-off-by: Babis Chalios +--- + drivers/ptp/Kconfig | 2 +- + drivers/ptp/ptp_vmclock.c | 83 ++++++++++++++++++++++++++++++++++++--- + 2 files changed, 78 insertions(+), 7 deletions(-) + +diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig +index 44bc88a0a772..8c1aad77d708 100644 +--- a/drivers/ptp/Kconfig ++++ b/drivers/ptp/Kconfig +@@ -121,7 +121,7 @@ config PTP_1588_CLOCK_KVM + config PTP_1588_CLOCK_VMCLOCK + tristate "Virtual machine PTP clock" + depends on X86_TSC || ARM_ARCH_TIMER +- depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128 ++ depends on PTP_1588_CLOCK && ARCH_SUPPORTS_INT128 + default y + help + This driver adds support for using a virtual precision clock +diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c +index 4673915c43e7..4b8c7fa4ea91 100644 +--- a/drivers/ptp/ptp_vmclock.c ++++ b/drivers/ptp/ptp_vmclock.c +@@ -14,10 +14,13 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -453,6 +456,7 @@ static int vmclock_remove(struct platform_device *pdev) + return 0; + } + ++#ifdef CONFIG_ACPI + static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) + { + struct vmclock_state *st = data; +@@ -490,7 +494,7 @@ vmclock_acpi_notification_handler(acpi_handle __always_unused handle, + wake_up_interruptible(&st->disrupt_wait); + } + +-static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) ++static int vmclock_setup_acpi_notification(struct device *dev) + { + struct acpi_device *adev = ACPI_COMPANION(dev); + acpi_status status; +@@ -503,10 +507,6 @@ static int vmclock_setup_notification(struct device *dev, struct vmclock_state * + if (!adev) + return -ENODEV; + +- /* The device does not support notifications. Nothing else to do */ +- if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) +- return 0; +- + status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, + vmclock_acpi_notification_handler, + dev); +@@ -540,6 +540,70 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) + + return 0; + } ++#else ++static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) ++{ ++ return -EINVAL; ++} ++ ++static int vmclock_setup_acpi_notification(struct device *dev) ++{ ++ return -EINVAL; ++} ++ ++#endif ++ ++static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *dev) ++{ ++ struct device *device = dev; ++ struct vmclock_state *st = device->driver_data; ++ ++ wake_up_interruptible(&st->disrupt_wait); ++ return IRQ_HANDLED; ++} ++ ++static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st) ++{ ++ struct platform_device *pdev = to_platform_device(dev); ++ struct resource *res; ++ ++ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); ++ if (!res) ++ return -ENODEV; ++ ++ st->res = *res; ++ ++ return 0; ++} ++ ++static int vmclock_setup_of_notification(struct device *dev) ++{ ++ struct platform_device *pdev = to_platform_device(dev); ++ int irq; ++ ++ irq = platform_get_irq(pdev, 0); ++ if (irq < 0) ++ return irq; ++ ++ return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED, ++ "vmclock", dev); ++} ++ ++static int vmclock_setup_notification(struct device *dev, ++ struct vmclock_state *st) ++{ ++ /* The device does not support notifications. Nothing else to do */ ++ if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return 0; ++ ++ if (has_acpi_companion(dev)) { ++ return vmclock_setup_acpi_notification(dev); ++ } else { ++ return vmclock_setup_of_notification(dev); ++ } ++ ++} ++ + + static void vmclock_put_idx(void *data) + { +@@ -561,7 +625,7 @@ static int vmclock_probe(struct platform_device *pdev) + if (has_acpi_companion(dev)) + ret = vmclock_probe_acpi(dev, st); + else +- ret = -EINVAL; /* Only ACPI for now */ ++ ret = vmclock_probe_dt(dev, st); + + if (ret) { + dev_info(dev, "Failed to obtain physical address: %d\n", ret); +@@ -673,12 +737,19 @@ static const struct acpi_device_id vmclock_acpi_ids[] = { + }; + MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); + ++static const struct of_device_id vmclock_of_ids[] = { ++ { .compatible = "amazon,vmclock", }, ++ { }, ++}; ++MODULE_DEVICE_TABLE(of, vmclock_of_ids); ++ + static struct platform_driver vmclock_platform_driver = { + .probe = vmclock_probe, + .remove = vmclock_remove, + .driver = { + .name = "vmclock", + .acpi_match_table = vmclock_acpi_ids, ++ .of_match_table = vmclock_of_ids, + }, + }; + +-- +2.34.1 + diff --git a/resources/patches/vmclock/6.1/0001-ptp-vmclock-add-vm-generation-counter.patch b/resources/patches/vmclock/6.1/0001-ptp-vmclock-add-vm-generation-counter.patch new file mode 100644 index 00000000000..28588e1c924 --- /dev/null +++ b/resources/patches/vmclock/6.1/0001-ptp-vmclock-add-vm-generation-counter.patch @@ -0,0 +1,60 @@ +From a46562c571c6d50e7afc3994b33d0ffb61ff7409 Mon Sep 17 00:00:00 2001 +From: Babis Chalios +Date: Tue, 2 Dec 2025 20:11:32 +0000 +Subject: [PATCH 1/4] ptp: vmclock: add vm generation counter + +Similar to live migration, loading a VM from some saved state (aka +snapshot) is also an event that calls for clock adjustments in the +guest. However, guests might want to take more actions as a response to +such events, e.g. as discarding UUIDs, resetting network connections, +reseeding entropy pools, etc. These are actions that guests don't +typically take during live migration, so add a new field in the +vmclock_abi called vm_generation_counter which informs the guest about +such events. + +Hypervisor advertises support for vm_generation_counter through the +VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT flag. Users need to check the +presence of this bit in vmclock_abi flags field before using this flag. + +Signed-off-by: Babis Chalios +Reviewed-by: David Woodhouse +--- + include/uapi/linux/vmclock-abi.h | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h +index d7ca44313bf8..75deb6ae2b27 100644 +--- a/include/uapi/linux/vmclock-abi.h ++++ b/include/uapi/linux/vmclock-abi.h +@@ -119,6 +119,12 @@ struct vmclock_abi { + * bit again after the update, using the about-to-be-valid fields. + */ + #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) ++ /* ++ * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will ++ * bump the vm_generation_counter field every time the guest is ++ * loaded from some save state (restored from a snapshot). ++ */ ++#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + + uint8_t pad[2]; + uint8_t clock_status; +@@ -183,6 +189,15 @@ struct vmclock_abi { + uint64_t time_frac_sec; /* (seconds >> 64) */ + uint64_t time_esterror_picosec; /* (± picoseconds) */ + uint64_t time_maxerror_picosec; /* (± picoseconds) */ ++ ++ /* ++ * This field changes to another non-repeating value when the guest ++ * has been loaded from a snapshot. In addition to handling a ++ * disruption in time (which will also be signalled through the ++ * disruption_marker field), a guest may wish to discard UUIDs, ++ * reset network connections, reseed entropy, etc. ++ */ ++ uint64_t vm_generation_counter; + }; + + #endif /* __VMCLOCK_ABI_H__ */ +-- +2.34.1 + diff --git a/resources/patches/vmclock/6.1/0002-ptp-vmclock-support-device-notifications.patch b/resources/patches/vmclock/6.1/0002-ptp-vmclock-support-device-notifications.patch new file mode 100644 index 00000000000..f9cde8c7242 --- /dev/null +++ b/resources/patches/vmclock/6.1/0002-ptp-vmclock-support-device-notifications.patch @@ -0,0 +1,257 @@ +From d0a6bf47dd6cd2a9ed17dbdc32dd34a6ba0f5b5f Mon Sep 17 00:00:00 2001 +From: Babis Chalios +Date: Tue, 2 Dec 2025 20:11:44 +0000 +Subject: [PATCH 2/4] ptp: vmclock: support device notifications + +Add optional support for device notifications in VMClock. When +supported, the hypervisor will send a device notification every time it +updates the seq_count to a new even value. + +Moreover, add support for poll() in VMClock as a means to propagate this +notification to user space. poll() will return a POLLIN event to +listeners every time seq_count changes to a value different than the one +last seen (since open() or last read()/pread()). This means that when +poll() returns a POLLIN event, listeners need to use read() to observe +what has changed and update the reader's view of seq_count. In other +words, after a poll() returned, all subsequent calls to poll() will +immediately return with a POLLIN event until the listener calls read(). + +The device advertises support for the notification mechanism by setting +flag VMCLOCK_FLAG_NOTIFICATION_PRESENT in vmclock_abi flags field. If +the flag is not present the driver won't setup the ACPI notification +handler and poll() will always immediately return POLLHUP. + +Signed-off-by: Babis Chalios +--- + drivers/ptp/ptp_vmclock.c | 130 ++++++++++++++++++++++++++++--- + include/uapi/linux/vmclock-abi.h | 5 ++ + 2 files changed, 126 insertions(+), 9 deletions(-) + +diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c +index 1ce69eada4b2..4673915c43e7 100644 +--- a/drivers/ptp/ptp_vmclock.c ++++ b/drivers/ptp/ptp_vmclock.c +@@ -5,6 +5,9 @@ + * Copyright © 2024 Amazon.com, Inc. or its affiliates. + */ + ++#include "linux/poll.h" ++#include "linux/types.h" ++#include "linux/wait.h" + #include + #include + #include +@@ -37,6 +40,7 @@ struct vmclock_state { + struct resource res; + struct vmclock_abi *clk; + struct miscdevice miscdev; ++ wait_queue_head_t disrupt_wait; + struct ptp_clock_info ptp_clock_info; + struct ptp_clock *ptp_clock; + enum clocksource_ids cs_id, sys_cs_id; +@@ -311,10 +315,15 @@ static const struct ptp_clock_info ptp_vmclock_info = { + .getcrosststamp = ptp_vmclock_getcrosststamp, + }; + ++struct vmclock_file_state { ++ struct vmclock_state *st; ++ atomic_t seq; ++}; ++ + static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) + { +- struct vmclock_state *st = container_of(fp->private_data, +- struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; + + if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) + return -EROFS; +@@ -333,11 +342,12 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) + static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) + { +- struct vmclock_state *st = container_of(fp->private_data, +- struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; ++ + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); + size_t max_count; +- int32_t seq; ++ int32_t seq, old_seq; + + if (*ppos >= PAGE_SIZE) + return 0; +@@ -346,6 +356,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + if (count > max_count) + count = max_count; + ++ old_seq = atomic_read(&fst->seq); + while (1) { + seq = st->clk->seq_count & ~1ULL; + virt_rmb(); +@@ -354,8 +365,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + return -EFAULT; + + virt_rmb(); +- if (seq == st->clk->seq_count) +- break; ++ if (seq == st->clk->seq_count) { ++ /* ++ * Either we updated fst->seq to seq (the latest version we observed) ++ * or someone else did (old_seq == seq), so we can break. ++ */ ++ if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) || ++ old_seq == seq) { ++ break; ++ } ++ } + + if (ktime_after(ktime_get(), deadline)) + return -ETIMEDOUT; +@@ -365,9 +384,57 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + return count; + } + ++static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait) ++{ ++ struct vmclock_file_state *fst = fp->private_data; ++ struct vmclock_state *st = fst->st; ++ uint32_t seq; ++ ++ /* ++ * Hypervisor will not send us any notifications, so fail immediately ++ * to avoid having caller sleeping for ever. ++ */ ++ if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return POLLHUP; ++ ++ poll_wait(fp, &st->disrupt_wait, wait); ++ ++ seq = st->clk->seq_count; ++ if (atomic_read(&fst->seq) != seq) ++ return POLLIN | POLLRDNORM; ++ ++ return 0; ++} ++ ++static int vmclock_miscdev_open(struct inode *inode, struct file *fp) ++{ ++ struct vmclock_state *st = container_of(fp->private_data, ++ struct vmclock_state, miscdev); ++ struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL); ++ ++ if (!fst) ++ return -ENOMEM; ++ ++ fst->st = st; ++ atomic_set(&fst->seq, 0); ++ ++ fp->private_data = fst; ++ ++ return 0; ++} ++ ++static int vmclock_miscdev_release(struct inode *inode, struct file *fp) ++{ ++ kfree(fp->private_data); ++ return 0; ++} ++ + static const struct file_operations vmclock_miscdev_fops = { +- .mmap = vmclock_miscdev_mmap, +- .read = vmclock_miscdev_read, ++ .open = vmclock_miscdev_open, ++ .release = vmclock_miscdev_release, ++ .mmap = vmclock_miscdev_mmap, ++ .read = vmclock_miscdev_read, ++ .poll = vmclock_miscdev_poll, + }; + + /* module operations */ +@@ -413,6 +480,44 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data + return AE_ERROR; + } + ++static void ++vmclock_acpi_notification_handler(acpi_handle __always_unused handle, ++ u32 __always_unused event, void *dev) ++{ ++ struct device *device = dev; ++ struct vmclock_state *st = device->driver_data; ++ ++ wake_up_interruptible(&st->disrupt_wait); ++} ++ ++static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) ++{ ++ struct acpi_device *adev = ACPI_COMPANION(dev); ++ acpi_status status; ++ ++ /* ++ * This should never happen as this function is only called when ++ * has_acpi_companion(dev) is true, but the logic is sufficiently ++ * complex that Coverity can't see the tautology. ++ */ ++ if (!adev) ++ return -ENODEV; ++ ++ /* The device does not support notifications. Nothing else to do */ ++ if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return 0; ++ ++ status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, ++ vmclock_acpi_notification_handler, ++ dev); ++ if (ACPI_FAILURE(status)) { ++ dev_err(dev, "failed to install notification handler"); ++ return -ENODEV; ++ } ++ ++ return 0; ++} ++ + static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) + { + struct acpi_device *adev = ACPI_COMPANION(dev); +@@ -495,6 +600,11 @@ static int vmclock_probe(struct platform_device *pdev) + goto out; + } + ++ init_waitqueue_head(&st->disrupt_wait); ++ ret = vmclock_setup_notification(dev, st); ++ if (ret) ++ return ret; ++ + /* If the structure is big enough, it can be mapped to userspace */ + if (st->clk->size >= PAGE_SIZE) { + st->miscdev.minor = MISC_DYNAMIC_MINOR; +@@ -544,6 +654,8 @@ static int vmclock_probe(struct platform_device *pdev) + goto out; + } + ++ dev->driver_data = st; ++ + dev_info(dev, "%s: registered %s%s%s\n", st->name, + st->miscdev.minor ? "miscdev" : "", + (st->miscdev.minor && st->ptp_clock) ? ", " : "", +diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h +index 75deb6ae2b27..4b7cd2b8532c 100644 +--- a/include/uapi/linux/vmclock-abi.h ++++ b/include/uapi/linux/vmclock-abi.h +@@ -125,6 +125,11 @@ struct vmclock_abi { + * loaded from some save state (restored from a snapshot). + */ + #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) ++ /* ++ * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send ++ * a notification every time it updates seq_count to a new even number. ++ */ ++#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) + + uint8_t pad[2]; + uint8_t clock_status; +-- +2.34.1 + diff --git a/resources/patches/vmclock/6.1/0003-dt-bindings-ptp-Add-amazon-vmclock.patch b/resources/patches/vmclock/6.1/0003-dt-bindings-ptp-Add-amazon-vmclock.patch new file mode 100644 index 00000000000..67fea022740 --- /dev/null +++ b/resources/patches/vmclock/6.1/0003-dt-bindings-ptp-Add-amazon-vmclock.patch @@ -0,0 +1,76 @@ +From d594b01069fb6fabb068379b59bd26e59dbd6661 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Tue, 2 Dec 2025 20:11:55 +0000 +Subject: [PATCH 3/4] dt-bindings: ptp: Add amazon,vmclock + +The vmclock device provides a PTP clock source and precise timekeeping +across live migration and snapshot/restore operations. + +The binding has a required memory region containing the vmclock_abi +structure and an optional interrupt for clock disruption notifications. + +The full specification is at https://david.woodhou.se/VMClock.pdf + +Signed-off-by: David Woodhouse +Signed-off-by: Babis Chalios +Reviewed-by: Krzysztof Kozlowski +--- + .../bindings/ptp/amazon,vmclock.yaml | 46 +++++++++++++++++++ + 1 file changed, 46 insertions(+) + create mode 100644 Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml + +diff --git a/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml +new file mode 100644 +index 000000000000..b98fee20ce5f +--- /dev/null ++++ b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml +@@ -0,0 +1,46 @@ ++# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) ++%YAML 1.2 ++--- ++$id: http://devicetree.org/schemas/ptp/amazon,vmclock.yaml# ++$schema: http://devicetree.org/meta-schemas/core.yaml# ++ ++title: Virtual Machine Clock ++ ++maintainers: ++ - David Woodhouse ++ ++description: ++ The vmclock device provides a precise clock source and allows for ++ accurate timekeeping across live migration and snapshot/restore ++ operations. The full specification of the shared data structure ++ is available at https://david.woodhou.se/VMClock.pdf ++ ++properties: ++ compatible: ++ const: amazon,vmclock ++ ++ reg: ++ description: ++ Specifies the shared memory region containing the vmclock_abi structure. ++ maxItems: 1 ++ ++ interrupts: ++ description: ++ Interrupt used to notify when the contents of the vmclock_abi structure ++ have been updated. ++ maxItems: 1 ++ ++required: ++ - compatible ++ - reg ++ ++additionalProperties: false ++ ++examples: ++ - | ++ #include ++ ptp@80000000 { ++ compatible = "amazon,vmclock"; ++ reg = <0x80000000 0x1000>; ++ interrupts = ; ++ }; +-- +2.34.1 + diff --git a/resources/patches/vmclock/6.1/0004-ptp-ptp_vmclock-Add-device-tree-support.patch b/resources/patches/vmclock/6.1/0004-ptp-ptp_vmclock-Add-device-tree-support.patch new file mode 100644 index 00000000000..e7b4fbf568d --- /dev/null +++ b/resources/patches/vmclock/6.1/0004-ptp-ptp_vmclock-Add-device-tree-support.patch @@ -0,0 +1,180 @@ +From a70db7595dac8a3b84d14a8dc62b4067cc152055 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Tue, 2 Dec 2025 20:12:07 +0000 +Subject: [PATCH 4/4] ptp: ptp_vmclock: Add device tree support + +Add device tree support to the ptp_vmclock driver, allowing it to probe +via device tree in addition to ACPI. + +Handle optional interrupt for clock disruption notifications, mirroring +the ACPI notification behavior. + +Signed-off-by: David Woodhouse +Signed-off-by: Babis Chalios +--- + drivers/ptp/Kconfig | 2 +- + drivers/ptp/ptp_vmclock.c | 83 ++++++++++++++++++++++++++++++++++++--- + 2 files changed, 78 insertions(+), 7 deletions(-) + +diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig +index 44bc88a0a772..8c1aad77d708 100644 +--- a/drivers/ptp/Kconfig ++++ b/drivers/ptp/Kconfig +@@ -121,7 +121,7 @@ config PTP_1588_CLOCK_KVM + config PTP_1588_CLOCK_VMCLOCK + tristate "Virtual machine PTP clock" + depends on X86_TSC || ARM_ARCH_TIMER +- depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128 ++ depends on PTP_1588_CLOCK && ARCH_SUPPORTS_INT128 + default y + help + This driver adds support for using a virtual precision clock +diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c +index 4673915c43e7..4b8c7fa4ea91 100644 +--- a/drivers/ptp/ptp_vmclock.c ++++ b/drivers/ptp/ptp_vmclock.c +@@ -14,10 +14,13 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -453,6 +456,7 @@ static int vmclock_remove(struct platform_device *pdev) + return 0; + } + ++#ifdef CONFIG_ACPI + static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) + { + struct vmclock_state *st = data; +@@ -490,7 +494,7 @@ vmclock_acpi_notification_handler(acpi_handle __always_unused handle, + wake_up_interruptible(&st->disrupt_wait); + } + +-static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st) ++static int vmclock_setup_acpi_notification(struct device *dev) + { + struct acpi_device *adev = ACPI_COMPANION(dev); + acpi_status status; +@@ -503,10 +507,6 @@ static int vmclock_setup_notification(struct device *dev, struct vmclock_state * + if (!adev) + return -ENODEV; + +- /* The device does not support notifications. Nothing else to do */ +- if (!(st->clk->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) +- return 0; +- + status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, + vmclock_acpi_notification_handler, + dev); +@@ -540,6 +540,70 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) + + return 0; + } ++#else ++static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) ++{ ++ return -EINVAL; ++} ++ ++static int vmclock_setup_acpi_notification(struct device *dev) ++{ ++ return -EINVAL; ++} ++ ++#endif ++ ++static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *dev) ++{ ++ struct device *device = dev; ++ struct vmclock_state *st = device->driver_data; ++ ++ wake_up_interruptible(&st->disrupt_wait); ++ return IRQ_HANDLED; ++} ++ ++static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st) ++{ ++ struct platform_device *pdev = to_platform_device(dev); ++ struct resource *res; ++ ++ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); ++ if (!res) ++ return -ENODEV; ++ ++ st->res = *res; ++ ++ return 0; ++} ++ ++static int vmclock_setup_of_notification(struct device *dev) ++{ ++ struct platform_device *pdev = to_platform_device(dev); ++ int irq; ++ ++ irq = platform_get_irq(pdev, 0); ++ if (irq < 0) ++ return irq; ++ ++ return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED, ++ "vmclock", dev); ++} ++ ++static int vmclock_setup_notification(struct device *dev, ++ struct vmclock_state *st) ++{ ++ /* The device does not support notifications. Nothing else to do */ ++ if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) ++ return 0; ++ ++ if (has_acpi_companion(dev)) { ++ return vmclock_setup_acpi_notification(dev); ++ } else { ++ return vmclock_setup_of_notification(dev); ++ } ++ ++} ++ + + static void vmclock_put_idx(void *data) + { +@@ -561,7 +625,7 @@ static int vmclock_probe(struct platform_device *pdev) + if (has_acpi_companion(dev)) + ret = vmclock_probe_acpi(dev, st); + else +- ret = -EINVAL; /* Only ACPI for now */ ++ ret = vmclock_probe_dt(dev, st); + + if (ret) { + dev_info(dev, "Failed to obtain physical address: %d\n", ret); +@@ -673,12 +737,19 @@ static const struct acpi_device_id vmclock_acpi_ids[] = { + }; + MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); + ++static const struct of_device_id vmclock_of_ids[] = { ++ { .compatible = "amazon,vmclock", }, ++ { }, ++}; ++MODULE_DEVICE_TABLE(of, vmclock_of_ids); ++ + static struct platform_driver vmclock_platform_driver = { + .probe = vmclock_probe, + .remove = vmclock_remove, + .driver = { + .name = "vmclock", + .acpi_match_table = vmclock_acpi_ids, ++ .of_match_table = vmclock_of_ids, + }, + }; + +-- +2.34.1 + diff --git a/resources/rebuild.sh b/resources/rebuild.sh index 505afd555d1..235055e6874 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -167,6 +167,12 @@ function build_al_kernel { git checkout $(get_tag $KERNEL_VERSION) + # Apply any patchset we have for our kernels + for patchset in ../patches/*; do + echo "Applying patchset ${patchset}/${KERNEL_VERSION}" + git apply ${patchset}/${KERNEL_VERSION}/*.patch + done + arch=$(uname -m) if [ "$arch" = "x86_64" ]; then format="elf" @@ -194,6 +200,11 @@ function build_al_kernel { cp -v $binary_path $OUTPUT_FILE cp -v .config $OUTPUT_FILE.config + # Undo any patches previsouly applied, so that we can build the same kernel with different + # configs, e.g. no-acpi + git reset --hard $(get_tag $KERNEL_VERSION) + git clean -f -d + popd &>/dev/null } diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 949435e6c83..5f98431b1d9 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -20,6 +20,7 @@ use crate::arch::{ use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; use crate::device_manager::pci_mngr::PciDevices; +use crate::devices::acpi::vmclock::{VMCLOCK_SIZE, VmClock}; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestRegionType}; @@ -97,6 +98,7 @@ pub fn create_fdt( create_psci_node(&mut fdt_writer)?; create_devices_node(&mut fdt_writer, device_manager)?; create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; + create_vmclock_node(&mut fdt_writer, &device_manager.acpi_devices.vmclock)?; create_pci_nodes(&mut fdt_writer, &device_manager.pci_devices)?; // End Header node. @@ -287,6 +289,18 @@ fn create_vmgenid_node(fdt: &mut FdtWriter, vmgenid: &VmGenId) -> Result<(), Fdt Ok(()) } +fn create_vmclock_node(fdt: &mut FdtWriter, vmclock: &VmClock) -> Result<(), FdtError> { + let vmclock_node = fdt.begin_node(&format!("ptp@{}", vmclock.guest_address.0))?; + fdt.property_string("compatible", "amazon,vmclock")?; + fdt.property_array_u64("reg", &[vmclock.guest_address.0, VMCLOCK_SIZE as u64])?; + fdt.property_array_u32( + "interrupts", + &[GIC_FDT_IRQ_TYPE_SPI, vmclock.gsi, IRQ_TYPE_EDGE_RISING], + )?; + fdt.end_node(vmclock_node)?; + Ok(()) +} + fn create_gic_node(fdt: &mut FdtWriter, gic_device: &GICDevice) -> Result<(), FdtError> { let interrupt = fdt.begin_node("intc")?; fdt.property_string("compatible", gic_device.fdt_compatibility())?; diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 979cd68a285..d1004096059 100644 Binary files a/src/vmm/src/arch/aarch64/output_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_GICv3.dtb differ diff --git a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb index 63ab6765036..9477bb72d17 100644 Binary files a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb differ diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 332b1ac3cc3..c111576fe2a 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -287,7 +287,6 @@ pub fn build_microvm_for_boot( )?; device_manager.attach_vmgenid_device(&vm)?; - #[cfg(target_arch = "x86_64")] device_manager.attach_vmclock_device(&vm)?; #[cfg(target_arch = "aarch64")] diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 9764143b5a9..3a27a480af1 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -1,11 +1,11 @@ // Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +#[cfg(target_arch = "x86_64")] use acpi_tables::{Aml, aml}; use vm_memory::GuestMemoryError; use crate::Vm; -#[cfg(target_arch = "x86_64")] use crate::devices::acpi::vmclock::VmClock; use crate::devices::acpi::vmgenid::VmGenId; use crate::vstate::resources::ResourceAllocator; @@ -23,7 +23,6 @@ pub struct ACPIDeviceManager { /// VMGenID device pub vmgenid: VmGenId, /// VMclock device - #[cfg(target_arch = "x86_64")] pub vmclock: VmClock, } @@ -32,7 +31,6 @@ impl ACPIDeviceManager { pub fn new(resource_allocator: &mut ResourceAllocator) -> Self { ACPIDeviceManager { vmgenid: VmGenId::new(resource_allocator), - #[cfg(target_arch = "x86_64")] vmclock: VmClock::new(resource_allocator), } } @@ -43,13 +41,14 @@ impl ACPIDeviceManager { Ok(()) } - #[cfg(target_arch = "x86_64")] pub fn attach_vmclock(&self, vm: &Vm) -> Result<(), ACPIDeviceError> { + vm.register_irq(&self.vmclock.interrupt_evt, self.vmclock.gsi)?; self.vmclock.activate(vm.guest_memory())?; Ok(()) } } +#[cfg(target_arch = "x86_64")] impl Aml for ACPIDeviceManager { fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { // AML for [`VmGenId`] device. @@ -65,30 +64,41 @@ impl Aml for ACPIDeviceManager { &aml::Name::new("_HID".try_into()?, &"ACPI0013")?, &aml::Name::new( "_CRS".try_into()?, - &aml::ResourceTemplate::new(vec![&aml::Interrupt::new( - true, - true, - false, - false, - self.vmgenid.gsi, - )]), + &aml::ResourceTemplate::new(vec![ + &aml::Interrupt::new(true, true, false, false, self.vmgenid.gsi), + &aml::Interrupt::new(true, true, false, false, self.vmclock.gsi), + ]), )?, &aml::Method::new( "_EVT".try_into()?, 1, true, - vec![&aml::If::new( - // We know that the maximum IRQ number fits in a u8. We have up to - // 32 IRQs in x86 and up to 128 in - // ARM (look into - // `vmm::crate::arch::layout::GSI_LEGACY_END`) - #[allow(clippy::cast_possible_truncation)] - &aml::Equal::new(&aml::Arg(0), &(self.vmgenid.gsi as u8)), - vec![&aml::Notify::new( - &aml::Path::new("\\_SB_.VGEN")?, - &0x80usize, - )], - )], + vec![ + &aml::If::new( + // We know that the maximum IRQ number fits in a u8. We have up to + // 32 IRQs in x86 and up to 128 in + // ARM (look into + // `vmm::crate::arch::layout::GSI_LEGACY_END`) + #[allow(clippy::cast_possible_truncation)] + &aml::Equal::new(&aml::Arg(0), &(self.vmgenid.gsi as u8)), + vec![&aml::Notify::new( + &aml::Path::new("\\_SB_.VGEN")?, + &0x80usize, + )], + ), + &aml::If::new( + // We know that the maximum IRQ number fits in a u8. We have up to + // 32 IRQs in x86 and up to 128 in + // ARM (look into + // `vmm::crate::arch::layout::GSI_LEGACY_END`) + #[allow(clippy::cast_possible_truncation)] + &aml::Equal::new(&aml::Arg(0), &(self.vmclock.gsi as u8)), + vec![&aml::Notify::new( + &aml::Path::new("\\_SB_.VCLK")?, + &0x80usize, + )], + ), + ], ), ], ) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index fc245e05539..1dc24d2feb3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -237,7 +237,6 @@ impl DeviceManager { Ok(()) } - #[cfg(target_arch = "x86_64")] pub(crate) fn attach_vmclock_device(&mut self, vm: &Vm) -> Result<(), AttachDeviceError> { self.acpi_devices.attach_vmclock(vm)?; Ok(()) @@ -465,6 +464,9 @@ impl<'a> Persist<'a> for DeviceManager { // Restore ACPI devices let mut acpi_devices = ACPIDeviceManager::restore(constructor_args.vm, &state.acpi_state)?; acpi_devices.vmgenid.notify_guest()?; + acpi_devices + .vmclock + .post_load_update(constructor_args.vm.guest_memory()); // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 2a0393e57f2..7ca2a2bb81d 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -15,7 +15,6 @@ use super::mmio::*; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::device_manager::acpi::ACPIDeviceError; -#[cfg(target_arch = "x86_64")] use crate::devices::acpi::vmclock::{VmClock, VmClockState}; use crate::devices::acpi::vmgenid::{VMGenIDState, VmGenId}; #[cfg(target_arch = "aarch64")] @@ -169,7 +168,6 @@ impl fmt::Debug for MMIODevManagerConstructorArgs<'_> { #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct ACPIDeviceManagerState { vmgenid: VMGenIDState, - #[cfg(target_arch = "x86_64")] vmclock: VmClockState, } @@ -181,7 +179,6 @@ impl<'a> Persist<'a> for ACPIDeviceManager { fn save(&self) -> Self::State { ACPIDeviceManagerState { vmgenid: self.vmgenid.save(), - #[cfg(target_arch = "x86_64")] vmclock: self.vmclock.save(), } } @@ -191,10 +188,14 @@ impl<'a> Persist<'a> for ACPIDeviceManager { // Safe to unwrap() here, this will never return an error. vmgenid: VmGenId::restore((), &state.vmgenid).unwrap(), // Safe to unwrap() here, this will never return an error. - #[cfg(target_arch = "x86_64")] - vmclock: VmClock::restore(vm.guest_memory(), &state.vmclock).unwrap(), + vmclock: VmClock::restore((), &state.vmclock).unwrap(), }; + vm.register_irq( + &acpi_devices.vmclock.interrupt_evt, + acpi_devices.vmclock.gsi, + )?; + acpi_devices.attach_vmgenid(vm)?; Ok(acpi_devices) } diff --git a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs index 134c8393f0c..80228ad848b 100644 --- a/src/vmm/src/devices/acpi/generated/vmclock_abi.rs +++ b/src/vmm/src/devices/acpi/generated/vmclock_abi.rs @@ -38,6 +38,8 @@ pub const VMCLOCK_FLAG_PERIOD_MAXERROR_VALID: u64 = 16; pub const VMCLOCK_FLAG_TIME_ESTERROR_VALID: u64 = 32; pub const VMCLOCK_FLAG_TIME_MAXERROR_VALID: u64 = 64; pub const VMCLOCK_FLAG_TIME_MONOTONIC: u64 = 128; +pub const VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: u64 = 256; +pub const VMCLOCK_FLAG_NOTIFICATION_PRESENT: u64 = 512; pub const VMCLOCK_STATUS_UNKNOWN: u8 = 0; pub const VMCLOCK_STATUS_INITIALIZING: u8 = 1; pub const VMCLOCK_STATUS_SYNCHRONIZED: u8 = 2; @@ -153,10 +155,11 @@ pub struct vmclock_abi { pub time_frac_sec: __le64, pub time_esterror_nanosec: __le64, pub time_maxerror_nanosec: __le64, + pub vm_generation_counter: __le64, } #[allow(clippy::unnecessary_operation, clippy::identity_op)] const _: () = { - ["Size of vmclock_abi"][::std::mem::size_of::() - 104usize]; + ["Size of vmclock_abi"][::std::mem::size_of::() - 112usize]; ["Alignment of vmclock_abi"][::std::mem::align_of::() - 8usize]; ["Offset of field: vmclock_abi::magic"][::std::mem::offset_of!(vmclock_abi, magic) - 0usize]; ["Offset of field: vmclock_abi::size"][::std::mem::offset_of!(vmclock_abi, size) - 4usize]; @@ -198,4 +201,6 @@ const _: () = { [::std::mem::offset_of!(vmclock_abi, time_esterror_nanosec) - 88usize]; ["Offset of field: vmclock_abi::time_maxerror_nanosec"] [::std::mem::offset_of!(vmclock_abi, time_maxerror_nanosec) - 96usize]; + ["Offset of field: vmclock_abi::vm_generation_counter"] + [::std::mem::offset_of!(vmclock_abi, vm_generation_counter) - 104usize]; }; diff --git a/src/vmm/src/devices/acpi/vmclock.rs b/src/vmm/src/devices/acpi/vmclock.rs index d7882a78ded..94e8c4563a4 100644 --- a/src/vmm/src/devices/acpi/vmclock.rs +++ b/src/vmm/src/devices/acpi/vmclock.rs @@ -6,14 +6,19 @@ use std::mem::offset_of; use std::sync::atomic::{Ordering, fence}; use acpi_tables::{Aml, aml}; -use log::error; +use log::{debug, error}; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryError}; +use vm_superio::Trigger; +use vmm_sys_util::eventfd::EventFd; +use crate::Vm; use crate::devices::acpi::generated::vmclock_abi::{ - VMCLOCK_COUNTER_INVALID, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, + VMCLOCK_COUNTER_INVALID, VMCLOCK_FLAG_NOTIFICATION_PRESENT, + VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT, VMCLOCK_MAGIC, VMCLOCK_STATUS_UNKNOWN, vmclock_abi, }; +use crate::devices::legacy::EventFdTrigger; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::resources::ResourceAllocator; @@ -22,7 +27,7 @@ use crate::vstate::resources::ResourceAllocator; unsafe impl ByteValued for vmclock_abi {} // We are reserving a physical page to expose the [`VmClock`] data -const VMCLOCK_SIZE: u32 = 0x1000; +pub const VMCLOCK_SIZE: u32 = 0x1000; // Write a value in `vmclock_abi` both in the Firecracker-managed state // and inside guest memory address that corresponds to it. @@ -47,6 +52,10 @@ macro_rules! write_vmclock_field { pub struct VmClock { /// Guest address in which we will write the VMclock struct pub guest_address: GuestAddress, + /// Interrupt line for notifying the device about changes + pub interrupt_evt: EventFdTrigger, + /// GSI number allocated for the device. + pub gsi: u32, /// The [`VmClock`] state we are exposing to the guest inner: vmclock_abi, } @@ -62,17 +71,33 @@ impl VmClock { ) .expect("vmclock: could not allocate guest memory for device"); + let gsi = resource_allocator + .allocate_gsi_legacy(1) + .inspect_err(|err| error!("vmclock: Could not allocate GSI for VMClock: {err}")) + .unwrap()[0]; + + let interrupt_evt = EventFdTrigger::new( + EventFd::new(libc::EFD_NONBLOCK) + .inspect_err(|err| { + error!("vmclock: Could not create EventFd for VMClock device: {err}") + }) + .unwrap(), + ); + let mut inner = vmclock_abi { magic: VMCLOCK_MAGIC, size: VMCLOCK_SIZE, version: 1, clock_status: VMCLOCK_STATUS_UNKNOWN, counter_id: VMCLOCK_COUNTER_INVALID, + flags: VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT | VMCLOCK_FLAG_NOTIFICATION_PRESENT, ..Default::default() }; VmClock { guest_address: GuestAddress(addr), + interrupt_evt, + gsi, inner, } } @@ -98,11 +123,23 @@ impl VmClock { self.inner.disruption_marker.wrapping_add(1) ); - // This fence ensures guest sees the `disruption_marker` update. It is matched to a - // read barrier in the guest. + write_vmclock_field!( + self, + mem, + vm_generation_counter, + self.inner.vm_generation_counter.wrapping_add(1) + ); + + // This fence ensures guest sees the `disruption_marker` and `vm_generation_counter` + // updates. It is matched to a read barrier in the guest. fence(Ordering::Release); write_vmclock_field!(self, mem, seq_count, self.inner.seq_count.wrapping_add(1)); + self.interrupt_evt + .trigger() + .inspect_err(|err| error!("vmclock: could not send guest notification: {err}")) + .unwrap(); + debug!("vmclock: notifying guest about VMClock updates"); } } @@ -113,31 +150,39 @@ impl VmClock { pub struct VmClockState { /// Guest address in which we write the [`VmClock`] info pub guest_address: u64, + /// GSI used for notifying the guest about device changes + pub gsi: u32, /// Data we expose to the guest pub inner: vmclock_abi, } impl<'a> Persist<'a> for VmClock { type State = VmClockState; - type ConstructorArgs = &'a GuestMemoryMmap; + type ConstructorArgs = (); type Error = Infallible; fn save(&self) -> Self::State { VmClockState { guest_address: self.guest_address.0, + gsi: self.gsi, inner: self.inner, } } - fn restore( - constructor_args: Self::ConstructorArgs, - state: &Self::State, - ) -> Result { + fn restore(vm: Self::ConstructorArgs, state: &Self::State) -> Result { + let interrupt_evt = EventFdTrigger::new( + EventFd::new(libc::EFD_NONBLOCK) + .inspect_err(|err| { + error!("vmclock: Could not create EventFd for VMClock device: {err}") + }) + .unwrap(), + ); let mut vmclock = VmClock { guest_address: GuestAddress(state.guest_address), + interrupt_evt, + gsi: state.gsi, inner: state.inner, }; - vmclock.post_load_update(constructor_args); Ok(vmclock) } } @@ -174,14 +219,20 @@ impl Aml for VmClock { #[cfg(test)] mod tests { use vm_memory::{Bytes, GuestAddress}; + use vmm_sys_util::tempfile::TempFile; - use crate::arch; + use crate::Vm; + #[cfg(target_arch = "x86_64")] + use crate::arch::x86_64::layout; + use crate::arch::{self, Kvm}; use crate::devices::acpi::generated::vmclock_abi::vmclock_abi; use crate::devices::acpi::vmclock::{VMCLOCK_SIZE, VmClock}; - use crate::snapshot::Persist; + use crate::devices::virtio::test_utils::default_mem; + use crate::snapshot::{Persist, Snapshot}; use crate::test_utils::single_region_mem; use crate::utils::u64_to_usize; use crate::vstate::resources::ResourceAllocator; + use crate::vstate::vm::tests::setup_vm_with_memory; // We are allocating memory from the end of the system memory portion const VMCLOCK_TEST_GUEST_ADDR: GuestAddress = @@ -211,15 +262,17 @@ mod tests { #[test] fn test_device_save_restore() { let vmclock = default_vmclock(); + // We're using memory inside the system memory portion of the guest RAM. So we need a + // memory region that includes it. let mem = single_region_mem( u64_to_usize(arch::SYSTEM_MEM_START) + u64_to_usize(arch::SYSTEM_MEM_SIZE), ); vmclock.activate(&mem).unwrap(); - let guest_data: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); let state = vmclock.save(); - let vmclock_new = VmClock::restore(&mem, &state).unwrap(); + let mut vmclock_new = VmClock::restore((), &state).unwrap(); + vmclock_new.post_load_update(&mem); let guest_data_new: vmclock_abi = mem.read_obj(VMCLOCK_TEST_GUEST_ADDR).unwrap(); assert_ne!(guest_data_new, vmclock.inner); @@ -228,5 +281,9 @@ mod tests { vmclock.inner.disruption_marker + 1, vmclock_new.inner.disruption_marker ); + assert_eq!( + vmclock.inner.vm_generation_counter + 1, + vmclock_new.inner.vm_generation_counter + ); } } diff --git a/tests/host_tools/vmclock-abi.h b/tests/host_tools/vmclock-abi.h index 2d99b29ac44..5c707e263cb 100644 --- a/tests/host_tools/vmclock-abi.h +++ b/tests/host_tools/vmclock-abi.h @@ -115,6 +115,17 @@ struct vmclock_abi { * bit again after the update, using the about-to-be-valid fields. */ #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + /* + * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will + * bump the vm_generation_counter field every time the guest is + * loaded from some save state (restored from a snapshot). + */ +#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) + /* + * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send + * a notification every time it updates seq_count to a new even number. + */ +#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) __u8 pad[2]; __u8 clock_status; @@ -177,6 +188,19 @@ struct vmclock_abi { __le64 time_frac_sec; /* Units of 1/2^64 of a second */ __le64 time_esterror_nanosec; __le64 time_maxerror_nanosec; + + /* + * This field changes to another non-repeating value when the VM + * is loaded from a snapshot. This event, typically, represents a + * "jump" forward in time. As a result, in this case as well, the + * guest needs to discard any calibrarion against external sources. + * Loading a snapshot in a VM has different semantics than other VM + * events such as live migration, i.e. apart from re-adjusting guest + * clocks a guest user space might want to discard UUIDs, reset + * network connections or reseed entropy, etc. As a result, we + * use a dedicated marker for such events. + */ + __le64 vm_generation_counter; }; #endif /* __VMCLOCK_ABI_H__ */ diff --git a/tests/host_tools/vmclock.c b/tests/host_tools/vmclock.c index d69304ac87c..b27d0acdc29 100644 --- a/tests/host_tools/vmclock.c +++ b/tests/host_tools/vmclock.c @@ -1,12 +1,12 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -#include #include #include #include #include #include +#include #include #include #include @@ -16,23 +16,26 @@ const char *VMCLOCK_DEV_PATH = "/dev/vmclock0"; -int get_vmclock_handle(struct vmclock_abi **vmclock) +int open_vmclock(void) { int fd = open(VMCLOCK_DEV_PATH, 0); - if (fd == -1) - goto out_err; + if (fd == -1) { + perror("open"); + exit(1); + } - void *ptr = mmap(NULL, sizeof(struct vmclock_abi), PROT_READ, MAP_SHARED, fd, 0); - if (ptr == MAP_FAILED) - goto out_err_mmap; + return fd; +} - *vmclock = ptr; - return 0; +struct vmclock_abi *get_vmclock_handle(int fd) +{ + void *ptr = mmap(NULL, sizeof(struct vmclock_abi), PROT_READ, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + exit(1); + } -out_err_mmap: - close(fd); -out_err: - return errno; + return ptr; } #define READ_VMCLOCK_FIELD_FN(type, field) \ @@ -56,16 +59,43 @@ type read##_##field (struct vmclock_abi *vmclock) { \ } READ_VMCLOCK_FIELD_FN(uint64_t, disruption_marker); +READ_VMCLOCK_FIELD_FN(uint64_t, vm_generation_counter); -int main() +/* + * Read `vmclock_abi` structure using a file descriptor pointing to + * `/dev/vmclock0`. + */ +void read_vmclock(int fd, struct vmclock_abi *vmclock) { - struct vmclock_abi *vmclock; + int ret; - int err = get_vmclock_handle(&vmclock); - if (err) { - printf("Could not mmap vmclock struct: %s\n", strerror(err)); + /* + * Use `pread()`, since the device doesn't implement lseek(), so + * we can't reset `fp`. + */ + ret = pread(fd, vmclock, sizeof(*vmclock), 0); + if (ret < 0) { + perror("read"); + exit(1); + } else if (ret < (int) sizeof(*vmclock)) { + fprintf(stderr, "We don't handle partial writes (%d). Exiting!\n", ret); exit(1); } +} + +void print_vmclock(struct vmclock_abi *vmclock) +{ + if (vmclock->flags & VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT) { + printf("VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: true\n"); + } else { + printf("VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT: false\n"); + } + + if (vmclock->flags & VMCLOCK_FLAG_NOTIFICATION_PRESENT) { + printf("VMCLOCK_FLAG_NOTIFICATION_PRESENT: true\n"); + } else { + printf("VMCLOCK_FLAG_NOTIFICATION_PRESENT: false\n"); + } printf("VMCLOCK_MAGIC: 0x%x\n", vmclock->magic); printf("VMCLOCK_SIZE: 0x%x\n", vmclock->size); @@ -73,6 +103,92 @@ int main() printf("VMCLOCK_CLOCK_STATUS: %u\n", vmclock->clock_status); printf("VMCLOCK_COUNTER_ID: %u\n", vmclock->counter_id); printf("VMCLOCK_DISRUPTION_MARKER: %lu\n", read_disruption_marker(vmclock)); + printf("VMCLOCK_VM_GENERATION_COUNTER: %lu\n", read_vm_generation_counter(vmclock)); + fflush(stdout); +} + +void run_poll(int fd) +{ + struct vmclock_abi vmclock; + int epfd, ret, nfds; + struct epoll_event ev; + + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + + epfd = epoll_create(1); + if (epfd < 0) { + perror("epoll_create"); + exit(1); + } + + ev.events = EPOLLIN | EPOLLRDNORM; + ev.data.fd = fd; + ret = epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + if (ret < 0) { + perror("epoll_add"); + exit(1); + } + + while (1) { + nfds = epoll_wait(epfd, &ev, 1, -1); + if (nfds < 0) { + perror("epoll_wait"); + exit(1); + } + + if (ev.data.fd != fd) { + fprintf(stderr, "Unknown file descriptor %d\n", ev.data.fd); + exit(1); + } + + if (ev.events & EPOLLHUP) { + fprintf(stderr, "Device does not support notifications. Stop polling\n"); + exit(1); + } else if (ev.events & EPOLLIN) { + fprintf(stdout, "Got VMClock notification\n"); + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + } + } +} + +void print_help_message() +{ + fprintf(stderr, "usage: vmclock MODE\n"); + fprintf(stderr, "Available modes:\n"); + fprintf(stderr, " -r\tRead vmclock_abi using read()\n"); + fprintf(stderr, " -m\tRead vmclock_abi using mmap()\n"); + fprintf(stderr, " -p\tPoll VMClock for changes\n"); +} + +int main(int argc, char *argv[]) +{ + int fd; + struct vmclock_abi vmclock, *vmclock_ptr; + + if (argc != 2) { + print_help_message(); + exit(1); + } + + fd = open_vmclock(); + + if (!strncmp(argv[1], "-r", 3)) { + printf("Reading VMClock with read()\n"); + read_vmclock(fd, &vmclock); + print_vmclock(&vmclock); + } else if (!strncmp(argv[1], "-m", 3)) { + printf("Reading VMClock with mmap()\n"); + vmclock_ptr = get_vmclock_handle(fd); + print_vmclock(vmclock_ptr); + } else if (!strncmp(argv[1], "-p", 3)) { + printf("Polling VMClock\n"); + run_poll(fd); + } else { + print_help_message(); + exit(1); + } return 0; } diff --git a/tests/integration_tests/functional/test_max_devices.py b/tests/integration_tests/functional/test_max_devices.py index 54153b27d2d..bd51d4e53a9 100644 --- a/tests/integration_tests/functional/test_max_devices.py +++ b/tests/integration_tests/functional/test_max_devices.py @@ -18,13 +18,13 @@ def max_devices(uvm): match platform.machine(): case "aarch64": # On aarch64, IRQs are available from 32 to 127. We always use one IRQ each for - # the VMGenID, RTC and serial devices, so the maximum number of devices supported - # at the same time is 93. - return 93 + # the VMGenID, VMClock, RTC and serial devices, so the maximum number of devices + # supported at the same time is 92. + return 92 case "x86_64": - # IRQs are available from 5 to 23. We always use one IRQ for VMGenID device, so - # the maximum number of devices supported at the same time is 18. - return 18 + # IRQs are available from 5 to 23. We always use one IRQ for VMGenID and VMClock + # devices, so the maximum number of devices supported at the same time is 17. + return 17 case _: raise ValueError("Unknown platform") diff --git a/tests/integration_tests/functional/test_vmclock.py b/tests/integration_tests/functional/test_vmclock.py index b487526abdb..925c6b021c5 100644 --- a/tests/integration_tests/functional/test_vmclock.py +++ b/tests/integration_tests/functional/test_vmclock.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 """Test VMclock device emulation""" -import platform - import pytest @@ -21,40 +19,88 @@ def vm_with_vmclock(uvm_plain_acpi, bin_vmclock_path): yield basevm -def parse_vmclock(vm): +def parse_vmclock(vm, use_mmap=False): """Parse the VMclock struct inside the guest and return a dictionary with its fields""" - _, stdout, _ = vm.ssh.check_output("/tmp/vmclock") + + cmd = "/tmp/vmclock -m" if use_mmap else "/tmp/vmclock -r" + _, stdout, _ = vm.ssh.check_output(cmd) + fields = stdout.strip().split("\n") + if use_mmap: + assert fields[0] == "Reading VMClock with mmap()" + else: + assert fields[0] == "Reading VMClock with read()" + + return dict(item.split(": ") for item in fields if item.startswith("VMCLOCK")) + + +def parse_vmclock_from_poll(vm, expected_notifications): + """Parse the output of the 'vmclock -p' command in the guest""" + + _, stdout, _ = vm.ssh.check_output("cat /tmp/vmclock.out") fields = stdout.strip().split("\n") - return dict(item.split(": ") for item in fields) + + nr_notifications = 0 + for line in fields: + if line == "Got VMClock notification": + nr_notifications += 1 + + assert nr_notifications == expected_notifications + return dict(item.split(": ") for item in fields if item.startswith("VMCLOCK")) -@pytest.mark.skipif( - platform.machine() != "x86_64", - reason="VMClock device is currently supported only on x86 systems", -) -def test_vmclock_fields(vm_with_vmclock): +@pytest.mark.parametrize("use_mmap", [False, True], ids=["read()", "mmap()"]) +def test_vmclock_read_fields(vm_with_vmclock, use_mmap): """Make sure that we expose the expected values in the VMclock struct""" vm = vm_with_vmclock - vmclock = parse_vmclock(vm) + vmclock = parse_vmclock(vm, use_mmap) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" assert vmclock["VMCLOCK_MAGIC"] == "0x4b4c4356" assert vmclock["VMCLOCK_SIZE"] == "0x1000" assert vmclock["VMCLOCK_VERSION"] == "1" assert vmclock["VMCLOCK_CLOCK_STATUS"] == "0" assert vmclock["VMCLOCK_COUNTER_ID"] == "255" assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" -@pytest.mark.skipif( - platform.machine() != "x86_64", - reason="VMClock device is currently supported only on x86 systems", -) -def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type): - """Test that `disruption_marker` is updated upon snapshot resume""" +@pytest.mark.parametrize("use_mmap", [False, True], ids=["read()", "mmap()"]) +def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type, use_mmap): + """Test that `disruption_marker` and `vm_generation_counter` are updated + upon snapshot resume""" basevm = vm_with_vmclock - vmclock = parse_vmclock(basevm) + vmclock = parse_vmclock(basevm, use_mmap) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" + + snapshot = basevm.make_snapshot(snapshot_type) + basevm.kill() + + for i, vm in enumerate( + microvm_factory.build_n_from_snapshot(snapshot, 5, incremental=True) + ): + vmclock = parse_vmclock(vm, use_mmap) + assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == f"{i+1}" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == f"{i+1}" + + +def test_vmclock_notifications(vm_with_vmclock, microvm_factory, snapshot_type): + """Test that Firecracker will send a notification on snapshot load""" + basevm = vm_with_vmclock + + # Launch vmclock utility in polling mode + basevm.ssh.check_output("/tmp/vmclock -p > /tmp/vmclock.out 2>&1 &") + + # We should not have received any notification yet + vmclock = parse_vmclock_from_poll(basevm, 0) + assert vmclock["VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT"] == "true" + assert vmclock["VMCLOCK_FLAG_NOTIFICATION_PRESENT"] == "true" assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == "0" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == "0" snapshot = basevm.make_snapshot(snapshot_type) basevm.kill() @@ -62,5 +108,6 @@ def test_snapshot_update(vm_with_vmclock, microvm_factory, snapshot_type): for i, vm in enumerate( microvm_factory.build_n_from_snapshot(snapshot, 5, incremental=True) ): - vmclock = parse_vmclock(vm) + vmclock = parse_vmclock_from_poll(vm, i + 1) assert vmclock["VMCLOCK_DISRUPTION_MARKER"] == f"{i+1}" + assert vmclock["VMCLOCK_VM_GENERATION_COUNTER"] == f"{i+1}" diff --git a/tools/devtool b/tools/devtool index dd95860695c..80ee3aa2c6b 100755 --- a/tools/devtool +++ b/tools/devtool @@ -573,7 +573,8 @@ ensure_ci_artifacts() { # Fetch all the artifacts so they are local say "Fetching CI artifacts from S3" FC_VERSION=$(cmd_sh "cd src/firecracker/src; cargo pkgid | cut -d# -f2 | cut -d. -f1-2") - S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + #S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + S3_URL=s3://spec.ccfc.min/firecracker-ci/v1.15-vmclock/$(uname -m) ARTIFACTS=$MICROVM_IMAGES_DIR/$(uname -m) if [ ! -d "$ARTIFACTS" ]; then mkdir -pv $ARTIFACTS