summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-02-02 18:06:02 -0800
committerJakub Kicinski <kuba@kernel.org>2026-02-02 18:06:03 -0800
commit74ad1dfe2335df7c1fd36139261c2fba6bcaea40 (patch)
tree7ea3bf188ab00d58af414ecd44794e18061011c8
parent8755aae4aa7569f079135a0590dbd0f7adcfacf4 (diff)
parent562f59fe32914a10af8554634866bd0b56257060 (diff)
Merge branch 'ptp-vmclock-add-vm-generation-counter-and-acpi-notification'
Takahiro Itazuri says: ==================== ptp: vmclock: Add VM generation counter and ACPI notification Similarly to live migration, starting a VM from some serialized state (aka snapshot) is an event which calls for adjusting guest clocks, hence a hypervisor should increase the disruption_marker before resuming the VM vCPUs, letting the guest know. However, loading a snapshot, is slightly different than live migration, especially since we can start multiple VMs from the same serialized state. Apart from adjusting clocks, the guest needs to take additional action during such events, e.g. recreate UUIDs, reset network adapters/connections, reseed entropy pools, etc. These actions are not necessary during live migration. This calls for a differentiation between the two triggering events. We differentiate between the two events via an extra field in the vmclock_abi, called vm_generation_counter. Whereas hypervisors should increase the disruption marker in both cases, they should only increase vm_generation_counter when a snapshot is loaded in a VM (not during live migration). Additionally, we attach an ACPI notification to VMClock. Implementing the notification is optional for the device. VMClock device will declare that it implements the notification by setting VMCLOCK_FLAG_NOTIFICATION_PRESENT bit in vmclock_abi flags. Hypervisors that implement the notification must send an ACPI notification every time seq_count changes to an even number. The driver will propagate these notifications to userspace via the poll() interface. ==================== Link: https://patch.msgid.link/20260130173704.12575-1-itazur@amazon.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml46
-rw-r--r--MAINTAINERS1
-rw-r--r--drivers/ptp/Kconfig2
-rw-r--r--drivers/ptp/ptp_vmclock.c236
-rw-r--r--include/uapi/linux/vmclock-abi.h20
5 files changed, 279 insertions, 26 deletions
diff --git a/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml
new file mode 100644
index 000000000000..357790df876f
--- /dev/null
+++ b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ptp/amazon,vmclock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Virtual Machine Clock
+
+maintainers:
+ - David Woodhouse <dwmw2@infradead.org>
+
+description:
+ The vmclock device provides a precise clock source and allows for
+ accurate timekeeping across live migration and snapshot/restore
+ operations. The full specification of the shared data structure is
+ available at https://uapi-group.org/specifications/specs/vmclock/
+
+properties:
+ compatible:
+ const: amazon,vmclock
+
+ reg:
+ description:
+ Specifies the shared memory region containing the vmclock_abi structure.
+ maxItems: 1
+
+ interrupts:
+ description:
+ Interrupt used to notify when the contents of the vmclock_abi structure
+ have been updated.
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ ptp@80000000 {
+ compatible = "amazon,vmclock";
+ reg = <0x80000000 0x1000>;
+ interrupts = <GIC_SPI 36 IRQ_TYPE_EDGE_RISING>;
+ };
diff --git a/MAINTAINERS b/MAINTAINERS
index 0caa8aee5840..98b07da905b0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21037,6 +21037,7 @@ PTP VMCLOCK SUPPORT
M: David Woodhouse <dwmw2@infradead.org>
L: netdev@vger.kernel.org
S: Maintained
+F: Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml
F: drivers/ptp/ptp_vmclock.c
F: include/uapi/linux/vmclock-abi.h
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 5f8ea34d11d6..b93640ca08b7 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -134,7 +134,7 @@ config PTP_1588_CLOCK_KVM
config PTP_1588_CLOCK_VMCLOCK
tristate "Virtual machine PTP clock"
depends on X86_TSC || ARM_ARCH_TIMER
- depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128
+ depends on PTP_1588_CLOCK && ARCH_SUPPORTS_INT128
default PTP_1588_CLOCK_KVM
help
This driver adds support for using a virtual precision clock
diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
index b3a83b03d9c1..c7c75e19f4dd 100644
--- a/drivers/ptp/ptp_vmclock.c
+++ b/drivers/ptp/ptp_vmclock.c
@@ -5,16 +5,22 @@
* Copyright © 2024 Amazon.com, Inc. or its affiliates.
*/
+#include "linux/poll.h"
+#include "linux/types.h"
+#include "linux/wait.h"
#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/miscdevice.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
@@ -39,6 +45,7 @@ struct vmclock_state {
struct resource res;
struct vmclock_abi *clk;
struct miscdevice miscdev;
+ wait_queue_head_t disrupt_wait;
struct ptp_clock_info ptp_clock_info;
struct ptp_clock *ptp_clock;
enum clocksource_ids cs_id, sys_cs_id;
@@ -76,13 +83,13 @@ static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
{
- if (likely(clk->time_type == VMCLOCK_TIME_UTC))
+ if (clk->time_type == VMCLOCK_TIME_TAI)
return true;
- if (clk->time_type == VMCLOCK_TIME_TAI &&
+ if (clk->time_type == VMCLOCK_TIME_UTC &&
(le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
if (sec)
- *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
+ *sec -= (int16_t)le16_to_cpu(clk->tai_offset_sec);
return true;
}
return false;
@@ -343,9 +350,9 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev,
return NULL;
}
- /* Only UTC, or TAI with offset */
+ /* Accept TAI directly, or UTC with valid offset for conversion to TAI */
if (!tai_adjust(st->clk, NULL)) {
- dev_info(dev, "vmclock does not provide unambiguous UTC\n");
+ dev_info(dev, "vmclock does not provide unambiguous time\n");
return NULL;
}
@@ -357,10 +364,15 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev,
return ptp_clock_register(&st->ptp_clock_info, dev);
}
+struct vmclock_file_state {
+ struct vmclock_state *st;
+ atomic_t seq;
+};
+
static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
{
- struct vmclock_state *st = container_of(fp->private_data,
- struct vmclock_state, miscdev);
+ struct vmclock_file_state *fst = fp->private_data;
+ struct vmclock_state *st = fst->st;
if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
return -EROFS;
@@ -379,11 +391,11 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
size_t count, loff_t *ppos)
{
- struct vmclock_state *st = container_of(fp->private_data,
- struct vmclock_state, miscdev);
ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+ struct vmclock_file_state *fst = fp->private_data;
+ struct vmclock_state *st = fst->st;
+ uint32_t seq, old_seq;
size_t max_count;
- uint32_t seq;
if (*ppos >= PAGE_SIZE)
return 0;
@@ -392,6 +404,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
if (count > max_count)
count = max_count;
+ old_seq = atomic_read(&fst->seq);
while (1) {
seq = le32_to_cpu(st->clk->seq_count) & ~1U;
/* Pairs with hypervisor wmb */
@@ -402,8 +415,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
/* Pairs with hypervisor wmb */
virt_rmb();
- if (seq == le32_to_cpu(st->clk->seq_count))
- break;
+ if (seq == le32_to_cpu(st->clk->seq_count)) {
+ /*
+ * Either we updated fst->seq to seq (the latest version we observed)
+ * or someone else did (old_seq == seq), so we can break.
+ */
+ if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) ||
+ old_seq == seq) {
+ break;
+ }
+ }
if (ktime_after(ktime_get(), deadline))
return -ETIMEDOUT;
@@ -413,25 +434,63 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
return count;
}
+static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait)
+{
+ struct vmclock_file_state *fst = fp->private_data;
+ struct vmclock_state *st = fst->st;
+ uint32_t seq;
+
+ /*
+ * Hypervisor will not send us any notifications, so fail immediately
+ * to avoid having caller sleeping for ever.
+ */
+ if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
+ return POLLHUP;
+
+ poll_wait(fp, &st->disrupt_wait, wait);
+
+ seq = le32_to_cpu(st->clk->seq_count);
+ if (atomic_read(&fst->seq) != seq)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static int vmclock_miscdev_open(struct inode *inode, struct file *fp)
+{
+ struct vmclock_state *st = container_of(fp->private_data,
+ struct vmclock_state, miscdev);
+ struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL);
+
+ if (!fst)
+ return -ENOMEM;
+
+ fst->st = st;
+ atomic_set(&fst->seq, 0);
+
+ fp->private_data = fst;
+
+ return 0;
+}
+
+static int vmclock_miscdev_release(struct inode *inode, struct file *fp)
+{
+ kfree(fp->private_data);
+ return 0;
+}
+
static const struct file_operations vmclock_miscdev_fops = {
.owner = THIS_MODULE,
+ .open = vmclock_miscdev_open,
+ .release = vmclock_miscdev_release,
.mmap = vmclock_miscdev_mmap,
.read = vmclock_miscdev_read,
+ .poll = vmclock_miscdev_poll,
};
/* module operations */
-static void vmclock_remove(void *data)
-{
- struct vmclock_state *st = data;
-
- if (st->ptp_clock)
- ptp_clock_unregister(st->ptp_clock);
-
- if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
- misc_deregister(&st->miscdev);
-}
-
+#if IS_ENABLED(CONFIG_ACPI)
static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
{
struct vmclock_state *st = data;
@@ -459,6 +518,40 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data
return AE_ERROR;
}
+static void
+vmclock_acpi_notification_handler(acpi_handle __always_unused handle,
+ u32 __always_unused event, void *dev)
+{
+ struct device *device = dev;
+ struct vmclock_state *st = device->driver_data;
+
+ wake_up_interruptible(&st->disrupt_wait);
+}
+
+static int vmclock_setup_acpi_notification(struct device *dev)
+{
+ struct acpi_device *adev = ACPI_COMPANION(dev);
+ acpi_status status;
+
+ /*
+ * This should never happen as this function is only called when
+ * has_acpi_companion(dev) is true, but the logic is sufficiently
+ * complex that Coverity can't see the tautology.
+ */
+ if (!adev)
+ return -ENODEV;
+
+ status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
+ vmclock_acpi_notification_handler,
+ dev);
+ if (ACPI_FAILURE(status)) {
+ dev_err(dev, "failed to install notification handler");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
{
struct acpi_device *adev = ACPI_COMPANION(dev);
@@ -481,6 +574,82 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
return 0;
}
+#endif /* CONFIG_ACPI */
+
+static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *_st)
+{
+ struct vmclock_state *st = _st;
+
+ wake_up_interruptible(&st->disrupt_wait);
+ return IRQ_HANDLED;
+}
+
+static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct resource *res;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENODEV;
+
+ st->res = *res;
+
+ return 0;
+}
+
+static int vmclock_setup_of_notification(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ int irq;
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return irq;
+
+ return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED,
+ "vmclock", dev->driver_data);
+}
+
+static int vmclock_setup_notification(struct device *dev,
+ struct vmclock_state *st)
+{
+ /* The device does not support notifications. Nothing else to do */
+ if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
+ return 0;
+
+#if IS_ENABLED(CONFIG_ACPI)
+ if (has_acpi_companion(dev))
+ return vmclock_setup_acpi_notification(dev);
+#endif
+ return vmclock_setup_of_notification(dev);
+}
+
+static void vmclock_remove(void *data)
+{
+ struct device *dev = data;
+ struct vmclock_state *st = dev->driver_data;
+
+ if (!st) {
+ dev_err(dev, "%s called with NULL driver_data", __func__);
+ return;
+ }
+
+#if IS_ENABLED(CONFIG_ACPI)
+ if (has_acpi_companion(dev))
+ acpi_remove_notify_handler(ACPI_COMPANION(dev)->handle,
+ ACPI_DEVICE_NOTIFY,
+ vmclock_acpi_notification_handler);
+#endif
+
+ if (st->ptp_clock)
+ ptp_clock_unregister(st->ptp_clock);
+
+ if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
+ misc_deregister(&st->miscdev);
+
+ dev->driver_data = NULL;
+}
static void vmclock_put_idx(void *data)
{
@@ -499,10 +668,12 @@ static int vmclock_probe(struct platform_device *pdev)
if (!st)
return -ENOMEM;
+#if IS_ENABLED(CONFIG_ACPI)
if (has_acpi_companion(dev))
ret = vmclock_probe_acpi(dev, st);
else
- ret = -EINVAL; /* Only ACPI for now */
+#endif
+ ret = vmclock_probe_dt(dev, st);
if (ret) {
dev_info(dev, "Failed to obtain physical address: %d\n", ret);
@@ -545,7 +716,14 @@ static int vmclock_probe(struct platform_device *pdev)
st->miscdev.minor = MISC_DYNAMIC_MINOR;
- ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st);
+ init_waitqueue_head(&st->disrupt_wait);
+ dev->driver_data = st;
+
+ ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, dev);
+ if (ret)
+ return ret;
+
+ ret = vmclock_setup_notification(dev, st);
if (ret)
return ret;
@@ -591,15 +769,23 @@ static int vmclock_probe(struct platform_device *pdev)
static const struct acpi_device_id vmclock_acpi_ids[] = {
{ "AMZNC10C", 0 },
+ { "VMCLOCK", 0 },
{}
};
MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
+static const struct of_device_id vmclock_of_ids[] = {
+ { .compatible = "amazon,vmclock", },
+ { },
+};
+MODULE_DEVICE_TABLE(of, vmclock_of_ids);
+
static struct platform_driver vmclock_platform_driver = {
.probe = vmclock_probe,
.driver = {
.name = "vmclock",
.acpi_match_table = vmclock_acpi_ids,
+ .of_match_table = vmclock_of_ids,
},
};
diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h
index 2d99b29ac44a..d320623b0118 100644
--- a/include/uapi/linux/vmclock-abi.h
+++ b/include/uapi/linux/vmclock-abi.h
@@ -115,6 +115,17 @@ struct vmclock_abi {
* bit again after the update, using the about-to-be-valid fields.
*/
#define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7)
+ /*
+ * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will
+ * bump the vm_generation_counter field every time the guest is
+ * loaded from some save state (restored from a snapshot).
+ */
+#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8)
+ /*
+ * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send
+ * a notification every time it updates seq_count to a new even number.
+ */
+#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9)
__u8 pad[2];
__u8 clock_status;
@@ -177,6 +188,15 @@ struct vmclock_abi {
__le64 time_frac_sec; /* Units of 1/2^64 of a second */
__le64 time_esterror_nanosec;
__le64 time_maxerror_nanosec;
+
+ /*
+ * This field changes to another non-repeating value when the guest
+ * has been loaded from a snapshot. In addition to handling a
+ * disruption in time (which will also be signalled through the
+ * disruption_marker field), a guest may wish to discard UUIDs,
+ * reset network connections, reseed entropy, etc.
+ */
+ __le64 vm_generation_counter;
};
#endif /* __VMCLOCK_ABI_H__ */