summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml46
-rw-r--r--MAINTAINERS1
-rw-r--r--drivers/ptp/Kconfig2
-rw-r--r--drivers/ptp/ptp_vmclock.c236
-rw-r--r--include/uapi/linux/vmclock-abi.h20
5 files changed, 279 insertions, 26 deletions
diff --git a/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml
new file mode 100644
index 000000000000..357790df876f
--- /dev/null
+++ b/Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ptp/amazon,vmclock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Virtual Machine Clock
+
+maintainers:
+ - David Woodhouse <dwmw2@infradead.org>
+
+description:
+ The vmclock device provides a precise clock source and allows for
+ accurate timekeeping across live migration and snapshot/restore
+ operations. The full specification of the shared data structure is
+ available at https://uapi-group.org/specifications/specs/vmclock/
+
+properties:
+ compatible:
+ const: amazon,vmclock
+
+ reg:
+ description:
+ Specifies the shared memory region containing the vmclock_abi structure.
+ maxItems: 1
+
+ interrupts:
+ description:
+ Interrupt used to notify when the contents of the vmclock_abi structure
+ have been updated.
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ ptp@80000000 {
+ compatible = "amazon,vmclock";
+ reg = <0x80000000 0x1000>;
+ interrupts = <GIC_SPI 36 IRQ_TYPE_EDGE_RISING>;
+ };
diff --git a/MAINTAINERS b/MAINTAINERS
index 0caa8aee5840..98b07da905b0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21037,6 +21037,7 @@ PTP VMCLOCK SUPPORT
M: David Woodhouse <dwmw2@infradead.org>
L: netdev@vger.kernel.org
S: Maintained
+F: Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml
F: drivers/ptp/ptp_vmclock.c
F: include/uapi/linux/vmclock-abi.h
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 5f8ea34d11d6..b93640ca08b7 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -134,7 +134,7 @@ config PTP_1588_CLOCK_KVM
config PTP_1588_CLOCK_VMCLOCK
tristate "Virtual machine PTP clock"
depends on X86_TSC || ARM_ARCH_TIMER
- depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128
+ depends on PTP_1588_CLOCK && ARCH_SUPPORTS_INT128
default PTP_1588_CLOCK_KVM
help
This driver adds support for using a virtual precision clock
diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
index b3a83b03d9c1..c7c75e19f4dd 100644
--- a/drivers/ptp/ptp_vmclock.c
+++ b/drivers/ptp/ptp_vmclock.c
@@ -5,16 +5,22 @@
* Copyright © 2024 Amazon.com, Inc. or its affiliates.
*/
+#include "linux/poll.h"
+#include "linux/types.h"
+#include "linux/wait.h"
#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/miscdevice.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
@@ -39,6 +45,7 @@ struct vmclock_state {
struct resource res;
struct vmclock_abi *clk;
struct miscdevice miscdev;
+ wait_queue_head_t disrupt_wait;
struct ptp_clock_info ptp_clock_info;
struct ptp_clock *ptp_clock;
enum clocksource_ids cs_id, sys_cs_id;
@@ -76,13 +83,13 @@ static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
{
- if (likely(clk->time_type == VMCLOCK_TIME_UTC))
+ if (clk->time_type == VMCLOCK_TIME_TAI)
return true;
- if (clk->time_type == VMCLOCK_TIME_TAI &&
+ if (clk->time_type == VMCLOCK_TIME_UTC &&
(le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
if (sec)
- *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
+ *sec -= (int16_t)le16_to_cpu(clk->tai_offset_sec);
return true;
}
return false;
@@ -343,9 +350,9 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev,
return NULL;
}
- /* Only UTC, or TAI with offset */
+ /* Accept TAI directly, or UTC with valid offset for conversion to TAI */
if (!tai_adjust(st->clk, NULL)) {
- dev_info(dev, "vmclock does not provide unambiguous UTC\n");
+ dev_info(dev, "vmclock does not provide unambiguous time\n");
return NULL;
}
@@ -357,10 +364,15 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev,
return ptp_clock_register(&st->ptp_clock_info, dev);
}
+struct vmclock_file_state {
+ struct vmclock_state *st;
+ atomic_t seq;
+};
+
static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
{
- struct vmclock_state *st = container_of(fp->private_data,
- struct vmclock_state, miscdev);
+ struct vmclock_file_state *fst = fp->private_data;
+ struct vmclock_state *st = fst->st;
if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
return -EROFS;
@@ -379,11 +391,11 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
size_t count, loff_t *ppos)
{
- struct vmclock_state *st = container_of(fp->private_data,
- struct vmclock_state, miscdev);
ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+ struct vmclock_file_state *fst = fp->private_data;
+ struct vmclock_state *st = fst->st;
+ uint32_t seq, old_seq;
size_t max_count;
- uint32_t seq;
if (*ppos >= PAGE_SIZE)
return 0;
@@ -392,6 +404,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
if (count > max_count)
count = max_count;
+ old_seq = atomic_read(&fst->seq);
while (1) {
seq = le32_to_cpu(st->clk->seq_count) & ~1U;
/* Pairs with hypervisor wmb */
@@ -402,8 +415,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
/* Pairs with hypervisor wmb */
virt_rmb();
- if (seq == le32_to_cpu(st->clk->seq_count))
- break;
+ if (seq == le32_to_cpu(st->clk->seq_count)) {
+ /*
+ * Either we updated fst->seq to seq (the latest version we observed)
+ * or someone else did (old_seq == seq), so we can break.
+ */
+ if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) ||
+ old_seq == seq) {
+ break;
+ }
+ }
if (ktime_after(ktime_get(), deadline))
return -ETIMEDOUT;
@@ -413,25 +434,63 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
return count;
}
+static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait)
+{
+ struct vmclock_file_state *fst = fp->private_data;
+ struct vmclock_state *st = fst->st;
+ uint32_t seq;
+
+ /*
+ * Hypervisor will not send us any notifications, so fail immediately
+ * to avoid having caller sleeping for ever.
+ */
+ if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
+ return POLLHUP;
+
+ poll_wait(fp, &st->disrupt_wait, wait);
+
+ seq = le32_to_cpu(st->clk->seq_count);
+ if (atomic_read(&fst->seq) != seq)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static int vmclock_miscdev_open(struct inode *inode, struct file *fp)
+{
+ struct vmclock_state *st = container_of(fp->private_data,
+ struct vmclock_state, miscdev);
+ struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL);
+
+ if (!fst)
+ return -ENOMEM;
+
+ fst->st = st;
+ atomic_set(&fst->seq, 0);
+
+ fp->private_data = fst;
+
+ return 0;
+}
+
+static int vmclock_miscdev_release(struct inode *inode, struct file *fp)
+{
+ kfree(fp->private_data);
+ return 0;
+}
+
static const struct file_operations vmclock_miscdev_fops = {
.owner = THIS_MODULE,
+ .open = vmclock_miscdev_open,
+ .release = vmclock_miscdev_release,
.mmap = vmclock_miscdev_mmap,
.read = vmclock_miscdev_read,
+ .poll = vmclock_miscdev_poll,
};
/* module operations */
-static void vmclock_remove(void *data)
-{
- struct vmclock_state *st = data;
-
- if (st->ptp_clock)
- ptp_clock_unregister(st->ptp_clock);
-
- if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
- misc_deregister(&st->miscdev);
-}
-
+#if IS_ENABLED(CONFIG_ACPI)
static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
{
struct vmclock_state *st = data;
@@ -459,6 +518,40 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data
return AE_ERROR;
}
+static void
+vmclock_acpi_notification_handler(acpi_handle __always_unused handle,
+ u32 __always_unused event, void *dev)
+{
+ struct device *device = dev;
+ struct vmclock_state *st = device->driver_data;
+
+ wake_up_interruptible(&st->disrupt_wait);
+}
+
+static int vmclock_setup_acpi_notification(struct device *dev)
+{
+ struct acpi_device *adev = ACPI_COMPANION(dev);
+ acpi_status status;
+
+ /*
+ * This should never happen as this function is only called when
+ * has_acpi_companion(dev) is true, but the logic is sufficiently
+ * complex that Coverity can't see the tautology.
+ */
+ if (!adev)
+ return -ENODEV;
+
+ status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
+ vmclock_acpi_notification_handler,
+ dev);
+ if (ACPI_FAILURE(status)) {
+ dev_err(dev, "failed to install notification handler");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
{
struct acpi_device *adev = ACPI_COMPANION(dev);
@@ -481,6 +574,82 @@ static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
return 0;
}
+#endif /* CONFIG_ACPI */
+
+static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *_st)
+{
+ struct vmclock_state *st = _st;
+
+ wake_up_interruptible(&st->disrupt_wait);
+ return IRQ_HANDLED;
+}
+
+static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ struct resource *res;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENODEV;
+
+ st->res = *res;
+
+ return 0;
+}
+
+static int vmclock_setup_of_notification(struct device *dev)
+{
+ struct platform_device *pdev = to_platform_device(dev);
+ int irq;
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return irq;
+
+ return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED,
+ "vmclock", dev->driver_data);
+}
+
+static int vmclock_setup_notification(struct device *dev,
+ struct vmclock_state *st)
+{
+ /* The device does not support notifications. Nothing else to do */
+ if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
+ return 0;
+
+#if IS_ENABLED(CONFIG_ACPI)
+ if (has_acpi_companion(dev))
+ return vmclock_setup_acpi_notification(dev);
+#endif
+ return vmclock_setup_of_notification(dev);
+}
+
+static void vmclock_remove(void *data)
+{
+ struct device *dev = data;
+ struct vmclock_state *st = dev->driver_data;
+
+ if (!st) {
+ dev_err(dev, "%s called with NULL driver_data", __func__);
+ return;
+ }
+
+#if IS_ENABLED(CONFIG_ACPI)
+ if (has_acpi_companion(dev))
+ acpi_remove_notify_handler(ACPI_COMPANION(dev)->handle,
+ ACPI_DEVICE_NOTIFY,
+ vmclock_acpi_notification_handler);
+#endif
+
+ if (st->ptp_clock)
+ ptp_clock_unregister(st->ptp_clock);
+
+ if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
+ misc_deregister(&st->miscdev);
+
+ dev->driver_data = NULL;
+}
static void vmclock_put_idx(void *data)
{
@@ -499,10 +668,12 @@ static int vmclock_probe(struct platform_device *pdev)
if (!st)
return -ENOMEM;
+#if IS_ENABLED(CONFIG_ACPI)
if (has_acpi_companion(dev))
ret = vmclock_probe_acpi(dev, st);
else
- ret = -EINVAL; /* Only ACPI for now */
+#endif
+ ret = vmclock_probe_dt(dev, st);
if (ret) {
dev_info(dev, "Failed to obtain physical address: %d\n", ret);
@@ -545,7 +716,14 @@ static int vmclock_probe(struct platform_device *pdev)
st->miscdev.minor = MISC_DYNAMIC_MINOR;
- ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st);
+ init_waitqueue_head(&st->disrupt_wait);
+ dev->driver_data = st;
+
+ ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, dev);
+ if (ret)
+ return ret;
+
+ ret = vmclock_setup_notification(dev, st);
if (ret)
return ret;
@@ -591,15 +769,23 @@ static int vmclock_probe(struct platform_device *pdev)
static const struct acpi_device_id vmclock_acpi_ids[] = {
{ "AMZNC10C", 0 },
+ { "VMCLOCK", 0 },
{}
};
MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
+static const struct of_device_id vmclock_of_ids[] = {
+ { .compatible = "amazon,vmclock", },
+ { },
+};
+MODULE_DEVICE_TABLE(of, vmclock_of_ids);
+
static struct platform_driver vmclock_platform_driver = {
.probe = vmclock_probe,
.driver = {
.name = "vmclock",
.acpi_match_table = vmclock_acpi_ids,
+ .of_match_table = vmclock_of_ids,
},
};
diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h
index 2d99b29ac44a..d320623b0118 100644
--- a/include/uapi/linux/vmclock-abi.h
+++ b/include/uapi/linux/vmclock-abi.h
@@ -115,6 +115,17 @@ struct vmclock_abi {
* bit again after the update, using the about-to-be-valid fields.
*/
#define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7)
+ /*
+ * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will
+ * bump the vm_generation_counter field every time the guest is
+ * loaded from some save state (restored from a snapshot).
+ */
+#define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8)
+ /*
+ * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send
+ * a notification every time it updates seq_count to a new even number.
+ */
+#define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9)
__u8 pad[2];
__u8 clock_status;
@@ -177,6 +188,15 @@ struct vmclock_abi {
__le64 time_frac_sec; /* Units of 1/2^64 of a second */
__le64 time_esterror_nanosec;
__le64 time_maxerror_nanosec;
+
+ /*
+ * This field changes to another non-repeating value when the guest
+ * has been loaded from a snapshot. In addition to handling a
+ * disruption in time (which will also be signalled through the
+ * disruption_marker field), a guest may wish to discard UUIDs,
+ * reset network connections, reseed entropy, etc.
+ */
+ __le64 vm_generation_counter;
};
#endif /* __VMCLOCK_ABI_H__ */