summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorPratyush Yadav <ptyadav@amazon.de>2025-11-25 11:58:44 -0500
committerAndrew Morton <akpm@linux-foundation.org>2025-11-27 14:24:41 -0800
commitb3749f174d686627f702234e64bad976dc432dbc (patch)
treeead5cc27a47c23fd1cbccbb6d69efa8102e1802b /include
parent8def18633e8df54a05cf7d323d0df24c21b320d6 (diff)
mm: memfd_luo: allow preserving memfd
The ability to preserve a memfd allows userspace to use KHO and LUO to transfer its memory contents to the next kernel. This is useful in many ways. For one, it can be used with IOMMUFD as the backing store for IOMMU page tables. Preserving IOMMUFD is essential for performing a hypervisor live update with passthrough devices. memfd support provides the first building block for making that possible. For another, applications with a large amount of memory that takes time to reconstruct, reboots to consume kernel upgrades can be very expensive. memfd with LUO gives those applications reboot-persistent memory that they can use to quickly save and reconstruct that state. While memfd is backed by either hugetlbfs or shmem, currently only support on shmem is added. To be more precise, support for anonymous shmem files is added. The handover to the next kernel is not transparent. All the properties of the file are not preserved; only its memory contents, position, and size. The recreated file gets the UID and GID of the task doing the restore, and the task's cgroup gets charged with the memory. Once preserved, the file cannot grow or shrink, and all its pages are pinned to avoid migrations and swapping. The file can still be read from or written to. Use vmalloc to get the buffer to hold the folios, and preserve it using kho_preserve_vmalloc(). This doesn't have the size limit. Link: https://lkml.kernel.org/r/20251125165850.3389713-15-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav <ptyadav@amazon.de> Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com> Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org> Tested-by: David Matlack <dmatlack@google.com> Cc: Aleksander Lobakin <aleksander.lobakin@intel.com> Cc: Alexander Graf <graf@amazon.com> Cc: Alice Ryhl <aliceryhl@google.com> Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com> Cc: anish kumar <yesanishhere@gmail.com> Cc: Anna Schumaker <anna.schumaker@oracle.com> Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Borislav Betkov <bp@alien8.de> Cc: Chanwoo Choi <cw00.choi@samsung.com> Cc: Chen Ridong <chenridong@huawei.com> Cc: Chris Li <chrisl@kernel.org> Cc: Christian Brauner <brauner@kernel.org> Cc: Daniel Wagner <wagi@kernel.org> Cc: Danilo Krummrich <dakr@kernel.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Jeffery <djeffery@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Guixin Liu <kanie@linux.alibaba.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jann Horn <jannh@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com> Cc: Joel Granados <joel.granados@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Lennart Poettering <lennart@poettering.net> Cc: Leon Romanovsky <leon@kernel.org> Cc: Leon Romanovsky <leonro@nvidia.com> Cc: Lukas Wunner <lukas@wunner.de> Cc: Marc Rutland <mark.rutland@arm.com> Cc: Masahiro Yamada <masahiroy@kernel.org> Cc: Matthew Maurer <mmaurer@google.com> Cc: Miguel Ojeda <ojeda@kernel.org> Cc: Myugnjoo Ham <myungjoo.ham@samsung.com> Cc: Parav Pandit <parav@nvidia.com> Cc: Pratyush Yadav <pratyush@kernel.org> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Saeed Mahameed <saeedm@nvidia.com> Cc: Samiullah Khawaja <skhawaja@google.com> Cc: Song Liu <song@kernel.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Stuart Hayes <stuart.w.hayes@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleinxer <tglx@linutronix.de> Cc: Thomas Weißschuh <linux@weissschuh.net> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: William Tu <witu@nvidia.com> Cc: Yoann Congal <yoann.congal@smile.fr> Cc: Zhu Yanjun <yanjun.zhu@linux.dev> Cc: Zijun Hu <quic_zijuhu@quicinc.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/kho/abi/memfd.h77
1 files changed, 77 insertions, 0 deletions
diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h
new file mode 100644
index 000000000000..da7d063474a1
--- /dev/null
+++ b/include/linux/kho/abi/memfd.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ *
+ * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
+ * Pratyush Yadav <ptyadav@amazon.de>
+ */
+
+#ifndef _LINUX_KHO_ABI_MEMFD_H
+#define _LINUX_KHO_ABI_MEMFD_H
+
+#include <linux/types.h>
+#include <linux/kexec_handover.h>
+
+/**
+ * DOC: memfd Live Update ABI
+ *
+ * This header defines the ABI for preserving the state of a memfd across a
+ * kexec reboot using the LUO.
+ *
+ * The state is serialized into a packed structure `struct memfd_luo_ser`
+ * which is handed over to the next kernel via the KHO mechanism.
+ *
+ * This interface is a contract. Any modification to the structure layout
+ * constitutes a breaking change. Such changes require incrementing the
+ * version number in the MEMFD_LUO_FH_COMPATIBLE string.
+ */
+
+/**
+ * MEMFD_LUO_FOLIO_DIRTY - The folio is dirty.
+ *
+ * This flag indicates the folio contains data from user. A non-dirty folio is
+ * one that was allocated (say using fallocate(2)) but not written to.
+ */
+#define MEMFD_LUO_FOLIO_DIRTY BIT(0)
+
+/**
+ * MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date.
+ *
+ * An up-to-date folio has been zeroed out. shmem zeroes out folios on first
+ * use. This flag tracks which folios need zeroing.
+ */
+#define MEMFD_LUO_FOLIO_UPTODATE BIT(1)
+
+/**
+ * struct memfd_luo_folio_ser - Serialized state of a single folio.
+ * @pfn: The page frame number of the folio.
+ * @flags: Flags to describe the state of the folio.
+ * @index: The page offset (pgoff_t) of the folio within the original file.
+ */
+struct memfd_luo_folio_ser {
+ u64 pfn:52;
+ u64 flags:12;
+ u64 index;
+} __packed;
+
+/**
+ * struct memfd_luo_ser - Main serialization structure for a memfd.
+ * @pos: The file's current position (f_pos).
+ * @size: The total size of the file in bytes (i_size).
+ * @nr_folios: Number of folios in the folios array.
+ * @folios: KHO vmalloc descriptor pointing to the array of
+ * struct memfd_luo_folio_ser.
+ */
+struct memfd_luo_ser {
+ u64 pos;
+ u64 size;
+ u64 nr_folios;
+ struct kho_vmalloc folios;
+} __packed;
+
+/* The compatibility string for memfd file handler */
+#define MEMFD_LUO_FH_COMPATIBLE "memfd-v1"
+
+#endif /* _LINUX_KHO_ABI_MEMFD_H */