summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/linux/alloc_tag.h2
-rw-r--r--include/linux/damon.h2
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/kexec_handover.h13
-rw-r--r--include/linux/kho/abi/kexec_handover.h20
-rw-r--r--include/linux/kho/abi/kexec_metadata.h46
-rw-r--r--include/linux/liveupdate.h17
-rw-r--r--include/linux/memcontrol.h191
-rw-r--r--include/linux/mm.h5
-rw-r--r--include/linux/mm_inline.h6
-rw-r--r--include/linux/mmzone.h42
-rw-r--r--include/linux/pgalloc_tag.h2
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/shmem_fs.h14
-rw-r--r--include/linux/swap.h25
-rw-r--r--include/linux/userfaultfd_k.h73
-rw-r--r--include/trace/events/memcg.h10
-rw-r--r--include/trace/events/writeback.h3
18 files changed, 286 insertions, 196 deletions
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index d40ac39bfbe8..02de2ede560f 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -163,9 +163,11 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
{
WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
}
+void alloc_tag_add_early_pfn(unsigned long pfn);
#else
static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
+static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
#endif
/* Caller should verify both ref and tag to be valid */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index d9a3babbafc1..f2cdb7c3f5e6 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -818,9 +818,11 @@ struct damon_ctx {
/* lists of &struct damon_call_control */
struct list_head call_controls;
+ bool call_controls_obsolete;
struct mutex call_controls_lock;
struct damos_walk_control *walk_control;
+ bool walk_control_obsolete;
struct mutex walk_control_lock;
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e1d257e6da68..11559c513dfb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2062,20 +2062,13 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file
const struct vm_area_struct *vma);
int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma);
int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
-int __vma_check_mmap_hook(struct vm_area_struct *vma);
static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
{
- int err;
-
if (file->f_op->mmap_prepare)
return compat_vma_mmap(file, vma);
- err = file->f_op->mmap(file, vma);
- if (err)
- return err;
-
- return __vma_check_mmap_hook(vma);
+ return file->f_op->mmap(file, vma);
}
static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index ac4129d1d741..8968c56d2d73 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -32,9 +32,9 @@ void kho_restore_free(void *mem);
struct folio *kho_restore_folio(phys_addr_t phys);
struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages);
void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
-int kho_add_subtree(const char *name, void *fdt);
-void kho_remove_subtree(void *fdt);
-int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
+int kho_add_subtree(const char *name, void *blob, size_t size);
+void kho_remove_subtree(void *blob);
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size);
void kho_memory_init(void);
@@ -97,14 +97,15 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
return NULL;
}
-static inline int kho_add_subtree(const char *name, void *fdt)
+static inline int kho_add_subtree(const char *name, void *blob, size_t size)
{
return -EOPNOTSUPP;
}
-static inline void kho_remove_subtree(void *fdt) { }
+static inline void kho_remove_subtree(void *blob) { }
-static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys,
+ size_t *size)
{
return -EOPNOTSUPP;
}
diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h
index 6b7d8ef550f9..7e847a2339b0 100644
--- a/include/linux/kho/abi/kexec_handover.h
+++ b/include/linux/kho/abi/kexec_handover.h
@@ -41,25 +41,28 @@
* restore the preserved data.::
*
* / {
- * compatible = "kho-v2";
+ * compatible = "kho-v3";
*
* preserved-memory-map = <0x...>;
*
* <subnode-name-1> {
* preserved-data = <0x...>;
+ * blob-size = <0x...>;
* };
*
* <subnode-name-2> {
* preserved-data = <0x...>;
+ * blob-size = <0x...>;
* };
* ... ...
* <subnode-name-N> {
* preserved-data = <0x...>;
+ * blob-size = <0x...>;
* };
* };
*
* Root KHO Node (/):
- * - compatible: "kho-v2"
+ * - compatible: "kho-v3"
*
* Indentifies the overall KHO ABI version.
*
@@ -78,16 +81,25 @@
*
* Physical address pointing to a subnode data blob that is also
* being preserved.
+ *
+ * - blob-size: u64
+ *
+ * Size in bytes of the preserved data blob. This is needed because
+ * blobs may use arbitrary formats (not just FDT), so the size
+ * cannot be determined from the blob content alone.
*/
/* The compatible string for the KHO FDT root node. */
-#define KHO_FDT_COMPATIBLE "kho-v2"
+#define KHO_FDT_COMPATIBLE "kho-v3"
/* The FDT property for the preserved memory map. */
#define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map"
/* The FDT property for preserved data blobs. */
-#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data"
+#define KHO_SUB_TREE_PROP_NAME "preserved-data"
+
+/* The FDT property for the size of preserved data blobs. */
+#define KHO_SUB_TREE_SIZE_PROP_NAME "blob-size"
/**
* DOC: Kexec Handover ABI for vmalloc Preservation
diff --git a/include/linux/kho/abi/kexec_metadata.h b/include/linux/kho/abi/kexec_metadata.h
new file mode 100644
index 000000000000..e9e3f7e38a7c
--- /dev/null
+++ b/include/linux/kho/abi/kexec_metadata.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/**
+ * DOC: Kexec Metadata ABI
+ *
+ * The "kexec-metadata" subtree stores optional metadata about the kexec chain.
+ * It is registered via kho_add_subtree(), keeping it independent from the core
+ * KHO ABI. This allows the metadata format to evolve without affecting other
+ * KHO consumers.
+ *
+ * The metadata is stored as a plain C struct rather than FDT format for
+ * simplicity and direct field access.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Breno Leitao <leitao@debian.org>
+ */
+
+#ifndef _LINUX_KHO_ABI_KEXEC_METADATA_H
+#define _LINUX_KHO_ABI_KEXEC_METADATA_H
+
+#include <linux/types.h>
+#include <linux/utsname.h>
+
+#define KHO_KEXEC_METADATA_VERSION 1
+
+/**
+ * struct kho_kexec_metadata - Kexec metadata passed between kernels
+ * @version: ABI version of this struct (must be first field)
+ * @previous_release: Kernel version string that initiated the kexec
+ * @kexec_count: Number of kexec boots since last cold boot
+ *
+ * This structure is preserved across kexec and allows the new kernel to
+ * identify which kernel it was booted from and how many kexec reboots
+ * have occurred.
+ *
+ * __NEW_UTS_LEN is part of uABI, so it safe to use it in here.
+ */
+struct kho_kexec_metadata {
+ u32 version;
+ char previous_release[__NEW_UTS_LEN + 1];
+ u32 kexec_count;
+} __packed;
+
+#define KHO_METADATA_NODE_NAME "kexec-metadata"
+
+#endif /* _LINUX_KHO_ABI_KEXEC_METADATA_H */
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index dd11fdc76a5f..30c5a39ff9e9 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -12,6 +12,7 @@
#include <linux/kho/abi/luo.h>
#include <linux/list.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/types.h>
#include <uapi/linux/liveupdate.h>
@@ -63,6 +64,7 @@ struct liveupdate_file_op_args {
* finish, in order to do successful finish calls for all
* resources in the session.
* @finish: Required. Final cleanup in the new kernel.
+ * @get_id: Optional. Returns a unique identifier for the file.
* @owner: Module reference
*
* All operations (except can_preserve) receive a pointer to a
@@ -78,6 +80,7 @@ struct liveupdate_file_ops {
int (*retrieve)(struct liveupdate_file_op_args *args);
bool (*can_finish)(struct liveupdate_file_op_args *args);
void (*finish)(struct liveupdate_file_op_args *args);
+ unsigned long (*get_id)(struct file *file);
struct module *owner;
};
@@ -228,12 +231,12 @@ bool liveupdate_enabled(void);
int liveupdate_reboot(void);
int liveupdate_register_file_handler(struct liveupdate_file_handler *fh);
-int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh);
+void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh);
int liveupdate_register_flb(struct liveupdate_file_handler *fh,
struct liveupdate_flb *flb);
-int liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
- struct liveupdate_flb *flb);
+void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb);
int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp);
int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp);
@@ -255,9 +258,8 @@ static inline int liveupdate_register_file_handler(struct liveupdate_file_handle
return -EOPNOTSUPP;
}
-static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+static inline void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
{
- return -EOPNOTSUPP;
}
static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh,
@@ -266,10 +268,9 @@ static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh,
return -EOPNOTSUPP;
}
-static inline int liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
- struct liveupdate_flb *flb)
+static inline void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb)
{
- return -EOPNOTSUPP;
}
static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5173a9f16721..dc3fa687759b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -115,6 +115,16 @@ struct mem_cgroup_per_node {
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter iter;
+ /*
+ * objcg is wiped out as a part of the objcg repaprenting process.
+ * orig_objcg preserves a pointer (and a reference) to the original
+ * objcg until the end of live of memcg.
+ */
+ struct obj_cgroup __rcu *objcg;
+ struct obj_cgroup *orig_objcg;
+ /* list of inherited objcgs, protected by objcg_lock */
+ struct list_head objcg_list;
+
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
/* slab stats for nmi context */
atomic_t slab_reclaimable;
@@ -179,6 +189,7 @@ struct obj_cgroup {
struct list_head list; /* protected by objcg_lock */
struct rcu_head rcu;
};
+ bool is_root;
};
/*
@@ -257,15 +268,6 @@ struct mem_cgroup {
seqlock_t socket_pressure_seqlock;
#endif
int kmemcg_id;
- /*
- * memcg->objcg is wiped out as a part of the objcg repaprenting
- * process. memcg->orig_objcg preserves a pointer (and a reference)
- * to the original objcg until the end of live of memcg.
- */
- struct obj_cgroup __rcu *objcg;
- struct obj_cgroup *orig_objcg;
- /* list of inherited objcgs, protected by objcg_lock */
- struct list_head objcg_list;
struct memcg_vmstats_percpu __percpu *vmstats_percpu;
@@ -367,9 +369,6 @@ enum objext_flags {
#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)
#ifdef CONFIG_MEMCG
-
-static inline bool folio_memcg_kmem(struct folio *folio);
-
/*
* After the initialization objcg->memcg is always pointing at
* a valid memcg, but can be atomically swapped to the parent memcg.
@@ -383,43 +382,19 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
}
/*
- * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
- * @folio: Pointer to the folio.
- *
- * Returns a pointer to the memory cgroup associated with the folio,
- * or NULL. This function assumes that the folio is known to have a
- * proper memory cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios or
- * kmem folios.
- */
-static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
-{
- unsigned long memcg_data = folio->memcg_data;
-
- VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
-
- return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
-}
-
-/*
- * __folio_objcg - get the object cgroup associated with a kmem folio.
+ * folio_objcg - get the object cgroup associated with a folio.
* @folio: Pointer to the folio.
*
* Returns a pointer to the object cgroup associated with the folio,
* or NULL. This function assumes that the folio is known to have a
- * proper object cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios or
- * LRU folios.
+ * proper object cgroup pointer.
*/
-static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
+static inline struct obj_cgroup *folio_objcg(struct folio *folio)
{
unsigned long memcg_data = folio->memcg_data;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
- VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}
@@ -433,21 +408,30 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
* proper memory cgroup pointer. It's not safe to call this function
* against some type of folios, e.g. slab folios or ex-slab folios.
*
- * For a non-kmem folio any of the following ensures folio and memcg binding
- * stability:
+ * For a folio any of the following ensures folio and objcg binding stability:
*
* - the folio lock
* - LRU isolation
* - exclusive reference
*
- * For a kmem folio a caller should hold an rcu read lock to protect memcg
- * associated with a kmem folio from being released.
+ * Based on the stable binding of folio and objcg, for a folio any of the
+ * following ensures folio and memcg binding stability:
+ *
+ * - cgroup_mutex
+ * - the lruvec lock
+ *
+ * If the caller only want to ensure that the page counters of memcg are
+ * updated correctly, ensure that the binding stability of folio and objcg
+ * is sufficient.
+ *
+ * Note: The caller should hold an rcu read lock or cgroup_mutex to protect
+ * memcg associated with a folio from being released.
*/
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
- if (folio_memcg_kmem(folio))
- return obj_cgroup_memcg(__folio_objcg(folio));
- return __folio_memcg(folio);
+ struct obj_cgroup *objcg = folio_objcg(folio);
+
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
/*
@@ -471,15 +455,10 @@ static inline bool folio_memcg_charged(struct folio *folio)
* has an associated memory cgroup pointer or an object cgroups vector or
* an object cgroup.
*
- * For a non-kmem folio any of the following ensures folio and memcg binding
- * stability:
+ * The page and objcg or memcg binding rules can refer to folio_memcg().
*
- * - the folio lock
- * - LRU isolation
- * - exclusive reference
- *
- * For a kmem folio a caller should hold an rcu read lock to protect memcg
- * associated with a kmem folio from being released.
+ * A caller should hold an rcu read lock to protect memcg associated with a
+ * page from being released.
*/
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
@@ -488,18 +467,14 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
* for slabs, READ_ONCE() should be used here.
*/
unsigned long memcg_data = READ_ONCE(folio->memcg_data);
+ struct obj_cgroup *objcg;
if (memcg_data & MEMCG_DATA_OBJEXTS)
return NULL;
- if (memcg_data & MEMCG_DATA_KMEM) {
- struct obj_cgroup *objcg;
+ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- return obj_cgroup_memcg(objcg);
- }
-
- return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
static inline struct mem_cgroup *page_memcg_check(struct page *page)
@@ -548,6 +523,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
+static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
+{
+ return objcg->is_root;
+}
+
static inline bool mem_cgroup_disabled(void)
{
return !cgroup_subsys_enabled(memory_cgrp_subsys);
@@ -735,7 +715,15 @@ out:
* folio_lruvec - return lruvec for isolating/putting an LRU folio
* @folio: Pointer to the folio.
*
- * This function relies on folio->mem_cgroup being stable.
+ * Call with rcu_read_lock() held to ensure the lifetime of the returned lruvec.
+ * Note that this alone will NOT guarantee the stability of the folio->lruvec
+ * association; the folio can be reparented to an ancestor if this races with
+ * cgroup deletion.
+ *
+ * Use folio_lruvec_lock() to ensure both lifetime and stability of the binding.
+ * Once a lruvec is locked, folio_lruvec() can be called on other folios, and
+ * their binding is stable if the returned lruvec matches the one the caller has
+ * locked. Useful for lock batching.
*/
static inline struct lruvec *folio_lruvec(struct folio *folio)
{
@@ -758,15 +746,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
unsigned long *flags);
-#ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
-#else
-static inline
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
-}
-#endif
-
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -774,23 +753,26 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
+ if (obj_cgroup_is_root(objcg))
+ return true;
return percpu_ref_tryget(&objcg->refcnt);
}
-static inline void obj_cgroup_get(struct obj_cgroup *objcg)
+static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
+ unsigned long nr)
{
- percpu_ref_get(&objcg->refcnt);
+ if (!obj_cgroup_is_root(objcg))
+ percpu_ref_get_many(&objcg->refcnt, nr);
}
-static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
- unsigned long nr)
+static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
- percpu_ref_get_many(&objcg->refcnt, nr);
+ obj_cgroup_get_many(objcg, 1);
}
static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
- if (objcg)
+ if (objcg && !obj_cgroup_is_root(objcg))
percpu_ref_put(&objcg->refcnt);
}
@@ -885,7 +867,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
return match;
}
-struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
+struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio);
ino_t page_cgroup_ino(struct page *page);
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
@@ -896,7 +878,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
}
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int zid, int nr_pages);
+ int zid, long nr_pages);
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
@@ -966,10 +948,15 @@ void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
static inline void count_memcg_folio_events(struct folio *folio,
enum vm_event_item idx, unsigned long nr)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
- if (memcg)
- count_memcg_events(memcg, idx, nr);
+ if (!folio_memcg_charged(folio))
+ return;
+
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ count_memcg_events(memcg, idx, nr);
+ rcu_read_unlock();
}
static inline void count_memcg_events_mm(struct mm_struct *mm,
@@ -1087,6 +1074,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return true;
}
+static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
+{
+ return true;
+}
+
static inline bool mem_cgroup_disabled(void)
{
return true;
@@ -1179,11 +1171,6 @@ static inline struct lruvec *folio_lruvec(struct folio *folio)
return &pgdat->__lruvec;
}
-static inline
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
-}
-
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
return NULL;
@@ -1242,6 +1229,7 @@ static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
{
struct pglist_data *pgdat = folio_pgdat(folio);
+ rcu_read_lock();
spin_lock(&pgdat->__lruvec.lru_lock);
return &pgdat->__lruvec;
}
@@ -1250,6 +1238,7 @@ static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
struct pglist_data *pgdat = folio_pgdat(folio);
+ rcu_read_lock();
spin_lock_irq(&pgdat->__lruvec.lru_lock);
return &pgdat->__lruvec;
}
@@ -1259,6 +1248,7 @@ static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
{
struct pglist_data *pgdat = folio_pgdat(folio);
+ rcu_read_lock();
spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
return &pgdat->__lruvec;
}
@@ -1479,20 +1469,28 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}
-static inline void unlock_page_lruvec(struct lruvec *lruvec)
+static inline void lruvec_lock_irq(struct lruvec *lruvec)
+{
+ rcu_read_lock();
+ spin_lock_irq(&lruvec->lru_lock);
+}
+
+static inline void lruvec_unlock(struct lruvec *lruvec)
{
spin_unlock(&lruvec->lru_lock);
+ rcu_read_unlock();
}
-static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+static inline void lruvec_unlock_irq(struct lruvec *lruvec)
{
spin_unlock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
}
-static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
- unsigned long flags)
+static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags)
{
spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+ rcu_read_unlock();
}
/* Test requires a stable folio->memcg binding, see folio_memcg() */
@@ -1511,7 +1509,7 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
if (folio_matches_lruvec(folio, locked_lruvec))
return locked_lruvec;
- unlock_page_lruvec_irq(locked_lruvec);
+ lruvec_unlock_irq(locked_lruvec);
}
return folio_lruvec_lock_irq(folio);
@@ -1525,7 +1523,7 @@ static inline void folio_lruvec_relock_irqsave(struct folio *folio,
if (folio_matches_lruvec(folio, *lruvecp))
return;
- unlock_page_lruvec_irqrestore(*lruvecp, *flags);
+ lruvec_unlock_irqrestore(*lruvecp, *flags);
}
*lruvecp = folio_lruvec_lock_irqsave(folio, flags);
@@ -1549,9 +1547,14 @@ static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
if (mem_cgroup_disabled())
return;
+ if (!folio_memcg_charged(folio))
+ return;
+
+ rcu_read_lock();
memcg = folio_memcg(folio);
- if (unlikely(memcg && &memcg->css != wb->memcg_css))
+ if (unlikely(&memcg->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
+ rcu_read_unlock();
}
void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 255e0f50ea32..0b776907152e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -758,6 +758,8 @@ struct vm_fault {
*/
};
+struct vm_uffd_ops;
+
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -865,6 +867,9 @@ struct vm_operations_struct {
struct page *(*find_normal_page)(struct vm_area_struct *vma,
unsigned long addr);
#endif /* CONFIG_FIND_NORMAL_PAGE */
+#ifdef CONFIG_USERFAULTFD
+ const struct vm_uffd_ops *uffd_ops;
+#endif
};
#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7fc2ced00f8f..a171070e15f0 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -348,6 +348,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
if (lru_gen_add_folio(lruvec, folio, false))
return;
@@ -362,6 +364,8 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
if (lru_gen_add_folio(lruvec, folio, true))
return;
@@ -376,6 +380,8 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
if (lru_gen_del_folio(lruvec, folio, false))
return;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3bcdda226a91..9adb2ad21da5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -694,6 +694,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
+void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid);
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid);
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);
#else /* !CONFIG_LRU_GEN */
@@ -735,6 +738,20 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}
+static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
+{
+}
+
+static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
+{
+ return true;
+}
+
+static inline
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
struct lruvec {
@@ -2053,21 +2070,16 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
extern size_t mem_section_usage_size(void);
/*
- * We use the lower bits of the mem_map pointer to store
- * a little bit of information. The pointer is calculated
- * as mem_map - section_nr_to_pfn(pnum). The result is
- * aligned to the minimum alignment of the two values:
- * 1. All mem_map arrays are page-aligned.
- * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
- * lowest bits. PFN_SECTION_SHIFT is arch-specific
- * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
- * worst combination is powerpc with 256k pages,
- * which results in PFN_SECTION_SHIFT equal 6.
- * To sum it up, at least 6 bits are available on all architectures.
- * However, we can exceed 6 bits on some other architectures except
- * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
- * with the worst case of 64K pages on arm64) if we make sure the
- * exceeded bit is not applicable to powerpc.
+ * We use the lower bits of the mem_map pointer to store a little bit of
+ * information. The pointer is calculated as mem_map - section_nr_to_pfn().
+ * The result is aligned to the minimum alignment of the two values:
+ *
+ * 1. All mem_map arrays are page-aligned.
+ * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT lowest bits.
+ *
+ * We always expect a single section to cover full pages. Therefore,
+ * we can safely assume that PFN_SECTION_SHIFT is large enough to
+ * accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this.
*/
enum {
SECTION_MARKED_PRESENT_BIT,
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 38a82d65e58e..951d33362268 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -181,7 +181,7 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page)
if (get_page_tag_ref(page, &ref, &handle)) {
alloc_tag_sub_check(&ref);
- if (ref.ct)
+ if (ref.ct && !is_codetag_empty(&ref))
tag = ct_to_alloc_tag(ref.ct);
put_page_tag_ref(handle);
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 004e6d56a499..368c7b4d7cb5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1535,7 +1535,7 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
- /* Cache for current->cgroups->memcg->objcg lookups: */
+ /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */
struct obj_cgroup *objcg;
#endif
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index f6a2d3402d76..93a0ba872ebe 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -221,20 +221,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
extern bool shmem_charge(struct inode *inode, long pages);
-#ifdef CONFIG_USERFAULTFD
-#ifdef CONFIG_SHMEM
-extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop);
-#else /* !CONFIG_SHMEM */
-#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
- src_addr, flags, foliop) ({ BUG(); 0; })
-#endif /* CONFIG_SHMEM */
-#endif /* CONFIG_USERFAULTFD */
-
/*
* Used space is stored as unsigned 64-bit value in bytes but
* quota core supports only signed 64-bit values so use that
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4b1f13b5bbad..7a09df6977a5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -310,8 +310,7 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
- unsigned int nr_io, unsigned int nr_rotated)
- __releases(lruvec->lru_lock);
+ unsigned int nr_io, unsigned int nr_rotated);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
@@ -353,6 +352,7 @@ extern void swap_setup(void);
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
@@ -547,6 +547,8 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
return READ_ONCE(memcg->swappiness);
}
+
+void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
@@ -611,5 +613,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio)
}
#endif
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
+ (idx) <= (highidx); \
+ (idx)++, (zone)++) \
+ if (!managed_zone(zone)) \
+ continue; \
+ else
+
#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d83e349900a3..d2920f98ab86 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -83,6 +83,39 @@ struct userfaultfd_ctx {
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
+/* VMA userfaultfd operations */
+struct vm_uffd_ops {
+ /* Checks if a VMA can support userfaultfd */
+ bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags);
+ /*
+ * Called to resolve UFFDIO_CONTINUE request.
+ * Should return the folio found at pgoff in the VMA's pagecache if it
+ * exists or ERR_PTR otherwise.
+ * The returned folio is locked and with reference held.
+ */
+ struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff);
+ /*
+ * Called during resolution of UFFDIO_COPY request.
+ * Should allocate and return a folio or NULL if allocation fails.
+ */
+ struct folio *(*alloc_folio)(struct vm_area_struct *vma,
+ unsigned long addr);
+ /*
+ * Called during resolution of UFFDIO_COPY request.
+ * Should only be called with a folio returned by alloc_folio() above.
+ * The folio will be set to locked.
+ * Returns 0 on success, error code on failure.
+ */
+ int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr);
+ /*
+ * Called during resolution of UFFDIO_COPY request on the error
+ * handling path.
+ * Should revert the operation of ->filemap_add().
+ */
+ void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma);
+};
+
/* A combined operation mode + behavior flags. */
typedef unsigned int __bitwise uffd_flags_t;
@@ -114,11 +147,6 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at
/* Flags controlling behavior. These behavior changes are mode-independent. */
#define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0)
-extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr, struct page *page,
- bool newly_allocated, uffd_flags_t flags);
-
extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
uffd_flags_t flags);
@@ -211,39 +239,8 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
return vma->vm_flags & __VM_UFFD_FLAGS;
}
-static inline bool vma_can_userfault(struct vm_area_struct *vma,
- vm_flags_t vm_flags,
- bool wp_async)
-{
- vm_flags &= __VM_UFFD_FLAGS;
-
- if (vma->vm_flags & VM_DROPPABLE)
- return false;
-
- if ((vm_flags & VM_UFFD_MINOR) &&
- (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
- return false;
-
- /*
- * If wp async enabled, and WP is the only mode enabled, allow any
- * memory type.
- */
- if (wp_async && (vm_flags == VM_UFFD_WP))
- return true;
-
- /*
- * If user requested uffd-wp but not enabled pte markers for
- * uffd-wp, then shmem & hugetlbfs are not supported but only
- * anonymous.
- */
- if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) &&
- !vma_is_anonymous(vma))
- return false;
-
- /* By default, allow any of anon|shmem|hugetlb */
- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
- vma_is_shmem(vma);
-}
+bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags,
+ bool wp_async);
static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
{
diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
index dfe2f51019b4..51b62c5931fc 100644
--- a/include/trace/events/memcg.h
+++ b/include/trace/events/memcg.h
@@ -11,14 +11,14 @@
DECLARE_EVENT_CLASS(memcg_rstat_stats,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val),
TP_STRUCT__entry(
__field(u64, id)
__field(int, item)
- __field(int, val)
+ __field(long, val)
),
TP_fast_assign(
@@ -27,20 +27,20 @@ DECLARE_EVENT_CLASS(memcg_rstat_stats,
__entry->val = val;
),
- TP_printk("memcg_id=%llu item=%d val=%d",
+ TP_printk("memcg_id=%llu item=%d val=%ld",
__entry->id, __entry->item, __entry->val)
);
DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val)
);
DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val)
);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index e5cd2b80fd29..bdac0d685a98 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -294,7 +294,10 @@ TRACE_EVENT(track_foreign_dirty,
__entry->ino = inode ? inode->i_ino : 0;
__entry->memcg_id = wb->memcg_css->id;
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
+
+ rcu_read_lock();
__entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
+ rcu_read_unlock();
),
TP_printk("bdi %s[%llu]: ino=%llu memcg_id=%u cgroup_ino=%llu page_cgroup_ino=%llu",