diff options
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/alloc_tag.h | 2 | ||||
| -rw-r--r-- | include/linux/damon.h | 2 | ||||
| -rw-r--r-- | include/linux/fs.h | 9 | ||||
| -rw-r--r-- | include/linux/kexec_handover.h | 13 | ||||
| -rw-r--r-- | include/linux/kho/abi/kexec_handover.h | 20 | ||||
| -rw-r--r-- | include/linux/kho/abi/kexec_metadata.h | 46 | ||||
| -rw-r--r-- | include/linux/liveupdate.h | 17 | ||||
| -rw-r--r-- | include/linux/memcontrol.h | 191 | ||||
| -rw-r--r-- | include/linux/mm.h | 5 | ||||
| -rw-r--r-- | include/linux/mm_inline.h | 6 | ||||
| -rw-r--r-- | include/linux/mmzone.h | 42 | ||||
| -rw-r--r-- | include/linux/pgalloc_tag.h | 2 | ||||
| -rw-r--r-- | include/linux/sched.h | 2 | ||||
| -rw-r--r-- | include/linux/shmem_fs.h | 14 | ||||
| -rw-r--r-- | include/linux/swap.h | 25 | ||||
| -rw-r--r-- | include/linux/userfaultfd_k.h | 73 |
16 files changed, 278 insertions, 191 deletions
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index d40ac39bfbe8..02de2ede560f 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -163,9 +163,11 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) { WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); } +void alloc_tag_add_early_pfn(unsigned long pfn); #else static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {} static inline void alloc_tag_sub_check(union codetag_ref *ref) {} +static inline void alloc_tag_add_early_pfn(unsigned long pfn) {} #endif /* Caller should verify both ref and tag to be valid */ diff --git a/include/linux/damon.h b/include/linux/damon.h index d9a3babbafc1..f2cdb7c3f5e6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -818,9 +818,11 @@ struct damon_ctx { /* lists of &struct damon_call_control */ struct list_head call_controls; + bool call_controls_obsolete; struct mutex call_controls_lock; struct damos_walk_control *walk_control; + bool walk_control_obsolete; struct mutex walk_control_lock; /* diff --git a/include/linux/fs.h b/include/linux/fs.h index e1d257e6da68..11559c513dfb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2062,20 +2062,13 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file const struct vm_area_struct *vma); int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma); int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); -int __vma_check_mmap_hook(struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { - int err; - if (file->f_op->mmap_prepare) return compat_vma_mmap(file, vma); - err = file->f_op->mmap(file, vma); - if (err) - return err; - - return __vma_check_mmap_hook(vma); + return file->f_op->mmap(file, vma); } static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index ac4129d1d741..8968c56d2d73 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -32,9 +32,9 @@ void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(const char *name, void *fdt); -void kho_remove_subtree(void *fdt); -int kho_retrieve_subtree(const char *name, phys_addr_t *phys); +int kho_add_subtree(const char *name, void *blob, size_t size); +void kho_remove_subtree(void *blob); +int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size); void kho_memory_init(void); @@ -97,14 +97,15 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; } -static inline int kho_add_subtree(const char *name, void *fdt) +static inline int kho_add_subtree(const char *name, void *blob, size_t size) { return -EOPNOTSUPP; } -static inline void kho_remove_subtree(void *fdt) { } +static inline void kho_remove_subtree(void *blob) { } -static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys, + size_t *size) { return -EOPNOTSUPP; } diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index 6b7d8ef550f9..7e847a2339b0 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -41,25 +41,28 @@ * restore the preserved data.:: * * / { - * compatible = "kho-v2"; + * compatible = "kho-v3"; * * preserved-memory-map = <0x...>; * * <subnode-name-1> { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * * <subnode-name-2> { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * ... ... * <subnode-name-N> { * preserved-data = <0x...>; + * blob-size = <0x...>; * }; * }; * * Root KHO Node (/): - * - compatible: "kho-v2" + * - compatible: "kho-v3" * * Indentifies the overall KHO ABI version. * @@ -78,16 +81,25 @@ * * Physical address pointing to a subnode data blob that is also * being preserved. + * + * - blob-size: u64 + * + * Size in bytes of the preserved data blob. This is needed because + * blobs may use arbitrary formats (not just FDT), so the size + * cannot be determined from the blob content alone. */ /* The compatible string for the KHO FDT root node. */ -#define KHO_FDT_COMPATIBLE "kho-v2" +#define KHO_FDT_COMPATIBLE "kho-v3" /* The FDT property for the preserved memory map. */ #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" /* The FDT property for preserved data blobs. */ -#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data" +#define KHO_SUB_TREE_PROP_NAME "preserved-data" + +/* The FDT property for the size of preserved data blobs. */ +#define KHO_SUB_TREE_SIZE_PROP_NAME "blob-size" /** * DOC: Kexec Handover ABI for vmalloc Preservation diff --git a/include/linux/kho/abi/kexec_metadata.h b/include/linux/kho/abi/kexec_metadata.h new file mode 100644 index 000000000000..e9e3f7e38a7c --- /dev/null +++ b/include/linux/kho/abi/kexec_metadata.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/** + * DOC: Kexec Metadata ABI + * + * The "kexec-metadata" subtree stores optional metadata about the kexec chain. + * It is registered via kho_add_subtree(), keeping it independent from the core + * KHO ABI. This allows the metadata format to evolve without affecting other + * KHO consumers. + * + * The metadata is stored as a plain C struct rather than FDT format for + * simplicity and direct field access. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Breno Leitao <leitao@debian.org> + */ + +#ifndef _LINUX_KHO_ABI_KEXEC_METADATA_H +#define _LINUX_KHO_ABI_KEXEC_METADATA_H + +#include <linux/types.h> +#include <linux/utsname.h> + +#define KHO_KEXEC_METADATA_VERSION 1 + +/** + * struct kho_kexec_metadata - Kexec metadata passed between kernels + * @version: ABI version of this struct (must be first field) + * @previous_release: Kernel version string that initiated the kexec + * @kexec_count: Number of kexec boots since last cold boot + * + * This structure is preserved across kexec and allows the new kernel to + * identify which kernel it was booted from and how many kexec reboots + * have occurred. + * + * __NEW_UTS_LEN is part of uABI, so it safe to use it in here. + */ +struct kho_kexec_metadata { + u32 version; + char previous_release[__NEW_UTS_LEN + 1]; + u32 kexec_count; +} __packed; + +#define KHO_METADATA_NODE_NAME "kexec-metadata" + +#endif /* _LINUX_KHO_ABI_KEXEC_METADATA_H */ diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index dd11fdc76a5f..30c5a39ff9e9 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -12,6 +12,7 @@ #include <linux/kho/abi/luo.h> #include <linux/list.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/types.h> #include <uapi/linux/liveupdate.h> @@ -63,6 +64,7 @@ struct liveupdate_file_op_args { * finish, in order to do successful finish calls for all * resources in the session. * @finish: Required. Final cleanup in the new kernel. + * @get_id: Optional. Returns a unique identifier for the file. * @owner: Module reference * * All operations (except can_preserve) receive a pointer to a @@ -78,6 +80,7 @@ struct liveupdate_file_ops { int (*retrieve)(struct liveupdate_file_op_args *args); bool (*can_finish)(struct liveupdate_file_op_args *args); void (*finish)(struct liveupdate_file_op_args *args); + unsigned long (*get_id)(struct file *file); struct module *owner; }; @@ -228,12 +231,12 @@ bool liveupdate_enabled(void); int liveupdate_reboot(void); int liveupdate_register_file_handler(struct liveupdate_file_handler *fh); -int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); +void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); int liveupdate_register_flb(struct liveupdate_file_handler *fh, struct liveupdate_flb *flb); -int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb); +void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb); int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); @@ -255,9 +258,8 @@ static inline int liveupdate_register_file_handler(struct liveupdate_file_handle return -EOPNOTSUPP; } -static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +static inline void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { - return -EOPNOTSUPP; } static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, @@ -266,10 +268,9 @@ static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, return -EOPNOTSUPP; } -static inline int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, - struct liveupdate_flb *flb) +static inline void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) { - return -EOPNOTSUPP; } static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5173a9f16721..dc3fa687759b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -115,6 +115,16 @@ struct mem_cgroup_per_node { unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; + /* + * objcg is wiped out as a part of the objcg repaprenting process. + * orig_objcg preserves a pointer (and a reference) to the original + * objcg until the end of live of memcg. + */ + struct obj_cgroup __rcu *objcg; + struct obj_cgroup *orig_objcg; + /* list of inherited objcgs, protected by objcg_lock */ + struct list_head objcg_list; + #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC /* slab stats for nmi context */ atomic_t slab_reclaimable; @@ -179,6 +189,7 @@ struct obj_cgroup { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; + bool is_root; }; /* @@ -257,15 +268,6 @@ struct mem_cgroup { seqlock_t socket_pressure_seqlock; #endif int kmemcg_id; - /* - * memcg->objcg is wiped out as a part of the objcg repaprenting - * process. memcg->orig_objcg preserves a pointer (and a reference) - * to the original objcg until the end of live of memcg. - */ - struct obj_cgroup __rcu *objcg; - struct obj_cgroup *orig_objcg; - /* list of inherited objcgs, protected by objcg_lock */ - struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; @@ -367,9 +369,6 @@ enum objext_flags { #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) #ifdef CONFIG_MEMCG - -static inline bool folio_memcg_kmem(struct folio *folio); - /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. @@ -383,43 +382,19 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) } /* - * __folio_memcg - Get the memory cgroup associated with a non-kmem folio - * @folio: Pointer to the folio. - * - * Returns a pointer to the memory cgroup associated with the folio, - * or NULL. This function assumes that the folio is known to have a - * proper memory cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios or - * kmem folios. - */ -static inline struct mem_cgroup *__folio_memcg(struct folio *folio) -{ - unsigned long memcg_data = folio->memcg_data; - - VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); - - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); -} - -/* - * __folio_objcg - get the object cgroup associated with a kmem folio. + * folio_objcg - get the object cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a - * proper object cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios or - * LRU folios. + * proper object cgroup pointer. */ -static inline struct obj_cgroup *__folio_objcg(struct folio *folio) +static inline struct obj_cgroup *folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } @@ -433,21 +408,30 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * - * For a non-kmem folio any of the following ensures folio and memcg binding - * stability: + * For a folio any of the following ensures folio and objcg binding stability: * * - the folio lock * - LRU isolation * - exclusive reference * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. + * Based on the stable binding of folio and objcg, for a folio any of the + * following ensures folio and memcg binding stability: + * + * - cgroup_mutex + * - the lruvec lock + * + * If the caller only want to ensure that the page counters of memcg are + * updated correctly, ensure that the binding stability of folio and objcg + * is sufficient. + * + * Note: The caller should hold an rcu read lock or cgroup_mutex to protect + * memcg associated with a folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { - if (folio_memcg_kmem(folio)) - return obj_cgroup_memcg(__folio_objcg(folio)); - return __folio_memcg(folio); + struct obj_cgroup *objcg = folio_objcg(folio); + + return objcg ? obj_cgroup_memcg(objcg) : NULL; } /* @@ -471,15 +455,10 @@ static inline bool folio_memcg_charged(struct folio *folio) * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * - * For a non-kmem folio any of the following ensures folio and memcg binding - * stability: + * The page and objcg or memcg binding rules can refer to folio_memcg(). * - * - the folio lock - * - LRU isolation - * - exclusive reference - * - * For a kmem folio a caller should hold an rcu read lock to protect memcg - * associated with a kmem folio from being released. + * A caller should hold an rcu read lock to protect memcg associated with a + * page from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { @@ -488,18 +467,14 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); + struct obj_cgroup *objcg; if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; - if (memcg_data & MEMCG_DATA_KMEM) { - struct obj_cgroup *objcg; + objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return obj_cgroup_memcg(objcg); - } - - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + return objcg ? obj_cgroup_memcg(objcg) : NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) @@ -548,6 +523,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) +{ + return objcg->is_root; +} + static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); @@ -735,7 +715,15 @@ out: * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * - * This function relies on folio->mem_cgroup being stable. + * Call with rcu_read_lock() held to ensure the lifetime of the returned lruvec. + * Note that this alone will NOT guarantee the stability of the folio->lruvec + * association; the folio can be reparented to an ancestor if this races with + * cgroup deletion. + * + * Use folio_lruvec_lock() to ensure both lifetime and stability of the binding. + * Once a lruvec is locked, folio_lruvec() can be called on other folios, and + * their binding is stable if the returned lruvec matches the one the caller has + * locked. Useful for lock batching. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { @@ -758,15 +746,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); -#ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); -#else -static inline -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ -} -#endif - static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; @@ -774,23 +753,26 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { + if (obj_cgroup_is_root(objcg)) + return true; return percpu_ref_tryget(&objcg->refcnt); } -static inline void obj_cgroup_get(struct obj_cgroup *objcg) +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, + unsigned long nr) { - percpu_ref_get(&objcg->refcnt); + if (!obj_cgroup_is_root(objcg)) + percpu_ref_get_many(&objcg->refcnt, nr); } -static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, - unsigned long nr) +static inline void obj_cgroup_get(struct obj_cgroup *objcg) { - percpu_ref_get_many(&objcg->refcnt, nr); + obj_cgroup_get_many(objcg, 1); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - if (objcg) + if (objcg && !obj_cgroup_is_root(objcg)) percpu_ref_put(&objcg->refcnt); } @@ -885,7 +867,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); +struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) @@ -896,7 +878,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int zid, int nr_pages); + int zid, long nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, @@ -966,10 +948,15 @@ void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { - struct mem_cgroup *memcg = folio_memcg(folio); + struct mem_cgroup *memcg; - if (memcg) - count_memcg_events(memcg, idx, nr); + if (!folio_memcg_charged(folio)) + return; + + rcu_read_lock(); + memcg = folio_memcg(folio); + count_memcg_events(memcg, idx, nr); + rcu_read_unlock(); } static inline void count_memcg_events_mm(struct mm_struct *mm, @@ -1087,6 +1074,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return true; } +static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) +{ + return true; +} + static inline bool mem_cgroup_disabled(void) { return true; @@ -1179,11 +1171,6 @@ static inline struct lruvec *folio_lruvec(struct folio *folio) return &pgdat->__lruvec; } -static inline -void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) -{ -} - static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; @@ -1242,6 +1229,7 @@ static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } @@ -1250,6 +1238,7 @@ static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } @@ -1259,6 +1248,7 @@ static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, { struct pglist_data *pgdat = folio_pgdat(folio); + rcu_read_lock(); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } @@ -1479,20 +1469,28 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } -static inline void unlock_page_lruvec(struct lruvec *lruvec) +static inline void lruvec_lock_irq(struct lruvec *lruvec) +{ + rcu_read_lock(); + spin_lock_irq(&lruvec->lru_lock); +} + +static inline void lruvec_unlock(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); + rcu_read_unlock(); } -static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) +static inline void lruvec_unlock_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); + rcu_read_unlock(); } -static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, - unsigned long flags) +static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); + rcu_read_unlock(); } /* Test requires a stable folio->memcg binding, see folio_memcg() */ @@ -1511,7 +1509,7 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; - unlock_page_lruvec_irq(locked_lruvec); + lruvec_unlock_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); @@ -1525,7 +1523,7 @@ static inline void folio_lruvec_relock_irqsave(struct folio *folio, if (folio_matches_lruvec(folio, *lruvecp)) return; - unlock_page_lruvec_irqrestore(*lruvecp, *flags); + lruvec_unlock_irqrestore(*lruvecp, *flags); } *lruvecp = folio_lruvec_lock_irqsave(folio, flags); @@ -1549,9 +1547,14 @@ static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, if (mem_cgroup_disabled()) return; + if (!folio_memcg_charged(folio)) + return; + + rcu_read_lock(); memcg = folio_memcg(folio); - if (unlikely(memcg && &memcg->css != wb->memcg_css)) + if (unlikely(&memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); + rcu_read_unlock(); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); diff --git a/include/linux/mm.h b/include/linux/mm.h index 255e0f50ea32..0b776907152e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -758,6 +758,8 @@ struct vm_fault { */ }; +struct vm_uffd_ops; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -865,6 +867,9 @@ struct vm_operations_struct { struct page *(*find_normal_page)(struct vm_area_struct *vma, unsigned long addr); #endif /* CONFIG_FIND_NORMAL_PAGE */ +#ifdef CONFIG_USERFAULTFD + const struct vm_uffd_ops *uffd_ops; +#endif }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 7fc2ced00f8f..a171070e15f0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -348,6 +348,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_add_folio(lruvec, folio, false)) return; @@ -362,6 +364,8 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_add_folio(lruvec, folio, true)) return; @@ -376,6 +380,8 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); + VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + if (lru_gen_del_folio(lruvec, folio, false)) return; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3bcdda226a91..9adb2ad21da5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -694,6 +694,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); +void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid); +bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid); +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else /* !CONFIG_LRU_GEN */ @@ -735,6 +738,20 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } +static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid) +{ +} + +static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid) +{ + return true; +} + +static inline +void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid) +{ +} + #endif /* CONFIG_LRU_GEN */ struct lruvec { @@ -2053,21 +2070,16 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) extern size_t mem_section_usage_size(void); /* - * We use the lower bits of the mem_map pointer to store - * a little bit of information. The pointer is calculated - * as mem_map - section_nr_to_pfn(pnum). The result is - * aligned to the minimum alignment of the two values: - * 1. All mem_map arrays are page-aligned. - * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT - * lowest bits. PFN_SECTION_SHIFT is arch-specific - * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the - * worst combination is powerpc with 256k pages, - * which results in PFN_SECTION_SHIFT equal 6. - * To sum it up, at least 6 bits are available on all architectures. - * However, we can exceed 6 bits on some other architectures except - * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available - * with the worst case of 64K pages on arm64) if we make sure the - * exceeded bit is not applicable to powerpc. + * We use the lower bits of the mem_map pointer to store a little bit of + * information. The pointer is calculated as mem_map - section_nr_to_pfn(). + * The result is aligned to the minimum alignment of the two values: + * + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT lowest bits. + * + * We always expect a single section to cover full pages. Therefore, + * we can safely assume that PFN_SECTION_SHIFT is large enough to + * accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this. */ enum { SECTION_MARKED_PRESENT_BIT, diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 38a82d65e58e..951d33362268 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -181,7 +181,7 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) if (get_page_tag_ref(page, &ref, &handle)) { alloc_tag_sub_check(&ref); - if (ref.ct) + if (ref.ct && !is_codetag_empty(&ref)) tag = ct_to_alloc_tag(ref.ct); put_page_tag_ref(handle); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 004e6d56a499..368c7b4d7cb5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1535,7 +1535,7 @@ struct task_struct { /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; - /* Cache for current->cgroups->memcg->objcg lookups: */ + /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */ struct obj_cgroup *objcg; #endif diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f6a2d3402d76..93a0ba872ebe 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -221,20 +221,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) extern bool shmem_charge(struct inode *inode, long pages); -#ifdef CONFIG_USERFAULTFD -#ifdef CONFIG_SHMEM -extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop); -#else /* !CONFIG_SHMEM */ -#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, flags, foliop) ({ BUG(); 0; }) -#endif /* CONFIG_SHMEM */ -#endif /* CONFIG_USERFAULTFD */ - /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that diff --git a/include/linux/swap.h b/include/linux/swap.h index 4b1f13b5bbad..7a09df6977a5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,8 +310,7 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, - unsigned int nr_io, unsigned int nr_rotated) - __releases(lruvec->lru_lock); + unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); @@ -353,6 +352,7 @@ extern void swap_setup(void); extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) @@ -547,6 +547,8 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return READ_ONCE(memcg->swappiness); } + +void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid); #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { @@ -611,5 +613,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio) } #endif +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return + * + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. + */ +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d83e349900a3..d2920f98ab86 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -83,6 +83,39 @@ struct userfaultfd_ctx { extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* VMA userfaultfd operations */ +struct vm_uffd_ops { + /* Checks if a VMA can support userfaultfd */ + bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); + /* + * Called to resolve UFFDIO_CONTINUE request. + * Should return the folio found at pgoff in the VMA's pagecache if it + * exists or ERR_PTR otherwise. + * The returned folio is locked and with reference held. + */ + struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); + /* + * Called during resolution of UFFDIO_COPY request. + * Should allocate and return a folio or NULL if allocation fails. + */ + struct folio *(*alloc_folio)(struct vm_area_struct *vma, + unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request. + * Should only be called with a folio returned by alloc_folio() above. + * The folio will be set to locked. + * Returns 0 on success, error code on failure. + */ + int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request on the error + * handling path. + * Should revert the operation of ->filemap_add(). + */ + void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma); +}; + /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; @@ -114,11 +147,6 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at /* Flags controlling behavior. These behavior changes are mode-independent. */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) -extern int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags); - extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); @@ -211,39 +239,8 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - vm_flags_t vm_flags, - bool wp_async) -{ - vm_flags &= __VM_UFFD_FLAGS; - - if (vma->vm_flags & VM_DROPPABLE) - return false; - - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; - - /* - * If wp async enabled, and WP is the only mode enabled, allow any - * memory type. - */ - if (wp_async && (vm_flags == VM_UFFD_WP)) - return true; - - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && - !vma_is_anonymous(vma)) - return false; - - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async); static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) { |
