summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-19 08:01:17 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-19 08:01:17 -0700
commit40735a683bf844a453d7a0f91e5e3daa0abc659b (patch)
treee6012cae8a2d6fc4195dba178b7eec7c5ab4362c
parentfaeab166167f5787719eb8683661fd41a3bb1514 (diff)
parent0b5e8d7999076ac3c490fc18376a404e2626abff (diff)
Merge tag 'mm-stable-2026-04-18-02-14' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: - "Eliminate Dying Memory Cgroup" (Qi Zheng and Muchun Song) Address the longstanding "dying memcg problem". A situation wherein a no-longer-used memory control group will hang around for an extended period pointlessly consuming memory - "fix unexpected type conversions and potential overflows" (Qi Zheng) Fix a couple of potential 32-bit/64-bit issues which were identified during review of the "Eliminate Dying Memory Cgroup" series - "kho: history: track previous kernel version and kexec boot count" (Breno Leitao) Use Kexec Handover (KHO) to pass the previous kernel's version string and the number of kexec reboots since the last cold boot to the next kernel, and print it at boot time - "liveupdate: prevent double preservation" (Pasha Tatashin) Teach LUO to avoid managing the same file across different active sessions - "liveupdate: Fix module unloading and unregister API" (Pasha Tatashin) Address an issue with how LUO handles module reference counting and unregistration during module unloading - "zswap pool per-CPU acomp_ctx simplifications" (Kanchana Sridhar) Simplify and clean up the zswap crypto compression handling and improve the lifecycle management of zswap pool's per-CPU acomp_ctx resources - "mm/damon/core: fix damon_call()/damos_walk() vs kdmond exit race" (SeongJae Park) Address unlikely but possible leaks and deadlocks in damon_call() and damon_walk() - "mm/damon/core: validate damos_quota_goal->nid" (SeongJae Park) Fix a couple of root-only wild pointer dereferences - "Docs/admin-guide/mm/damon: warn commit_inputs vs other params race" (SeongJae Park) Update the DAMON documentation to warn operators about potential races which can occur if the commit_inputs parameter is altered at the wrong time - "Minor hmm_test fixes and cleanups" (Alistair Popple) Bugfixes and a cleanup for the HMM kernel selftests - "Modify memfd_luo code" (Chenghao Duan) Cleanups, simplifications and speedups to the memfd_lou code - "mm, kvm: allow uffd support in guest_memfd" (Mike Rapoport) Support for userfaultfd in guest_memfd - "selftests/mm: skip several tests when thp is not available" (Chunyu Hu) Fix several issues in the selftests code which were causing breakage when the tests were run on CONFIG_THP=n kernels - "mm/mprotect: micro-optimization work" (Pedro Falcato) A couple of nice speedups for mprotect() - "MAINTAINERS: update KHO and LIVE UPDATE entries" (Pratyush Yadav) Document upcoming changes in the maintenance of KHO, LUO, memfd_luo, kexec, crash, kdump and probably other kexec-based things - they are being moved out of mm.git and into a new git tree * tag 'mm-stable-2026-04-18-02-14' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (121 commits) MAINTAINERS: add page cache reviewer mm/vmscan: avoid false-positive -Wuninitialized warning MAINTAINERS: update Dave's kdump reviewer email address MAINTAINERS: drop include/linux/liveupdate from LIVE UPDATE MAINTAINERS: drop include/linux/kho/abi/ from KHO MAINTAINERS: update KHO and LIVE UPDATE maintainers MAINTAINERS: update kexec/kdump maintainers entries mm/migrate_device: remove dead migration entry check in migrate_vma_collect_huge_pmd() selftests: mm: skip charge_reserved_hugetlb without killall userfaultfd: allow registration of ranges below mmap_min_addr mm/vmstat: fix vmstat_shepherd double-scheduling vmstat_update mm/hugetlb: fix early boot crash on parameters without '=' separator zram: reject unrecognized type= values in recompress_store() docs: proc: document ProtectionKey in smaps mm/mprotect: special-case small folios when applying permissions mm/mprotect: move softleaf code out of the main function mm: remove '!root_reclaim' checking in should_abort_scan() mm/sparse: fix comment for section map alignment mm/page_io: use sio->len for PSWPIN accounting in sio_read_complete() selftests/mm: transhuge_stress: skip the test when thp not available ...
-rw-r--r--CREDITS8
-rw-r--r--Documentation/admin-guide/mm/damon/lru_sort.rst4
-rw-r--r--Documentation/admin-guide/mm/damon/reclaim.rst4
-rw-r--r--Documentation/admin-guide/mm/kho.rst41
-rw-r--r--Documentation/filesystems/proc.rst4
-rw-r--r--MAINTAINERS29
-rw-r--r--drivers/block/zram/zram_drv.c5
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/userfaultfd.c2
-rw-r--r--include/linux/alloc_tag.h2
-rw-r--r--include/linux/damon.h2
-rw-r--r--include/linux/fs.h9
-rw-r--r--include/linux/kexec_handover.h13
-rw-r--r--include/linux/kho/abi/kexec_handover.h20
-rw-r--r--include/linux/kho/abi/kexec_metadata.h46
-rw-r--r--include/linux/liveupdate.h17
-rw-r--r--include/linux/memcontrol.h191
-rw-r--r--include/linux/mm.h5
-rw-r--r--include/linux/mm_inline.h6
-rw-r--r--include/linux/mmzone.h42
-rw-r--r--include/linux/pgalloc_tag.h2
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/shmem_fs.h14
-rw-r--r--include/linux/swap.h25
-rw-r--r--include/linux/userfaultfd_k.h73
-rw-r--r--include/trace/events/memcg.h10
-rw-r--r--include/trace/events/writeback.h3
-rw-r--r--kernel/cgroup/cgroup.c9
-rw-r--r--kernel/liveupdate/kexec_handover.c158
-rw-r--r--kernel/liveupdate/kexec_handover_debugfs.c55
-rw-r--r--kernel/liveupdate/kexec_handover_internal.h15
-rw-r--r--kernel/liveupdate/luo_core.c11
-rw-r--r--kernel/liveupdate/luo_file.c112
-rw-r--r--kernel/liveupdate/luo_flb.c182
-rw-r--r--kernel/liveupdate/luo_internal.h7
-rw-r--r--kernel/liveupdate/luo_session.c46
-rw-r--r--lib/alloc_tag.c109
-rw-r--r--lib/test_hmm.c130
-rw-r--r--lib/test_kho.c5
-rw-r--r--lib/tests/liveupdate.c18
-rw-r--r--mm/Kconfig.debug11
-rw-r--r--mm/compaction.c43
-rw-r--r--mm/damon/core.c88
-rw-r--r--mm/damon/stat.c5
-rw-r--r--mm/huge_memory.c22
-rw-r--r--mm/hugetlb.c18
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol-v1.c31
-rw-r--r--mm/memcontrol-v1.h7
-rw-r--r--mm/memcontrol.c632
-rw-r--r--mm/memfd_luo.c34
-rw-r--r--mm/mempolicy.c23
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/migrate_device.c6
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mprotect.c218
-rw-r--r--mm/page_alloc.c10
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/shmem.c176
-rw-r--r--mm/shrinker.c6
-rw-r--r--mm/sparse.c1
-rw-r--r--mm/swap.c59
-rw-r--r--mm/userfaultfd.c682
-rw-r--r--mm/util.c10
-rw-r--r--mm/vmscan.c303
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c30
-rw-r--r--mm/zswap.c187
-rw-r--r--tools/testing/selftests/liveupdate/liveupdate.c41
-rwxr-xr-xtools/testing/selftests/mm/charge_reserved_hugetlb.sh5
-rw-r--r--tools/testing/selftests/mm/guard-regions.c4
-rw-r--r--tools/testing/selftests/mm/hmm-tests.c83
-rw-r--r--tools/testing/selftests/mm/hugetlb_dio.c91
-rw-r--r--tools/testing/selftests/mm/merge.c88
-rw-r--r--tools/testing/selftests/mm/soft-dirty.c4
-rw-r--r--tools/testing/selftests/mm/split_huge_page_test.c19
-rw-r--r--tools/testing/selftests/mm/thp_settings.c35
-rw-r--r--tools/testing/selftests/mm/thp_settings.h1
-rw-r--r--tools/testing/selftests/mm/transhuge-stress.c4
-rw-r--r--tools/testing/selftests/mm/vm_util.c24
-rw-r--r--tools/testing/selftests/mm/vm_util.h2
84 files changed, 2814 insertions, 1675 deletions
diff --git a/CREDITS b/CREDITS
index bee8609d889f..49dfefcb7575 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1451,6 +1451,14 @@ N: Andy Gospodarek
E: andy@greyhouse.net
D: Maintenance and contributions to the network interface bonding driver.
+N: Vivek Goyal
+E: vgoyal@redhat.com
+D: KDUMP, KEXEC, and VIRTIO FILE SYSTEM
+
+N: Alexander Graf
+E: graf@amazon.com
+D: Kexec Handover (KHO)
+
N: Wolfgang Grandegger
E: wg@grandegger.com
D: Controller Area Network (device drivers)
diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst
index a7dea7c75a9b..14cc6b2db897 100644
--- a/Documentation/admin-guide/mm/damon/lru_sort.rst
+++ b/Documentation/admin-guide/mm/damon/lru_sort.rst
@@ -79,6 +79,10 @@ of parametrs except ``enabled`` again. Once the re-reading is done, this
parameter is set as ``N``. If invalid parameters are found while the
re-reading, DAMON_LRU_SORT will be disabled.
+Once ``Y`` is written to this parameter, the user must not write to any
+parameters until reading ``commit_inputs`` again returns ``N``. If users
+violate this rule, the kernel may exhibit undefined behavior.
+
active_mem_bp
-------------
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 47854c461706..d7a0225b4950 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -71,6 +71,10 @@ of parametrs except ``enabled`` again. Once the re-reading is done, this
parameter is set as ``N``. If invalid parameters are found while the
re-reading, DAMON_RECLAIM will be disabled.
+Once ``Y`` is written to this parameter, the user must not write to any
+parameters until reading ``commit_inputs`` again returns ``N``. If users
+violate this rule, the kernel may exhibit undefined behavior.
+
min_age
-------
diff --git a/Documentation/admin-guide/mm/kho.rst b/Documentation/admin-guide/mm/kho.rst
index cb9a20f64920..2c26e560bd78 100644
--- a/Documentation/admin-guide/mm/kho.rst
+++ b/Documentation/admin-guide/mm/kho.rst
@@ -42,6 +42,45 @@ For example, if you used ``reserve_mem`` command line parameter to create
an early memory reservation, the new kernel will have that memory at the
same physical address as the old kernel.
+Kexec Metadata
+==============
+
+KHO automatically tracks metadata about the kexec chain, passing information
+about the previous kernel to the next kernel. This feature helps diagnose
+bugs that only reproduce when kexecing from specific kernel versions.
+
+On each KHO kexec, the kernel logs the previous kernel's version and the
+number of kexec reboots since the last cold boot::
+
+ [ 0.000000] KHO: exec from: 6.19.0-rc4-next-20260107 (count 1)
+
+The metadata includes:
+
+``previous_release``
+ The kernel version string (from ``uname -r``) of the kernel that
+ initiated the kexec.
+
+``kexec_count``
+ The number of kexec boots since the last cold boot. On cold boot,
+ this counter starts at 0 and increments with each kexec. This helps
+ identify issues that only manifest after multiple consecutive kexec
+ reboots.
+
+Use Cases
+---------
+
+This metadata is particularly useful for debugging kexec transition bugs,
+where a buggy kernel kexecs into a new kernel and the bug manifests only
+in the second kernel. Examples of such bugs include:
+
+- Memory corruption from the previous kernel affecting the new kernel
+- Incorrect hardware state left by the previous kernel
+- Firmware/ACPI state issues that only appear in kexec scenarios
+
+At scale, correlating crashes to the previous kernel version enables
+faster root cause analysis when issues only occur in specific kernel
+transition scenarios.
+
debugfs Interfaces
==================
@@ -80,5 +119,5 @@ stabilized.
it finished to interpret their metadata.
``/sys/kernel/debug/kho/in/sub_fdts/``
- Similar to ``kho/out/sub_fdts/``, but contains sub FDT blobs
+ Similar to ``kho/out/sub_fdts/``, but contains sub blobs
of KHO producers passed from the old kernel.
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 7a4e67a04290..db6167befb7b 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -565,6 +565,10 @@ does not take into account swapped out page of underlying shmem objects.
naturally aligned THP pages of any currently enabled size. 1 if true, 0
otherwise.
+If both the kernel and the CPU support protection keys (pkeys),
+"ProtectionKey" indicates the memory protection key associated with the
+virtual memory area.
+
"VmFlags" field deserves a separate description. This member represents the
kernel flags associated with the particular virtual memory area in two letter
encoded manner. The codes are the following:
diff --git a/MAINTAINERS b/MAINTAINERS
index 13f49378b157..c00df84252aa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13859,8 +13859,10 @@ F: scripts/Makefile.kcsan
KDUMP
M: Andrew Morton <akpm@linux-foundation.org>
M: Baoquan He <bhe@redhat.com>
-R: Vivek Goyal <vgoyal@redhat.com>
-R: Dave Young <dyoung@redhat.com>
+M: Mike Rapoport <rppt@kernel.org>
+M: Pasha Tatashin <pasha.tatashin@soleen.com>
+M: Pratyush Yadav <pratyush@kernel.org>
+R: Dave Young <ruirui.yang@linux.dev>
L: kexec@lists.infradead.org
S: Maintained
W: http://lse.sourceforge.net/kdump/
@@ -14175,6 +14177,9 @@ F: include/linux/kernfs.h
KEXEC
M: Andrew Morton <akpm@linux-foundation.org>
M: Baoquan He <bhe@redhat.com>
+M: Mike Rapoport <rppt@kernel.org>
+M: Pasha Tatashin <pasha.tatashin@soleen.com>
+M: Pratyush Yadav <pratyush@kernel.org>
L: kexec@lists.infradead.org
W: http://kernel.org/pub/linux/utils/kernel/kexec/
F: include/linux/kexec.h
@@ -14182,18 +14187,18 @@ F: include/uapi/linux/kexec.h
F: kernel/kexec*
KEXEC HANDOVER (KHO)
-M: Alexander Graf <graf@amazon.com>
M: Mike Rapoport <rppt@kernel.org>
M: Pasha Tatashin <pasha.tatashin@soleen.com>
-R: Pratyush Yadav <pratyush@kernel.org>
+M: Pratyush Yadav <pratyush@kernel.org>
+R: Alexander Graf <graf@amazon.com>
L: kexec@lists.infradead.org
L: linux-mm@kvack.org
S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
F: Documentation/admin-guide/mm/kho.rst
F: Documentation/core-api/kho/*
F: include/linux/kexec_handover.h
F: include/linux/kho/
-F: include/linux/kho/abi/
F: kernel/liveupdate/kexec_handover*
F: lib/test_kho.c
F: tools/testing/selftests/kho/
@@ -14892,15 +14897,15 @@ F: tools/testing/selftests/livepatch/
LIVE UPDATE
M: Pasha Tatashin <pasha.tatashin@soleen.com>
M: Mike Rapoport <rppt@kernel.org>
-R: Pratyush Yadav <pratyush@kernel.org>
+M: Pratyush Yadav <pratyush@kernel.org>
L: linux-kernel@vger.kernel.org
S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git
F: Documentation/core-api/liveupdate.rst
F: Documentation/mm/memfd_preservation.rst
F: Documentation/userspace-api/liveupdate.rst
F: include/linux/kho/abi/
F: include/linux/liveupdate.h
-F: include/linux/liveupdate/
F: include/uapi/linux/liveupdate.h
F: kernel/liveupdate/
F: lib/tests/liveupdate.c
@@ -16859,8 +16864,12 @@ F: mm/migrate_device.c
MEMORY MANAGEMENT - MGLRU (MULTI-GEN LRU)
M: Andrew Morton <akpm@linux-foundation.org>
-M: Axel Rasmussen <axelrasmussen@google.com>
-M: Yuanchu Xie <yuanchu@google.com>
+R: Kairui Song <kasong@tencent.com>
+R: Qi Zheng <qi.zheng@linux.dev>
+R: Shakeel Butt <shakeel.butt@linux.dev>
+R: Barry Song <baohua@kernel.org>
+R: Axel Rasmussen <axelrasmussen@google.com>
+R: Yuanchu Xie <yuanchu@google.com>
R: Wei Xu <weixugc@google.com>
L: linux-mm@kvack.org
S: Maintained
@@ -20115,7 +20124,9 @@ F: kernel/padata.c
PAGE CACHE
M: Matthew Wilcox (Oracle) <willy@infradead.org>
+R: Jan Kara <jack@suse.cz>
L: linux-fsdevel@vger.kernel.org
+L: linux-mm@kvack.org
S: Supported
T: git git://git.infradead.org/users/willy/pagecache.git
F: Documentation/filesystems/locking.rst
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index c2afd1c34f4a..aebc710f0d6a 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -2546,6 +2546,8 @@ static ssize_t recompress_store(struct device *dev,
mode = RECOMPRESS_HUGE;
if (!strcmp(val, "huge_idle"))
mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
+ if (!mode)
+ return -EINVAL;
continue;
}
@@ -2678,7 +2680,7 @@ static void zram_bio_discard(struct zram *zram, struct bio *bio)
*/
if (offset) {
if (n <= (PAGE_SIZE - offset))
- return;
+ goto end_bio;
n -= (PAGE_SIZE - offset);
index++;
@@ -2693,6 +2695,7 @@ static void zram_bio_discard(struct zram *zram, struct bio *bio)
n -= PAGE_SIZE;
}
+end_bio:
bio_endio(bio);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 4d7f84e77d2f..d6e062c42a8d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -822,8 +822,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
long offset;
struct mem_cgroup *memcg, *old_memcg;
- /* The folio lock pins the memcg */
- memcg = folio_memcg(folio);
+ memcg = get_mem_cgroup_from_folio(folio);
old_memcg = set_active_memcg(memcg);
head = NULL;
@@ -844,6 +843,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
}
out:
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return head;
/*
* In case anything failed, we just free everything we got.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3c75ee025bda..e1fbdf9ee769 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -280,15 +280,13 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio)
if (inode_cgwb_enabled(inode)) {
struct cgroup_subsys_state *memcg_css;
- if (folio) {
- memcg_css = mem_cgroup_css_from_folio(folio);
- wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
- } else {
- /* must pin memcg_css, see wb_get_create() */
+ /* must pin memcg_css, see wb_get_create() */
+ if (folio)
+ memcg_css = get_mem_cgroup_css_from_folio(folio);
+ else
memcg_css = task_get_css(current, memory_cgrp_id);
- wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
- css_put(memcg_css);
- }
+ wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ css_put(memcg_css);
}
if (!wb)
@@ -979,16 +977,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio
if (!wbc->wb || wbc->no_cgroup_owner)
return;
- css = mem_cgroup_css_from_folio(folio);
+ css = get_mem_cgroup_css_from_folio(folio);
/* dead cgroups shouldn't contribute to inode ownership arbitration */
if (!css_is_online(css))
- return;
+ goto out;
id = css->id;
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
- return;
+ goto out;
}
if (id == wbc->wb_lcand_id)
@@ -1001,6 +999,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio
wbc->wb_tcand_bytes += bytes;
else
wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
+out:
+ css_put(css);
}
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bdc84e5219cd..4b53dc4a3266 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1238,8 +1238,6 @@ static __always_inline int validate_unaligned_range(
return -EINVAL;
if (!len)
return -EINVAL;
- if (start < mmap_min_addr)
- return -EINVAL;
if (start >= task_size)
return -EINVAL;
if (len > task_size - start)
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index d40ac39bfbe8..02de2ede560f 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -163,9 +163,11 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
{
WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
}
+void alloc_tag_add_early_pfn(unsigned long pfn);
#else
static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
+static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
#endif
/* Caller should verify both ref and tag to be valid */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index d9a3babbafc1..f2cdb7c3f5e6 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -818,9 +818,11 @@ struct damon_ctx {
/* lists of &struct damon_call_control */
struct list_head call_controls;
+ bool call_controls_obsolete;
struct mutex call_controls_lock;
struct damos_walk_control *walk_control;
+ bool walk_control_obsolete;
struct mutex walk_control_lock;
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e1d257e6da68..11559c513dfb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2062,20 +2062,13 @@ void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file
const struct vm_area_struct *vma);
int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma);
int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
-int __vma_check_mmap_hook(struct vm_area_struct *vma);
static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
{
- int err;
-
if (file->f_op->mmap_prepare)
return compat_vma_mmap(file, vma);
- err = file->f_op->mmap(file, vma);
- if (err)
- return err;
-
- return __vma_check_mmap_hook(vma);
+ return file->f_op->mmap(file, vma);
}
static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index ac4129d1d741..8968c56d2d73 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -32,9 +32,9 @@ void kho_restore_free(void *mem);
struct folio *kho_restore_folio(phys_addr_t phys);
struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages);
void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
-int kho_add_subtree(const char *name, void *fdt);
-void kho_remove_subtree(void *fdt);
-int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
+int kho_add_subtree(const char *name, void *blob, size_t size);
+void kho_remove_subtree(void *blob);
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size);
void kho_memory_init(void);
@@ -97,14 +97,15 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
return NULL;
}
-static inline int kho_add_subtree(const char *name, void *fdt)
+static inline int kho_add_subtree(const char *name, void *blob, size_t size)
{
return -EOPNOTSUPP;
}
-static inline void kho_remove_subtree(void *fdt) { }
+static inline void kho_remove_subtree(void *blob) { }
-static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys,
+ size_t *size)
{
return -EOPNOTSUPP;
}
diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h
index 6b7d8ef550f9..7e847a2339b0 100644
--- a/include/linux/kho/abi/kexec_handover.h
+++ b/include/linux/kho/abi/kexec_handover.h
@@ -41,25 +41,28 @@
* restore the preserved data.::
*
* / {
- * compatible = "kho-v2";
+ * compatible = "kho-v3";
*
* preserved-memory-map = <0x...>;
*
* <subnode-name-1> {
* preserved-data = <0x...>;
+ * blob-size = <0x...>;
* };
*
* <subnode-name-2> {
* preserved-data = <0x...>;
+ * blob-size = <0x...>;
* };
* ... ...
* <subnode-name-N> {
* preserved-data = <0x...>;
+ * blob-size = <0x...>;
* };
* };
*
* Root KHO Node (/):
- * - compatible: "kho-v2"
+ * - compatible: "kho-v3"
*
* Indentifies the overall KHO ABI version.
*
@@ -78,16 +81,25 @@
*
* Physical address pointing to a subnode data blob that is also
* being preserved.
+ *
+ * - blob-size: u64
+ *
+ * Size in bytes of the preserved data blob. This is needed because
+ * blobs may use arbitrary formats (not just FDT), so the size
+ * cannot be determined from the blob content alone.
*/
/* The compatible string for the KHO FDT root node. */
-#define KHO_FDT_COMPATIBLE "kho-v2"
+#define KHO_FDT_COMPATIBLE "kho-v3"
/* The FDT property for the preserved memory map. */
#define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map"
/* The FDT property for preserved data blobs. */
-#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data"
+#define KHO_SUB_TREE_PROP_NAME "preserved-data"
+
+/* The FDT property for the size of preserved data blobs. */
+#define KHO_SUB_TREE_SIZE_PROP_NAME "blob-size"
/**
* DOC: Kexec Handover ABI for vmalloc Preservation
diff --git a/include/linux/kho/abi/kexec_metadata.h b/include/linux/kho/abi/kexec_metadata.h
new file mode 100644
index 000000000000..e9e3f7e38a7c
--- /dev/null
+++ b/include/linux/kho/abi/kexec_metadata.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/**
+ * DOC: Kexec Metadata ABI
+ *
+ * The "kexec-metadata" subtree stores optional metadata about the kexec chain.
+ * It is registered via kho_add_subtree(), keeping it independent from the core
+ * KHO ABI. This allows the metadata format to evolve without affecting other
+ * KHO consumers.
+ *
+ * The metadata is stored as a plain C struct rather than FDT format for
+ * simplicity and direct field access.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Breno Leitao <leitao@debian.org>
+ */
+
+#ifndef _LINUX_KHO_ABI_KEXEC_METADATA_H
+#define _LINUX_KHO_ABI_KEXEC_METADATA_H
+
+#include <linux/types.h>
+#include <linux/utsname.h>
+
+#define KHO_KEXEC_METADATA_VERSION 1
+
+/**
+ * struct kho_kexec_metadata - Kexec metadata passed between kernels
+ * @version: ABI version of this struct (must be first field)
+ * @previous_release: Kernel version string that initiated the kexec
+ * @kexec_count: Number of kexec boots since last cold boot
+ *
+ * This structure is preserved across kexec and allows the new kernel to
+ * identify which kernel it was booted from and how many kexec reboots
+ * have occurred.
+ *
+ * __NEW_UTS_LEN is part of uABI, so it safe to use it in here.
+ */
+struct kho_kexec_metadata {
+ u32 version;
+ char previous_release[__NEW_UTS_LEN + 1];
+ u32 kexec_count;
+} __packed;
+
+#define KHO_METADATA_NODE_NAME "kexec-metadata"
+
+#endif /* _LINUX_KHO_ABI_KEXEC_METADATA_H */
diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index dd11fdc76a5f..30c5a39ff9e9 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -12,6 +12,7 @@
#include <linux/kho/abi/luo.h>
#include <linux/list.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/types.h>
#include <uapi/linux/liveupdate.h>
@@ -63,6 +64,7 @@ struct liveupdate_file_op_args {
* finish, in order to do successful finish calls for all
* resources in the session.
* @finish: Required. Final cleanup in the new kernel.
+ * @get_id: Optional. Returns a unique identifier for the file.
* @owner: Module reference
*
* All operations (except can_preserve) receive a pointer to a
@@ -78,6 +80,7 @@ struct liveupdate_file_ops {
int (*retrieve)(struct liveupdate_file_op_args *args);
bool (*can_finish)(struct liveupdate_file_op_args *args);
void (*finish)(struct liveupdate_file_op_args *args);
+ unsigned long (*get_id)(struct file *file);
struct module *owner;
};
@@ -228,12 +231,12 @@ bool liveupdate_enabled(void);
int liveupdate_reboot(void);
int liveupdate_register_file_handler(struct liveupdate_file_handler *fh);
-int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh);
+void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh);
int liveupdate_register_flb(struct liveupdate_file_handler *fh,
struct liveupdate_flb *flb);
-int liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
- struct liveupdate_flb *flb);
+void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb);
int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp);
int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp);
@@ -255,9 +258,8 @@ static inline int liveupdate_register_file_handler(struct liveupdate_file_handle
return -EOPNOTSUPP;
}
-static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+static inline void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
{
- return -EOPNOTSUPP;
}
static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh,
@@ -266,10 +268,9 @@ static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh,
return -EOPNOTSUPP;
}
-static inline int liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
- struct liveupdate_flb *flb)
+static inline void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb)
{
- return -EOPNOTSUPP;
}
static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5173a9f16721..dc3fa687759b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -115,6 +115,16 @@ struct mem_cgroup_per_node {
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter iter;
+ /*
+ * objcg is wiped out as a part of the objcg repaprenting process.
+ * orig_objcg preserves a pointer (and a reference) to the original
+ * objcg until the end of live of memcg.
+ */
+ struct obj_cgroup __rcu *objcg;
+ struct obj_cgroup *orig_objcg;
+ /* list of inherited objcgs, protected by objcg_lock */
+ struct list_head objcg_list;
+
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
/* slab stats for nmi context */
atomic_t slab_reclaimable;
@@ -179,6 +189,7 @@ struct obj_cgroup {
struct list_head list; /* protected by objcg_lock */
struct rcu_head rcu;
};
+ bool is_root;
};
/*
@@ -257,15 +268,6 @@ struct mem_cgroup {
seqlock_t socket_pressure_seqlock;
#endif
int kmemcg_id;
- /*
- * memcg->objcg is wiped out as a part of the objcg repaprenting
- * process. memcg->orig_objcg preserves a pointer (and a reference)
- * to the original objcg until the end of live of memcg.
- */
- struct obj_cgroup __rcu *objcg;
- struct obj_cgroup *orig_objcg;
- /* list of inherited objcgs, protected by objcg_lock */
- struct list_head objcg_list;
struct memcg_vmstats_percpu __percpu *vmstats_percpu;
@@ -367,9 +369,6 @@ enum objext_flags {
#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)
#ifdef CONFIG_MEMCG
-
-static inline bool folio_memcg_kmem(struct folio *folio);
-
/*
* After the initialization objcg->memcg is always pointing at
* a valid memcg, but can be atomically swapped to the parent memcg.
@@ -383,43 +382,19 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
}
/*
- * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
- * @folio: Pointer to the folio.
- *
- * Returns a pointer to the memory cgroup associated with the folio,
- * or NULL. This function assumes that the folio is known to have a
- * proper memory cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios or
- * kmem folios.
- */
-static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
-{
- unsigned long memcg_data = folio->memcg_data;
-
- VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
-
- return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
-}
-
-/*
- * __folio_objcg - get the object cgroup associated with a kmem folio.
+ * folio_objcg - get the object cgroup associated with a folio.
* @folio: Pointer to the folio.
*
* Returns a pointer to the object cgroup associated with the folio,
* or NULL. This function assumes that the folio is known to have a
- * proper object cgroup pointer. It's not safe to call this function
- * against some type of folios, e.g. slab folios or ex-slab folios or
- * LRU folios.
+ * proper object cgroup pointer.
*/
-static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
+static inline struct obj_cgroup *folio_objcg(struct folio *folio)
{
unsigned long memcg_data = folio->memcg_data;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
- VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}
@@ -433,21 +408,30 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
* proper memory cgroup pointer. It's not safe to call this function
* against some type of folios, e.g. slab folios or ex-slab folios.
*
- * For a non-kmem folio any of the following ensures folio and memcg binding
- * stability:
+ * For a folio any of the following ensures folio and objcg binding stability:
*
* - the folio lock
* - LRU isolation
* - exclusive reference
*
- * For a kmem folio a caller should hold an rcu read lock to protect memcg
- * associated with a kmem folio from being released.
+ * Based on the stable binding of folio and objcg, for a folio any of the
+ * following ensures folio and memcg binding stability:
+ *
+ * - cgroup_mutex
+ * - the lruvec lock
+ *
+ * If the caller only want to ensure that the page counters of memcg are
+ * updated correctly, ensure that the binding stability of folio and objcg
+ * is sufficient.
+ *
+ * Note: The caller should hold an rcu read lock or cgroup_mutex to protect
+ * memcg associated with a folio from being released.
*/
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
- if (folio_memcg_kmem(folio))
- return obj_cgroup_memcg(__folio_objcg(folio));
- return __folio_memcg(folio);
+ struct obj_cgroup *objcg = folio_objcg(folio);
+
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
/*
@@ -471,15 +455,10 @@ static inline bool folio_memcg_charged(struct folio *folio)
* has an associated memory cgroup pointer or an object cgroups vector or
* an object cgroup.
*
- * For a non-kmem folio any of the following ensures folio and memcg binding
- * stability:
+ * The page and objcg or memcg binding rules can refer to folio_memcg().
*
- * - the folio lock
- * - LRU isolation
- * - exclusive reference
- *
- * For a kmem folio a caller should hold an rcu read lock to protect memcg
- * associated with a kmem folio from being released.
+ * A caller should hold an rcu read lock to protect memcg associated with a
+ * page from being released.
*/
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
@@ -488,18 +467,14 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
* for slabs, READ_ONCE() should be used here.
*/
unsigned long memcg_data = READ_ONCE(folio->memcg_data);
+ struct obj_cgroup *objcg;
if (memcg_data & MEMCG_DATA_OBJEXTS)
return NULL;
- if (memcg_data & MEMCG_DATA_KMEM) {
- struct obj_cgroup *objcg;
+ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- return obj_cgroup_memcg(objcg);
- }
-
- return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
+ return objcg ? obj_cgroup_memcg(objcg) : NULL;
}
static inline struct mem_cgroup *page_memcg_check(struct page *page)
@@ -548,6 +523,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
+static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
+{
+ return objcg->is_root;
+}
+
static inline bool mem_cgroup_disabled(void)
{
return !cgroup_subsys_enabled(memory_cgrp_subsys);
@@ -735,7 +715,15 @@ out:
* folio_lruvec - return lruvec for isolating/putting an LRU folio
* @folio: Pointer to the folio.
*
- * This function relies on folio->mem_cgroup being stable.
+ * Call with rcu_read_lock() held to ensure the lifetime of the returned lruvec.
+ * Note that this alone will NOT guarantee the stability of the folio->lruvec
+ * association; the folio can be reparented to an ancestor if this races with
+ * cgroup deletion.
+ *
+ * Use folio_lruvec_lock() to ensure both lifetime and stability of the binding.
+ * Once a lruvec is locked, folio_lruvec() can be called on other folios, and
+ * their binding is stable if the returned lruvec matches the one the caller has
+ * locked. Useful for lock batching.
*/
static inline struct lruvec *folio_lruvec(struct folio *folio)
{
@@ -758,15 +746,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
unsigned long *flags);
-#ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
-#else
-static inline
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
-}
-#endif
-
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -774,23 +753,26 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
+ if (obj_cgroup_is_root(objcg))
+ return true;
return percpu_ref_tryget(&objcg->refcnt);
}
-static inline void obj_cgroup_get(struct obj_cgroup *objcg)
+static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
+ unsigned long nr)
{
- percpu_ref_get(&objcg->refcnt);
+ if (!obj_cgroup_is_root(objcg))
+ percpu_ref_get_many(&objcg->refcnt, nr);
}
-static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
- unsigned long nr)
+static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
- percpu_ref_get_many(&objcg->refcnt, nr);
+ obj_cgroup_get_many(objcg, 1);
}
static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
- if (objcg)
+ if (objcg && !obj_cgroup_is_root(objcg))
percpu_ref_put(&objcg->refcnt);
}
@@ -885,7 +867,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
return match;
}
-struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
+struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio);
ino_t page_cgroup_ino(struct page *page);
static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
@@ -896,7 +878,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
}
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int zid, int nr_pages);
+ int zid, long nr_pages);
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
@@ -966,10 +948,15 @@ void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
static inline void count_memcg_folio_events(struct folio *folio,
enum vm_event_item idx, unsigned long nr)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
- if (memcg)
- count_memcg_events(memcg, idx, nr);
+ if (!folio_memcg_charged(folio))
+ return;
+
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
+ count_memcg_events(memcg, idx, nr);
+ rcu_read_unlock();
}
static inline void count_memcg_events_mm(struct mm_struct *mm,
@@ -1087,6 +1074,11 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return true;
}
+static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
+{
+ return true;
+}
+
static inline bool mem_cgroup_disabled(void)
{
return true;
@@ -1179,11 +1171,6 @@ static inline struct lruvec *folio_lruvec(struct folio *folio)
return &pgdat->__lruvec;
}
-static inline
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
-}
-
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
return NULL;
@@ -1242,6 +1229,7 @@ static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
{
struct pglist_data *pgdat = folio_pgdat(folio);
+ rcu_read_lock();
spin_lock(&pgdat->__lruvec.lru_lock);
return &pgdat->__lruvec;
}
@@ -1250,6 +1238,7 @@ static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
struct pglist_data *pgdat = folio_pgdat(folio);
+ rcu_read_lock();
spin_lock_irq(&pgdat->__lruvec.lru_lock);
return &pgdat->__lruvec;
}
@@ -1259,6 +1248,7 @@ static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
{
struct pglist_data *pgdat = folio_pgdat(folio);
+ rcu_read_lock();
spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
return &pgdat->__lruvec;
}
@@ -1479,20 +1469,28 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}
-static inline void unlock_page_lruvec(struct lruvec *lruvec)
+static inline void lruvec_lock_irq(struct lruvec *lruvec)
+{
+ rcu_read_lock();
+ spin_lock_irq(&lruvec->lru_lock);
+}
+
+static inline void lruvec_unlock(struct lruvec *lruvec)
{
spin_unlock(&lruvec->lru_lock);
+ rcu_read_unlock();
}
-static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+static inline void lruvec_unlock_irq(struct lruvec *lruvec)
{
spin_unlock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
}
-static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
- unsigned long flags)
+static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags)
{
spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+ rcu_read_unlock();
}
/* Test requires a stable folio->memcg binding, see folio_memcg() */
@@ -1511,7 +1509,7 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
if (folio_matches_lruvec(folio, locked_lruvec))
return locked_lruvec;
- unlock_page_lruvec_irq(locked_lruvec);
+ lruvec_unlock_irq(locked_lruvec);
}
return folio_lruvec_lock_irq(folio);
@@ -1525,7 +1523,7 @@ static inline void folio_lruvec_relock_irqsave(struct folio *folio,
if (folio_matches_lruvec(folio, *lruvecp))
return;
- unlock_page_lruvec_irqrestore(*lruvecp, *flags);
+ lruvec_unlock_irqrestore(*lruvecp, *flags);
}
*lruvecp = folio_lruvec_lock_irqsave(folio, flags);
@@ -1549,9 +1547,14 @@ static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
if (mem_cgroup_disabled())
return;
+ if (!folio_memcg_charged(folio))
+ return;
+
+ rcu_read_lock();
memcg = folio_memcg(folio);
- if (unlikely(memcg && &memcg->css != wb->memcg_css))
+ if (unlikely(&memcg->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
+ rcu_read_unlock();
}
void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 255e0f50ea32..0b776907152e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -758,6 +758,8 @@ struct vm_fault {
*/
};
+struct vm_uffd_ops;
+
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -865,6 +867,9 @@ struct vm_operations_struct {
struct page *(*find_normal_page)(struct vm_area_struct *vma,
unsigned long addr);
#endif /* CONFIG_FIND_NORMAL_PAGE */
+#ifdef CONFIG_USERFAULTFD
+ const struct vm_uffd_ops *uffd_ops;
+#endif
};
#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7fc2ced00f8f..a171070e15f0 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -348,6 +348,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
if (lru_gen_add_folio(lruvec, folio, false))
return;
@@ -362,6 +364,8 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
if (lru_gen_add_folio(lruvec, folio, true))
return;
@@ -376,6 +380,8 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+
if (lru_gen_del_folio(lruvec, folio, false))
return;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3bcdda226a91..9adb2ad21da5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -694,6 +694,9 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
+void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid);
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid);
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);
#else /* !CONFIG_LRU_GEN */
@@ -735,6 +738,20 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}
+static inline void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
+{
+}
+
+static inline bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
+{
+ return true;
+}
+
+static inline
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
struct lruvec {
@@ -2053,21 +2070,16 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
extern size_t mem_section_usage_size(void);
/*
- * We use the lower bits of the mem_map pointer to store
- * a little bit of information. The pointer is calculated
- * as mem_map - section_nr_to_pfn(pnum). The result is
- * aligned to the minimum alignment of the two values:
- * 1. All mem_map arrays are page-aligned.
- * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
- * lowest bits. PFN_SECTION_SHIFT is arch-specific
- * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
- * worst combination is powerpc with 256k pages,
- * which results in PFN_SECTION_SHIFT equal 6.
- * To sum it up, at least 6 bits are available on all architectures.
- * However, we can exceed 6 bits on some other architectures except
- * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
- * with the worst case of 64K pages on arm64) if we make sure the
- * exceeded bit is not applicable to powerpc.
+ * We use the lower bits of the mem_map pointer to store a little bit of
+ * information. The pointer is calculated as mem_map - section_nr_to_pfn().
+ * The result is aligned to the minimum alignment of the two values:
+ *
+ * 1. All mem_map arrays are page-aligned.
+ * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT lowest bits.
+ *
+ * We always expect a single section to cover full pages. Therefore,
+ * we can safely assume that PFN_SECTION_SHIFT is large enough to
+ * accommodate SECTION_MAP_LAST_BIT. We use BUILD_BUG_ON() to ensure this.
*/
enum {
SECTION_MARKED_PRESENT_BIT,
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 38a82d65e58e..951d33362268 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -181,7 +181,7 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page)
if (get_page_tag_ref(page, &ref, &handle)) {
alloc_tag_sub_check(&ref);
- if (ref.ct)
+ if (ref.ct && !is_codetag_empty(&ref))
tag = ct_to_alloc_tag(ref.ct);
put_page_tag_ref(handle);
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 004e6d56a499..368c7b4d7cb5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1535,7 +1535,7 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
- /* Cache for current->cgroups->memcg->objcg lookups: */
+ /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */
struct obj_cgroup *objcg;
#endif
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index f6a2d3402d76..93a0ba872ebe 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -221,20 +221,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
extern bool shmem_charge(struct inode *inode, long pages);
-#ifdef CONFIG_USERFAULTFD
-#ifdef CONFIG_SHMEM
-extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop);
-#else /* !CONFIG_SHMEM */
-#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
- src_addr, flags, foliop) ({ BUG(); 0; })
-#endif /* CONFIG_SHMEM */
-#endif /* CONFIG_USERFAULTFD */
-
/*
* Used space is stored as unsigned 64-bit value in bytes but
* quota core supports only signed 64-bit values so use that
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4b1f13b5bbad..7a09df6977a5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -310,8 +310,7 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
- unsigned int nr_io, unsigned int nr_rotated)
- __releases(lruvec->lru_lock);
+ unsigned int nr_io, unsigned int nr_rotated);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
@@ -353,6 +352,7 @@ extern void swap_setup(void);
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
@@ -547,6 +547,8 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
return READ_ONCE(memcg->swappiness);
}
+
+void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid);
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
@@ -611,5 +613,24 @@ static inline bool mem_cgroup_swap_full(struct folio *folio)
}
#endif
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
+ (idx) <= (highidx); \
+ (idx)++, (zone)++) \
+ if (!managed_zone(zone)) \
+ continue; \
+ else
+
#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d83e349900a3..d2920f98ab86 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -83,6 +83,39 @@ struct userfaultfd_ctx {
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
+/* VMA userfaultfd operations */
+struct vm_uffd_ops {
+ /* Checks if a VMA can support userfaultfd */
+ bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags);
+ /*
+ * Called to resolve UFFDIO_CONTINUE request.
+ * Should return the folio found at pgoff in the VMA's pagecache if it
+ * exists or ERR_PTR otherwise.
+ * The returned folio is locked and with reference held.
+ */
+ struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff);
+ /*
+ * Called during resolution of UFFDIO_COPY request.
+ * Should allocate and return a folio or NULL if allocation fails.
+ */
+ struct folio *(*alloc_folio)(struct vm_area_struct *vma,
+ unsigned long addr);
+ /*
+ * Called during resolution of UFFDIO_COPY request.
+ * Should only be called with a folio returned by alloc_folio() above.
+ * The folio will be set to locked.
+ * Returns 0 on success, error code on failure.
+ */
+ int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr);
+ /*
+ * Called during resolution of UFFDIO_COPY request on the error
+ * handling path.
+ * Should revert the operation of ->filemap_add().
+ */
+ void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma);
+};
+
/* A combined operation mode + behavior flags. */
typedef unsigned int __bitwise uffd_flags_t;
@@ -114,11 +147,6 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_at
/* Flags controlling behavior. These behavior changes are mode-independent. */
#define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0)
-extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr, struct page *page,
- bool newly_allocated, uffd_flags_t flags);
-
extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
uffd_flags_t flags);
@@ -211,39 +239,8 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
return vma->vm_flags & __VM_UFFD_FLAGS;
}
-static inline bool vma_can_userfault(struct vm_area_struct *vma,
- vm_flags_t vm_flags,
- bool wp_async)
-{
- vm_flags &= __VM_UFFD_FLAGS;
-
- if (vma->vm_flags & VM_DROPPABLE)
- return false;
-
- if ((vm_flags & VM_UFFD_MINOR) &&
- (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
- return false;
-
- /*
- * If wp async enabled, and WP is the only mode enabled, allow any
- * memory type.
- */
- if (wp_async && (vm_flags == VM_UFFD_WP))
- return true;
-
- /*
- * If user requested uffd-wp but not enabled pte markers for
- * uffd-wp, then shmem & hugetlbfs are not supported but only
- * anonymous.
- */
- if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) &&
- !vma_is_anonymous(vma))
- return false;
-
- /* By default, allow any of anon|shmem|hugetlb */
- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
- vma_is_shmem(vma);
-}
+bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags,
+ bool wp_async);
static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
{
diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
index dfe2f51019b4..51b62c5931fc 100644
--- a/include/trace/events/memcg.h
+++ b/include/trace/events/memcg.h
@@ -11,14 +11,14 @@
DECLARE_EVENT_CLASS(memcg_rstat_stats,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val),
TP_STRUCT__entry(
__field(u64, id)
__field(int, item)
- __field(int, val)
+ __field(long, val)
),
TP_fast_assign(
@@ -27,20 +27,20 @@ DECLARE_EVENT_CLASS(memcg_rstat_stats,
__entry->val = val;
),
- TP_printk("memcg_id=%llu item=%d val=%d",
+ TP_printk("memcg_id=%llu item=%d val=%ld",
__entry->id, __entry->item, __entry->val)
);
DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val)
);
DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val)
);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index e5cd2b80fd29..bdac0d685a98 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -294,7 +294,10 @@ TRACE_EVENT(track_foreign_dirty,
__entry->ino = inode ? inode->i_ino : 0;
__entry->memcg_id = wb->memcg_css->id;
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
+
+ rcu_read_lock();
__entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
+ rcu_read_unlock();
),
TP_printk("bdi %s[%llu]: ino=%llu memcg_id=%u cgroup_ino=%llu page_cgroup_ino=%llu",
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1f084ee71443..43adc96c7f1a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5999,8 +5999,9 @@ out_unlock:
*/
static void css_killed_work_fn(struct work_struct *work)
{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys_state *css;
+
+ css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
cgroup_lock();
@@ -6021,8 +6022,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
if (atomic_dec_and_test(&css->online_cnt)) {
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_offline_wq, &css->destroy_work);
+ INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
+ queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
}
}
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
index 532f455c5d4f..94762de1fe5f 100644
--- a/kernel/liveupdate/kexec_handover.c
+++ b/kernel/liveupdate/kexec_handover.c
@@ -18,7 +18,9 @@
#include <linux/kexec.h>
#include <linux/kexec_handover.h>
#include <linux/kho_radix_tree.h>
+#include <linux/utsname.h>
#include <linux/kho/abi/kexec_handover.h>
+#include <linux/kho/abi/kexec_metadata.h>
#include <linux/libfdt.h>
#include <linux/list.h>
#include <linux/memblock.h>
@@ -724,12 +726,13 @@ err_disable_kho:
}
/**
- * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
+ * kho_add_subtree - record the physical address of a sub blob in KHO root tree.
* @name: name of the sub tree.
- * @fdt: the sub tree blob.
+ * @blob: the sub tree blob.
+ * @size: size of the blob in bytes.
*
* Creates a new child node named @name in KHO root FDT and records
- * the physical address of @fdt. The pages of @fdt must also be preserved
+ * the physical address of @blob. The pages of @blob must also be preserved
* by KHO for the new kernel to retrieve it after kexec.
*
* A debugfs blob entry is also created at
@@ -738,10 +741,11 @@ err_disable_kho:
*
* Return: 0 on success, error code on failure
*/
-int kho_add_subtree(const char *name, void *fdt)
+int kho_add_subtree(const char *name, void *blob, size_t size)
{
- phys_addr_t phys = virt_to_phys(fdt);
+ phys_addr_t phys = virt_to_phys(blob);
void *root_fdt = kho_out.fdt;
+ u64 size_u64 = size;
int err = -ENOMEM;
int off, fdt_err;
@@ -758,12 +762,18 @@ int kho_add_subtree(const char *name, void *fdt)
goto out_pack;
}
- err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME,
+ err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME,
&phys, sizeof(phys));
if (err < 0)
goto out_pack;
- WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
+ err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME,
+ &size_u64, sizeof(size_u64));
+ if (err < 0)
+ goto out_pack;
+
+ WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob,
+ size, false));
out_pack:
fdt_pack(root_fdt);
@@ -772,9 +782,9 @@ out_pack:
}
EXPORT_SYMBOL_GPL(kho_add_subtree);
-void kho_remove_subtree(void *fdt)
+void kho_remove_subtree(void *blob)
{
- phys_addr_t target_phys = virt_to_phys(fdt);
+ phys_addr_t target_phys = virt_to_phys(blob);
void *root_fdt = kho_out.fdt;
int off;
int err;
@@ -790,13 +800,13 @@ void kho_remove_subtree(void *fdt)
const u64 *val;
int len;
- val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len);
+ val = fdt_getprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, &len);
if (!val || len != sizeof(phys_addr_t))
continue;
if ((phys_addr_t)*val == target_phys) {
fdt_del_node(root_fdt, off);
- kho_debugfs_fdt_remove(&kho_out.dbg, fdt);
+ kho_debugfs_blob_remove(&kho_out.dbg, blob);
break;
}
}
@@ -1260,6 +1270,8 @@ EXPORT_SYMBOL_GPL(kho_restore_free);
struct kho_in {
phys_addr_t fdt_phys;
phys_addr_t scratch_phys;
+ char previous_release[__NEW_UTS_LEN + 1];
+ u32 kexec_count;
struct kho_debugfs dbg;
};
@@ -1292,16 +1304,17 @@ bool is_kho_boot(void)
EXPORT_SYMBOL_GPL(is_kho_boot);
/**
- * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
- * @name: the name of the sub FDT passed to kho_add_subtree().
- * @phys: if found, the physical address of the sub FDT is stored in @phys.
+ * kho_retrieve_subtree - retrieve a preserved sub blob by its name.
+ * @name: the name of the sub blob passed to kho_add_subtree().
+ * @phys: if found, the physical address of the sub blob is stored in @phys.
+ * @size: if not NULL and found, the size of the sub blob is stored in @size.
*
- * Retrieve a preserved sub FDT named @name and store its physical
- * address in @phys.
+ * Retrieve a preserved sub blob named @name and store its physical
+ * address in @phys and optionally its size in @size.
*
* Return: 0 on success, error code on failure
*/
-int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys, size_t *size)
{
const void *fdt = kho_get_fdt();
const u64 *val;
@@ -1317,12 +1330,22 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
if (offset < 0)
return -ENOENT;
- val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len);
+ val = fdt_getprop(fdt, offset, KHO_SUB_TREE_PROP_NAME, &len);
if (!val || len != sizeof(*val))
return -EINVAL;
*phys = (phys_addr_t)*val;
+ val = fdt_getprop(fdt, offset, KHO_SUB_TREE_SIZE_PROP_NAME, &len);
+ if (!val || len != sizeof(*val)) {
+ pr_warn("broken KHO subnode '%s': missing or invalid blob-size property\n",
+ name);
+ return -EINVAL;
+ }
+
+ if (size)
+ *size = (size_t)*val;
+
return 0;
}
EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
@@ -1373,6 +1396,96 @@ static __init int kho_out_fdt_setup(void)
return err;
}
+static void __init kho_in_kexec_metadata(void)
+{
+ struct kho_kexec_metadata *metadata;
+ phys_addr_t metadata_phys;
+ size_t blob_size;
+ int err;
+
+ err = kho_retrieve_subtree(KHO_METADATA_NODE_NAME, &metadata_phys,
+ &blob_size);
+ if (err)
+ /* This is fine, previous kernel didn't export metadata */
+ return;
+
+ /* Check that, at least, "version" is present */
+ if (blob_size < sizeof(u32)) {
+ pr_warn("kexec-metadata blob too small (%zu bytes)\n",
+ blob_size);
+ return;
+ }
+
+ metadata = phys_to_virt(metadata_phys);
+
+ if (metadata->version != KHO_KEXEC_METADATA_VERSION) {
+ pr_warn("kexec-metadata version %u not supported (expected %u)\n",
+ metadata->version, KHO_KEXEC_METADATA_VERSION);
+ return;
+ }
+
+ if (blob_size < sizeof(*metadata)) {
+ pr_warn("kexec-metadata blob too small for v%u (%zu < %zu)\n",
+ metadata->version, blob_size, sizeof(*metadata));
+ return;
+ }
+
+ /*
+ * Copy data to the kernel structure that will persist during
+ * kernel lifetime.
+ */
+ kho_in.kexec_count = metadata->kexec_count;
+ strscpy(kho_in.previous_release, metadata->previous_release,
+ sizeof(kho_in.previous_release));
+
+ pr_info("exec from: %s (count %u)\n",
+ kho_in.previous_release, kho_in.kexec_count);
+}
+
+/*
+ * Create kexec metadata to pass kernel version and boot count to the
+ * next kernel. This keeps the core KHO ABI minimal and allows the
+ * metadata format to evolve independently.
+ */
+static __init int kho_out_kexec_metadata(void)
+{
+ struct kho_kexec_metadata *metadata;
+ int err;
+
+ metadata = kho_alloc_preserve(sizeof(*metadata));
+ if (IS_ERR(metadata))
+ return PTR_ERR(metadata);
+
+ metadata->version = KHO_KEXEC_METADATA_VERSION;
+ strscpy(metadata->previous_release, init_uts_ns.name.release,
+ sizeof(metadata->previous_release));
+ /* kho_in.kexec_count is set to 0 on cold boot */
+ metadata->kexec_count = kho_in.kexec_count + 1;
+
+ err = kho_add_subtree(KHO_METADATA_NODE_NAME, metadata,
+ sizeof(*metadata));
+ if (err)
+ kho_unpreserve_free(metadata);
+
+ return err;
+}
+
+static int __init kho_kexec_metadata_init(const void *fdt)
+{
+ int err;
+
+ if (fdt)
+ kho_in_kexec_metadata();
+
+ /* Populate kexec metadata for the possible next kexec */
+ err = kho_out_kexec_metadata();
+ if (err)
+ pr_warn("failed to initialize kexec-metadata subtree: %d\n",
+ err);
+
+ return err;
+}
+
static __init int kho_init(void)
{
struct kho_radix_tree *tree = &kho_out.radix_tree;
@@ -1406,6 +1519,10 @@ static __init int kho_init(void)
if (err)
goto err_free_fdt;
+ err = kho_kexec_metadata_init(fdt);
+ if (err)
+ goto err_free_fdt;
+
if (fdt) {
kho_in_debugfs_init(&kho_in.dbg, fdt);
return 0;
@@ -1430,8 +1547,9 @@ static __init int kho_init(void)
init_cma_reserved_pageblock(pfn_to_page(pfn));
}
- WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
- kho_out.fdt, true));
+ WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, "fdt",
+ kho_out.fdt,
+ fdt_totalsize(kho_out.fdt), true));
return 0;
diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c
index acf368222682..257ee8a52be6 100644
--- a/kernel/liveupdate/kexec_handover_debugfs.c
+++ b/kernel/liveupdate/kexec_handover_debugfs.c
@@ -24,8 +24,9 @@ struct fdt_debugfs {
struct dentry *file;
};
-static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
- const char *name, const void *fdt)
+static int __kho_debugfs_blob_add(struct list_head *list, struct dentry *dir,
+ const char *name, const void *blob,
+ size_t size)
{
struct fdt_debugfs *f;
struct dentry *file;
@@ -34,8 +35,8 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
if (!f)
return -ENOMEM;
- f->wrapper.data = (void *)fdt;
- f->wrapper.size = fdt_totalsize(fdt);
+ f->wrapper.data = (void *)blob;
+ f->wrapper.size = size;
file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
if (IS_ERR(file)) {
@@ -49,8 +50,8 @@ static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
return 0;
}
-int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
- const void *fdt, bool root)
+int kho_debugfs_blob_add(struct kho_debugfs *dbg, const char *name,
+ const void *blob, size_t size, bool root)
{
struct dentry *dir;
@@ -59,15 +60,15 @@ int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
else
dir = dbg->sub_fdt_dir;
- return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt);
+ return __kho_debugfs_blob_add(&dbg->fdt_list, dir, name, blob, size);
}
-void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt)
+void kho_debugfs_blob_remove(struct kho_debugfs *dbg, void *blob)
{
struct fdt_debugfs *ff;
list_for_each_entry(ff, &dbg->fdt_list, list) {
- if (ff->wrapper.data == fdt) {
+ if (ff->wrapper.data == blob) {
debugfs_remove(ff->file);
list_del(&ff->list);
kfree(ff);
@@ -113,28 +114,42 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt)
goto err_rmdir;
}
- err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt);
+ err = __kho_debugfs_blob_add(&dbg->fdt_list, dir, "fdt", fdt,
+ fdt_totalsize(fdt));
if (err)
goto err_rmdir;
fdt_for_each_subnode(child, fdt, 0) {
int len = 0;
const char *name = fdt_get_name(fdt, child, NULL);
- const u64 *fdt_phys;
+ const u64 *blob_phys;
+ const u64 *blob_size;
+ void *blob;
- fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len);
- if (!fdt_phys)
+ blob_phys = fdt_getprop(fdt, child,
+ KHO_SUB_TREE_PROP_NAME, &len);
+ if (!blob_phys)
continue;
- if (len != sizeof(*fdt_phys)) {
- pr_warn("node %s prop fdt has invalid length: %d\n",
- name, len);
+ if (len != sizeof(*blob_phys)) {
+ pr_warn("node %s prop %s has invalid length: %d\n",
+ name, KHO_SUB_TREE_PROP_NAME, len);
continue;
}
- err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name,
- phys_to_virt(*fdt_phys));
+
+ blob_size = fdt_getprop(fdt, child,
+ KHO_SUB_TREE_SIZE_PROP_NAME, &len);
+ if (!blob_size || len != sizeof(*blob_size)) {
+ pr_warn("node %s missing or invalid %s property\n",
+ name, KHO_SUB_TREE_SIZE_PROP_NAME);
+ continue;
+ }
+
+ blob = phys_to_virt(*blob_phys);
+ err = __kho_debugfs_blob_add(&dbg->fdt_list, sub_fdt_dir, name,
+ blob, *blob_size);
if (err) {
- pr_warn("failed to add fdt %s to debugfs: %pe\n", name,
- ERR_PTR(err));
+ pr_warn("failed to add blob %s to debugfs: %pe\n",
+ name, ERR_PTR(err));
continue;
}
}
diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h
index 9a832a35254c..0399ff107775 100644
--- a/kernel/liveupdate/kexec_handover_internal.h
+++ b/kernel/liveupdate/kexec_handover_internal.h
@@ -26,18 +26,19 @@ extern unsigned int kho_scratch_cnt;
int kho_debugfs_init(void);
void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt);
int kho_out_debugfs_init(struct kho_debugfs *dbg);
-int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
- const void *fdt, bool root);
-void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt);
+int kho_debugfs_blob_add(struct kho_debugfs *dbg, const char *name,
+ const void *blob, size_t size, bool root);
+void kho_debugfs_blob_remove(struct kho_debugfs *dbg, void *blob);
#else
static inline int kho_debugfs_init(void) { return 0; }
static inline void kho_in_debugfs_init(struct kho_debugfs *dbg,
const void *fdt) { }
static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; }
-static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
- const void *fdt, bool root) { return 0; }
-static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg,
- void *fdt) { }
+static inline int kho_debugfs_blob_add(struct kho_debugfs *dbg,
+ const char *name, const void *blob,
+ size_t size, bool root) { return 0; }
+static inline void kho_debugfs_blob_remove(struct kho_debugfs *dbg,
+ void *blob) { }
#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
index 84ac728d63ba..803f51c84275 100644
--- a/kernel/liveupdate/luo_core.c
+++ b/kernel/liveupdate/luo_core.c
@@ -54,6 +54,7 @@
#include <linux/liveupdate.h>
#include <linux/miscdevice.h>
#include <linux/mm.h>
+#include <linux/rwsem.h>
#include <linux/sizes.h>
#include <linux/string.h>
#include <linux/unaligned.h>
@@ -68,6 +69,11 @@ static struct {
u64 liveupdate_num;
} luo_global;
+/*
+ * luo_register_rwlock - Protects registration of file handlers and FLBs.
+ */
+DECLARE_RWSEM(luo_register_rwlock);
+
static int __init early_liveupdate_param(char *buf)
{
return kstrtobool(buf, &luo_global.enabled);
@@ -88,7 +94,7 @@ static int __init luo_early_startup(void)
}
/* Retrieve LUO subtree, and verify its format. */
- err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys);
+ err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys, NULL);
if (err) {
if (err != -ENOENT) {
pr_err("failed to retrieve FDT '%s' from KHO: %pe\n",
@@ -172,7 +178,8 @@ static int __init luo_fdt_setup(void)
if (err)
goto exit_free;
- err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out);
+ err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out,
+ fdt_totalsize(fdt_out));
if (err)
goto exit_free;
luo_global.fdt_out = fdt_out;
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index 5acee4174bf0..a0a419085e28 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -108,12 +108,16 @@
#include <linux/liveupdate.h>
#include <linux/module.h>
#include <linux/sizes.h>
+#include <linux/xarray.h>
#include <linux/slab.h>
#include <linux/string.h>
#include "luo_internal.h"
static LIST_HEAD(luo_file_handler_list);
+/* Keep track of files being preserved by LUO */
+static DEFINE_XARRAY(luo_preserved_files);
+
/* 2 4K pages, give space for 128 files per file_set */
#define LUO_FILE_PGCNT 2ul
#define LUO_FILE_MAX \
@@ -203,6 +207,12 @@ static void luo_free_files_mem(struct luo_file_set *file_set)
file_set->files = NULL;
}
+static unsigned long luo_get_id(struct liveupdate_file_handler *fh,
+ struct file *file)
+{
+ return fh->ops->get_id ? fh->ops->get_id(file) : (unsigned long)file;
+}
+
static bool luo_token_is_used(struct luo_file_set *file_set, u64 token)
{
struct luo_file *iter;
@@ -248,6 +258,7 @@ static bool luo_token_is_used(struct luo_file_set *file_set, u64 token)
* Context: Can be called from an ioctl handler during normal system operation.
* Return: 0 on success. Returns a negative errno on failure:
* -EEXIST if the token is already used.
+ * -EBUSY if the file descriptor is already preserved by another session.
* -EBADF if the file descriptor is invalid.
* -ENOSPC if the file_set is full.
* -ENOENT if no compatible handler is found.
@@ -277,20 +288,28 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
goto err_fput;
err = -ENOENT;
+ down_read(&luo_register_rwlock);
list_private_for_each_entry(fh, &luo_file_handler_list, list) {
if (fh->ops->can_preserve(fh, file)) {
- err = 0;
+ if (try_module_get(fh->ops->owner))
+ err = 0;
break;
}
}
+ up_read(&luo_register_rwlock);
/* err is still -ENOENT if no handler was found */
if (err)
goto err_free_files_mem;
+ err = xa_insert(&luo_preserved_files, luo_get_id(fh, file),
+ file, GFP_KERNEL);
+ if (err)
+ goto err_module_put;
+
err = luo_flb_file_preserve(fh);
if (err)
- goto err_free_files_mem;
+ goto err_erase_xa;
luo_file = kzalloc_obj(*luo_file);
if (!luo_file) {
@@ -320,6 +339,10 @@ err_kfree:
kfree(luo_file);
err_flb_unpreserve:
luo_flb_file_unpreserve(fh);
+err_erase_xa:
+ xa_erase(&luo_preserved_files, luo_get_id(fh, file));
+err_module_put:
+ module_put(fh->ops->owner);
err_free_files_mem:
luo_free_files_mem(file_set);
err_fput:
@@ -362,7 +385,10 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set)
args.private_data = luo_file->private_data;
luo_file->fh->ops->unpreserve(&args);
luo_flb_file_unpreserve(luo_file->fh);
+ module_put(luo_file->fh->ops->owner);
+ xa_erase(&luo_preserved_files,
+ luo_get_id(luo_file->fh, luo_file->file));
list_del(&luo_file->list);
file_set->count--;
@@ -606,6 +632,11 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
luo_file->file = args.file;
/* Get reference so we can keep this file in LUO until finish */
get_file(luo_file->file);
+
+ WARN_ON(xa_insert(&luo_preserved_files,
+ luo_get_id(luo_file->fh, luo_file->file),
+ luo_file->file, GFP_KERNEL));
+
*filep = luo_file->file;
luo_file->retrieve_status = 1;
@@ -646,6 +677,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set,
luo_file->fh->ops->finish(&args);
luo_flb_file_finish(luo_file->fh);
+ module_put(luo_file->fh->ops->owner);
}
/**
@@ -701,8 +733,11 @@ int luo_file_finish(struct luo_file_set *file_set)
luo_file_finish_one(file_set, luo_file);
- if (luo_file->file)
+ if (luo_file->file) {
+ xa_erase(&luo_preserved_files,
+ luo_get_id(luo_file->fh, luo_file->file));
fput(luo_file->file);
+ }
list_del(&luo_file->list);
file_set->count--;
mutex_destroy(&luo_file->mutex);
@@ -777,22 +812,28 @@ int luo_file_deserialize(struct luo_file_set *file_set,
bool handler_found = false;
struct luo_file *luo_file;
+ down_read(&luo_register_rwlock);
list_private_for_each_entry(fh, &luo_file_handler_list, list) {
if (!strcmp(fh->compatible, file_ser[i].compatible)) {
- handler_found = true;
+ if (try_module_get(fh->ops->owner))
+ handler_found = true;
break;
}
}
+ up_read(&luo_register_rwlock);
if (!handler_found) {
- pr_warn("No registered handler for compatible '%s'\n",
+ pr_warn("No registered handler for compatible '%.*s'\n",
+ (int)sizeof(file_ser[i].compatible),
file_ser[i].compatible);
return -ENOENT;
}
luo_file = kzalloc_obj(*luo_file);
- if (!luo_file)
+ if (!luo_file) {
+ module_put(fh->ops->owner);
return -ENOMEM;
+ }
luo_file->fh = fh;
luo_file->file = NULL;
@@ -842,41 +883,28 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
return -EINVAL;
}
- /*
- * Ensure the system is quiescent (no active sessions).
- * This prevents registering new handlers while sessions are active or
- * while deserialization is in progress.
- */
- if (!luo_session_quiesce())
- return -EBUSY;
-
+ down_write(&luo_register_rwlock);
/* Check for duplicate compatible strings */
list_private_for_each_entry(fh_iter, &luo_file_handler_list, list) {
if (!strcmp(fh_iter->compatible, fh->compatible)) {
pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
fh->compatible);
err = -EEXIST;
- goto err_resume;
+ goto err_unlock;
}
}
- /* Pin the module implementing the handler */
- if (!try_module_get(fh->ops->owner)) {
- err = -EAGAIN;
- goto err_resume;
- }
-
INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, flb_list));
INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list));
list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list);
- luo_session_resume();
+ up_write(&luo_register_rwlock);
liveupdate_test_register(fh);
return 0;
-err_resume:
- luo_session_resume();
+err_unlock:
+ up_write(&luo_register_rwlock);
return err;
}
@@ -886,41 +914,13 @@ err_resume:
*
* Unregisters the file handler from the liveupdate core. This function
* reverses the operations of liveupdate_register_file_handler().
- *
- * It ensures safe removal by checking that:
- * No live update session is currently in progress.
- * No FLB registered with this file handler.
- *
- * If the unregistration fails, the internal test state is reverted.
- *
- * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live
- * update is in progress, can't quiesce live update or FLB is registred with
- * this file handler.
*/
-int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
{
- int err = -EBUSY;
-
if (!liveupdate_enabled())
- return -EOPNOTSUPP;
-
- liveupdate_test_unregister(fh);
-
- if (!luo_session_quiesce())
- goto err_register;
-
- if (!list_empty(&ACCESS_PRIVATE(fh, flb_list)))
- goto err_resume;
+ return;
+ guard(rwsem_write)(&luo_register_rwlock);
+ luo_flb_unregister_all(fh);
list_del(&ACCESS_PRIVATE(fh, list));
- module_put(fh->ops->owner);
- luo_session_resume();
-
- return 0;
-
-err_resume:
- luo_session_resume();
-err_register:
- liveupdate_test_register(fh);
- return err;
}
diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c
index f52e8114837e..00f5494812c4 100644
--- a/kernel/liveupdate/luo_flb.c
+++ b/kernel/liveupdate/luo_flb.c
@@ -89,13 +89,18 @@ struct luo_flb_link {
static struct luo_flb_private *luo_flb_get_private(struct liveupdate_flb *flb)
{
struct luo_flb_private *private = &ACCESS_PRIVATE(flb, private);
+ static DEFINE_SPINLOCK(luo_flb_init_lock);
+ if (smp_load_acquire(&private->initialized))
+ return private;
+
+ guard(spinlock)(&luo_flb_init_lock);
if (!private->initialized) {
mutex_init(&private->incoming.lock);
mutex_init(&private->outgoing.lock);
INIT_LIST_HEAD(&private->list);
private->users = 0;
- private->initialized = true;
+ smp_store_release(&private->initialized, true);
}
return private;
@@ -110,10 +115,15 @@ static int luo_flb_file_preserve_one(struct liveupdate_flb *flb)
struct liveupdate_flb_op_args args = {0};
int err;
+ if (!try_module_get(flb->ops->owner))
+ return -ENODEV;
+
args.flb = flb;
err = flb->ops->preserve(&args);
- if (err)
+ if (err) {
+ module_put(flb->ops->owner);
return err;
+ }
private->outgoing.data = args.data;
private->outgoing.obj = args.obj;
}
@@ -141,6 +151,7 @@ static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb)
private->outgoing.data = 0;
private->outgoing.obj = NULL;
+ module_put(flb->ops->owner);
}
}
}
@@ -176,12 +187,17 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb)
if (!found)
return -ENOENT;
+ if (!try_module_get(flb->ops->owner))
+ return -ENODEV;
+
args.flb = flb;
args.data = private->incoming.data;
err = flb->ops->retrieve(&args);
- if (err)
+ if (err) {
+ module_put(flb->ops->owner);
return err;
+ }
private->incoming.obj = args.obj;
private->incoming.retrieved = true;
@@ -215,6 +231,7 @@ static void luo_flb_file_finish_one(struct liveupdate_flb *flb)
private->incoming.data = 0;
private->incoming.obj = NULL;
private->incoming.finished = true;
+ module_put(flb->ops->owner);
}
}
}
@@ -240,17 +257,20 @@ int luo_flb_file_preserve(struct liveupdate_file_handler *fh)
struct luo_flb_link *iter;
int err = 0;
+ down_read(&luo_register_rwlock);
list_for_each_entry(iter, flb_list, list) {
err = luo_flb_file_preserve_one(iter->flb);
if (err)
goto exit_err;
}
+ up_read(&luo_register_rwlock);
return 0;
exit_err:
list_for_each_entry_continue_reverse(iter, flb_list, list)
luo_flb_file_unpreserve_one(iter->flb);
+ up_read(&luo_register_rwlock);
return err;
}
@@ -272,6 +292,7 @@ void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh)
struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
struct luo_flb_link *iter;
+ guard(rwsem_read)(&luo_register_rwlock);
list_for_each_entry_reverse(iter, flb_list, list)
luo_flb_file_unpreserve_one(iter->flb);
}
@@ -292,10 +313,67 @@ void luo_flb_file_finish(struct liveupdate_file_handler *fh)
struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
struct luo_flb_link *iter;
+ guard(rwsem_read)(&luo_register_rwlock);
list_for_each_entry_reverse(iter, flb_list, list)
luo_flb_file_finish_one(iter->flb);
}
+static void luo_flb_unregister_one(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb)
+{
+ struct luo_flb_private *private = luo_flb_get_private(flb);
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter;
+ bool found = false;
+
+ /* Find and remove the link from the file handler's list */
+ list_for_each_entry(iter, flb_list, list) {
+ if (iter->flb == flb) {
+ list_del(&iter->list);
+ kfree(iter);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ pr_warn("Failed to unregister FLB '%s': not found in file handler '%s'\n",
+ flb->compatible, fh->compatible);
+ return;
+ }
+
+ private->users--;
+
+ /*
+ * If this is the last file-handler with which we are registred, remove
+ * from the global list.
+ */
+ if (!private->users) {
+ list_del_init(&private->list);
+ luo_flb_global.count--;
+ }
+}
+
+/**
+ * luo_flb_unregister_all - Unregister all FLBs associated with a file handler.
+ * @fh: The file handler whose FLBs should be unregistered.
+ *
+ * This function iterates through the list of FLBs associated with the given
+ * file handler and unregisters them all one by one.
+ */
+void luo_flb_unregister_all(struct liveupdate_file_handler *fh)
+{
+ struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
+ struct luo_flb_link *iter, *tmp;
+
+ if (!liveupdate_enabled())
+ return;
+
+ lockdep_assert_held_write(&luo_register_rwlock);
+ list_for_each_entry_safe(iter, tmp, flb_list, list)
+ luo_flb_unregister_one(fh, iter->flb);
+}
+
/**
* liveupdate_register_flb - Associate an FLB with a file handler and register it globally.
* @fh: The file handler that will now depend on the FLB.
@@ -326,7 +404,6 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh,
struct luo_flb_link *link __free(kfree) = NULL;
struct liveupdate_flb *gflb;
struct luo_flb_link *iter;
- int err;
if (!liveupdate_enabled())
return -EOPNOTSUPP;
@@ -347,19 +424,12 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh,
if (!link)
return -ENOMEM;
- /*
- * Ensure the system is quiescent (no active sessions).
- * This acts as a global lock for registration: no other thread can
- * be in this section, and no sessions can be creating/using FDs.
- */
- if (!luo_session_quiesce())
- return -EBUSY;
+ guard(rwsem_write)(&luo_register_rwlock);
/* Check that this FLB is not already linked to this file handler */
- err = -EEXIST;
list_for_each_entry(iter, flb_list, list) {
if (iter->flb == flb)
- goto err_resume;
+ return -EEXIST;
}
/*
@@ -367,25 +437,16 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh,
* is registered
*/
if (!private->users) {
- if (WARN_ON(!list_empty(&private->list))) {
- err = -EINVAL;
- goto err_resume;
- }
+ if (WARN_ON(!list_empty(&private->list)))
+ return -EINVAL;
- if (luo_flb_global.count == LUO_FLB_MAX) {
- err = -ENOSPC;
- goto err_resume;
- }
+ if (luo_flb_global.count == LUO_FLB_MAX)
+ return -ENOSPC;
/* Check that compatible string is unique in global list */
list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) {
if (!strcmp(gflb->compatible, flb->compatible))
- goto err_resume;
- }
-
- if (!try_module_get(flb->ops->owner)) {
- err = -EAGAIN;
- goto err_resume;
+ return -EEXIST;
}
list_add_tail(&private->list, &luo_flb_global.list);
@@ -396,13 +457,8 @@ int liveupdate_register_flb(struct liveupdate_file_handler *fh,
private->users++;
link->flb = flb;
list_add_tail(&no_free_ptr(link)->list, flb_list);
- luo_session_resume();
return 0;
-
-err_resume:
- luo_session_resume();
- return err;
}
/**
@@ -418,63 +474,17 @@ err_resume:
* the FLB is removed from the global registry and the reference to its
* owner module (acquired during registration) is released.
*
- * Context: This function ensures the session is quiesced (no active FDs
- * being created) during the update. It is typically called from a
- * subsystem's module exit function.
- * Return: 0 on success.
- * -EOPNOTSUPP if live update is disabled.
- * -EBUSY if the live update session is active and cannot be quiesced.
- * -ENOENT if the FLB was not found in the file handler's list.
+ * Context: It is typically called from a subsystem's module exit function.
*/
-int liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
- struct liveupdate_flb *flb)
+void liveupdate_unregister_flb(struct liveupdate_file_handler *fh,
+ struct liveupdate_flb *flb)
{
- struct luo_flb_private *private = luo_flb_get_private(flb);
- struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list);
- struct luo_flb_link *iter;
- int err = -ENOENT;
-
if (!liveupdate_enabled())
- return -EOPNOTSUPP;
+ return;
- /*
- * Ensure the system is quiescent (no active sessions).
- * This acts as a global lock for unregistration.
- */
- if (!luo_session_quiesce())
- return -EBUSY;
+ guard(rwsem_write)(&luo_register_rwlock);
- /* Find and remove the link from the file handler's list */
- list_for_each_entry(iter, flb_list, list) {
- if (iter->flb == flb) {
- list_del(&iter->list);
- kfree(iter);
- err = 0;
- break;
- }
- }
-
- if (err)
- goto err_resume;
-
- private->users--;
- /*
- * If this is the last file-handler with which we are registred, remove
- * from the global list, and relese module reference.
- */
- if (!private->users) {
- list_del_init(&private->list);
- luo_flb_global.count--;
- module_put(flb->ops->owner);
- }
-
- luo_session_resume();
-
- return 0;
-
-err_resume:
- luo_session_resume();
- return err;
+ luo_flb_unregister_one(fh, flb);
}
/**
@@ -492,7 +502,8 @@ err_resume:
*
* Return: 0 on success, or a negative errno on failure. -ENODATA means no
* incoming FLB data, -ENOENT means specific flb not found in the incoming
- * data, and -EOPNOTSUPP when live update is disabled or not configured.
+ * data, -ENODEV if the FLB's module is unloading, and -EOPNOTSUPP when
+ * live update is disabled or not configured.
*/
int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp)
{
@@ -638,6 +649,7 @@ void luo_flb_serialize(void)
struct liveupdate_flb *gflb;
int i = 0;
+ guard(rwsem_read)(&luo_register_rwlock);
list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) {
struct luo_flb_private *private = luo_flb_get_private(gflb);
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index 8083d8739b09..875844d7a41d 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -77,14 +77,14 @@ struct luo_session {
struct mutex mutex;
};
+extern struct rw_semaphore luo_register_rwlock;
+
int luo_session_create(const char *name, struct file **filep);
int luo_session_retrieve(const char *name, struct file **filep);
int __init luo_session_setup_outgoing(void *fdt);
int __init luo_session_setup_incoming(void *fdt);
int luo_session_serialize(void);
int luo_session_deserialize(void);
-bool luo_session_quiesce(void);
-void luo_session_resume(void);
int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd);
void luo_file_unpreserve_files(struct luo_file_set *file_set);
@@ -103,16 +103,15 @@ void luo_file_set_destroy(struct luo_file_set *file_set);
int luo_flb_file_preserve(struct liveupdate_file_handler *fh);
void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh);
void luo_flb_file_finish(struct liveupdate_file_handler *fh);
+void luo_flb_unregister_all(struct liveupdate_file_handler *fh);
int __init luo_flb_setup_outgoing(void *fdt);
int __init luo_flb_setup_incoming(void *fdt);
void luo_flb_serialize(void);
#ifdef CONFIG_LIVEUPDATE_TEST
void liveupdate_test_register(struct liveupdate_file_handler *fh);
-void liveupdate_test_unregister(struct liveupdate_file_handler *fh);
#else
static inline void liveupdate_test_register(struct liveupdate_file_handler *fh) { }
-static inline void liveupdate_test_unregister(struct liveupdate_file_handler *fh) { }
#endif
#endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
index 25ae704d7787..a3327a28fc1f 100644
--- a/kernel/liveupdate/luo_session.c
+++ b/kernel/liveupdate/luo_session.c
@@ -544,7 +544,8 @@ int luo_session_deserialize(void)
session = luo_session_alloc(sh->ser[i].name);
if (IS_ERR(session)) {
- pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
+ pr_warn("Failed to allocate session [%.*s] during deserialization %pe\n",
+ (int)sizeof(sh->ser[i].name),
sh->ser[i].name, session);
return PTR_ERR(session);
}
@@ -606,46 +607,3 @@ err_undo:
return err;
}
-/**
- * luo_session_quiesce - Ensure no active sessions exist and lock session lists.
- *
- * Acquires exclusive write locks on both incoming and outgoing session lists.
- * It then validates no sessions exist in either list.
- *
- * This mechanism is used during file handler un/registration to ensure that no
- * sessions are currently using the handler, and no new sessions can be created
- * while un/registration is in progress.
- *
- * This prevents registering new handlers while sessions are active or
- * while deserialization is in progress.
- *
- * Return:
- * true - System is quiescent (0 sessions) and locked.
- * false - Active sessions exist. The locks are released internally.
- */
-bool luo_session_quiesce(void)
-{
- down_write(&luo_session_global.incoming.rwsem);
- down_write(&luo_session_global.outgoing.rwsem);
-
- if (luo_session_global.incoming.count ||
- luo_session_global.outgoing.count) {
- up_write(&luo_session_global.outgoing.rwsem);
- up_write(&luo_session_global.incoming.rwsem);
- return false;
- }
-
- return true;
-}
-
-/**
- * luo_session_resume - Unlock session lists and resume normal activity.
- *
- * Releases the exclusive locks acquired by a successful call to
- * luo_session_quiesce().
- */
-void luo_session_resume(void)
-{
- up_write(&luo_session_global.outgoing.rwsem);
- up_write(&luo_session_global.incoming.rwsem);
-}
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 58991ab09d84..ed1bdcf1f8ab 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -6,7 +6,9 @@
#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/page_ext.h>
+#include <linux/pgalloc_tag.h>
#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/string_choices.h>
@@ -758,8 +760,115 @@ static __init bool need_page_alloc_tagging(void)
return mem_profiling_support;
}
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+/*
+ * Track page allocations before page_ext is initialized.
+ * Some pages are allocated before page_ext becomes available, leaving
+ * their codetag uninitialized. Track these early PFNs so we can clear
+ * their codetag refs later to avoid warnings when they are freed.
+ *
+ * Early allocations include:
+ * - Base allocations independent of CPU count
+ * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
+ * such as trace ring buffers, scheduler per-cpu data)
+ *
+ * For simplicity, we fix the size to 8192.
+ * If insufficient, a warning will be triggered to alert the user.
+ *
+ * TODO: Replace fixed-size array with dynamic allocation using
+ * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
+ */
+#define EARLY_ALLOC_PFN_MAX 8192
+
+static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
+static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
+
+static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
+{
+ int old_idx, new_idx;
+
+ do {
+ old_idx = atomic_read(&early_pfn_count);
+ if (old_idx >= EARLY_ALLOC_PFN_MAX) {
+ pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
+ EARLY_ALLOC_PFN_MAX);
+ return;
+ }
+ new_idx = old_idx + 1;
+ } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
+
+ early_pfns[old_idx] = pfn;
+}
+
+typedef void alloc_tag_add_func(unsigned long pfn);
+static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
+ RCU_INITIALIZER(__alloc_tag_add_early_pfn);
+
+void alloc_tag_add_early_pfn(unsigned long pfn)
+{
+ alloc_tag_add_func *alloc_tag_add;
+
+ if (static_key_enabled(&mem_profiling_compressed))
+ return;
+
+ rcu_read_lock();
+ alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
+ if (alloc_tag_add)
+ alloc_tag_add(pfn);
+ rcu_read_unlock();
+}
+
+static void __init clear_early_alloc_pfn_tag_refs(void)
+{
+ unsigned int i;
+
+ if (static_key_enabled(&mem_profiling_compressed))
+ return;
+
+ rcu_assign_pointer(alloc_tag_add_early_pfn_ptr, NULL);
+ /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
+ synchronize_rcu();
+
+ for (i = 0; i < atomic_read(&early_pfn_count); i++) {
+ unsigned long pfn = early_pfns[i];
+
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ /*
+ * An early-allocated page could be freed and reallocated
+ * after its page_ext is initialized but before we clear it.
+ * In that case, it already has a valid tag set.
+ * We should not overwrite that valid tag with CODETAG_EMPTY.
+ *
+ * Note: there is still a small race window between checking
+ * ref.ct and calling set_codetag_empty(). We accept this
+ * race as it's unlikely and the extra complexity of atomic
+ * cmpxchg is not worth it for this debug-only code path.
+ */
+ if (ref.ct) {
+ put_page_tag_ref(handle);
+ continue;
+ }
+
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+ }
+
+ }
+}
+#else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+static inline void __init clear_early_alloc_pfn_tag_refs(void) {}
+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
static __init void init_page_alloc_tagging(void)
{
+ clear_early_alloc_pfn_tag_refs();
}
struct page_ext_operations page_alloc_tagging_ops = {
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 0964d53365e6..213504915737 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -185,11 +185,73 @@ static int dmirror_fops_open(struct inode *inode, struct file *filp)
return 0;
}
+static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
+{
+ unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT;
+ unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT;
+ unsigned long npages = end_pfn - start_pfn + 1;
+ unsigned long i;
+ unsigned long *src_pfns;
+ unsigned long *dst_pfns;
+ unsigned int order = 0;
+
+ src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL);
+ dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL);
+
+ migrate_device_range(src_pfns, start_pfn, npages);
+ for (i = 0; i < npages; i++) {
+ struct page *dpage, *spage;
+
+ spage = migrate_pfn_to_page(src_pfns[i]);
+ if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ if (WARN_ON(!is_device_private_page(spage) &&
+ !is_device_coherent_page(spage)))
+ continue;
+
+ order = folio_order(page_folio(spage));
+ spage = BACKING_PAGE(spage);
+ if (src_pfns[i] & MIGRATE_PFN_COMPOUND) {
+ dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE,
+ order), 0);
+ } else {
+ dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
+ order = 0;
+ }
+
+ /* TODO Support splitting here */
+ lock_page(dpage);
+ dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
+ if (src_pfns[i] & MIGRATE_PFN_WRITE)
+ dst_pfns[i] |= MIGRATE_PFN_WRITE;
+ if (order)
+ dst_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ folio_copy(page_folio(dpage), page_folio(spage));
+ }
+ migrate_device_pages(src_pfns, dst_pfns, npages);
+ migrate_device_finalize(src_pfns, dst_pfns, npages);
+ kvfree(src_pfns);
+ kvfree(dst_pfns);
+}
+
static int dmirror_fops_release(struct inode *inode, struct file *filp)
{
struct dmirror *dmirror = filp->private_data;
+ struct dmirror_device *mdevice = dmirror->mdevice;
+ int i;
mmu_interval_notifier_remove(&dmirror->notifier);
+
+ if (mdevice->devmem_chunks) {
+ for (i = 0; i < mdevice->devmem_count; i++) {
+ struct dmirror_chunk *devmem =
+ mdevice->devmem_chunks[i];
+
+ dmirror_device_evict_chunk(devmem);
+ }
+ }
+
xa_destroy(&dmirror->pt);
kfree(dmirror);
return 0;
@@ -1377,56 +1439,6 @@ static int dmirror_snapshot(struct dmirror *dmirror,
return ret;
}
-static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
-{
- unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT;
- unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT;
- unsigned long npages = end_pfn - start_pfn + 1;
- unsigned long i;
- unsigned long *src_pfns;
- unsigned long *dst_pfns;
- unsigned int order = 0;
-
- src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL);
- dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL);
-
- migrate_device_range(src_pfns, start_pfn, npages);
- for (i = 0; i < npages; i++) {
- struct page *dpage, *spage;
-
- spage = migrate_pfn_to_page(src_pfns[i]);
- if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
- continue;
-
- if (WARN_ON(!is_device_private_page(spage) &&
- !is_device_coherent_page(spage)))
- continue;
-
- order = folio_order(page_folio(spage));
- spage = BACKING_PAGE(spage);
- if (src_pfns[i] & MIGRATE_PFN_COMPOUND) {
- dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE,
- order), 0);
- } else {
- dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
- order = 0;
- }
-
- /* TODO Support splitting here */
- lock_page(dpage);
- dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
- if (src_pfns[i] & MIGRATE_PFN_WRITE)
- dst_pfns[i] |= MIGRATE_PFN_WRITE;
- if (order)
- dst_pfns[i] |= MIGRATE_PFN_COMPOUND;
- folio_copy(page_folio(dpage), page_folio(spage));
- }
- migrate_device_pages(src_pfns, dst_pfns, npages);
- migrate_device_finalize(src_pfns, dst_pfns, npages);
- kvfree(src_pfns);
- kvfree(dst_pfns);
-}
-
/* Removes free pages from the free list so they can't be re-allocated */
static void dmirror_remove_free_pages(struct dmirror_chunk *devmem)
{
@@ -1726,6 +1738,13 @@ static const struct dev_pagemap_ops dmirror_devmem_ops = {
.folio_split = dmirror_devmem_folio_split,
};
+static void dmirror_device_release(struct device *dev)
+{
+ struct dmirror_device *mdevice = container_of(dev, struct dmirror_device, device);
+
+ dmirror_device_remove_chunks(mdevice);
+}
+
static int dmirror_device_init(struct dmirror_device *mdevice, int id)
{
dev_t dev;
@@ -1737,6 +1756,8 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id)
cdev_init(&mdevice->cdevice, &dmirror_fops);
mdevice->cdevice.owner = THIS_MODULE;
+ mdevice->device.release = dmirror_device_release;
+
device_initialize(&mdevice->device);
mdevice->device.devt = dev;
@@ -1744,12 +1765,16 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id)
if (ret)
goto put_device;
+ /* Build a list of free ZONE_DEVICE struct pages */
+ ret = dmirror_allocate_chunk(mdevice, NULL, false);
+ if (ret)
+ goto put_device;
+
ret = cdev_device_add(&mdevice->cdevice, &mdevice->device);
if (ret)
goto put_device;
- /* Build a list of free ZONE_DEVICE struct pages */
- return dmirror_allocate_chunk(mdevice, NULL, false);
+ return 0;
put_device:
put_device(&mdevice->device);
@@ -1758,7 +1783,6 @@ put_device:
static void dmirror_device_remove(struct dmirror_device *mdevice)
{
- dmirror_device_remove_chunks(mdevice);
cdev_device_del(&mdevice->cdevice, &mdevice->device);
put_device(&mdevice->device);
}
diff --git a/lib/test_kho.c b/lib/test_kho.c
index 7ef9e4061869..aa6a0956bb8b 100644
--- a/lib/test_kho.c
+++ b/lib/test_kho.c
@@ -143,7 +143,8 @@ static int kho_test_preserve(struct kho_test_state *state)
if (err)
goto err_unpreserve_data;
- err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt));
+ err = kho_add_subtree(KHO_TEST_FDT, folio_address(state->fdt),
+ fdt_totalsize(folio_address(state->fdt)));
if (err)
goto err_unpreserve_data;
@@ -318,7 +319,7 @@ static int __init kho_test_init(void)
if (!kho_is_enabled())
return 0;
- err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys);
+ err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys, NULL);
if (!err) {
err = kho_test_restore(fdt_phys);
if (err)
diff --git a/lib/tests/liveupdate.c b/lib/tests/liveupdate.c
index 496d6ef91a30..e4b0ecbee32f 100644
--- a/lib/tests/liveupdate.c
+++ b/lib/tests/liveupdate.c
@@ -135,24 +135,6 @@ void liveupdate_test_register(struct liveupdate_file_handler *fh)
TEST_NFLBS, fh->compatible);
}
-void liveupdate_test_unregister(struct liveupdate_file_handler *fh)
-{
- int err, i;
-
- for (i = 0; i < TEST_NFLBS; i++) {
- struct liveupdate_flb *flb = &test_flbs[i];
-
- err = liveupdate_unregister_flb(fh, flb);
- if (err) {
- pr_err("Failed to unregister %s %pe\n",
- flb->compatible, ERR_PTR(err));
- }
- }
-
- pr_info("Unregistered %d FLBs from file handler: [%s]\n",
- TEST_NFLBS, fh->compatible);
-}
-
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pasha Tatashin <pasha.tatashin@soleen.com>");
MODULE_DESCRIPTION("In-kernel test for LUO mechanism");
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 7638d75b27db..91b3e027b753 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -297,6 +297,17 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
If unsure, say Y.
+config DEBUG_KMEMLEAK_VERBOSE
+ bool "Default kmemleak to verbose mode"
+ depends on DEBUG_KMEMLEAK_AUTO_SCAN
+ help
+ Say Y here to have kmemleak print unreferenced object details
+ (backtrace, hex dump, address) to dmesg when new memory leaks are
+ detected during automatic scanning. This can also be toggled at
+ runtime via /sys/module/kmemleak/parameters/verbose.
+
+ If unsure, say N.
+
config PER_VMA_LOCK_STATS
bool "Statistics for per-vma locks"
depends on PER_VMA_LOCK
diff --git a/mm/compaction.c b/mm/compaction.c
index 1e8f8eca318c..3648ce22c807 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -518,6 +518,24 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
return true;
}
+static struct lruvec *
+compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags,
+ struct compact_control *cc)
+{
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
+ compact_lock_irqsave(&lruvec->lru_lock, flags, cc);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irqrestore(&lruvec->lru_lock, *flags);
+ goto retry;
+ }
+
+ return lruvec;
+}
+
/*
* Compaction requires the taking of some coarse locks that are potentially
* very heavily contended. The lock should be periodically unlocked to avoid
@@ -839,7 +857,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
{
pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;
struct lruvec *locked = NULL;
struct folio *folio = NULL;
@@ -913,7 +931,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
@@ -964,7 +982,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/* for alloc_contig case */
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
@@ -1053,7 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(page_has_movable_ops(page)) &&
!PageMovableOpsIsolated(page)) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
@@ -1153,18 +1171,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!folio_test_clear_lru(folio))
goto isolate_fail_put;
- lruvec = folio_lruvec(folio);
+ if (locked)
+ lruvec = folio_lruvec(folio);
/* If we already hold the lock, we can skip some rechecking */
- if (lruvec != locked) {
+ if (lruvec != locked || !locked) {
if (locked)
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
- compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc);
locked = lruvec;
- lruvec_memcg_debug(lruvec, folio);
-
/*
* Try get exclusive access under lock. If marked for
* skip, the scan is aborted unless the current context
@@ -1226,7 +1243,7 @@ isolate_success_no_list:
isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
folio_put(folio);
@@ -1242,7 +1259,7 @@ isolate_fail:
*/
if (nr_isolated) {
if (locked) {
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
locked = NULL;
}
putback_movable_pages(&cc->migratepages);
@@ -1274,7 +1291,7 @@ isolate_fail:
isolate_abort:
if (locked)
- unlock_page_lruvec_irqrestore(locked, flags);
+ lruvec_unlock_irqrestore(locked, flags);
if (folio) {
folio_set_lru(folio);
folio_put(folio);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7f04fc3f8c8c..fa9531d8e7f8 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1573,35 +1573,6 @@ int damon_kdamond_pid(struct damon_ctx *ctx)
return pid;
}
-/*
- * damon_call_handle_inactive_ctx() - handle DAMON call request that added to
- * an inactive context.
- * @ctx: The inactive DAMON context.
- * @control: Control variable of the call request.
- *
- * This function is called in a case that @control is added to @ctx but @ctx is
- * not running (inactive). See if @ctx handled @control or not, and cleanup
- * @control if it was not handled.
- *
- * Returns 0 if @control was handled by @ctx, negative error code otherwise.
- */
-static int damon_call_handle_inactive_ctx(
- struct damon_ctx *ctx, struct damon_call_control *control)
-{
- struct damon_call_control *c;
-
- mutex_lock(&ctx->call_controls_lock);
- list_for_each_entry(c, &ctx->call_controls, list) {
- if (c == control) {
- list_del(&control->list);
- mutex_unlock(&ctx->call_controls_lock);
- return -EINVAL;
- }
- }
- mutex_unlock(&ctx->call_controls_lock);
- return 0;
-}
-
/**
* damon_call() - Invoke a given function on DAMON worker thread (kdamond).
* @ctx: DAMON context to call the function for.
@@ -1619,6 +1590,10 @@ static int damon_call_handle_inactive_ctx(
* synchronization. The return value of the function will be saved in
* &damon_call_control->return_code.
*
+ * Note that this function should be called only after damon_start() with the
+ * @ctx has succeeded. Otherwise, this function could fall into an indefinite
+ * wait.
+ *
* Return: 0 on success, negative error code otherwise.
*/
int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
@@ -1629,10 +1604,12 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
INIT_LIST_HEAD(&control->list);
mutex_lock(&ctx->call_controls_lock);
+ if (ctx->call_controls_obsolete) {
+ mutex_unlock(&ctx->call_controls_lock);
+ return -ECANCELED;
+ }
list_add_tail(&control->list, &ctx->call_controls);
mutex_unlock(&ctx->call_controls_lock);
- if (!damon_is_running(ctx))
- return damon_call_handle_inactive_ctx(ctx, control);
if (control->repeat)
return 0;
wait_for_completion(&control->completion);
@@ -1660,6 +1637,10 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
* passed at least one &damos->apply_interval_us, kdamond marks the request as
* completed so that damos_walk() can wakeup and return.
*
+ * Note that this function should be called only after damon_start() with the
+ * @ctx has succeeded. Otherwise, this function could fall into an indefinite
+ * wait.
+ *
* Return: 0 on success, negative error code otherwise.
*/
int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
@@ -1667,19 +1648,16 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
init_completion(&control->completion);
control->canceled = false;
mutex_lock(&ctx->walk_control_lock);
+ if (ctx->walk_control_obsolete) {
+ mutex_unlock(&ctx->walk_control_lock);
+ return -ECANCELED;
+ }
if (ctx->walk_control) {
mutex_unlock(&ctx->walk_control_lock);
return -EBUSY;
}
ctx->walk_control = control;
mutex_unlock(&ctx->walk_control_lock);
- if (!damon_is_running(ctx)) {
- mutex_lock(&ctx->walk_control_lock);
- if (ctx->walk_control == control)
- ctx->walk_control = NULL;
- mutex_unlock(&ctx->walk_control_lock);
- return -EINVAL;
- }
wait_for_completion(&control->completion);
if (control->canceled)
return -ECANCELED;
@@ -2239,12 +2217,24 @@ static inline u64 damos_get_some_mem_psi_total(void)
#endif /* CONFIG_PSI */
#ifdef CONFIG_NUMA
+static bool invalid_mem_node(int nid)
+{
+ return nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY);
+}
+
static __kernel_ulong_t damos_get_node_mem_bp(
struct damos_quota_goal *goal)
{
struct sysinfo i;
__kernel_ulong_t numerator;
+ if (invalid_mem_node(goal->nid)) {
+ if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
+ return 0;
+ else /* DAMOS_QUOTA_NODE_MEM_FREE_BP */
+ return 10000;
+ }
+
si_meminfo_node(&i, goal->nid);
if (goal->metric == DAMOS_QUOTA_NODE_MEM_USED_BP)
numerator = i.totalram - i.freeram;
@@ -2261,6 +2251,13 @@ static unsigned long damos_get_node_memcg_used_bp(
unsigned long used_pages, numerator;
struct sysinfo i;
+ if (invalid_mem_node(goal->nid)) {
+ if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
+ return 0;
+ else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */
+ return 10000;
+ }
+
memcg = mem_cgroup_get_from_id(goal->memcg_id);
if (!memcg) {
if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
@@ -2452,7 +2449,8 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
}
/* New charge window starts */
- if (time_after_eq(jiffies, quota->charged_from +
+ if (!time_in_range_open(jiffies, quota->charged_from,
+ quota->charged_from +
msecs_to_jiffies(quota->reset_interval))) {
if (damos_quota_is_set(quota) &&
quota->charged_sz >= quota->esz)
@@ -2952,6 +2950,12 @@ static int kdamond_fn(void *data)
pr_debug("kdamond (%d) starts\n", current->pid);
+ mutex_lock(&ctx->call_controls_lock);
+ ctx->call_controls_obsolete = false;
+ mutex_unlock(&ctx->call_controls_lock);
+ mutex_lock(&ctx->walk_control_lock);
+ ctx->walk_control_obsolete = false;
+ mutex_unlock(&ctx->walk_control_lock);
complete(&ctx->kdamond_started);
kdamond_init_ctx(ctx);
@@ -3062,7 +3066,13 @@ done:
damon_destroy_targets(ctx);
kfree(ctx->regions_score_histogram);
+ mutex_lock(&ctx->call_controls_lock);
+ ctx->call_controls_obsolete = true;
+ mutex_unlock(&ctx->call_controls_lock);
kdamond_call(ctx, true);
+ mutex_lock(&ctx->walk_control_lock);
+ ctx->walk_control_obsolete = true;
+ mutex_unlock(&ctx->walk_control_lock);
damos_walk_cancel(ctx);
pr_debug("kdamond (%d) finishes\n", current->pid);
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index 60351a719460..99ba346f9e32 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -255,8 +255,11 @@ static int damon_stat_start(void)
if (!damon_stat_context)
return -ENOMEM;
err = damon_start(&damon_stat_context, 1, true);
- if (err)
+ if (err) {
+ damon_destroy_ctx(damon_stat_context);
+ damon_stat_context = NULL;
return err;
+ }
damon_stat_last_refresh_jiffies = jiffies;
call_control.data = damon_stat_context;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42c983821c03..970e077019b7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1218,13 +1218,29 @@ retry:
static struct deferred_split *folio_split_queue_lock(struct folio *folio)
{
- return split_queue_lock(folio_nid(folio), folio_memcg(folio));
+ struct deferred_split *queue;
+
+ rcu_read_lock();
+ queue = split_queue_lock(folio_nid(folio), folio_memcg(folio));
+ /*
+ * The memcg destruction path is acquiring the split queue lock for
+ * reparenting. Once you have it locked, it's safe to drop the rcu lock.
+ */
+ rcu_read_unlock();
+
+ return queue;
}
static struct deferred_split *
folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
{
- return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
+ struct deferred_split *queue;
+
+ rcu_read_lock();
+ queue = split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
+ rcu_read_unlock();
+
+ return queue;
}
static inline void split_queue_unlock(struct deferred_split *queue)
@@ -3994,7 +4010,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1);
if (do_lru)
- unlock_page_lruvec(lruvec);
+ lruvec_unlock(lruvec);
if (ci)
swap_cluster_unlock(ci);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9413ed497be5..f24bf49be047 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4218,6 +4218,9 @@ static __init int hugetlb_add_param(char *s, int (*setup)(char *))
size_t len;
char *p;
+ if (!s)
+ return -EINVAL;
+
if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
return -EINVAL;
@@ -4784,6 +4787,18 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
return 0;
}
+#ifdef CONFIG_USERFAULTFD
+static bool hugetlb_can_userfault(struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+{
+ return true;
+}
+
+static const struct vm_uffd_ops hugetlb_uffd_ops = {
+ .can_userfault = hugetlb_can_userfault,
+};
+#endif
+
/*
* When a new function is introduced to vm_operations_struct and added
* to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
@@ -4797,6 +4812,9 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.close = hugetlb_vm_op_close,
.may_split = hugetlb_vm_op_split,
.pagesize = hugetlb_vm_op_pagesize,
+#ifdef CONFIG_USERFAULTFD
+ .uffd_ops = &hugetlb_uffd_ops,
+#endif
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fa8201e23222..2eff0d6b622b 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -241,7 +241,7 @@ static int kmemleak_skip_disable;
/* If there are leaks that can be reported */
static bool kmemleak_found_leaks;
-static bool kmemleak_verbose;
+static bool kmemleak_verbose = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_VERBOSE);
module_param_named(verbose, kmemleak_verbose, bool, 0600);
static void kmemleak_disable(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5629844b2804..a6a1c91e276d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2601,7 +2601,7 @@ static int __init prepare_kho_fdt(void)
if (err)
goto err_unpreserve_fdt;
- err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt);
+ err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt, fdt_totalsize(fdt));
if (err)
goto err_unpreserve_fdt;
@@ -2646,7 +2646,7 @@ static void *__init reserve_mem_kho_retrieve_fdt(void)
if (fdt)
return fdt;
- err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys);
+ err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys, NULL);
if (err) {
if (err != -ENOENT)
pr_warn("failed to retrieve FDT '%s' from KHO: %d\n",
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 437cd25784fe..433bba9dfe71 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -613,6 +613,7 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
+ struct obj_cgroup *objcg;
unsigned int nr_entries;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
@@ -624,12 +625,13 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
if (!do_memsw_account())
return;
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
+ objcg = folio_objcg(folio);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
+ if (!objcg)
return;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
/*
* In case the memcg owning these pages has been offlined and doesn't
* have an ID allocated to it anymore, charge the closest online
@@ -644,7 +646,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
page_counter_uncharge(&memcg->memory, nr_entries);
if (memcg != swap_memcg) {
@@ -665,7 +667,8 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
preempt_enable_nested();
memcg1_check_events(memcg, folio_nid(folio));
- css_put(&memcg->css);
+ rcu_read_unlock();
+ obj_cgroup_put(objcg);
}
/*
@@ -1884,6 +1887,22 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
+ reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
+}
+
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ int i;
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ reparent_memcg_lruvec_state_local(memcg, parent, i);
+}
+
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
unsigned long memory, memsw;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 1b969294ea6a..f92f81108d5e 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -73,6 +73,13 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent);
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx);
void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages);
static inline bool memcg1_tcpmem_active(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 051b82ebf371..c3d98ab41f1f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -206,26 +206,100 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
return objcg;
}
-static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
+static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent,
+ int nid)
{
struct obj_cgroup *objcg, *iter;
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid];
- objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
-
- spin_lock_irq(&objcg_lock);
-
+ objcg = rcu_replace_pointer(pn->objcg, NULL, true);
/* 1) Ready to reparent active objcg. */
- list_add(&objcg->list, &memcg->objcg_list);
+ list_add(&objcg->list, &pn->objcg_list);
/* 2) Reparent active objcg and already reparented objcgs to parent. */
- list_for_each_entry(iter, &memcg->objcg_list, list)
+ list_for_each_entry(iter, &pn->objcg_list, list)
WRITE_ONCE(iter->memcg, parent);
/* 3) Move already reparented objcgs to the parent's list */
- list_splice(&memcg->objcg_list, &parent->objcg_list);
+ list_splice(&pn->objcg_list, &parent_pn->objcg_list);
+
+ return objcg;
+}
+#ifdef CONFIG_MEMCG_V1
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force);
+
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ /*
+ * Reparent stats exposed non-hierarchically. Flush @memcg's stats first
+ * to read its stats accurately , and conservatively flush @parent's
+ * stats after reparenting to avoid hiding a potentially large stat
+ * update (e.g. from callers of mem_cgroup_flush_stats_ratelimited()).
+ */
+ __mem_cgroup_flush_stats(memcg, true);
+
+ /* The following counts are all non-hierarchical and need to be reparented. */
+ reparent_memcg1_state_local(memcg, parent);
+ reparent_memcg1_lruvec_state_local(memcg, parent);
+
+ __mem_cgroup_flush_stats(parent, true);
+}
+#else
+static inline void reparent_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
+{
+}
+#endif
+
+static inline void reparent_locks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+ spin_lock_irq(&objcg_lock);
+ spin_lock_nested(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock, 1);
+ spin_lock_nested(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock, 2);
+}
+
+static inline void reparent_unlocks(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+ spin_unlock(&mem_cgroup_lruvec(parent, NODE_DATA(nid))->lru_lock);
+ spin_unlock(&mem_cgroup_lruvec(memcg, NODE_DATA(nid))->lru_lock);
spin_unlock_irq(&objcg_lock);
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ int nid;
+
+ for_each_node(nid) {
+retry:
+ if (lru_gen_enabled())
+ max_lru_gen_memcg(parent, nid);
+
+ reparent_locks(memcg, parent, nid);
+
+ if (lru_gen_enabled()) {
+ if (!recheck_lru_gen_max_memcg(parent, nid)) {
+ reparent_unlocks(memcg, parent, nid);
+ cond_resched();
+ goto retry;
+ }
+ lru_gen_reparent_memcg(memcg, parent, nid);
+ } else {
+ lru_reparent_memcg(memcg, parent, nid);
+ }
- percpu_ref_kill(&objcg->refcnt);
+ objcg = __memcg_reparent_objcgs(memcg, parent, nid);
+
+ reparent_unlocks(memcg, parent, nid);
+
+ percpu_ref_kill(&objcg->refcnt);
+ }
+
+ reparent_state_local(memcg, parent);
}
/*
@@ -241,7 +315,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_bpf_enabled_key);
EXPORT_SYMBOL(memcg_bpf_enabled_key);
/**
- * mem_cgroup_css_from_folio - css of the memcg associated with a folio
+ * get_mem_cgroup_css_from_folio - acquire a css of the memcg associated with a folio
* @folio: folio of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
@@ -251,14 +325,16 @@ EXPORT_SYMBOL(memcg_bpf_enabled_key);
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
-struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
+struct cgroup_subsys_state *get_mem_cgroup_css_from_folio(struct folio *folio)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
- if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
- memcg = root_mem_cgroup;
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return &root_mem_cgroup->css;
- return &memcg->css;
+ memcg = get_mem_cgroup_from_folio(folio);
+
+ return memcg ? &memcg->css : &root_mem_cgroup->css;
}
/**
@@ -449,6 +525,30 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+#ifdef CONFIG_MEMCG_V1
+static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
+ enum node_stat_item idx, long val);
+
+void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ struct lruvec *parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+ unsigned long value = lruvec_page_state_local(child_lruvec, idx);
+ struct mem_cgroup_per_node *child_pn, *parent_pn;
+
+ child_pn = container_of(child_lruvec, struct mem_cgroup_per_node, lruvec);
+ parent_pn = container_of(parent_lruvec, struct mem_cgroup_per_node, lruvec);
+
+ __mod_memcg_lruvec_state(child_pn, idx, -value);
+ __mod_memcg_lruvec_state(parent_pn, idx, value);
+ }
+}
+#endif
+
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
#ifdef CONFIG_MEMCG_V1
@@ -508,7 +608,7 @@ static inline int memcg_events_index(enum vm_event_item idx)
struct memcg_vmstats_percpu {
/* Stats updates since the last flush */
- unsigned int stats_updates;
+ unsigned long stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
struct memcg_vmstats_percpu __percpu *parent_pcpu;
@@ -539,7 +639,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic_t stats_updates;
+ atomic_long_t stats_updates;
};
/*
@@ -565,16 +665,16 @@ static u64 flush_last_time;
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic_read(&vmstats->stats_updates) >
+ return atomic_long_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, long val,
int cpu)
{
struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
- unsigned int stats_updates;
+ unsigned long stats_updates;
if (!val)
return;
@@ -597,7 +697,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
continue;
stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
- atomic_add(stats_updates, &statc->vmstats->stats_updates);
+ atomic_long_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -605,7 +705,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_long_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -684,31 +784,64 @@ static int memcg_page_state_unit(int item);
* Normalize the value passed into memcg_rstat_updated() to be in pages. Round
* up non-zero sub-page updates to 1 page as zero page updates are ignored.
*/
-static int memcg_state_val_in_pages(int idx, int val)
+static long memcg_state_val_in_pages(int idx, long val)
{
int unit = memcg_page_state_unit(idx);
+ long res;
if (!val || unit == PAGE_SIZE)
return val;
- else
- return max(val * unit / PAGE_SIZE, 1UL);
+
+ /* Get the absolute value of (val * unit / PAGE_SIZE). */
+ res = mult_frac(abs(val), unit, PAGE_SIZE);
+ /* Round up zero values. */
+ res = res ? : 1;
+
+ return val < 0 ? -res : res;
}
-/**
- * mod_memcg_state - update cgroup memory statistics
- * @memcg: the memory cgroup
- * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
- * @val: delta to add to the counter, can be negative
+#ifdef CONFIG_MEMCG_V1
+/*
+ * Used in mod_memcg_state() and mod_memcg_lruvec_state() to avoid race with
+ * reparenting of non-hierarchical state_locals.
*/
-void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
- int val)
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
{
- int i = memcg_stats_index(idx);
- int cpu;
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return memcg;
- if (mem_cgroup_disabled())
+ rcu_read_lock();
+
+ while (memcg_is_dying(memcg))
+ memcg = parent_mem_cgroup(memcg);
+
+ return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
+ rcu_read_unlock();
+}
+#else
+static inline struct mem_cgroup *get_non_dying_memcg_start(struct mem_cgroup *memcg)
+{
+ return memcg;
+}
+
+static inline void get_non_dying_memcg_end(void)
+{
+}
+#endif
+
+static void __mod_memcg_state(struct mem_cgroup *memcg,
+ enum memcg_stat_item idx, long val)
+{
+ int i = memcg_stats_index(idx);
+ int cpu;
+
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
@@ -717,11 +850,29 @@ void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
memcg_rstat_updated(memcg, val, cpu);
+
trace_mod_memcg_state(memcg, idx, val);
put_cpu();
}
+/**
+ * mod_memcg_state - update cgroup memory statistics
+ * @memcg: the memory cgroup
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+ int val)
+{
+ if (mem_cgroup_disabled())
+ return;
+
+ memcg = get_non_dying_memcg_start(memcg);
+ __mod_memcg_state(memcg, idx, val);
+ get_non_dying_memcg_end();
+}
+
#ifdef CONFIG_MEMCG_V1
/* idx can be of type enum memcg_stat_item or node_stat_item. */
unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
@@ -739,23 +890,27 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+
+void reparent_memcg_state_local(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent, int idx)
+{
+ unsigned long value = memcg_page_state_local(memcg, idx);
+
+ __mod_memcg_state(memcg, idx, -value);
+ __mod_memcg_state(parent, idx, value);
+}
#endif
-static void mod_memcg_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx,
- int val)
+static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
+ enum node_stat_item idx, long val)
{
- struct mem_cgroup_per_node *pn;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = pn->memcg;
int i = memcg_stats_index(idx);
int cpu;
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- memcg = pn->memcg;
-
cpu = get_cpu();
/* Update memcg */
@@ -771,6 +926,23 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec,
put_cpu();
}
+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx,
+ int val)
+{
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup *memcg;
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = get_non_dying_memcg_start(pn->memcg);
+ pn = memcg->nodeinfo[pgdat->node_id];
+
+ __mod_memcg_lruvec_state(pn, idx, val);
+
+ get_non_dying_memcg_end();
+}
+
/**
* mod_lruvec_state - update lruvec memory statistics
* @lruvec: the lruvec
@@ -991,17 +1163,23 @@ again:
/**
* get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
* @folio: folio from which memcg should be extracted.
+ *
+ * See folio_memcg() for folio->objcg/memcg binding rules.
*/
struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
if (mem_cgroup_disabled())
return NULL;
+ if (!folio_memcg_charged(folio))
+ return root_mem_cgroup;
+
rcu_read_lock();
- if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
+ do {
+ memcg = folio_memcg(folio);
+ } while (unlikely(!css_tryget(&memcg->css)));
rcu_read_unlock();
return memcg;
}
@@ -1198,23 +1376,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
}
}
-#ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
-{
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return;
-
- memcg = folio_memcg(folio);
-
- if (!memcg)
- VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
- else
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
-}
-#endif
-
/**
* folio_lruvec_lock - Lock the lruvec for a folio.
* @folio: Pointer to the folio.
@@ -1224,14 +1385,20 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
* - folio_test_lru false
* - folio frozen (refcount of 0)
*
- * Return: The lruvec this folio is on with its lock held.
+ * Return: The lruvec this folio is on with its lock held and rcu read lock held.
*/
struct lruvec *folio_lruvec_lock(struct folio *folio)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock(&lruvec->lru_lock);
- lruvec_memcg_debug(lruvec, folio);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock(&lruvec->lru_lock);
+ goto retry;
+ }
return lruvec;
}
@@ -1246,14 +1413,20 @@ struct lruvec *folio_lruvec_lock(struct folio *folio)
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
- * disabled.
+ * disabled and rcu read lock held.
*/
struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock_irq(&lruvec->lru_lock);
- lruvec_memcg_debug(lruvec, folio);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ goto retry;
+ }
return lruvec;
}
@@ -1269,15 +1442,21 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
* - folio frozen (refcount of 0)
*
* Return: The lruvec this folio is on with its lock held and interrupts
- * disabled.
+ * disabled and rcu read lock held.
*/
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
unsigned long *flags)
{
- struct lruvec *lruvec = folio_lruvec(folio);
+ struct lruvec *lruvec;
+ rcu_read_lock();
+retry:
+ lruvec = folio_lruvec(folio);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
- lruvec_memcg_debug(lruvec, folio);
+ if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
+ spin_unlock_irqrestore(&lruvec->lru_lock, *flags);
+ goto retry;
+ }
return lruvec;
}
@@ -1293,7 +1472,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
* to or just after a page is removed from an lru list.
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int zid, int nr_pages)
+ int zid, long nr_pages)
{
struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
@@ -1310,7 +1489,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
size = *lru_size;
if (WARN_ONCE(size < 0,
- "%s(%p, %d, %d): lru_size %ld\n",
+ "%s(%p, %d, %ld): lru_size %ld\n",
__func__, lruvec, lru, nr_pages, size)) {
VM_BUG_ON(1);
*lru_size = 0;
@@ -2581,17 +2760,17 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return try_charge_memcg(memcg, gfp_mask, nr_pages);
}
-static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct obj_cgroup *objcg)
{
VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
/*
- * Any of the following ensures page's memcg stability:
+ * Any of the following ensures folio's objcg stability:
*
* - the page lock
* - LRU isolation
* - exclusive reference
*/
- folio->memcg_data = (unsigned long)memcg;
+ folio->memcg_data = (unsigned long)objcg;
}
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
@@ -2693,14 +2872,26 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p)
static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
- struct obj_cgroup *objcg = NULL;
+ int nid = numa_node_id();
+
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg);
- for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
- objcg = rcu_dereference(memcg->objcg);
if (likely(objcg && obj_cgroup_tryget(objcg)))
- break;
- objcg = NULL;
+ return objcg;
}
+
+ return NULL;
+}
+
+static inline struct obj_cgroup *get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg;
+
+ rcu_read_lock();
+ objcg = __get_obj_cgroup_from_memcg(memcg);
+ rcu_read_unlock();
+
return objcg;
}
@@ -2759,6 +2950,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
{
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ int nid = numa_node_id();
if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
return NULL;
@@ -2775,53 +2967,39 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
* Objcg reference is kept by the task, so it's safe
* to use the objcg by the current task.
*/
- return objcg;
+ return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
}
memcg = this_cpu_read(int_active_memcg);
if (unlikely(memcg))
goto from_memcg;
- return NULL;
+ return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
from_memcg:
- objcg = NULL;
- for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
/*
* Memcg pointer is protected by scope (see set_active_memcg())
* and is pinning the corresponding objcg, so objcg can't go
* away and can be used within the scope without any additional
* protection.
*/
- objcg = rcu_dereference_check(memcg->objcg, 1);
+ objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1);
if (likely(objcg))
- break;
+ return objcg;
}
- return objcg;
+ return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
}
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_online())
- return NULL;
-
- if (folio_memcg_kmem(folio)) {
- objcg = __folio_objcg(folio);
+ objcg = folio_objcg(folio);
+ if (objcg)
obj_cgroup_get(objcg);
- } else {
- struct mem_cgroup *memcg;
- rcu_read_lock();
- memcg = __folio_memcg(folio);
- if (memcg)
- objcg = __get_obj_cgroup_from_memcg(memcg);
- else
- objcg = NULL;
- rcu_read_unlock();
- }
return objcg;
}
@@ -2922,7 +3100,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
int ret = 0;
objcg = current_obj_cgroup();
- if (objcg) {
+ if (objcg && !obj_cgroup_is_root(objcg)) {
ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
obj_cgroup_get(objcg);
@@ -3251,7 +3429,7 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
* obj_cgroup_get() is used to get a permanent reference.
*/
objcg = current_obj_cgroup();
- if (!objcg)
+ if (!objcg || obj_cgroup_is_root(objcg))
return true;
/*
@@ -3383,33 +3561,20 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
return;
new_refs = (1 << (old_order - new_order)) - 1;
- css_get_many(&__folio_memcg(folio)->css, new_refs);
+ obj_cgroup_get_many(folio_objcg(folio), new_refs);
}
-static int memcg_online_kmem(struct mem_cgroup *memcg)
+static void memcg_online_kmem(struct mem_cgroup *memcg)
{
- struct obj_cgroup *objcg;
-
if (mem_cgroup_kmem_disabled())
- return 0;
+ return;
if (unlikely(mem_cgroup_is_root(memcg)))
- return 0;
-
- objcg = obj_cgroup_alloc();
- if (!objcg)
- return -ENOMEM;
-
- objcg->memcg = memcg;
- rcu_assign_pointer(memcg->objcg, objcg);
- obj_cgroup_get(objcg);
- memcg->orig_objcg = objcg;
+ return;
static_branch_enable(&memcg_kmem_online_key);
memcg->kmemcg_id = memcg->id.id;
-
- return 0;
}
static void memcg_offline_kmem(struct mem_cgroup *memcg)
@@ -3423,16 +3588,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
return;
parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
memcg_reparent_list_lrus(memcg, parent);
-
- /*
- * Objcg's reparenting must be after list_lru's, make sure list_lru
- * helpers won't use parent's list_lru until child is drained.
- */
- memcg_reparent_objcgs(memcg, parent);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -3705,8 +3861,6 @@ struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, un
break;
}
memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- memcg = root_mem_cgroup;
}
return memcg;
}
@@ -3771,6 +3925,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn->lruvec_stats_percpu)
goto fail;
+ INIT_LIST_HEAD(&pn->objcg_list);
+
lruvec_init(&pn->lruvec);
pn->memcg = memcg;
@@ -3785,10 +3941,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- obj_cgroup_put(memcg->orig_objcg);
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ if (!pn)
+ continue;
- for_each_node(node)
- free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
+ obj_cgroup_put(pn->orig_objcg);
+ free_mem_cgroup_per_node_info(pn);
+ }
memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
@@ -3859,7 +4019,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
#endif
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->objcg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
@@ -3935,9 +4094,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct obj_cgroup *objcg;
+ int nid;
- if (memcg_online_kmem(memcg))
- goto remove_id;
+ memcg_online_kmem(memcg);
/*
* A memcg must be visible for expand_shrinker_info()
@@ -3947,6 +4107,20 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
+ for_each_node(nid) {
+ objcg = obj_cgroup_alloc();
+ if (!objcg)
+ goto free_objcg;
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ objcg->is_root = true;
+
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg);
+ obj_cgroup_get(objcg);
+ memcg->nodeinfo[nid]->orig_objcg = objcg;
+ }
+
if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
queue_delayed_work(system_dfl_wq, &stats_flush_dwork,
FLUSH_TIME);
@@ -3969,9 +4143,27 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL);
return 0;
+free_objcg:
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+
+ objcg = rcu_replace_pointer(pn->objcg, NULL, true);
+ if (objcg)
+ percpu_ref_kill(&objcg->refcnt);
+
+ if (pn->orig_objcg) {
+ obj_cgroup_put(pn->orig_objcg);
+ /*
+ * Reset pn->orig_objcg to NULL to prevent
+ * obj_cgroup_put() from being called again in
+ * __mem_cgroup_free().
+ */
+ pn->orig_objcg = NULL;
+ }
+ }
+ free_shrinker_info(memcg);
offline_kmem:
memcg_offline_kmem(memcg);
-remove_id:
mem_cgroup_private_id_remove(memcg);
return -ENOMEM;
}
@@ -3989,6 +4181,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
reparent_deferred_split_queue(memcg);
+ /*
+ * The reparenting of objcg must be after the reparenting of the
+ * list_lru and deferred_split_queue above, which ensures that they will
+ * not mistakenly get the parent list_lru and deferred_split_queue.
+ */
+ memcg_reparent_objcgs(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
lru_gen_offline_memcg(memcg);
@@ -4221,8 +4419,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic_read(&memcg->vmstats->stats_updates))
- atomic_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_long_read(&memcg->vmstats->stats_updates))
+ atomic_long_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
@@ -4799,16 +4997,20 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
gfp_t gfp)
{
- int ret;
-
- ret = try_charge(memcg, gfp, folio_nr_pages(folio));
- if (ret)
- goto out;
+ int ret = 0;
+ struct obj_cgroup *objcg;
- css_get(&memcg->css);
- commit_charge(folio, memcg);
+ objcg = get_obj_cgroup_from_memcg(memcg);
+ /* Do not account at the root objcg level. */
+ if (!obj_cgroup_is_root(objcg))
+ ret = try_charge_memcg(memcg, gfp, folio_nr_pages(folio));
+ if (ret) {
+ obj_cgroup_put(objcg);
+ return ret;
+ }
+ commit_charge(folio, objcg);
memcg1_commit_charge(folio, memcg);
-out:
+
return ret;
}
@@ -4894,7 +5096,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
}
struct uncharge_gather {
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
@@ -4908,58 +5110,52 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(ug->objcg);
if (ug->nr_memory) {
- memcg_uncharge(ug->memcg, ug->nr_memory);
+ memcg_uncharge(memcg, ug->nr_memory);
if (ug->nr_kmem) {
- mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
- memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
+ mod_memcg_state(memcg, MEMCG_KMEM, -ug->nr_kmem);
+ memcg1_account_kmem(memcg, -ug->nr_kmem);
}
- memcg1_oom_recover(ug->memcg);
+ memcg1_oom_recover(memcg);
}
- memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid);
+ memcg1_uncharge_batch(memcg, ug->pgpgout, ug->nr_memory, ug->nid);
+ rcu_read_unlock();
/* drop reference from uncharge_folio */
- css_put(&ug->memcg->css);
+ obj_cgroup_put(ug->objcg);
}
static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
{
long nr_pages;
- struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* Nobody should be changing or seriously looking at
- * folio memcg or objcg at this point, we have fully
- * exclusive access to the folio.
+ * folio objcg at this point, we have fully exclusive
+ * access to the folio.
*/
- if (folio_memcg_kmem(folio)) {
- objcg = __folio_objcg(folio);
- /*
- * This get matches the put at the end of the function and
- * kmem pages do not hold memcg references anymore.
- */
- memcg = get_mem_cgroup_from_objcg(objcg);
- } else {
- memcg = __folio_memcg(folio);
- }
-
- if (!memcg)
+ objcg = folio_objcg(folio);
+ if (!objcg)
return;
- if (ug->memcg != memcg) {
- if (ug->memcg) {
+ if (ug->objcg != objcg) {
+ if (ug->objcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = memcg;
+ ug->objcg = objcg;
ug->nid = folio_nid(folio);
- /* pairs with css_put in uncharge_batch */
- css_get(&memcg->css);
+ /* pairs with obj_cgroup_put in uncharge_batch */
+ obj_cgroup_get(objcg);
}
nr_pages = folio_nr_pages(folio);
@@ -4967,20 +5163,17 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
-
- folio->memcg_data = 0;
- obj_cgroup_put(objcg);
} else {
/* LRU pages aren't accounted at the root level */
- if (!mem_cgroup_is_root(memcg))
+ if (!obj_cgroup_is_root(objcg))
ug->nr_memory += nr_pages;
ug->pgpgout++;
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
- folio->memcg_data = 0;
}
- css_put(&memcg->css);
+ folio->memcg_data = 0;
+ obj_cgroup_put(objcg);
}
void __mem_cgroup_uncharge(struct folio *folio)
@@ -5004,7 +5197,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
uncharge_gather_clear(&ug);
for (i = 0; i < folios->nr; i++)
uncharge_folio(folios->folios[i], &ug);
- if (ug.memcg)
+ if (ug.objcg)
uncharge_batch(&ug);
}
@@ -5021,6 +5214,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
long nr_pages = folio_nr_pages(new);
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
@@ -5035,21 +5229,24 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
if (folio_memcg_charged(new))
return;
- memcg = folio_memcg(old);
- VM_WARN_ON_ONCE_FOLIO(!memcg, old);
- if (!memcg)
+ objcg = folio_objcg(old);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, old);
+ if (!objcg)
return;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
/* Force-charge the new page. The old one will be freed soon */
- if (!mem_cgroup_is_root(memcg)) {
+ if (!obj_cgroup_is_root(objcg)) {
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
}
- css_get(&memcg->css);
- commit_charge(new, memcg);
+ obj_cgroup_get(objcg);
+ commit_charge(new, objcg);
memcg1_commit_charge(new, memcg);
+ rcu_read_unlock();
}
/**
@@ -5065,7 +5262,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
*/
void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
@@ -5076,18 +5273,18 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
if (mem_cgroup_disabled())
return;
- memcg = folio_memcg(old);
+ objcg = folio_objcg(old);
/*
- * Note that it is normal to see !memcg for a hugetlb folio.
+ * Note that it is normal to see !objcg for a hugetlb folio.
* For e.g, it could have been allocated when memory_hugetlb_accounting
* was not selected.
*/
- VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
- if (!memcg)
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !objcg, old);
+ if (!objcg)
return;
- /* Transfer the charge and the css ref */
- commit_charge(new, memcg);
+ /* Transfer the charge and the objcg ref */
+ commit_charge(new, objcg);
/* Warning should never happen, so don't worry about refcount non-0 */
WARN_ON_ONCE(folio_unqueue_deferred_split(old));
@@ -5270,22 +5467,27 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
if (do_memsw_account())
return 0;
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
+ objcg = folio_objcg(folio);
+ VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
+ if (!objcg)
return 0;
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
if (!entry.val) {
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ rcu_read_unlock();
return 0;
}
memcg = mem_cgroup_private_id_get_online(memcg, nr_pages);
+ /* memcg is pined by memcg ID. */
+ rcu_read_unlock();
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
@@ -5343,27 +5545,29 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
bool mem_cgroup_swap_full(struct folio *folio)
{
struct mem_cgroup *memcg;
+ bool ret = false;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (vm_swap_full())
return true;
- if (do_memsw_account())
- return false;
+ if (do_memsw_account() || !folio_memcg_charged(folio))
+ return ret;
+ rcu_read_lock();
memcg = folio_memcg(folio);
- if (!memcg)
- return false;
-
for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
- usage * 2 >= READ_ONCE(memcg->swap.max))
- return true;
+ usage * 2 >= READ_ONCE(memcg->swap.max)) {
+ ret = true;
+ break;
+ }
}
+ rcu_read_unlock();
- return false;
+ return ret;
}
static int __init setup_swap_account(char *s)
@@ -5559,6 +5763,9 @@ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
+ if (obj_cgroup_is_root(objcg))
+ return;
+
VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
/* PF_MEMALLOC context, charging must succeed */
@@ -5588,6 +5795,9 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
+ if (obj_cgroup_is_root(objcg))
+ return;
+
obj_cgroup_uncharge(objcg, size);
rcu_read_lock();
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
index bc7f4f045edf..b02b503c750d 100644
--- a/mm/memfd_luo.c
+++ b/mm/memfd_luo.c
@@ -105,7 +105,6 @@ static int memfd_luo_preserve_folios(struct file *file,
if (!size) {
*nr_foliosp = 0;
*out_folios_ser = NULL;
- memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
return 0;
}
@@ -410,6 +409,7 @@ static int memfd_luo_retrieve_folios(struct file *file,
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
struct folio *folio;
+ long npages, nr_added_pages = 0;
int err = -EIO;
long i;
@@ -456,21 +456,26 @@ static int memfd_luo_retrieve_folios(struct file *file,
if (flags & MEMFD_LUO_FOLIO_DIRTY)
folio_mark_dirty(folio);
- err = shmem_inode_acct_blocks(inode, 1);
+ npages = folio_nr_pages(folio);
+ err = shmem_inode_acct_blocks(inode, npages);
if (err) {
- pr_err("shmem: failed to account folio index %ld: %d\n",
- i, err);
- goto unlock_folio;
+ pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n",
+ i, npages, err);
+ goto remove_from_cache;
}
- shmem_recalc_inode(inode, 1, 0);
+ nr_added_pages += npages;
folio_add_lru(folio);
folio_unlock(folio);
folio_put(folio);
}
+ shmem_recalc_inode(inode, nr_added_pages, 0);
+
return 0;
+remove_from_cache:
+ filemap_remove_folio(folio);
unlock_folio:
folio_unlock(folio);
folio_put(folio);
@@ -481,12 +486,19 @@ put_folios:
*/
for (long j = i + 1; j < nr_folios; j++) {
const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
+ phys_addr_t phys;
+
+ if (!pfolio->pfn)
+ continue;
- folio = kho_restore_folio(pfolio->pfn);
+ phys = PFN_PHYS(pfolio->pfn);
+ folio = kho_restore_folio(phys);
if (folio)
folio_put(folio);
}
+ shmem_recalc_inode(inode, nr_added_pages, 0);
+
return err;
}
@@ -525,7 +537,7 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
}
vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
- file->f_inode->i_size = ser->size;
+ i_size_write(file_inode(file), ser->size);
if (ser->nr_folios) {
folios_ser = kho_restore_vmalloc(&ser->folios);
@@ -560,6 +572,11 @@ static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
return shmem_file(file) && !inode->i_nlink;
}
+static unsigned long memfd_luo_get_id(struct file *file)
+{
+ return (unsigned long)file_inode(file);
+}
+
static const struct liveupdate_file_ops memfd_luo_file_ops = {
.freeze = memfd_luo_freeze,
.finish = memfd_luo_finish,
@@ -567,6 +584,7 @@ static const struct liveupdate_file_ops memfd_luo_file_ops = {
.preserve = memfd_luo_preserve,
.unpreserve = memfd_luo_unpreserve,
.can_preserve = memfd_luo_can_preserve,
+ .get_id = memfd_luo_get_id,
.owner = THIS_MODULE,
};
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2e136b738889..99179314a444 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3706,18 +3706,19 @@ static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
new_wi_state->iw_table[i] = 1;
mutex_lock(&wi_state_lock);
- if (!input) {
- old_wi_state = rcu_dereference_protected(wi_state,
- lockdep_is_held(&wi_state_lock));
- if (!old_wi_state)
- goto update_wi_state;
- if (input == old_wi_state->mode_auto) {
- mutex_unlock(&wi_state_lock);
- return count;
- }
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
- memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
- nr_node_ids * sizeof(u8));
+ if (old_wi_state && input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return count;
+ }
+
+ if (!input) {
+ if (old_wi_state)
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
goto update_wi_state;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 76142a02192b..8a64291ab5b4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -672,6 +672,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
struct lruvec *old_lruvec, *new_lruvec;
struct mem_cgroup *memcg;
+ rcu_read_lock();
memcg = folio_memcg(folio);
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
@@ -699,6 +700,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
+ rcu_read_unlock();
}
local_irq_enable();
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 2912eba575d5..fbfe5715f635 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -175,12 +175,6 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
return migrate_vma_collect_skip(start, end, walk);
}
- if (softleaf_is_migration(entry)) {
- softleaf_entry_wait_on_locked(entry, ptl);
- spin_unlock(ptl);
- return -EAGAIN;
- }
-
if (softleaf_is_device_private_write(entry))
write = MIGRATE_PFN_WRITE;
} else {
diff --git a/mm/mlock.c b/mm/mlock.c
index fdbd1434a35f..8c227fefa2df 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,7 +205,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch)
}
if (lruvec)
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
folios_put(fbatch);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 110d47a36d4b..9cbf932b028c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -117,9 +117,9 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
}
/* Set nr_ptes number of ptes, starting from idx */
-static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
- int idx, bool set_write, struct mmu_gather *tlb)
+static __always_inline void prot_commit_flush_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t oldpte, pte_t ptent,
+ int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
{
/*
* Advance the position in the batch by idx; note that if idx > 0,
@@ -143,7 +143,7 @@ static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long add
* !PageAnonExclusive() pages, starting from start_idx. Caller must enforce
* that the ptes point to consecutive pages of the same anon large folio.
*/
-static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
+static __always_inline int page_anon_exclusive_sub_batch(int start_idx, int max_len,
struct page *first_page, bool expected_anon_exclusive)
{
int idx;
@@ -169,7 +169,7 @@ static int page_anon_exclusive_sub_batch(int start_idx, int max_len,
* pte of the batch. Therefore, we must individually check all pages and
* retrieve sub-batches.
*/
-static void commit_anon_folio_batch(struct vm_area_struct *vma,
+static __always_inline void commit_anon_folio_batch(struct vm_area_struct *vma,
struct folio *folio, struct page *first_page, unsigned long addr, pte_t *ptep,
pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
{
@@ -188,7 +188,7 @@ static void commit_anon_folio_batch(struct vm_area_struct *vma,
}
}
-static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
+static __always_inline void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
struct folio *folio, struct page *page, unsigned long addr, pte_t *ptep,
pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
{
@@ -211,6 +211,111 @@ static void set_write_prot_commit_flush_ptes(struct vm_area_struct *vma,
commit_anon_folio_batch(vma, folio, page, addr, ptep, oldpte, ptent, nr_ptes, tlb);
}
+static long change_softleaf_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte, pte_t oldpte, unsigned long cp_flags)
+{
+ const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ softleaf_t entry = softleaf_from_pte(oldpte);
+ pte_t newpte;
+
+ if (softleaf_is_migration_write(entry)) {
+ const struct folio *folio = softleaf_to_folio(entry);
+
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ if (folio_test_anon(folio))
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ } else if (softleaf_is_device_private_write(entry)) {
+ /*
+ * We do not preserve soft-dirtiness. See
+ * copy_nonpresent_pte() for explanation.
+ */
+ entry = make_readable_device_private_entry(swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (softleaf_is_marker(entry)) {
+ /*
+ * Ignore error swap entries unconditionally,
+ * because any access should sigbus/sigsegv
+ * anyway.
+ */
+ if (softleaf_is_poison_marker(entry) ||
+ softleaf_is_guard_marker(entry))
+ return 0;
+ /*
+ * If this is uffd-wp pte marker and we'd like
+ * to unprotect it, drop it; the next page
+ * fault will trigger without uffd trapping.
+ */
+ if (uffd_wp_resolve) {
+ pte_clear(vma->vm_mm, addr, pte);
+ return 1;
+ }
+ return 0;
+ } else {
+ newpte = oldpte;
+ }
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+
+ if (!pte_same(oldpte, newpte)) {
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
+ return 1;
+ }
+ return 0;
+}
+
+static __always_inline void change_present_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
+ int nr_ptes, unsigned long end, pgprot_t newprot,
+ struct folio *folio, struct page *page, unsigned long cp_flags)
+{
+ const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ pte_t ptent, oldpte;
+
+ oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes);
+ ptent = pte_modify(oldpte, newprot);
+
+ if (uffd_wp)
+ ptent = pte_mkuffd_wp(ptent);
+ else if (uffd_wp_resolve)
+ ptent = pte_clear_uffd_wp(ptent);
+
+ /*
+ * In some writable, shared mappings, we might want
+ * to catch actual write access -- see
+ * vma_wants_writenotify().
+ *
+ * In all writable, private mappings, we have to
+ * properly handle COW.
+ *
+ * In both cases, we can sometimes still change PTEs
+ * writable and avoid the write-fault handler, for
+ * example, if a PTE is already dirty and no other
+ * COW or special handling is required.
+ */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
+ !pte_write(ptent))
+ set_write_prot_commit_flush_ptes(vma, folio, page,
+ addr, ptep, oldpte, ptent, nr_ptes, tlb);
+ else
+ prot_commit_flush_ptes(vma, addr, ptep, oldpte, ptent,
+ nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+}
+
static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -221,7 +326,6 @@ static long change_pte_range(struct mmu_gather *tlb,
bool is_private_single_threaded;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
- bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
int nr_ptes;
tlb_change_page_size(tlb, PAGE_SIZE);
@@ -242,7 +346,6 @@ static long change_pte_range(struct mmu_gather *tlb,
int max_nr_ptes = (end - addr) >> PAGE_SHIFT;
struct folio *folio = NULL;
struct page *page;
- pte_t ptent;
/* Already in the desired state. */
if (prot_numa && pte_protnone(oldpte))
@@ -268,34 +371,20 @@ static long change_pte_range(struct mmu_gather *tlb,
nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
- oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
- ptent = pte_modify(oldpte, newprot);
-
- if (uffd_wp)
- ptent = pte_mkuffd_wp(ptent);
- else if (uffd_wp_resolve)
- ptent = pte_clear_uffd_wp(ptent);
-
/*
- * In some writable, shared mappings, we might want
- * to catch actual write access -- see
- * vma_wants_writenotify().
- *
- * In all writable, private mappings, we have to
- * properly handle COW.
- *
- * In both cases, we can sometimes still change PTEs
- * writable and avoid the write-fault handler, for
- * example, if a PTE is already dirty and no other
- * COW or special handling is required.
+ * Optimize for the small-folio common case by
+ * special-casing it here. Compiler constant propagation
+ * plus copious amounts of __always_inline does wonders.
*/
- if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
- !pte_write(ptent))
- set_write_prot_commit_flush_ptes(vma, folio, page,
- addr, pte, oldpte, ptent, nr_ptes, tlb);
- else
- prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
- nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
+ if (likely(nr_ptes == 1)) {
+ change_present_ptes(tlb, vma, addr, pte, 1,
+ end, newprot, folio, page, cp_flags);
+ } else {
+ change_present_ptes(tlb, vma, addr, pte,
+ nr_ptes, end, newprot, folio, page,
+ cp_flags);
+ }
+
pages += nr_ptes;
} else if (pte_none(oldpte)) {
/*
@@ -317,66 +406,7 @@ static long change_pte_range(struct mmu_gather *tlb,
pages++;
}
} else {
- softleaf_t entry = softleaf_from_pte(oldpte);
- pte_t newpte;
-
- if (softleaf_is_migration_write(entry)) {
- const struct folio *folio = softleaf_to_folio(entry);
-
- /*
- * A protection check is difficult so
- * just be safe and disable write
- */
- if (folio_test_anon(folio))
- entry = make_readable_exclusive_migration_entry(
- swp_offset(entry));
- else
- entry = make_readable_migration_entry(swp_offset(entry));
- newpte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(oldpte))
- newpte = pte_swp_mksoft_dirty(newpte);
- } else if (softleaf_is_device_private_write(entry)) {
- /*
- * We do not preserve soft-dirtiness. See
- * copy_nonpresent_pte() for explanation.
- */
- entry = make_readable_device_private_entry(
- swp_offset(entry));
- newpte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(oldpte))
- newpte = pte_swp_mkuffd_wp(newpte);
- } else if (softleaf_is_marker(entry)) {
- /*
- * Ignore error swap entries unconditionally,
- * because any access should sigbus/sigsegv
- * anyway.
- */
- if (softleaf_is_poison_marker(entry) ||
- softleaf_is_guard_marker(entry))
- continue;
- /*
- * If this is uffd-wp pte marker and we'd like
- * to unprotect it, drop it; the next page
- * fault will trigger without uffd trapping.
- */
- if (uffd_wp_resolve) {
- pte_clear(vma->vm_mm, addr, pte);
- pages++;
- }
- continue;
- } else {
- newpte = oldpte;
- }
-
- if (uffd_wp)
- newpte = pte_swp_mkuffd_wp(newpte);
- else if (uffd_wp_resolve)
- newpte = pte_swp_clear_uffd_wp(newpte);
-
- if (!pte_same(oldpte, newpte)) {
- set_pte_at(vma->vm_mm, addr, pte, newpte);
- pages++;
- }
+ pages += change_softleaf_pte(vma, addr, pte, oldpte, cp_flags);
}
} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
lazy_mmu_mode_disable();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f349ca85b70e..65e205111553 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1242,10 +1242,18 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
union pgtag_ref_handle handle;
union codetag_ref ref;
- if (get_page_tag_ref(page, &ref, &handle)) {
+ if (likely(get_page_tag_ref(page, &ref, &handle))) {
alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
update_page_tag_ref(handle, &ref);
put_page_tag_ref(handle);
+ } else {
+ /*
+ * page_ext is not available yet, record the pfn so we can
+ * clear the tag ref later when page_ext is initialized.
+ */
+ alloc_tag_add_early_pfn(page_to_pfn(page));
+ if (task->alloc_tag)
+ alloc_tag_set_inaccurate(task->alloc_tag);
}
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 330abc5ab7b4..70cea9e24d2f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -276,10 +276,14 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT);
goto out_unlock;
}
+
+ rcu_read_lock();
if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) {
+ rcu_read_unlock();
folio_mark_dirty(folio);
return AOP_WRITEPAGE_ACTIVATE;
}
+ rcu_read_unlock();
__swap_writepage(folio, swap_plug);
return 0;
@@ -307,11 +311,11 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
struct cgroup_subsys_state *css;
struct mem_cgroup *memcg;
- memcg = folio_memcg(folio);
- if (!memcg)
+ if (!folio_memcg_charged(folio))
return;
rcu_read_lock();
+ memcg = folio_memcg(folio);
css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
bio_associate_blkg_from_css(bio, css);
rcu_read_unlock();
@@ -493,7 +497,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
folio_mark_uptodate(folio);
folio_unlock(folio);
}
- count_vm_events(PSWPIN, sio->pages);
+ count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
} else {
for (p = 0; p < sio->pages; p++) {
struct folio *folio = page_folio(sio->bvec[p].bv_page);
diff --git a/mm/percpu.c b/mm/percpu.c
index a2107bdebf0b..b0676b8054ed 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1622,7 +1622,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
return true;
objcg = current_obj_cgroup();
- if (!objcg)
+ if (!objcg || obj_cgroup_is_root(objcg))
return true;
if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
diff --git a/mm/shmem.c b/mm/shmem.c
index 19bf77925fa1..3b5dc21b323c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3177,119 +3177,99 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
#endif /* CONFIG_TMPFS_QUOTA */
#ifdef CONFIG_USERFAULTFD
-int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
-{
- struct inode *inode = file_inode(dst_vma->vm_file);
- struct shmem_inode_info *info = SHMEM_I(inode);
+static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct inode *inode = file_inode(vma->vm_file);
struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ pgoff_t pgoff = linear_page_index(vma, addr);
gfp_t gfp = mapping_gfp_mask(mapping);
- pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- void *page_kaddr;
struct folio *folio;
- int ret;
- pgoff_t max_off;
- if (shmem_inode_acct_blocks(inode, 1)) {
- /*
- * We may have got a page, returned -ENOENT triggering a retry,
- * and now we find ourselves with -ENOMEM. Release the page, to
- * avoid a BUG_ON in our caller.
- */
- if (unlikely(*foliop)) {
- folio_put(*foliop);
- *foliop = NULL;
- }
- return -ENOMEM;
- }
-
- if (!*foliop) {
- ret = -ENOMEM;
- folio = shmem_alloc_folio(gfp, 0, info, pgoff);
- if (!folio)
- goto out_unacct_blocks;
+ if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)))
+ return NULL;
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
- page_kaddr = kmap_local_folio(folio, 0);
- /*
- * The read mmap_lock is held here. Despite the
- * mmap_lock being read recursive a deadlock is still
- * possible if a writer has taken a lock. For example:
- *
- * process A thread 1 takes read lock on own mmap_lock
- * process A thread 2 calls mmap, blocks taking write lock
- * process B thread 1 takes page fault, read lock on own mmap lock
- * process B thread 2 calls mmap, blocks taking write lock
- * process A thread 1 blocks taking read lock on process B
- * process B thread 1 blocks taking read lock on process A
- *
- * Disable page faults to prevent potential deadlock
- * and retry the copy outside the mmap_lock.
- */
- pagefault_disable();
- ret = copy_from_user(page_kaddr,
- (const void __user *)src_addr,
- PAGE_SIZE);
- pagefault_enable();
- kunmap_local(page_kaddr);
-
- /* fallback to copy_from_user outside mmap_lock */
- if (unlikely(ret)) {
- *foliop = folio;
- ret = -ENOENT;
- /* don't free the page */
- goto out_unacct_blocks;
- }
+ folio = shmem_alloc_folio(gfp, 0, info, pgoff);
+ if (!folio)
+ return NULL;
- flush_dcache_folio(folio);
- } else { /* ZEROPAGE */
- clear_user_highpage(&folio->page, dst_addr);
- }
- } else {
- folio = *foliop;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- *foliop = NULL;
+ if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) {
+ folio_put(folio);
+ return NULL;
}
- VM_BUG_ON(folio_test_locked(folio));
- VM_BUG_ON(folio_test_swapbacked(folio));
+ return folio;
+}
+
+static int shmem_mfill_filemap_add(struct folio *folio,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t pgoff = linear_page_index(vma, addr);
+ gfp_t gfp = mapping_gfp_mask(mapping);
+ int err;
+
__folio_set_locked(folio);
__folio_set_swapbacked(folio);
- __folio_mark_uptodate(folio);
-
- ret = -EFAULT;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(pgoff >= max_off))
- goto out_release;
- ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
- if (ret)
- goto out_release;
- ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
- if (ret)
- goto out_release;
+ err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
+ if (err)
+ goto err_unlock;
- ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- &folio->page, true, flags);
- if (ret)
- goto out_delete_from_cache;
+ if (shmem_inode_acct_blocks(inode, 1)) {
+ err = -ENOMEM;
+ goto err_delete_from_cache;
+ }
+ folio_add_lru(folio);
shmem_recalc_inode(inode, 1, 0);
- folio_unlock(folio);
+
return 0;
-out_delete_from_cache:
+
+err_delete_from_cache:
filemap_remove_folio(folio);
-out_release:
+err_unlock:
folio_unlock(folio);
- folio_put(folio);
-out_unacct_blocks:
- shmem_inode_unacct_blocks(inode, 1);
- return ret;
+ return err;
}
+
+static void shmem_mfill_filemap_remove(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ filemap_remove_folio(folio);
+ shmem_recalc_inode(inode, 0, 0);
+ folio_unlock(folio);
+}
+
+static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff)
+{
+ struct folio *folio;
+ int err;
+
+ err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
+ if (err)
+ return ERR_PTR(err);
+
+ return folio;
+}
+
+static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags)
+{
+ return true;
+}
+
+static const struct vm_uffd_ops shmem_uffd_ops = {
+ .can_userfault = shmem_can_userfault,
+ .get_folio_noalloc = shmem_get_folio_noalloc,
+ .alloc_folio = shmem_mfill_folio_alloc,
+ .filemap_add = shmem_mfill_filemap_add,
+ .filemap_remove = shmem_mfill_filemap_remove,
+};
#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS
@@ -5325,6 +5305,9 @@ static const struct vm_operations_struct shmem_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+#ifdef CONFIG_USERFAULTFD
+ .uffd_ops = &shmem_uffd_ops,
+#endif
};
static const struct vm_operations_struct shmem_anon_vm_ops = {
@@ -5334,6 +5317,9 @@ static const struct vm_operations_struct shmem_anon_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+#ifdef CONFIG_USERFAULTFD
+ .uffd_ops = &shmem_uffd_ops,
+#endif
};
int shmem_init_fs_context(struct fs_context *fc)
diff --git a/mm/shrinker.c b/mm/shrinker.c
index c23086bccf4d..76b3f750cf65 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -288,14 +288,10 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
int nid, index, offset;
long nr;
- struct mem_cgroup *parent;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct shrinker_info *child_info, *parent_info;
struct shrinker_info_unit *child_unit, *parent_unit;
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
/* Prevent from concurrent shrinker_info expand */
mutex_lock(&shrinker_mutex);
for_each_node(nid) {
diff --git a/mm/sparse.c b/mm/sparse.c
index 007fd52c621e..effdac6b0ab1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -403,7 +403,6 @@ failed:
ms = __nr_to_section(pnum);
if (!preinited_vmemmap_section(ms))
ms->section_mem_map = 0;
- ms->section_mem_map = 0;
}
}
diff --git a/mm/swap.c b/mm/swap.c
index 78b4aa811fc6..5cc44f0de987 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -91,7 +91,7 @@ static void page_cache_release(struct folio *folio)
__page_cache_release(folio, &lruvec, &flags);
if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
}
void __folio_put(struct folio *folio)
@@ -175,7 +175,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
}
if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
folios_put(fbatch);
}
@@ -240,6 +240,7 @@ void folio_rotate_reclaimable(struct folio *folio)
void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
unsigned int nr_io, unsigned int nr_rotated)
__releases(lruvec->lru_lock)
+ __releases(rcu)
{
unsigned long cost;
@@ -253,6 +254,7 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
if (!cost) {
spin_unlock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
return;
}
@@ -285,8 +287,10 @@ void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
spin_unlock_irq(&lruvec->lru_lock);
lruvec = parent_lruvec(lruvec);
- if (!lruvec)
+ if (!lruvec) {
+ rcu_read_unlock();
break;
+ }
spin_lock_irq(&lruvec->lru_lock);
}
}
@@ -349,7 +353,7 @@ void folio_activate(struct folio *folio)
lruvec = folio_lruvec_lock_irq(folio);
lru_activate(lruvec, folio);
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
folio_set_lru(folio);
}
#endif
@@ -412,18 +416,20 @@ static void lru_gen_inc_refs(struct folio *folio)
static bool lru_gen_clear_refs(struct folio *folio)
{
- struct lru_gen_folio *lrugen;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
+ unsigned long seq;
if (gen < 0)
return true;
set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
- lrugen = &folio_lruvec(folio)->lrugen;
+ rcu_read_lock();
+ seq = READ_ONCE(folio_lruvec(folio)->lrugen.min_seq[type]);
+ rcu_read_unlock();
/* whether can do without shuffling under the LRU lock */
- return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type]));
+ return gen == lru_gen_from_seq(seq);
}
#else /* !CONFIG_LRU_GEN */
@@ -963,7 +969,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
if (folio_is_zone_device(folio)) {
if (lruvec) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
lruvec = NULL;
}
if (folio_ref_sub_and_test(folio, nr_refs))
@@ -977,7 +983,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
/* hugetlb has its own memcg */
if (folio_test_hugetlb(folio)) {
if (lruvec) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
lruvec = NULL;
}
free_huge_folio(folio);
@@ -991,7 +997,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
j++;
}
if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec_unlock_irqrestore(lruvec, flags);
if (!j) {
folio_batch_reinit(folios);
return;
@@ -1084,6 +1090,39 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
fbatch->nr = j;
}
+#ifdef CONFIG_MEMCG
+static void lruvec_reparent_lru(struct lruvec *child_lruvec,
+ struct lruvec *parent_lruvec,
+ enum lru_list lru, int nid)
+{
+ int zid;
+ struct zone *zone;
+
+ if (lru != LRU_UNEVICTABLE)
+ list_splice_tail_init(&child_lruvec->lists[lru], &parent_lruvec->lists[lru]);
+
+ for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
+ unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
+
+ mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
+ }
+}
+
+void lru_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+ enum lru_list lru;
+ struct lruvec *child_lruvec, *parent_lruvec;
+
+ child_lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ parent_lruvec = mem_cgroup_lruvec(parent, NODE_DATA(nid));
+ parent_lruvec->anon_cost += child_lruvec->anon_cost;
+ parent_lruvec->file_cost += child_lruvec->file_cost;
+
+ for_each_lru(lru)
+ lruvec_reparent_lru(child_lruvec, parent_lruvec, lru, nid);
+}
+#endif
+
static const struct ctl_table swap_sysctl_table[] = {
{
.procname = "page-cluster",
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 89879c3ba344..885da1e56466 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -14,12 +14,61 @@
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
-#include <linux/shmem_fs.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
#include "swap.h"
+struct mfill_state {
+ struct userfaultfd_ctx *ctx;
+ unsigned long src_start;
+ unsigned long dst_start;
+ unsigned long len;
+ uffd_flags_t flags;
+
+ struct vm_area_struct *vma;
+ unsigned long src_addr;
+ unsigned long dst_addr;
+ pmd_t *pmd;
+};
+
+static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags)
+{
+ /* anonymous memory does not support MINOR mode */
+ if (vm_flags & VM_UFFD_MINOR)
+ return false;
+ return true;
+}
+
+static struct folio *anon_alloc_folio(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+ addr);
+
+ if (!folio)
+ return NULL;
+
+ if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ return folio;
+}
+
+static const struct vm_uffd_ops anon_uffd_ops = {
+ .can_userfault = anon_can_userfault,
+ .alloc_folio = anon_alloc_folio,
+};
+
+static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return &anon_uffd_ops;
+ return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL;
+}
+
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
{
@@ -143,6 +192,128 @@ static void uffd_mfill_unlock(struct vm_area_struct *vma)
}
#endif
+static void mfill_put_vma(struct mfill_state *state)
+{
+ if (!state->vma)
+ return;
+
+ up_read(&state->ctx->map_changing_lock);
+ uffd_mfill_unlock(state->vma);
+ state->vma = NULL;
+}
+
+static int mfill_get_vma(struct mfill_state *state)
+{
+ struct userfaultfd_ctx *ctx = state->ctx;
+ uffd_flags_t flags = state->flags;
+ struct vm_area_struct *dst_vma;
+ const struct vm_uffd_ops *ops;
+ int err;
+
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len);
+ if (IS_ERR(dst_vma))
+ return PTR_ERR(dst_vma);
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ down_read(&ctx->map_changing_lock);
+ state->vma = dst_vma;
+ err = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
+ goto out_unlock;
+
+ err = -EINVAL;
+
+ /*
+ * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
+ * it will overwrite vm_ops, so vma_is_anonymous must return false.
+ */
+ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
+ dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+
+ /*
+ * validate 'mode' now that we know the dst_vma: don't allow
+ * a wrprotect copy if the userfaultfd didn't register as WP.
+ */
+ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
+ goto out_unlock;
+
+ if (is_vm_hugetlb_page(dst_vma))
+ return 0;
+
+ ops = vma_uffd_ops(dst_vma);
+ if (!ops)
+ goto out_unlock;
+
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
+ !ops->get_folio_noalloc)
+ goto out_unlock;
+
+ return 0;
+
+out_unlock:
+ mfill_put_vma(state);
+ return err;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd = pgd_offset(mm, address);
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, address);
+ if (!pud)
+ return NULL;
+ /*
+ * Note that we didn't run this because the pmd was
+ * missing, the *pmd may be already established and in
+ * turn it may also be a trans_huge_pmd.
+ */
+ return pmd_alloc(mm, pud, address);
+}
+
+static int mfill_establish_pmd(struct mfill_state *state)
+{
+ struct mm_struct *dst_mm = state->ctx->mm;
+ pmd_t *dst_pmd, dst_pmdval;
+
+ dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr);
+ if (unlikely(!dst_pmd))
+ return -ENOMEM;
+
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ if (unlikely(pmd_none(dst_pmdval)) &&
+ unlikely(__pte_alloc(dst_mm, dst_pmd)))
+ return -ENOMEM;
+
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval)))
+ return -EEXIST;
+ if (unlikely(pmd_bad(dst_pmdval)))
+ return -EFAULT;
+
+ state->pmd = dst_pmd;
+ return 0;
+}
+
/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
unsigned long dst_addr)
@@ -165,10 +336,10 @@ static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
* This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
* and anon, and for both shared and private VMAs.
*/
-int mfill_atomic_install_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr, struct page *page,
- bool newly_allocated, uffd_flags_t flags)
+static int mfill_atomic_install_pte(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, struct page *page,
+ uffd_flags_t flags)
{
int ret;
struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -212,9 +383,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
goto out_unlock;
if (page_in_cache) {
- /* Usually, cache pages are already added to LRU */
- if (newly_allocated)
- folio_add_lru(folio);
folio_add_file_rmap_pte(folio, page, dst_vma);
} else {
folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE);
@@ -229,6 +397,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ if (page_in_cache)
+ folio_unlock(folio);
+
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
@@ -238,58 +409,100 @@ out:
return ret;
}
-static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
+static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr)
{
void *kaddr;
int ret;
+
+ kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
+ ret = copy_from_user(kaddr, (const void __user *) src_addr,
+ PAGE_SIZE);
+ pagefault_enable();
+ kunmap_local(kaddr);
+
+ if (ret)
+ return -EFAULT;
+
+ flush_dcache_folio(folio);
+ return ret;
+}
+
+static int mfill_copy_folio_retry(struct mfill_state *state, struct folio *folio)
+{
+ unsigned long src_addr = state->src_addr;
+ void *kaddr;
+ int err;
+
+ /* retry copying with mm_lock dropped */
+ mfill_put_vma(state);
+
+ kaddr = kmap_local_folio(folio, 0);
+ err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE);
+ kunmap_local(kaddr);
+ if (unlikely(err))
+ return -EFAULT;
+
+ flush_dcache_folio(folio);
+
+ /* reget VMA and PMD, they could change underneath us */
+ err = mfill_get_vma(state);
+ if (err)
+ return err;
+
+ err = mfill_establish_pmd(state);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int __mfill_atomic_pte(struct mfill_state *state,
+ const struct vm_uffd_ops *ops)
+{
+ unsigned long dst_addr = state->dst_addr;
+ unsigned long src_addr = state->src_addr;
+ uffd_flags_t flags = state->flags;
struct folio *folio;
+ int ret;
- if (!*foliop) {
- ret = -ENOMEM;
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
- dst_addr);
- if (!folio)
- goto out;
+ folio = ops->alloc_folio(state->vma, state->dst_addr);
+ if (!folio)
+ return -ENOMEM;
- kaddr = kmap_local_folio(folio, 0);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
+ ret = mfill_copy_folio_locked(folio, src_addr);
/*
- * The read mmap_lock is held here. Despite the
- * mmap_lock being read recursive a deadlock is still
- * possible if a writer has taken a lock. For example:
- *
- * process A thread 1 takes read lock on own mmap_lock
- * process A thread 2 calls mmap, blocks taking write lock
- * process B thread 1 takes page fault, read lock on own mmap lock
- * process B thread 2 calls mmap, blocks taking write lock
- * process A thread 1 blocks taking read lock on process B
- * process B thread 1 blocks taking read lock on process A
- *
- * Disable page faults to prevent potential deadlock
- * and retry the copy outside the mmap_lock.
+ * Fallback to copy_from_user outside mmap_lock.
+ * If retry is successful, mfill_copy_folio_locked() returns
+ * with locks retaken by mfill_get_vma().
+ * If there was an error, we must mfill_put_vma() anyway and it
+ * will take care of unlocking if needed.
*/
- pagefault_disable();
- ret = copy_from_user(kaddr, (const void __user *) src_addr,
- PAGE_SIZE);
- pagefault_enable();
- kunmap_local(kaddr);
-
- /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
- ret = -ENOENT;
- *foliop = folio;
- /* don't free the page */
- goto out;
+ ret = mfill_copy_folio_retry(state, folio);
+ if (ret)
+ goto err_folio_put;
}
-
- flush_dcache_folio(folio);
+ } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
+ clear_user_highpage(&folio->page, state->dst_addr);
} else {
- folio = *foliop;
- *foliop = NULL;
+ VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags);
}
/*
@@ -299,63 +512,65 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
*/
__folio_mark_uptodate(folio);
- ret = -ENOMEM;
- if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
- goto out_release;
+ if (ops->filemap_add) {
+ ret = ops->filemap_add(folio, state->vma, state->dst_addr);
+ if (ret)
+ goto err_folio_put;
+ }
- ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- &folio->page, true, flags);
+ ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr,
+ &folio->page, flags);
if (ret)
- goto out_release;
-out:
- return ret;
-out_release:
+ goto err_filemap_remove;
+
+ return 0;
+
+err_filemap_remove:
+ if (ops->filemap_remove)
+ ops->filemap_remove(folio, state->vma);
+err_folio_put:
folio_put(folio);
- goto out;
+ return ret;
}
-static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_copy(struct mfill_state *state)
{
- struct folio *folio;
- int ret = -ENOMEM;
-
- folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
- if (!folio)
- return ret;
-
- if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
- goto out_put;
+ const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma);
/*
- * The memory barrier inside __folio_mark_uptodate makes sure that
- * zeroing out the folio become visible before mapping the page
- * using set_pte_at(). See do_anonymous_page().
+ * The normal page fault path for a MAP_PRIVATE mapping in a
+ * file-backed VMA will invoke the fault, fill the hole in the file and
+ * COW it right away. The result generates plain anonymous memory.
+ * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll
+ * generate anonymous memory directly without actually filling the
+ * hole. For the MAP_PRIVATE case the robustness check only happens in
+ * the pagetable (to verify it's still none) and not in the page cache.
*/
- __folio_mark_uptodate(folio);
+ if (!(state->vma->vm_flags & VM_SHARED))
+ ops = &anon_uffd_ops;
- ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- &folio->page, true, 0);
- if (ret)
- goto out_put;
+ return __mfill_atomic_pte(state, ops);
+}
- return 0;
-out_put:
- folio_put(folio);
- return ret;
+static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state)
+{
+ const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma);
+
+ return __mfill_atomic_pte(state, ops);
}
-static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_zeropage(struct mfill_state *state)
{
+ struct vm_area_struct *dst_vma = state->vma;
+ unsigned long dst_addr = state->dst_addr;
+ pmd_t *dst_pmd = state->pmd;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
int ret;
- if (mm_forbids_zeropage(dst_vma->vm_mm))
- return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
+ if (mm_forbids_zeropage(dst_vma->vm_mm) ||
+ (dst_vma->vm_flags & VM_SHARED))
+ return mfill_atomic_pte_zeroed_folio(state);
_dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr),
dst_vma->vm_page_prot));
@@ -381,28 +596,29 @@ out:
}
/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- uffd_flags_t flags)
+static int mfill_atomic_pte_continue(struct mfill_state *state)
{
- struct inode *inode = file_inode(dst_vma->vm_file);
+ struct vm_area_struct *dst_vma = state->vma;
+ const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma);
+ unsigned long dst_addr = state->dst_addr;
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ uffd_flags_t flags = state->flags;
+ pmd_t *dst_pmd = state->pmd;
struct folio *folio;
struct page *page;
int ret;
- ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
- /* Our caller expects us to return -EFAULT if we failed to find folio */
- if (ret == -ENOENT)
- ret = -EFAULT;
- if (ret)
- goto out;
- if (!folio) {
- ret = -EFAULT;
- goto out;
+ if (!ops) {
+ VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA");
+ return -EOPNOTSUPP;
}
+ folio = ops->get_folio_noalloc(inode, pgoff);
+ /* Our caller expects us to return -EFAULT if we failed to find folio */
+ if (IS_ERR_OR_NULL(folio))
+ return -EFAULT;
+
page = folio_file_page(folio, pgoff);
if (PageHWPoison(page)) {
ret = -EIO;
@@ -410,30 +626,28 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
}
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- page, false, flags);
+ page, flags);
if (ret)
goto out_release;
- folio_unlock(folio);
- ret = 0;
-out:
- return ret;
+ return 0;
+
out_release:
folio_unlock(folio);
folio_put(folio);
- goto out;
+ return ret;
}
/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
-static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- uffd_flags_t flags)
+static int mfill_atomic_pte_poison(struct mfill_state *state)
{
- int ret;
+ struct vm_area_struct *dst_vma = state->vma;
struct mm_struct *dst_mm = dst_vma->vm_mm;
+ unsigned long dst_addr = state->dst_addr;
+ pmd_t *dst_pmd = state->pmd;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
+ int ret;
_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
ret = -EAGAIN;
@@ -462,27 +676,6 @@ out:
return ret;
}
-static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
-
- pgd = pgd_offset(mm, address);
- p4d = p4d_alloc(mm, pgd, address);
- if (!p4d)
- return NULL;
- pud = pud_alloc(mm, p4d, address);
- if (!pud)
- return NULL;
- /*
- * Note that we didn't run this because the pmd was
- * missing, the *pmd may be already established and in
- * turn it may also be a trans_huge_pmd.
- */
- return pmd_alloc(mm, pud, address);
-}
-
#ifdef CONFIG_HUGETLB_PAGE
/*
* mfill_atomic processing for HUGETLB vmas. Note that this routine is
@@ -657,48 +850,21 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
-static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
+static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state)
{
- ssize_t err;
-
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
- return mfill_atomic_pte_continue(dst_pmd, dst_vma,
- dst_addr, flags);
- } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
- return mfill_atomic_pte_poison(dst_pmd, dst_vma,
- dst_addr, flags);
- }
-
- /*
- * The normal page fault path for a shmem will invoke the
- * fault, fill the hole in the file and COW it right away. The
- * result generates plain anonymous memory. So when we are
- * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
- * generate anonymous memory directly without actually filling
- * the hole. For the MAP_PRIVATE case the robustness check
- * only happens in the pagetable (to verify it's still none)
- * and not in the radix tree.
- */
- if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
- err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
- dst_addr, src_addr,
- flags, foliop);
- else
- err = mfill_atomic_pte_zeropage(dst_pmd,
- dst_vma, dst_addr);
- } else {
- err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
- dst_addr, src_addr,
- flags, foliop);
- }
-
- return err;
+ uffd_flags_t flags = state->flags;
+
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
+ return mfill_atomic_pte_continue(state);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON))
+ return mfill_atomic_pte_poison(state);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
+ return mfill_atomic_pte_copy(state);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE))
+ return mfill_atomic_pte_zeropage(state);
+
+ VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags);
+ return -EOPNOTSUPP;
}
static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
@@ -707,13 +873,17 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long len,
uffd_flags_t flags)
{
- struct mm_struct *dst_mm = ctx->mm;
- struct vm_area_struct *dst_vma;
+ struct mfill_state state = (struct mfill_state){
+ .ctx = ctx,
+ .dst_start = dst_start,
+ .src_start = src_start,
+ .flags = flags,
+ .len = len,
+ .src_addr = src_start,
+ .dst_addr = dst_start,
+ };
+ long copied = 0;
ssize_t err;
- pmd_t *dst_pmd;
- unsigned long src_addr, dst_addr;
- long copied;
- struct folio *folio;
/*
* Sanitize the command parameters:
@@ -725,125 +895,35 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
VM_WARN_ON_ONCE(src_start + len <= src_start);
VM_WARN_ON_ONCE(dst_start + len <= dst_start);
- src_addr = src_start;
- dst_addr = dst_start;
- copied = 0;
- folio = NULL;
-retry:
- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
- if (IS_ERR(dst_vma)) {
- err = PTR_ERR(dst_vma);
+ err = mfill_get_vma(&state);
+ if (err)
goto out;
- }
-
- /*
- * If memory mappings are changing because of non-cooperative
- * operation (e.g. mremap) running in parallel, bail out and
- * request the user to retry later
- */
- down_read(&ctx->map_changing_lock);
- err = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
- goto out_unlock;
-
- err = -EINVAL;
- /*
- * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
- * it will overwrite vm_ops, so vma_is_anonymous must return false.
- */
- if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
- dst_vma->vm_flags & VM_SHARED))
- goto out_unlock;
-
- /*
- * validate 'mode' now that we know the dst_vma: don't allow
- * a wrprotect copy if the userfaultfd didn't register as WP.
- */
- if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
- goto out_unlock;
/*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
- if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
+ if (is_vm_hugetlb_page(state.vma))
+ return mfill_atomic_hugetlb(ctx, state.vma, dst_start,
src_start, len, flags);
- if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
- goto out_unlock;
- if (!vma_is_shmem(dst_vma) &&
- uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
- goto out_unlock;
-
- while (src_addr < src_start + len) {
- pmd_t dst_pmdval;
+ while (state.src_addr < src_start + len) {
+ VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
- VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
-
- dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
- if (unlikely(!dst_pmd)) {
- err = -ENOMEM;
+ err = mfill_establish_pmd(&state);
+ if (err)
break;
- }
- dst_pmdval = pmdp_get_lockless(dst_pmd);
- if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_pmd))) {
- err = -ENOMEM;
- break;
- }
- dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is THP don't override it and just be strict.
- * (This includes the case where the PMD used to be THP and
- * changed back to none after __pte_alloc().)
- */
- if (unlikely(!pmd_present(dst_pmdval) ||
- pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
- if (unlikely(pmd_bad(dst_pmdval))) {
- err = -EFAULT;
- break;
- }
/*
* For shmem mappings, khugepaged is allowed to remove page
* tables under us; pte_offset_map_lock() will deal with that.
*/
- err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
- src_addr, flags, &folio);
+ err = mfill_atomic_pte(&state);
cond_resched();
- if (unlikely(err == -ENOENT)) {
- void *kaddr;
-
- up_read(&ctx->map_changing_lock);
- uffd_mfill_unlock(dst_vma);
- VM_WARN_ON_ONCE(!folio);
-
- kaddr = kmap_local_folio(folio, 0);
- err = copy_from_user(kaddr,
- (const void __user *) src_addr,
- PAGE_SIZE);
- kunmap_local(kaddr);
- if (unlikely(err)) {
- err = -EFAULT;
- goto out;
- }
- flush_dcache_folio(folio);
- goto retry;
- } else
- VM_WARN_ON_ONCE(folio);
-
if (!err) {
- dst_addr += PAGE_SIZE;
- src_addr += PAGE_SIZE;
+ state.dst_addr += PAGE_SIZE;
+ state.src_addr += PAGE_SIZE;
copied += PAGE_SIZE;
if (fatal_signal_pending(current))
@@ -853,12 +933,8 @@ retry:
break;
}
-out_unlock:
- up_read(&ctx->map_changing_lock);
- uffd_mfill_unlock(dst_vma);
+ mfill_put_vma(&state);
out:
- if (folio)
- folio_put(folio);
VM_WARN_ON_ONCE(copied < 0);
VM_WARN_ON_ONCE(err > 0);
VM_WARN_ON_ONCE(!copied && !err);
@@ -1938,6 +2014,38 @@ out:
return moved ? moved : err;
}
+bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags,
+ bool wp_async)
+{
+ const struct vm_uffd_ops *ops = vma_uffd_ops(vma);
+
+ if (vma->vm_flags & VM_DROPPABLE)
+ return false;
+
+ vm_flags &= __VM_UFFD_FLAGS;
+
+ /*
+ * If WP is the only mode enabled and context is wp async, allow any
+ * memory type.
+ */
+ if (wp_async && (vm_flags == VM_UFFD_WP))
+ return true;
+
+ /* For any other mode reject VMAs that don't implement vm_uffd_ops */
+ if (!ops)
+ return false;
+
+ /*
+ * If user requested uffd-wp but not enabled pte markers for
+ * uffd-wp, then only anonymous memory is supported
+ */
+ if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) &&
+ !vma_is_anonymous(vma))
+ return false;
+
+ return ops->can_userfault(vma, vm_flags);
+}
+
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
diff --git a/mm/util.c b/mm/util.c
index f063fd4de1e8..232c3930a662 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1281,16 +1281,6 @@ int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
}
EXPORT_SYMBOL(compat_vma_mmap);
-int __vma_check_mmap_hook(struct vm_area_struct *vma)
-{
- /* vm_ops->mapped is not valid if mmap() is specified. */
- if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
- return -EINVAL;
-
- return 0;
-}
-EXPORT_SYMBOL(__vma_check_mmap_hook);
-
static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
const struct page *page)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4bf091b1c8af..bd1b1aa12581 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -269,25 +269,6 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
}
#endif
-/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
- * and including the specified highidx
- * @zone: The current zone in the iterator
- * @pgdat: The pgdat which node_zones are being iterated
- * @idx: The index variable
- * @highidx: The index of the highest zone to return
- *
- * This macro iterates through all managed zones up to and including the specified highidx.
- * The zone iterator enters an invalid state after macro call and must be reinitialized
- * before it can be used again.
- */
-#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
- for ((idx) = 0, (zone) = (pgdat)->node_zones; \
- (idx) <= (highidx); \
- (idx)++, (zone)++) \
- if (!managed_zone(zone)) \
- continue; \
- else
-
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
@@ -409,8 +390,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
* @lru: lru to use
* @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
*/
-static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int zone_idx)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
unsigned long size = 0;
int zid;
@@ -1831,7 +1811,7 @@ bool folio_isolate_lru(struct folio *folio)
folio_get(folio);
lruvec = folio_lruvec_lock_irq(folio);
lruvec_del_folio(lruvec, folio);
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
ret = true;
}
@@ -1885,24 +1865,27 @@ static bool too_many_isolated(struct pglist_data *pgdat, int file,
/*
* move_folios_to_lru() moves folios from private @list to appropriate LRU list.
*
- * Returns the number of pages moved to the given lruvec.
+ * Returns the number of pages moved to the appropriate lruvec.
+ *
+ * Note: The caller must not hold any lruvec lock.
*/
-static unsigned int move_folios_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_folios_to_lru(struct list_head *list)
{
int nr_pages, nr_moved = 0;
+ struct lruvec *lruvec = NULL;
struct folio_batch free_folios;
folio_batch_init(&free_folios);
while (!list_empty(list)) {
struct folio *folio = lru_to_folio(list);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
list_del(&folio->lru);
if (unlikely(!folio_evictable(folio))) {
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
folio_putback_lru(folio);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec = NULL;
continue;
}
@@ -1924,20 +1907,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
mem_cgroup_uncharge_folios(&free_folios);
free_unref_folios(&free_folios);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec = NULL;
}
continue;
}
- /*
- * All pages were isolated from the same lruvec (and isolation
- * inhibits memcg migration).
- */
- VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
lruvec_add_folio(lruvec, folio);
nr_pages = folio_nr_pages(folio);
nr_moved += nr_pages;
@@ -1945,11 +1923,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
workingset_age_nonresident(lruvec, nr_pages);
}
+ if (lruvec)
+ lruvec_unlock_irq(lruvec);
+
if (free_folios.nr) {
- spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
free_unref_folios(&free_folios);
- spin_lock_irq(&lruvec->lru_lock);
}
return nr_moved;
@@ -1998,7 +1977,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
lru_add_drain();
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
&nr_scanned, sc, lru);
@@ -2008,7 +1987,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
mod_lruvec_state(lruvec, item, nr_scanned);
mod_lruvec_state(lruvec, PGSCAN_ANON + file, nr_scanned);
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
if (nr_taken == 0)
return 0;
@@ -2016,16 +1995,16 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
lruvec_memcg(lruvec));
- spin_lock_irq(&lruvec->lru_lock);
- move_folios_to_lru(lruvec, &folio_list);
+ move_folios_to_lru(&folio_list);
mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
mod_lruvec_state(lruvec, item, nr_reclaimed);
mod_lruvec_state(lruvec, PGSTEAL_ANON + file, nr_reclaimed);
+ lruvec_lock_irq(lruvec);
lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
nr_scanned - nr_reclaimed);
@@ -2104,7 +2083,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
lru_add_drain();
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
@@ -2113,7 +2092,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
mod_lruvec_state(lruvec, PGREFILL, nr_scanned);
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
while (!list_empty(&l_hold)) {
struct folio *folio;
@@ -2162,16 +2141,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
/*
* Move folios back to the lru list.
*/
- spin_lock_irq(&lruvec->lru_lock);
-
- nr_activate = move_folios_to_lru(lruvec, &l_active);
- nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
+ nr_activate = move_folios_to_lru(&l_active);
+ nr_deactivate = move_folios_to_lru(&l_inactive);
- __count_vm_events(PGDEACTIVATE, nr_deactivate);
+ count_vm_events(PGDEACTIVATE, nr_deactivate);
count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-
+ lruvec_lock_irq(lruvec);
lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
@@ -2886,8 +2863,9 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
return NULL;
clear_bit(key, &mm->lru_gen.bitmap);
+ mmgrab(mm);
- return mmget_not_zero(mm) ? mm : NULL;
+ return mm;
}
void lru_gen_add_mm(struct mm_struct *mm)
@@ -3087,7 +3065,7 @@ done:
reset_bloom_filter(mm_state, walk->seq + 1);
if (*iter)
- mmput_async(*iter);
+ mmdrop(*iter);
*iter = mm;
@@ -3442,8 +3420,10 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
if (folio_nid(folio) != pgdat->node_id)
return NULL;
+ rcu_read_lock();
if (folio_memcg(folio) != memcg)
- return NULL;
+ folio = NULL;
+ rcu_read_unlock();
return folio;
}
@@ -3803,9 +3783,9 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
}
if (walk->batched) {
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
reset_batch_size(walk);
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
}
cond_resched();
@@ -3965,7 +3945,7 @@ restart:
if (seq < READ_ONCE(lrugen->max_seq))
return false;
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -3980,7 +3960,7 @@ restart:
if (inc_min_seq(lruvec, type, swappiness))
continue;
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
cond_resched();
goto restart;
}
@@ -4015,7 +3995,7 @@ restart:
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
unlock:
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
return success;
}
@@ -4213,12 +4193,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
struct pglist_data *pgdat = folio_pgdat(folio);
- struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
- DEFINE_MAX_SEQ(lruvec);
- int gen = lru_gen_from_seq(max_seq);
+ struct lruvec *lruvec;
+ struct lru_gen_mm_state *mm_state;
+ unsigned long max_seq;
+ int gen;
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4253,6 +4233,12 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
}
}
+ memcg = get_mem_cgroup_from_folio(folio);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ max_seq = READ_ONCE((lruvec)->lrugen.max_seq);
+ gen = lru_gen_from_seq(max_seq);
+ mm_state = get_mm_state(lruvec);
+
lazy_mmu_mode_enable();
pte -= (addr - start) / PAGE_SIZE;
@@ -4302,6 +4288,8 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw, unsigned int nr)
if (mm_state && suitable_to_scan(i, young))
update_bloom_filter(mm_state, max_seq, pvmw->pmd);
+ mem_cgroup_put(memcg);
+
return true;
}
@@ -4437,6 +4425,148 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
}
+bool recheck_lru_gen_max_memcg(struct mem_cgroup *memcg, int nid)
+{
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+ return false;
+ }
+
+ return true;
+}
+
+static void try_to_inc_max_seq_nowalk(struct mem_cgroup *memcg,
+ struct lruvec *lruvec)
+{
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
+ int swappiness = mem_cgroup_swappiness(memcg);
+ DEFINE_MAX_SEQ(lruvec);
+ bool success = false;
+
+ /*
+ * We are not iterating the mm_list here, updating mm_state->seq is just
+ * to make mm walkers work properly.
+ */
+ if (mm_state) {
+ spin_lock(&mm_list->lock);
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+ if (max_seq > mm_state->seq) {
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ success = true;
+ }
+ spin_unlock(&mm_list->lock);
+ } else {
+ success = true;
+ }
+
+ if (success)
+ inc_max_seq(lruvec, max_seq, swappiness);
+}
+
+/*
+ * We need to ensure that the folios of child memcg can be reparented to the
+ * same gen of the parent memcg, so the gens of the parent memcg needed be
+ * incremented to the MAX_NR_GENS before reparenting.
+ */
+void max_lru_gen_memcg(struct mem_cgroup *memcg, int nid)
+{
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ while (get_nr_gens(lruvec, type) < MAX_NR_GENS) {
+ try_to_inc_max_seq_nowalk(memcg, lruvec);
+ cond_resched();
+ }
+ }
+}
+
+/*
+ * Compared to traditional LRU, MGLRU faces the following challenges:
+ *
+ * 1. Each lruvec has between MIN_NR_GENS and MAX_NR_GENS generations, the
+ * number of generations of the parent and child memcg may be different,
+ * so we cannot simply transfer MGLRU folios in the child memcg to the
+ * parent memcg as we did for traditional LRU folios.
+ * 2. The generation information is stored in folio->flags, but we cannot
+ * traverse these folios while holding the lru lock, otherwise it may
+ * cause softlockup.
+ * 3. In walk_update_folio(), the gen of folio and corresponding lru size
+ * may be updated, but the folio is not immediately moved to the
+ * corresponding lru list. Therefore, there may be folios of different
+ * generations on an LRU list.
+ * 4. In lru_gen_del_folio(), the generation to which the folio belongs is
+ * found based on the generation information in folio->flags, and the
+ * corresponding LRU size will be updated. Therefore, we need to update
+ * the lru size correctly during reparenting, otherwise the lru size may
+ * be updated incorrectly in lru_gen_del_folio().
+ *
+ * Finally, we choose a compromise method, which is to splice the lru list in
+ * the child memcg to the lru list of the same generation in the parent memcg
+ * during reparenting.
+ *
+ * The same generation has different meanings in the parent and child memcg,
+ * so this compromise method will cause the LRU inversion problem. But as the
+ * system runs, this problem will be fixed automatically.
+ */
+static void __lru_gen_reparent_memcg(struct lruvec *child_lruvec, struct lruvec *parent_lruvec,
+ int zone, int type)
+{
+ struct lru_gen_folio *child_lrugen, *parent_lrugen;
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ int i;
+
+ child_lrugen = &child_lruvec->lrugen;
+ parent_lrugen = &parent_lruvec->lrugen;
+
+ for (i = 0; i < get_nr_gens(child_lruvec, type); i++) {
+ int gen = lru_gen_from_seq(child_lrugen->max_seq - i);
+ long nr_pages = child_lrugen->nr_pages[gen][type][zone];
+ int child_lru_active = lru_gen_is_active(child_lruvec, gen) ? LRU_ACTIVE : 0;
+ int parent_lru_active = lru_gen_is_active(parent_lruvec, gen) ? LRU_ACTIVE : 0;
+
+ /* Assuming that child pages are colder than parent pages */
+ list_splice_tail_init(&child_lrugen->folios[gen][type][zone],
+ &parent_lrugen->folios[gen][type][zone]);
+
+ WRITE_ONCE(child_lrugen->nr_pages[gen][type][zone], 0);
+ WRITE_ONCE(parent_lrugen->nr_pages[gen][type][zone],
+ parent_lrugen->nr_pages[gen][type][zone] + nr_pages);
+
+ if (lru_gen_is_active(child_lruvec, gen) != lru_gen_is_active(parent_lruvec, gen)) {
+ __update_lru_size(child_lruvec, lru + child_lru_active, zone, -nr_pages);
+ __update_lru_size(parent_lruvec, lru + parent_lru_active, zone, nr_pages);
+ }
+ }
+}
+
+void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent, int nid)
+{
+ struct lruvec *child_lruvec, *parent_lruvec;
+ int type, zid;
+ struct zone *zone;
+ enum lru_list lru;
+
+ child_lruvec = get_lruvec(memcg, nid);
+ parent_lruvec = get_lruvec(parent, nid);
+
+ for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1)
+ for (type = 0; type < ANON_AND_FILE; type++)
+ __lru_gen_reparent_memcg(child_lruvec, parent_lruvec, zid, type);
+
+ for_each_lru(lru) {
+ for_each_managed_zone_pgdat(zone, NODE_DATA(nid), zid, MAX_NR_ZONES - 1) {
+ unsigned long size = mem_cgroup_get_zone_lru_size(child_lruvec, lru, zid);
+
+ mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
+ }
+ }
+}
+
#endif /* CONFIG_MEMCG */
/******************************************************************************
@@ -4630,7 +4760,7 @@ static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
static int get_tier_idx(struct lruvec *lruvec, int type)
{
int tier;
- struct ctrl_pos sp, pv;
+ struct ctrl_pos sp, pv = {};
/*
* To leave a margin for fluctuations, use a larger gain factor (2:3).
@@ -4649,7 +4779,7 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
- struct ctrl_pos sp, pv;
+ struct ctrl_pos sp, pv = {};
if (swappiness <= MIN_SWAPPINESS + 1)
return LRU_GEN_FILE;
@@ -4707,7 +4837,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
@@ -4716,7 +4846,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
scanned = 0;
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
if (list_empty(&list))
return scanned;
@@ -4749,14 +4879,14 @@ retry:
set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
}
- spin_lock_irq(&lruvec->lru_lock);
-
- move_folios_to_lru(lruvec, &list);
+ move_folios_to_lru(&list);
walk = current->reclaim_state->mm_walk;
if (walk && walk->batched) {
walk->lruvec = lruvec;
+ lruvec_lock_irq(lruvec);
reset_batch_size(walk);
+ lruvec_unlock_irq(lruvec);
}
mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
@@ -4766,8 +4896,6 @@ retry:
mod_lruvec_state(lruvec, item, reclaimed);
mod_lruvec_state(lruvec, PGSTEAL_ANON + type, reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
-
list_splice_init(&clean, &list);
if (!list_empty(&list)) {
@@ -4843,10 +4971,6 @@ static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
int i;
enum zone_watermarks mark;
- /* don't abort memcg reclaim to ensure fairness */
- if (!root_reclaim(sc))
- return false;
-
if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
return true;
@@ -4900,9 +5024,24 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* If too many file cache in the coldest generation can't be evicted
* due to being dirty, wake up the flusher.
*/
- if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
+ if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) {
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /*
+ * For cgroupv1 dirty throttling is achieved by waking up
+ * the kernel flusher here and later waiting on folios
+ * which are in writeback to finish (see shrink_folio_list()).
+ *
+ * Flusher may not be able to issue writeback quickly
+ * enough for cgroupv1 writeback throttling to work
+ * on a large system.
+ */
+ if (!writeback_throttling_sane(sc))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ }
+
/* whether this lruvec should be rotated */
return nr_to_scan < 0;
}
@@ -5196,7 +5335,7 @@ static void lru_gen_change_state(bool enabled)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
VM_WARN_ON_ONCE(!state_is_valid(lruvec));
@@ -5204,12 +5343,12 @@ static void lru_gen_change_state(bool enabled)
lruvec->lrugen.enabled = enabled;
while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
cond_resched();
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec_lock_irq(lruvec);
}
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
}
cond_resched();
@@ -7898,7 +8037,7 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- unlock_page_lruvec_irq(lruvec);
+ lruvec_unlock_irq(lruvec);
} else if (pgscanned) {
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c360c1b29ac9..f534972f517d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2141,7 +2141,7 @@ static void vmstat_shepherd(struct work_struct *w)
if (cpu_is_isolated(cpu))
continue;
- if (!delayed_work_pending(dw) && need_update(cpu))
+ if (!work_busy(&dw->work) && need_update(cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 37a94979900f..07e6836d0502 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -244,12 +244,15 @@ static void *lru_gen_eviction(struct folio *folio)
int refs = folio_lru_refs(folio);
bool workingset = folio_test_workingset(folio);
int tier = lru_tier_from_refs(refs, workingset);
- struct mem_cgroup *memcg = folio_memcg(folio);
+ struct mem_cgroup *memcg;
struct pglist_data *pgdat = folio_pgdat(folio);
+ unsigned short memcg_id;
BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
+ rcu_read_lock();
+ memcg = folio_memcg(folio);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
@@ -257,8 +260,10 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
+ memcg_id = mem_cgroup_private_id(memcg);
+ rcu_read_unlock();
- return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type);
+ return pack_shadow(memcg_id, pgdat, token, workingset, type);
}
/*
@@ -541,7 +546,6 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
void workingset_refault(struct folio *folio, void *shadow)
{
bool file = folio_is_file_lru(folio);
- struct pglist_data *pgdat;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
bool workingset;
@@ -564,14 +568,12 @@ void workingset_refault(struct folio *folio, void *shadow)
* locked to guarantee folio_memcg() stability throughout.
*/
nr = folio_nr_pages(folio);
- memcg = folio_memcg(folio);
- pgdat = folio_pgdat(folio);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
-
+ memcg = get_mem_cgroup_from_folio(folio);
+ lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio));
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
if (!workingset_test_recent(shadow, file, &workingset, true))
- return;
+ goto out;
folio_set_active(folio);
workingset_age_nonresident(lruvec, nr);
@@ -587,6 +589,8 @@ void workingset_refault(struct folio *folio, void *shadow)
lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
+out:
+ mem_cgroup_put(memcg);
}
/**
@@ -599,8 +603,11 @@ void workingset_activation(struct folio *folio)
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
*/
- if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+ if (mem_cgroup_disabled() || folio_memcg_charged(folio)) {
+ rcu_read_lock();
workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
+ rcu_read_unlock();
+ }
}
/*
@@ -684,9 +691,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
mem_cgroup_flush_stats_ratelimited(sc->memcg);
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
- pages += lruvec_page_state_local(lruvec,
- NR_LRU_BASE + i);
+ pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1);
+
pages += lruvec_page_state_local(
lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
pages += lruvec_page_state_local(
diff --git a/mm/zswap.c b/mm/zswap.c
index 0823cadd02b6..4b5149173b0e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -242,6 +242,34 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
**********************************/
static void __zswap_pool_empty(struct percpu_ref *ref);
+static void acomp_ctx_free(struct crypto_acomp_ctx *acomp_ctx)
+{
+ if (!acomp_ctx)
+ return;
+
+ /*
+ * If there was an error in allocating @acomp_ctx->req, it
+ * would be set to NULL.
+ */
+ if (acomp_ctx->req)
+ acomp_request_free(acomp_ctx->req);
+
+ acomp_ctx->req = NULL;
+
+ /*
+ * We have to handle both cases here: an error pointer return from
+ * crypto_alloc_acomp_node(); and a) NULL initialization by zswap, or
+ * b) NULL assignment done in a previous call to acomp_ctx_free().
+ */
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+
+ acomp_ctx->acomp = NULL;
+
+ kfree(acomp_ctx->buffer);
+ acomp_ctx->buffer = NULL;
+}
+
static struct zswap_pool *zswap_pool_create(char *compressor)
{
struct zswap_pool *pool;
@@ -263,19 +291,27 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
- pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
+ /* Many things rely on the zero-initialization. */
+ pool->acomp_ctx = alloc_percpu_gfp(*pool->acomp_ctx,
+ GFP_KERNEL | __GFP_ZERO);
if (!pool->acomp_ctx) {
pr_err("percpu alloc failed\n");
goto error;
}
- for_each_possible_cpu(cpu)
- mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
-
+ /*
+ * This is serialized against CPU hotplug operations. Hence, cores
+ * cannot be offlined until this finishes.
+ */
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
+
+ /*
+ * cpuhp_state_add_instance() will not cleanup on failure since
+ * we don't register a hotunplug callback.
+ */
if (ret)
- goto error;
+ goto cpuhp_add_fail;
/* being the current pool takes 1 ref; this func expects the
* caller to always add the new pool as the current pool
@@ -292,6 +328,10 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
ref_fail:
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+
+cpuhp_add_fail:
+ for_each_possible_cpu(cpu)
+ acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu));
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
@@ -322,9 +362,15 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
static void zswap_pool_destroy(struct zswap_pool *pool)
{
+ int cpu;
+
zswap_pool_debug("destroying", pool);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+
+ for_each_possible_cpu(cpu)
+ acomp_ctx_free(per_cpu_ptr(pool->acomp_ctx, cpu));
+
free_percpu(pool->acomp_ctx);
zs_destroy_pool(pool->zs_pool);
@@ -664,8 +710,10 @@ void zswap_folio_swapin(struct folio *folio)
struct lruvec *lruvec;
if (folio) {
+ rcu_read_lock();
lruvec = folio_lruvec(folio);
atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins);
+ rcu_read_unlock();
}
}
@@ -736,39 +784,41 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp = NULL;
- struct acomp_req *req = NULL;
- u8 *buffer = NULL;
- int ret;
+ int ret = -ENOMEM;
- buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
- if (!buffer) {
- ret = -ENOMEM;
- goto fail;
+ /*
+ * To handle cases where the CPU goes through online-offline-online
+ * transitions, we return if the acomp_ctx has already been initialized.
+ */
+ if (acomp_ctx->acomp) {
+ WARN_ON_ONCE(IS_ERR(acomp_ctx->acomp));
+ return 0;
}
- acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
- if (IS_ERR(acomp)) {
+ acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffer)
+ return ret;
+
+ /*
+ * In case of an error, crypto_alloc_acomp_node() returns an
+ * error pointer, never NULL.
+ */
+ acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp_ctx->acomp)) {
pr_err("could not alloc crypto acomp %s : %pe\n",
- pool->tfm_name, acomp);
- ret = PTR_ERR(acomp);
+ pool->tfm_name, acomp_ctx->acomp);
+ ret = PTR_ERR(acomp_ctx->acomp);
goto fail;
}
- req = acomp_request_alloc(acomp);
- if (!req) {
+ /* acomp_request_alloc() returns NULL in case of an error. */
+ acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!acomp_ctx->req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
- ret = -ENOMEM;
goto fail;
}
- /*
- * Only hold the mutex after completing allocations, otherwise we may
- * recurse into zswap through reclaim and attempt to hold the mutex
- * again resulting in a deadlock.
- */
- mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
/*
@@ -776,80 +826,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
* crypto_wait_req(); if the backend of acomp is scomp, the callback
* won't be called, crypto_wait_req() will return without blocking.
*/
- acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
- acomp_ctx->buffer = buffer;
- acomp_ctx->acomp = acomp;
- acomp_ctx->req = req;
- mutex_unlock(&acomp_ctx->mutex);
+ mutex_init(&acomp_ctx->mutex);
return 0;
fail:
- if (!IS_ERR_OR_NULL(acomp))
- crypto_free_acomp(acomp);
- kfree(buffer);
+ acomp_ctx_free(acomp_ctx);
return ret;
}
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct acomp_req *req;
- struct crypto_acomp *acomp;
- u8 *buffer;
-
- if (IS_ERR_OR_NULL(acomp_ctx))
- return 0;
-
- mutex_lock(&acomp_ctx->mutex);
- req = acomp_ctx->req;
- acomp = acomp_ctx->acomp;
- buffer = acomp_ctx->buffer;
- acomp_ctx->req = NULL;
- acomp_ctx->acomp = NULL;
- acomp_ctx->buffer = NULL;
- mutex_unlock(&acomp_ctx->mutex);
-
- /*
- * Do the actual freeing after releasing the mutex to avoid subtle
- * locking dependencies causing deadlocks.
- */
- if (!IS_ERR_OR_NULL(req))
- acomp_request_free(req);
- if (!IS_ERR_OR_NULL(acomp))
- crypto_free_acomp(acomp);
- kfree(buffer);
-
- return 0;
-}
-
-static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
-{
- struct crypto_acomp_ctx *acomp_ctx;
-
- for (;;) {
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
- if (likely(acomp_ctx->req))
- return acomp_ctx;
- /*
- * It is possible that we were migrated to a different CPU after
- * getting the per-CPU ctx but before the mutex was acquired. If
- * the old CPU got offlined, zswap_cpu_comp_dead() could have
- * already freed ctx->req (among other things) and set it to
- * NULL. Just try again on the new CPU that we ended up on.
- */
- mutex_unlock(&acomp_ctx->mutex);
- }
-}
-
-static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
-{
- mutex_unlock(&acomp_ctx->mutex);
-}
-
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zswap_pool *pool)
{
@@ -862,7 +849,9 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
u8 *dst;
bool mapped = false;
- acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -893,11 +882,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
* to the active LRU list in the case.
*/
if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
+ rcu_read_lock();
if (!mem_cgroup_zswap_writeback_enabled(
folio_memcg(page_folio(page)))) {
+ rcu_read_unlock();
comp_ret = comp_ret ? comp_ret : -EINVAL;
goto unlock;
}
+ rcu_read_unlock();
comp_ret = 0;
dlen = PAGE_SIZE;
dst = kmap_local_page(page);
@@ -925,7 +917,7 @@ unlock:
else if (alloc_ret)
zswap_reject_alloc_fail++;
- acomp_ctx_put_unlock(acomp_ctx);
+ mutex_unlock(&acomp_ctx->mutex);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -937,7 +929,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct crypto_acomp_ctx *acomp_ctx;
int ret = 0, dlen;
- acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
zs_obj_read_sg_begin(pool->zs_pool, entry->handle, input, entry->length);
/* zswap entries of length PAGE_SIZE are not compressed. */
@@ -962,7 +955,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
}
zs_obj_read_sg_end(pool->zs_pool, entry->handle);
- acomp_ctx_put_unlock(acomp_ctx);
+ mutex_unlock(&acomp_ctx->mutex);
if (!ret && dlen == PAGE_SIZE)
return true;
@@ -1782,7 +1775,7 @@ static int zswap_setup(void)
ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
"mm/zswap_pool:prepare",
zswap_cpu_comp_prepare,
- zswap_cpu_comp_dead);
+ NULL);
if (ret)
goto hp_fail;
diff --git a/tools/testing/selftests/liveupdate/liveupdate.c b/tools/testing/selftests/liveupdate/liveupdate.c
index c2878e3d5ef9..37c808fbe1e9 100644
--- a/tools/testing/selftests/liveupdate/liveupdate.c
+++ b/tools/testing/selftests/liveupdate/liveupdate.c
@@ -345,4 +345,45 @@ TEST_F(liveupdate_device, preserve_unsupported_fd)
ASSERT_EQ(close(session_fd), 0);
}
+/*
+ * Test Case: Prevent Double Preservation
+ *
+ * Verifies that a file (memfd) can only be preserved once across all active
+ * sessions. Attempting to preserve it a second time, whether in the same or
+ * a different session, should fail with EBUSY.
+ */
+TEST_F(liveupdate_device, prevent_double_preservation)
+{
+ int session_fd1, session_fd2, mem_fd;
+ int ret;
+
+ self->fd1 = open(LIVEUPDATE_DEV, O_RDWR);
+ if (self->fd1 < 0 && errno == ENOENT)
+ SKIP(return, "%s does not exist", LIVEUPDATE_DEV);
+ ASSERT_GE(self->fd1, 0);
+
+ session_fd1 = create_session(self->fd1, "double-preserve-session-1");
+ ASSERT_GE(session_fd1, 0);
+ session_fd2 = create_session(self->fd1, "double-preserve-session-2");
+ ASSERT_GE(session_fd2, 0);
+
+ mem_fd = memfd_create("test-memfd", 0);
+ ASSERT_GE(mem_fd, 0);
+
+ /* First preservation should succeed */
+ ASSERT_EQ(preserve_fd(session_fd1, mem_fd, 0x1111), 0);
+
+ /* Second preservation in a different session should fail with EBUSY */
+ ret = preserve_fd(session_fd2, mem_fd, 0x2222);
+ EXPECT_EQ(ret, -EBUSY);
+
+ /* Second preservation in the same session (different token) should fail with EBUSY */
+ ret = preserve_fd(session_fd1, mem_fd, 0x3333);
+ EXPECT_EQ(ret, -EBUSY);
+
+ ASSERT_EQ(close(mem_fd), 0);
+ ASSERT_EQ(close(session_fd1), 0);
+ ASSERT_EQ(close(session_fd2), 0);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index 447769657634..44f4e703deb9 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -11,6 +11,11 @@ if [[ $(id -u) -ne 0 ]]; then
exit $ksft_skip
fi
+if ! command -v killall >/dev/null 2>&1; then
+ echo "killall not available. Skipping..."
+ exit $ksft_skip
+fi
+
nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
fault_limit_file=limit_in_bytes
diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c
index dbd21d66d383..48e8b1539be3 100644
--- a/tools/testing/selftests/mm/guard-regions.c
+++ b/tools/testing/selftests/mm/guard-regions.c
@@ -21,6 +21,7 @@
#include <sys/uio.h>
#include <unistd.h>
#include "vm_util.h"
+#include "thp_settings.h"
#include "../pidfd/pidfd.h"
@@ -2195,6 +2196,9 @@ TEST_F(guard_regions, collapse)
char *ptr;
int i;
+ if (!thp_available())
+ SKIP(return, "Transparent Hugepages not available\n");
+
/* Need file to be correct size for tests for non-anon. */
if (variant->backing != ANON_BACKED)
ASSERT_EQ(ftruncate(self->fd, size), 0);
diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
index e8328c89d855..788689497e92 100644
--- a/tools/testing/selftests/mm/hmm-tests.c
+++ b/tools/testing/selftests/mm/hmm-tests.c
@@ -34,6 +34,7 @@
*/
#include <lib/test_hmm_uapi.h>
#include <mm/gup_test.h>
+#include <mm/vm_util.h>
struct hmm_buffer {
void *ptr;
@@ -548,7 +549,7 @@ TEST_F(hmm, anon_write_child)
for (migrate = 0; migrate < 2; ++migrate) {
for (use_thp = 0; use_thp < 2; ++use_thp) {
- npages = ALIGN(use_thp ? TWOMEG : HMM_BUFFER_SIZE,
+ npages = ALIGN(use_thp ? read_pmd_pagesize() : HMM_BUFFER_SIZE,
self->page_size) >> self->page_shift;
ASSERT_NE(npages, 0);
size = npages << self->page_shift;
@@ -728,7 +729,7 @@ TEST_F(hmm, anon_write_huge)
int *ptr;
int ret;
- size = 2 * TWOMEG;
+ size = 2 * read_pmd_pagesize();
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
@@ -744,7 +745,7 @@ TEST_F(hmm, anon_write_huge)
buffer->fd, 0);
ASSERT_NE(buffer->ptr, MAP_FAILED);
- size = TWOMEG;
+ size /= 2;
npages = size >> self->page_shift;
map = (void *)ALIGN((uintptr_t)buffer->ptr, size);
ret = madvise(map, size, MADV_HUGEPAGE);
@@ -771,54 +772,6 @@ TEST_F(hmm, anon_write_huge)
}
/*
- * Read numeric data from raw and tagged kernel status files. Used to read
- * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
- */
-static long file_read_ulong(char *file, const char *tag)
-{
- int fd;
- char buf[2048];
- int len;
- char *p, *q;
- long val;
-
- fd = open(file, O_RDONLY);
- if (fd < 0) {
- /* Error opening the file */
- return -1;
- }
-
- len = read(fd, buf, sizeof(buf));
- close(fd);
- if (len < 0) {
- /* Error in reading the file */
- return -1;
- }
- if (len == sizeof(buf)) {
- /* Error file is too large */
- return -1;
- }
- buf[len] = '\0';
-
- /* Search for a tag if provided */
- if (tag) {
- p = strstr(buf, tag);
- if (!p)
- return -1; /* looks like the line we want isn't there */
- p += strlen(tag);
- } else
- p = buf;
-
- val = strtol(p, &q, 0);
- if (*q != ' ') {
- /* Error parsing the file */
- return -1;
- }
-
- return val;
-}
-
-/*
* Write huge TLBFS page.
*/
TEST_F(hmm, anon_write_hugetlbfs)
@@ -826,15 +779,13 @@ TEST_F(hmm, anon_write_hugetlbfs)
struct hmm_buffer *buffer;
unsigned long npages;
unsigned long size;
- unsigned long default_hsize;
+ unsigned long default_hsize = default_huge_page_size();
unsigned long i;
int *ptr;
int ret;
- default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
- if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+ if (!default_hsize)
SKIP(return, "Huge page size could not be determined");
- default_hsize = default_hsize*1024; /* KB to B */
size = ALIGN(TWOMEG, default_hsize);
npages = size >> self->page_shift;
@@ -1606,7 +1557,7 @@ TEST_F(hmm, compound)
struct hmm_buffer *buffer;
unsigned long npages;
unsigned long size;
- unsigned long default_hsize;
+ unsigned long default_hsize = default_huge_page_size();
int *ptr;
unsigned char *m;
int ret;
@@ -1614,10 +1565,8 @@ TEST_F(hmm, compound)
/* Skip test if we can't allocate a hugetlbfs page. */
- default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
- if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+ if (!default_hsize)
SKIP(return, "Huge page size could not be determined");
- default_hsize = default_hsize*1024; /* KB to B */
size = ALIGN(TWOMEG, default_hsize);
npages = size >> self->page_shift;
@@ -2106,7 +2055,7 @@ TEST_F(hmm, migrate_anon_huge_empty)
int *ptr;
int ret;
- size = TWOMEG;
+ size = read_pmd_pagesize();
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
@@ -2158,7 +2107,7 @@ TEST_F(hmm, migrate_anon_huge_zero)
int ret;
int val;
- size = TWOMEG;
+ size = read_pmd_pagesize();
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
@@ -2221,7 +2170,7 @@ TEST_F(hmm, migrate_anon_huge_free)
int *ptr;
int ret;
- size = TWOMEG;
+ size = read_pmd_pagesize();
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
@@ -2280,7 +2229,7 @@ TEST_F(hmm, migrate_anon_huge_fault)
int *ptr;
int ret;
- size = TWOMEG;
+ size = read_pmd_pagesize();
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
@@ -2332,7 +2281,7 @@ TEST_F(hmm, migrate_partial_unmap_fault)
{
struct hmm_buffer *buffer;
unsigned long npages;
- unsigned long size = TWOMEG;
+ unsigned long size = read_pmd_pagesize();
unsigned long i;
void *old_ptr;
void *map;
@@ -2398,7 +2347,7 @@ TEST_F(hmm, migrate_remap_fault)
{
struct hmm_buffer *buffer;
unsigned long npages;
- unsigned long size = TWOMEG;
+ unsigned long size = read_pmd_pagesize();
unsigned long i;
void *old_ptr, *new_ptr = NULL;
void *map;
@@ -2498,7 +2447,7 @@ TEST_F(hmm, migrate_anon_huge_err)
int *ptr;
int ret;
- size = TWOMEG;
+ size = read_pmd_pagesize();
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
@@ -2593,7 +2542,7 @@ TEST_F(hmm, migrate_anon_huge_zero_err)
int *ptr;
int ret;
- size = TWOMEG;
+ size = read_pmd_pagesize();
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c
index 9ac62eb4c97d..31a054fa8134 100644
--- a/tools/testing/selftests/mm/hugetlb_dio.c
+++ b/tools/testing/selftests/mm/hugetlb_dio.c
@@ -17,12 +17,57 @@
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>
+#include <sys/syscall.h>
#include "vm_util.h"
#include "kselftest.h"
-void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
+#ifndef STATX_DIOALIGN
+#define STATX_DIOALIGN 0x00002000U
+#endif
+
+static int get_dio_alignment(int fd)
+{
+ struct statx stx;
+ int ret;
+
+ ret = syscall(__NR_statx, fd, "", AT_EMPTY_PATH, STATX_DIOALIGN, &stx);
+ if (ret < 0)
+ return -1;
+
+ /*
+ * If STATX_DIOALIGN is unsupported, assume no alignment
+ * constraint and let the test proceed.
+ */
+ if (!(stx.stx_mask & STATX_DIOALIGN) || !stx.stx_dio_offset_align)
+ return 1;
+
+ return stx.stx_dio_offset_align;
+}
+
+static bool check_dio_alignment(unsigned int start_off,
+ unsigned int end_off, unsigned int align)
+{
+ unsigned int writesize = end_off - start_off;
+
+ /*
+ * The kernel's DIO path checks that file offset, length, and
+ * buffer address are all multiples of dio_offset_align. When
+ * this test case's parameters don't satisfy that, the write
+ * would fail with -EINVAL before exercising the hugetlb unpin
+ * path, so skip.
+ */
+ if (start_off % align != 0 || writesize % align != 0) {
+ ksft_test_result_skip("DIO align=%u incompatible with offset %u writesize %u\n",
+ align, start_off, writesize);
+ return false;
+ }
+
+ return true;
+}
+
+static void run_dio_using_hugetlb(int fd, unsigned int start_off,
+ unsigned int end_off, unsigned int align)
{
- int fd;
char *buffer = NULL;
char *orig_buffer = NULL;
size_t h_pagesize = 0;
@@ -32,6 +77,9 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
const int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
const int mmap_prot = PROT_READ | PROT_WRITE;
+ if (!check_dio_alignment(start_off, end_off, align))
+ return;
+
writesize = end_off - start_off;
/* Get the default huge page size */
@@ -39,10 +87,9 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
if (!h_pagesize)
ksft_exit_fail_msg("Unable to determine huge page size\n");
- /* Open the file to DIO */
- fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
- if (fd < 0)
- ksft_exit_fail_perror("Error opening file\n");
+ /* Reset file position since fd is shared across tests */
+ if (lseek(fd, 0, SEEK_SET) < 0)
+ ksft_exit_fail_perror("lseek failed\n");
/* Get the free huge pages before allocation */
free_hpage_b = get_free_hugepages();
@@ -71,7 +118,6 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
/* unmap the huge page */
munmap(orig_buffer, h_pagesize);
- close(fd);
/* Get the free huge pages after unmap*/
free_hpage_a = get_free_hugepages();
@@ -89,37 +135,38 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
int main(void)
{
- size_t pagesize = 0;
- int fd;
+ int fd, align;
+ const size_t pagesize = psize();
ksft_print_header();
- /* Open the file to DIO */
- fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
- if (fd < 0)
- ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno));
- close(fd);
-
/* Check if huge pages are free */
if (!get_free_hugepages())
ksft_exit_skip("No free hugepage, exiting\n");
- ksft_set_plan(4);
+ fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
+ if (fd < 0)
+ ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno));
- /* Get base page size */
- pagesize = psize();
+ align = get_dio_alignment(fd);
+ if (align < 0)
+ ksft_exit_skip("Unable to obtain DIO alignment: %s\n",
+ strerror(errno));
+ ksft_set_plan(4);
/* start and end is aligned to pagesize */
- run_dio_using_hugetlb(0, (pagesize * 3));
+ run_dio_using_hugetlb(fd, 0, (pagesize * 3), align);
/* start is aligned but end is not aligned */
- run_dio_using_hugetlb(0, (pagesize * 3) - (pagesize / 2));
+ run_dio_using_hugetlb(fd, 0, (pagesize * 3) - (pagesize / 2), align);
/* start is unaligned and end is aligned */
- run_dio_using_hugetlb(pagesize / 2, (pagesize * 3));
+ run_dio_using_hugetlb(fd, pagesize / 2, (pagesize * 3), align);
/* both start and end are unaligned */
- run_dio_using_hugetlb(pagesize / 2, (pagesize * 3) + (pagesize / 2));
+ run_dio_using_hugetlb(fd, pagesize / 2, (pagesize * 3) + (pagesize / 2), align);
+
+ close(fd);
ksft_finished();
}
diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
index 10b686102b79..519e5ac02db7 100644
--- a/tools/testing/selftests/mm/merge.c
+++ b/tools/testing/selftests/mm/merge.c
@@ -48,6 +48,19 @@ static pid_t do_fork(struct procmap_fd *procmap)
return 0;
}
+#ifdef __NR_mseal
+static int sys_mseal(void *ptr, size_t len, unsigned long flags)
+{
+ return syscall(__NR_mseal, (unsigned long)ptr, len, flags);
+}
+#else
+static int sys_mseal(void *ptr, size_t len, unsigned long flags)
+{
+ errno = ENOSYS;
+ return -1;
+}
+#endif
+
FIXTURE_SETUP(merge)
{
self->page_size = psize();
@@ -1217,6 +1230,81 @@ TEST_F(merge, mremap_correct_placed_faulted)
ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size);
}
+TEST_F(merge, merge_vmas_with_mseal)
+{
+ unsigned int page_size = self->page_size;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr, *ptr2, *ptr3;
+ /* We need our own as cannot munmap() once sealed. */
+ char *carveout;
+
+ /* Invalid mseal() call to see if implemented. */
+ ASSERT_EQ(sys_mseal(NULL, 0, ~0UL), -1);
+ if (errno == ENOSYS)
+ SKIP(return, "mseal not supported, skipping.");
+
+ /* Map carveout. */
+ carveout = mmap(NULL, 5 * page_size, PROT_NONE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ ASSERT_NE(carveout, MAP_FAILED);
+
+ /*
+ * Map 3 separate VMAs:
+ *
+ * |-----------|-----------|-----------|
+ * | RW | RWE | RO |
+ * |-----------|-----------|-----------|
+ * ptr ptr2 ptr3
+ */
+ ptr = mmap(&carveout[page_size], page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ ptr2 = mmap(&carveout[2 * page_size], page_size,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+ ptr3 = mmap(&carveout[3 * page_size], page_size, PROT_READ,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ ASSERT_NE(ptr3, MAP_FAILED);
+
+ /*
+ * mseal the second VMA:
+ *
+ * |-----------|-----------|-----------|
+ * | RW | RWES | RO |
+ * |-----------|-----------|-----------|
+ * ptr ptr2 ptr3
+ */
+ ASSERT_EQ(sys_mseal(ptr2, page_size, 0), 0);
+
+ /* Make first VMA mergeable upon mseal. */
+ ASSERT_EQ(mprotect(ptr, page_size,
+ PROT_READ | PROT_WRITE | PROT_EXEC), 0);
+ /*
+ * At this point we have:
+ *
+ * |-----------|-----------|-----------|
+ * | RWE | RWES | RO |
+ * |-----------|-----------|-----------|
+ * ptr ptr2 ptr3
+ *
+ * Now mseal all of the VMAs.
+ */
+ ASSERT_EQ(sys_mseal(ptr, 3 * page_size, 0), 0);
+
+ /*
+ * We should end up with:
+ *
+ * |-----------------------|-----------|
+ * | RWES | ROS |
+ * |-----------------------|-----------|
+ * ptr ptr3
+ */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 2 * page_size);
+}
+
TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev)
{
struct procmap_fd *procmap = &self->procmap;
diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
index 59c0dbe99a9b..bcfcac99b436 100644
--- a/tools/testing/selftests/mm/soft-dirty.c
+++ b/tools/testing/selftests/mm/soft-dirty.c
@@ -82,7 +82,9 @@ static void test_hugepage(int pagemap_fd, int pagesize)
int i, ret;
if (!thp_is_enabled()) {
- ksft_test_result_skip("Transparent Hugepages not available\n");
+ ksft_print_msg("Transparent Hugepages not available\n");
+ ksft_test_result_skip("Test %s huge page allocation\n", __func__);
+ ksft_test_result_skip("Test %s huge page dirty bit\n", __func__);
return;
}
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index e0167111bdd1..500d07c4938b 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -21,6 +21,7 @@
#include <time.h>
#include "vm_util.h"
#include "kselftest.h"
+#include "thp_settings.h"
uint64_t pagesize;
unsigned int pageshift;
@@ -255,21 +256,6 @@ static int check_after_split_folio_orders(char *vaddr_start, size_t len,
return status;
}
-static void write_file(const char *path, const char *buf, size_t buflen)
-{
- int fd;
- ssize_t numwritten;
-
- fd = open(path, O_WRONLY);
- if (fd == -1)
- ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno));
-
- numwritten = write(fd, buf, buflen - 1);
- close(fd);
- if (numwritten < 1)
- ksft_exit_fail_msg("Write failed\n");
-}
-
static void write_debugfs(const char *fmt, ...)
{
char input[INPUT_MAX];
@@ -772,6 +758,9 @@ int main(int argc, char **argv)
ksft_finished();
}
+ if (!thp_is_enabled())
+ ksft_exit_skip("Transparent Hugepages not available\n");
+
if (argc > 1)
optional_xfs_path = argv[1];
diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c
index 574bd0f8ae48..e748ebfb3d4e 100644
--- a/tools/testing/selftests/mm/thp_settings.c
+++ b/tools/testing/selftests/mm/thp_settings.c
@@ -6,6 +6,7 @@
#include <string.h>
#include <unistd.h>
+#include "vm_util.h"
#include "thp_settings.h"
#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
@@ -64,29 +65,6 @@ int read_file(const char *path, char *buf, size_t buflen)
return (unsigned int) numread;
}
-int write_file(const char *path, const char *buf, size_t buflen)
-{
- int fd;
- ssize_t numwritten;
-
- fd = open(path, O_WRONLY);
- if (fd == -1) {
- printf("open(%s)\n", path);
- exit(EXIT_FAILURE);
- return 0;
- }
-
- numwritten = write(fd, buf, buflen - 1);
- close(fd);
- if (numwritten < 1) {
- printf("write(%s)\n", buf);
- exit(EXIT_FAILURE);
- return 0;
- }
-
- return (unsigned int) numwritten;
-}
-
unsigned long read_num(const char *path)
{
char buf[21];
@@ -104,10 +82,7 @@ void write_num(const char *path, unsigned long num)
char buf[21];
sprintf(buf, "%ld", num);
- if (!write_file(path, buf, strlen(buf) + 1)) {
- perror(path);
- exit(EXIT_FAILURE);
- }
+ write_file(path, buf, strlen(buf) + 1);
}
int thp_read_string(const char *name, const char * const strings[])
@@ -165,11 +140,7 @@ void thp_write_string(const char *name, const char *val)
printf("%s: Pathname is too long\n", __func__);
exit(EXIT_FAILURE);
}
-
- if (!write_file(path, val, strlen(val) + 1)) {
- perror(path);
- exit(EXIT_FAILURE);
- }
+ write_file(path, val, strlen(val) + 1);
}
unsigned long thp_read_num(const char *name)
diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h
index 76eeb712e5f1..7748a9009191 100644
--- a/tools/testing/selftests/mm/thp_settings.h
+++ b/tools/testing/selftests/mm/thp_settings.h
@@ -63,7 +63,6 @@ struct thp_settings {
};
int read_file(const char *path, char *buf, size_t buflen);
-int write_file(const char *path, const char *buf, size_t buflen);
unsigned long read_num(const char *path);
void write_num(const char *path, unsigned long num);
diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
index bcad47c09518..7a9f1035099b 100644
--- a/tools/testing/selftests/mm/transhuge-stress.c
+++ b/tools/testing/selftests/mm/transhuge-stress.c
@@ -17,6 +17,7 @@
#include <sys/mman.h>
#include "vm_util.h"
#include "kselftest.h"
+#include "thp_settings.h"
int backing_fd = -1;
int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
@@ -37,6 +38,9 @@ int main(int argc, char **argv)
ksft_print_header();
+ if (!thp_is_enabled())
+ ksft_exit_skip("Transparent Hugepages not available\n");
+
ram = sysconf(_SC_PHYS_PAGES);
if (ram > SIZE_MAX / psize() / 4)
ram = SIZE_MAX / 4;
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index a6d4ff7dfdc0..db94564f4431 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -764,3 +764,27 @@ int unpoison_memory(unsigned long pfn)
return ret > 0 ? 0 : -errno;
}
+
+void write_file(const char *path, const char *buf, size_t buflen)
+{
+ int fd, saved_errno;
+ ssize_t numwritten;
+
+ if (buflen < 2)
+ ksft_exit_fail_msg("Incorrect buffer len: %zu\n", buflen);
+
+ fd = open(path, O_WRONLY);
+ if (fd == -1)
+ ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno));
+
+ numwritten = write(fd, buf, buflen - 1);
+ saved_errno = errno;
+ close(fd);
+ errno = saved_errno;
+ if (numwritten < 0)
+ ksft_exit_fail_msg("%s write(%.*s) failed: %s\n", path, (int)(buflen - 1),
+ buf, strerror(errno));
+ if (numwritten != buflen - 1)
+ ksft_exit_fail_msg("%s write(%.*s) is truncated, expected %zu bytes, got %zd bytes\n",
+ path, (int)(buflen - 1), buf, buflen - 1, numwritten);
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index e9c4e24769c1..1a07305ceff4 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -166,3 +166,5 @@ int unpoison_memory(unsigned long pfn);
#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0)
#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))
+
+void write_file(const char *path, const char *buf, size_t buflen);