From c354ec9cee909136e168f4f4900d56a491c4d6c5 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:20:38 +0200 Subject: ptrace_get_syscall_info: factor out ptrace_get_syscall_info_op Move the code that calculates the type of the system call stop out of ptrace_get_syscall_info() into a separate function ptrace_get_syscall_info_op() which is going to be used later to implement PTRACE_SET_SYSCALL_INFO API. Link: https://lkml.kernel.org/r/20250303112038.GE24170@strace.io Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Cc: Alexander Gordeev Cc: Alexey Gladkov (Intel) Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Charlie Jenkins Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromiatnikov Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Maciej W. Rozycki Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- kernel/ptrace.c | 58 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d5f89f9ef29f..22e7d74cf4cd 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -921,7 +921,6 @@ ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, unsigned long args[ARRAY_SIZE(info->entry.args)]; int i; - info->op = PTRACE_SYSCALL_INFO_ENTRY; info->entry.nr = syscall_get_nr(child, regs); syscall_get_arguments(child, regs, args); for (i = 0; i < ARRAY_SIZE(args); i++) @@ -943,7 +942,6 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, * diverge significantly enough. */ ptrace_get_syscall_info_entry(child, regs, info); - info->op = PTRACE_SYSCALL_INFO_SECCOMP; info->seccomp.ret_data = child->ptrace_message; /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ @@ -954,7 +952,6 @@ static unsigned long ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { - info->op = PTRACE_SYSCALL_INFO_EXIT; info->exit.rval = syscall_get_error(child, regs); info->exit.is_error = !!info->exit.rval; if (!info->exit.is_error) @@ -965,19 +962,8 @@ ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, } static int -ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, - void __user *datavp) +ptrace_get_syscall_info_op(struct task_struct *child) { - struct pt_regs *regs = task_pt_regs(child); - struct ptrace_syscall_info info = { - .op = PTRACE_SYSCALL_INFO_NONE, - .arch = syscall_get_arch(child), - .instruction_pointer = instruction_pointer(regs), - .stack_pointer = user_stack_pointer(regs), - }; - unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); - unsigned long write_size; - /* * This does not need lock_task_sighand() to access * child->last_siginfo because ptrace_freeze_traced() @@ -988,18 +974,42 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, case SIGTRAP | 0x80: switch (child->ptrace_message) { case PTRACE_EVENTMSG_SYSCALL_ENTRY: - actual_size = ptrace_get_syscall_info_entry(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_ENTRY; case PTRACE_EVENTMSG_SYSCALL_EXIT: - actual_size = ptrace_get_syscall_info_exit(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_EXIT; + default: + return PTRACE_SYSCALL_INFO_NONE; } - break; case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): - actual_size = ptrace_get_syscall_info_seccomp(child, regs, - &info); + return PTRACE_SYSCALL_INFO_SECCOMP; + default: + return PTRACE_SYSCALL_INFO_NONE; + } +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = ptrace_get_syscall_info_op(child), + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_SECCOMP: + actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info); break; } -- cgit v1.2.3 From 26bb32768fe6552de044f782a58b3272073fbfc0 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:20:44 +0200 Subject: ptrace: introduce PTRACE_SET_SYSCALL_INFO request PTRACE_SET_SYSCALL_INFO is a generic ptrace API that complements PTRACE_GET_SYSCALL_INFO by letting the ptracer modify details of system calls the tracee is blocked in. This API allows ptracers to obtain and modify system call details in a straightforward and architecture-agnostic way, providing a consistent way of manipulating the system call number and arguments across architectures. As in case of PTRACE_GET_SYSCALL_INFO, PTRACE_SET_SYSCALL_INFO also does not aim to address numerous architecture-specific system call ABI peculiarities, like differences in the number of system call arguments for such system calls as pread64 and preadv. The current implementation supports changing only those bits of system call information that are used by strace system call tampering, namely, syscall number, syscall arguments, and syscall return value. Support of changing additional details returned by PTRACE_GET_SYSCALL_INFO, such as instruction pointer and stack pointer, could be added later if needed, by using struct ptrace_syscall_info.flags to specify the additional details that should be set. Currently, "flags" and "reserved" fields of struct ptrace_syscall_info must be initialized with zeroes; "arch", "instruction_pointer", and "stack_pointer" fields are currently ignored. PTRACE_SET_SYSCALL_INFO currently supports only PTRACE_SYSCALL_INFO_ENTRY, PTRACE_SYSCALL_INFO_EXIT, and PTRACE_SYSCALL_INFO_SECCOMP operations. Other operations could be added later if needed. Ideally, PTRACE_SET_SYSCALL_INFO should have been introduced along with PTRACE_GET_SYSCALL_INFO, but it didn't happen. The last straw that convinced me to implement PTRACE_SET_SYSCALL_INFO was apparent failure to provide an API of changing the first system call argument on riscv architecture. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_SET_SYSCALL_INFO Modify information about the system call that caused the stop. The "data" argument is a pointer to struct ptrace_syscall_info that specifies the system call information to be set. The "addr" argument should be set to sizeof(struct ptrace_syscall_info)). Link: https://lore.kernel.org/all/59505464-c84a-403d-972f-d4b2055eeaac@gmail.com/ Link: https://lkml.kernel.org/r/20250303112044.GF24170@strace.io Signed-off-by: Dmitry V. Levin Reviewed-by: Alexey Gladkov Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Reviewed-by: Eugene Syromiatnikov Reviewed-by: Oleg Nesterov Cc: Alexander Gordeev Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Maciej W. Rozycki Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- kernel/ptrace.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 22e7d74cf4cd..75a84efad40f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -944,7 +944,10 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, ptrace_get_syscall_info_entry(child, regs, info); info->seccomp.ret_data = child->ptrace_message; - /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + /* + * ret_data is the last non-reserved field + * in struct ptrace_syscall_info.seccomp + */ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); } @@ -1016,6 +1019,118 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, write_size = min(actual_size, user_size); return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; } + +static int +ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int nr = info->entry.nr; + int i; + + /* + * Check that the syscall number specified in info->entry.nr + * is either a value of type "int" or a sign-extended value + * of type "int". + */ + if (nr != info->entry.nr) + return -ERANGE; + + for (i = 0; i < ARRAY_SIZE(args); i++) { + args[i] = info->entry.args[i]; + /* + * Check that the syscall argument specified in + * info->entry.args[i] is either a value of type + * "unsigned long" or a sign-extended value of type "long". + */ + if (args[i] != info->entry.args[i]) + return -ERANGE; + } + + syscall_set_nr(child, regs, nr); + /* + * If the syscall number is set to -1, setting syscall arguments is not + * just pointless, it would also clobber the syscall return value on + * those architectures that share the same register both for the first + * argument of syscall and its return value. + */ + if (nr != -1) + syscall_set_arguments(child, regs, args); + + return 0; +} + +static int +ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * info->entry is currently a subset of info->seccomp, + * info->seccomp.ret_data is currently ignored. + */ + return ptrace_set_syscall_info_entry(child, regs, info); +} + +static int +ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + long rval = info->exit.rval; + + /* + * Check that the return value specified in info->exit.rval + * is either a value of type "long" or a sign-extended value + * of type "long". + */ + if (rval != info->exit.rval) + return -ERANGE; + + if (info->exit.is_error) + syscall_set_return_value(child, regs, rval, 0); + else + syscall_set_return_value(child, regs, 0, rval); + + return 0; +} + +static int +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size, + const void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info; + + if (user_size < sizeof(info)) + return -EINVAL; + + /* + * The compatibility is tracked by info.op and info.flags: if user-space + * does not instruct us to use unknown extra bits from future versions + * of ptrace_syscall_info, we are not going to read them either. + */ + if (copy_from_user(&info, datavp, sizeof(info))) + return -EFAULT; + + /* Reserved for future use. */ + if (info.flags || info.reserved) + return -EINVAL; + + /* Changing the type of the system call stop is not supported yet. */ + if (ptrace_get_syscall_info_op(child) != info.op) + return -EINVAL; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + return ptrace_set_syscall_info_entry(child, regs, &info); + case PTRACE_SYSCALL_INFO_EXIT: + return ptrace_set_syscall_info_exit(child, regs, &info); + case PTRACE_SYSCALL_INFO_SECCOMP: + return ptrace_set_syscall_info_seccomp(child, regs, &info); + default: + /* Other types of system call stops are not supported yet. */ + return -EINVAL; + } +} #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, @@ -1234,6 +1349,10 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_GET_SYSCALL_INFO: ret = ptrace_get_syscall_info(child, addr, datavp); break; + + case PTRACE_SET_SYSCALL_INFO: + ret = ptrace_set_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: -- cgit v1.2.3 From b56e64466554dfd39a7695f5025f583d8a20288d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Mar 2025 12:37:11 +0100 Subject: kernel/events/uprobes: pass VMA instead of MM to remove_breakpoint() Patch series "kernel/events/uprobes: uprobe_write_opcode() rewrite", v3. Currently, uprobe_write_opcode() implements COW-breaking manually, which is really far from ideal. Further, there is interest in supporting uprobes on hugetlb pages [1], and leaving at least the COW-breaking to the core will make this much easier. Also, I think the current code doesn't really handle some things properly (see patch #3) when replacing/zapping pages. Let's rewrite it, to leave COW-breaking to the fault handler, and handle registration/unregistration by temporarily unmapping the anonymous page, modifying it, and mapping it again. We still have to implement zapping of anonymous pages ourselves, unfortunately. We could look into not performing the temporary unmapping if we can perform the write atomically, which would likely also make adding hugetlb support a lot easier. But, limited (e.g., only PMD/PUD) hugetlb support could be added on top of this with some tweaking. Note that we now won't have to allocate another anonymous folio when unregistering (which will be beneficial for hugetlb as well), we can simply modify the already-mapped one from the registration (if any). When registering a uprobe, we'll first trigger a ptrace-like write fault to break COW, to then modify the already-mapped page. Briefly sanity tested with perf probes and with the bpf uprobes selftest. This patch (of 3): Pass VMA instead of MM to remove_breakpoint() and remove the "MM" argument from install_breakpoint(), because it can easily be derived from the VMA. Link: https://lkml.kernel.org/r/20250321113713.204682-1-david@redhat.com Link: https://lkml.kernel.org/r/20250321113713.204682-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Cc: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andrii Nakryiko Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Mark Rutland Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcox (Oracle) Cc: Namhyung kim Cc: Russel King Cc: tongtiangen Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d783b5882b6..8dc23ca9f66f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1134,10 +1134,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) return ret; } -static int -install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long vaddr) +static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; @@ -1162,9 +1162,11 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; } -static int -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; + set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -1296,10 +1298,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); + err = install_breakpoint(uprobe, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) - err |= remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: @@ -1472,7 +1474,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); - err |= remove_breakpoint(uprobe, mm, vaddr); + err |= remove_breakpoint(uprobe, vma, vaddr); } mmap_read_unlock(mm); @@ -1610,7 +1612,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } -- cgit v1.2.3 From 8a5577428e8e586a107ee5f39eb4c86d32c971cd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Mar 2025 12:37:12 +0100 Subject: kernel/events/uprobes: pass VMA to set_swbp(), set_orig_insn() and uprobe_write_opcode() We already have the VMA, no need to look it up using get_user_page_vma_remote(). We can now switch to get_user_pages_remote(). Link: https://lkml.kernel.org/r/20250321113713.204682-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andrii Nakryiko Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcox (Oracle) Cc: Namhyung kim Cc: Russel King Cc: tongtiangen Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8dc23ca9f66f..c33d710e8db7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -474,19 +474,19 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, uprobe_opcode_t opcode) { + struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; struct page *old_page, *new_page; - struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; bool orig_page_huge = false; unsigned int gup_flags = FOLL_FORCE; @@ -498,9 +498,9 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR(old_page)) - return PTR_ERR(old_page); + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, NULL); + if (ret != 1) + return ret; ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) @@ -590,30 +590,31 @@ put_old: /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); } /** * set_orig_insn - Restore the original instruction. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_orig_insn(struct arch_uprobe *auprobe, + struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } @@ -1153,7 +1154,7 @@ static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) @@ -1168,7 +1169,7 @@ static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { -- cgit v1.2.3 From 6e3092d788be1de0aac56b81fc17551c76645cdf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Mar 2025 12:37:13 +0100 Subject: kernel/events/uprobes: uprobe_write_opcode() rewrite uprobe_write_opcode() does some pretty low-level things that really, it shouldn't be doing: for example, manually breaking COW by allocating anonymous folios and replacing mapped pages. Further, it does seem to do some shaky things: for example, writing to possible COW-shared anonymous pages or zapping anonymous pages that might be pinned. We're also not taking care of uffd, uffd-wp, softdirty ... although rather corner cases here. Let's just get it right like ordinary ptrace writes would. Let's rewrite the code, leaving COW-breaking to core-MM, triggered by FOLL_FORCE|FOLL_WRITE (note that the code was already using FOLL_FORCE). We'll use GUP to lookup/faultin the page and break COW if required. Then, we'll walk the page tables using a folio_walk to perform our page modification atomically by temporarily unmap the PTE + flushing the TLB. Likely, we could avoid the temporary unmap in case we can just atomically write the instruction, but that will be a separate project. Unfortunately, we still have to implement the zapping logic manually, because we only want to zap in specific circumstances (e.g., page content identical). Note that we can now handle large folios (compound pages) and the shared zeropage just fine, so drop these checks. Link: https://lkml.kernel.org/r/20250321113713.204682-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andrii Nakryiko Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcox (Oracle) Cc: Namhyung kim Cc: Russel King Cc: tongtiangen Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 312 ++++++++++++++++++++++++------------------------ 1 file changed, 158 insertions(+), 154 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c33d710e8db7..4c965ba77f9f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -29,6 +29,7 @@ #include #include #include /* check_stable_address_space */ +#include #include @@ -151,91 +152,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } -/** - * __replace_page - replace page in vma by new page. - * based on replace_page in mm/ksm.c - * - * @vma: vma that holds the pte pointing to page - * @addr: address the old @page is mapped at - * @old_page: the page we are replacing by new_page - * @new_page: the modified page we replace page by - * - * If @new_page is NULL, only unmap @old_page. - * - * Returns 0 on success, negative error code otherwise. - */ -static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *old_page, struct page *new_page) -{ - struct folio *old_folio = page_folio(old_page); - struct folio *new_folio; - struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); - int err; - struct mmu_notifier_range range; - pte_t pte; - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, - addr + PAGE_SIZE); - - if (new_page) { - new_folio = page_folio(new_page); - err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); - if (err) - return err; - } - - /* For folio_free_swap() below */ - folio_lock(old_folio); - - mmu_notifier_invalidate_range_start(&range); - err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) - goto unlock; - VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - pte = ptep_get(pvmw.pte); - - /* - * Handle PFN swap PTES, such as device-exclusive ones, that actually - * map pages: simply trigger GUP again to fix it up. - */ - if (unlikely(!pte_present(pte))) { - page_vma_mapped_walk_done(&pvmw); - goto unlock; - } - - if (new_page) { - folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(new_folio, vma); - } else - /* no new page, just dec_mm_counter for old_page */ - dec_mm_counter(mm, MM_ANONPAGES); - - if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_folio)); - inc_mm_counter(mm, MM_ANONPAGES); - } - - flush_cache_page(vma, addr, pte_pfn(pte)); - ptep_clear_flush(vma, addr, pvmw.pte); - if (new_page) - set_pte_at(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); - - folio_remove_rmap_pte(old_folio, old_page, vma); - if (!folio_mapped(old_folio)) - folio_free_swap(old_folio); - page_vma_mapped_walk_done(&pvmw); - folio_put(old_folio); - - err = 0; - unlock: - mmu_notifier_invalidate_range_end(&range); - folio_unlock(old_folio); - return err; -} - /** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. @@ -463,6 +379,95 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, return ret; } +static bool orig_page_is_identical(struct vm_area_struct *vma, + unsigned long vaddr, struct page *page, bool *pmd_mappable) +{ + const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; + struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, + index); + struct page *orig_page; + bool identical; + + if (IS_ERR(orig_folio)) + return false; + orig_page = folio_file_page(orig_folio, index); + + *pmd_mappable = folio_test_pmd_mappable(orig_folio); + identical = folio_test_uptodate(orig_folio) && + pages_identical(page, orig_page); + folio_put(orig_folio); + return identical; +} + +static int __uprobe_write_opcode(struct vm_area_struct *vma, + struct folio_walk *fw, struct folio *folio, + unsigned long opcode_vaddr, uprobe_opcode_t opcode) +{ + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + const bool is_register = !!is_swbp_insn(&opcode); + bool pmd_mappable; + + /* For now, we'll only handle PTE-mapped folios. */ + if (fw->level != FW_LEVEL_PTE) + return -EFAULT; + + /* + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, + * but the VMA might not be writable. + */ + if (!pte_write(fw->pte)) { + if (!PageAnonExclusive(fw->page)) + return -EFAULT; + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) + return -EFAULT; + /* SOFTDIRTY is handled via pte_mkdirty() below. */ + } + + /* + * We'll temporarily unmap the page and flush the TLB, such that we can + * modify the page atomically. + */ + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); + copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + + /* + * When unregistering, we may only zap a PTE if uffd is disabled and + * there are no unexpected folio references ... + */ + if (is_register || userfaultfd_missing(vma) || + (folio_ref_count(folio) != folio_mapcount(folio) + 1 + + folio_test_swapcache(folio) * folio_nr_pages(folio))) + goto remap; + + /* + * ... and the mapped page is identical to the original page that + * would get faulted in on next access. + */ + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) + goto remap; + + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); + folio_remove_rmap_pte(folio, fw->page, vma); + if (!folio_mapped(folio) && folio_test_swapcache(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); + } + folio_put(folio); + + return pmd_mappable; +remap: + /* + * Make sure that our copy_to_page() changes become visible before the + * set_pte_at() write. + */ + smp_wmb(); + /* We modified the page. Make sure to mark the PTE dirty. */ + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); + return 0; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -475,116 +480,115 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. - * @vaddr: the virtual address to store the opcode. - * @opcode: opcode to be written at @vaddr. + * @opcode_vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - unsigned long vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - struct page *old_page, *new_page; int ret, is_register, ref_ctr_updated = 0; - bool orig_page_huge = false; unsigned int gup_flags = FOLL_FORCE; + struct mmu_notifier_range range; + struct folio_walk fw; + struct folio *folio; + struct page *page; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); -retry: + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) + return -EINVAL; + + /* + * When registering, we have to break COW to get an exclusive anonymous + * page that we can safely modify. Use FOLL_WRITE to trigger a write + * fault if required. When unregistering, we might be lucky and the + * anon page is already gone. So defer write faults until really + * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * cannot deal with PMDs yet. + */ if (is_register) - gup_flags |= FOLL_SPLIT_PMD; - /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, NULL); - if (ret != 1) - return ret; + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; - ret = verify_opcode(old_page, vaddr, &opcode); +retry: + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) - goto put_old; - - if (is_zero_page(old_page)) { - ret = -EINVAL; - goto put_old; - } + goto out; + folio = page_folio(page); - if (WARN(!is_register && PageCompound(old_page), - "uprobe unregister should never work on compound page\n")) { - ret = -EINVAL; - goto put_old; + ret = verify_opcode(page, opcode_vaddr, &opcode); + if (ret <= 0) { + folio_put(folio); + goto out; } /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); - if (ret) - goto put_old; + if (ret) { + folio_put(folio); + goto out; + } ref_ctr_updated = 1; } ret = 0; - if (!is_register && !PageAnon(old_page)) - goto put_old; - - ret = anon_vma_prepare(vma); - if (ret) - goto put_old; - - ret = -ENOMEM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); - if (!new_page) - goto put_old; - - __SetPageUptodate(new_page); - copy_highpage(new_page, old_page); - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (unlikely(!folio_test_anon(folio))) { + VM_WARN_ON_ONCE(is_register); + folio_put(folio); + goto out; + } if (!is_register) { - struct page *orig_page; - pgoff_t index; - - VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); - - index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; - orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, - index); - - if (orig_page) { - if (PageUptodate(orig_page) && - pages_identical(new_page, orig_page)) { - /* let go new_page */ - put_page(new_page); - new_page = NULL; - - if (PageCompound(orig_page)) - orig_page_huge = true; - } - put_page(orig_page); - } + /* + * In the common case, we'll be able to zap the page when + * unregistering. So trigger MMU notifiers now, as we won't + * be able to do it under PTL. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + vaddr, vaddr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + } + + ret = -EAGAIN; + /* Walk the page tables again, to perform the actual update. */ + if (folio_walk_start(&fw, vma, vaddr, 0)) { + if (fw.page == page) + ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + folio_walk_end(&fw, vma); } - ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); - if (new_page) - put_page(new_page); -put_old: - put_page(old_page); + if (!is_register) + mmu_notifier_invalidate_range_end(&range); - if (unlikely(ret == -EAGAIN)) + folio_put(folio); + switch (ret) { + case -EFAULT: + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; + fallthrough; + case -EAGAIN: goto retry; + default: + break; + } +out: /* Revert back reference counter if instruction update failed. */ - if (ret && is_register && ref_ctr_updated) + if (ret < 0 && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); /* try collapse pmd for compound page */ - if (!ret && orig_page_huge) + if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); - return ret; + return ret < 0 ? ret : 0; } /** -- cgit v1.2.3 From 8adce0857769d596c5b000d118119e3bb1d63a32 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Thu, 24 Apr 2025 16:28:05 -0400 Subject: cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "vmscan: enforce mems_effective during demotion", v5. Change reclaim to respect cpuset.mems_effective during demotion when possible. Presently, reclaim explicitly ignores cpuset.mems_effective when demoting, which may cause the cpuset settings to violated. Implement cpuset_node_allowed() to check the cpuset.mems_effective associated wih the mem_cgroup of the lruvec being scanned. This only applies to cgroup/cpuset v2, as cpuset exists in a different hierarchy than mem_cgroup in v1. This requires renaming the existing cpuset_node_allowed() to be cpuset_current_now_allowed() - which is more descriptive anyway - to implement the new cpuset_node_allowed() which takes a target cgroup. This patch (of 2): Rename cpuset_node_allowed to reflect that the function checks the current task's cpuset.mems. This allows us to make a new cpuset_node_allowed function that checks a target cgroup's cpuset.mems. Link: https://lkml.kernel.org/r/20250424202806.52632-1-gourry@gourry.net Link: https://lkml.kernel.org/r/20250424202806.52632-2-gourry@gourry.net Signed-off-by: Gregory Price Acked-by: Waiman Long Acked-by: Tejun Heo Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- kernel/cgroup/cpuset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 306b60430091..54f6af362191 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4164,7 +4164,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /* - * cpuset_node_allowed - Can we allocate on a memory node? + * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -4203,7 +4203,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -bool cpuset_node_allowed(int node, gfp_t gfp_mask) +bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ -- cgit v1.2.3 From 7d709f49babc28907b0ac60228f522d2e6216add Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Thu, 24 Apr 2025 16:28:06 -0400 Subject: vmscan,cgroup: apply mems_effective to reclaim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible for a reclaimer to cause demotions of an lruvec belonging to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply this limitation based on the lruvec's memcg and prevent demotion. Notably, this may still allow demotion of shared libraries or any memory first instantiated in another cgroup. This means cpusets still cannot cannot guarantee complete isolation when demotion is enabled, and the docs have been updated to reflect this. This is useful for isolating workloads on a multi-tenant system from certain classes of memory more consistently - with the noted exceptions. Note on locking: The cgroup_get_e_css reference protects the css->effective_mems, and calls of this interface would be subject to the same race conditions associated with a non-atomic access to cs->effective_mems. So while this interface cannot make strong guarantees of correctness, it can therefore avoid taking a global or rcu_read_lock for performance. Link: https://lkml.kernel.org/r/20250424202806.52632-3-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Shakeel Butt Suggested-by: Waiman Long Acked-by: Tejun Heo Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Waiman Long Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- kernel/cgroup/cpuset.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 54f6af362191..83639a12883d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4237,6 +4237,42 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) return allowed; } +bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + struct cgroup_subsys_state *css; + struct cpuset *cs; + bool allowed; + + /* + * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy + * and mems_allowed is likely to be empty even if we could get to it, + * so return true to avoid taking a global lock on the empty check. + */ + if (!cpuset_v2()) + return true; + + css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); + if (!css) + return true; + + /* + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but node_isset is atomic and the reference + * taken via cgroup_get_e_css is sufficient to protect css. + * + * Since this interface is intended for use by migration paths, we + * relax locking here to avoid taking global locks - while accepting + * there may be rare scenarios where the result may be innaccurate. + * + * Reclaim and migration are subject to these same race conditions, and + * cannot make strong isolation guarantees, so this is acceptable. + */ + cs = container_of(css, struct cpuset, css); + allowed = node_isset(nid, cs->effective_mems); + css_put(css); + return allowed; +} + /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor -- cgit v1.2.3 From 3dc92c311498c4d307cfdd0c6c3ac9355b50f683 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 May 2025 00:46:22 -0700 Subject: kexec: add Kexec HandOver (KHO) generation helpers Add the infrastructure to generate Kexec HandOver metadata. Kexec HandOver is a mechanism that allows Linux to preserve state - arbitrary properties as well as memory locations - across kexec. It does so using 2 concepts: 1) KHO FDT - Every KHO kexec carries a KHO specific flattened device tree blob that describes preserved memory regions. Device drivers can register to KHO to serialize and preserve their states before kexec. 2) Scratch Regions - CMA regions that we allocate in the first kernel. CMA gives us the guarantee that no handover pages land in those regions, because handover pages must be at a static physical memory location. We use these regions as the place to load future kexec images so that they won't collide with any handover data. Link: https://lkml.kernel.org/r/20250509074635.3187114-5-changyuanl@google.com Signed-off-by: Alexander Graf Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Pratyush Yadav Signed-off-by: Pratyush Yadav Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/Makefile | 1 + kernel/kexec_handover.c | 557 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 558 insertions(+) create mode 100644 kernel/kexec_handover.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 434929de17ef..97c09847db42 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c new file mode 100644 index 000000000000..e541d3d5003d --- /dev/null +++ b/kernel/kexec_handover.c @@ -0,0 +1,557 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover.c - kexec handover metadata processing + * Copyright (C) 2023 Alexander Graf + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport + * Copyright (C) 2025 Google LLC, Changyuan Lyu + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* + * KHO is tightly coupled with mm init and needs access to some of mm + * internal APIs. + */ +#include "../mm/internal.h" + +#define KHO_FDT_COMPATIBLE "kho-v1" +#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" +#define PROP_SUB_FDT "fdt" + +static bool kho_enable __ro_after_init; + +bool kho_is_enabled(void) +{ + return kho_enable; +} +EXPORT_SYMBOL_GPL(kho_is_enabled); + +static int __init kho_parse_enable(char *p) +{ + return kstrtobool(p, &kho_enable); +} +early_param("kho", kho_parse_enable); + +struct kho_serialization { + struct page *fdt; + struct list_head fdt_list; + struct dentry *sub_fdt_dir; +}; + +/* + * With KHO enabled, memory can become fragmented because KHO regions may + * be anywhere in physical address space. The scratch regions give us a + * safe zones that we will never see KHO allocations from. This is where we + * can later safely load our new kexec images into and then use the scratch + * area for early allocations that happen before page allocator is + * initialized. + */ +static struct kho_scratch *kho_scratch; +static unsigned int kho_scratch_cnt; + +/* + * The scratch areas are scaled by default as percent of memory allocated from + * memblock. A user can override the scale with command line parameter: + * + * kho_scratch=N% + * + * It is also possible to explicitly define size for a lowmem, a global and + * per-node scratch areas: + * + * kho_scratch=l[KMG],n[KMG],m[KMG] + * + * The explicit size definition takes precedence over scale definition. + */ +static unsigned int scratch_scale __initdata = 200; +static phys_addr_t scratch_size_global __initdata; +static phys_addr_t scratch_size_pernode __initdata; +static phys_addr_t scratch_size_lowmem __initdata; + +static int __init kho_parse_scratch_size(char *p) +{ + size_t len; + unsigned long sizes[3]; + int i; + + if (!p) + return -EINVAL; + + len = strlen(p); + if (!len) + return -EINVAL; + + /* parse nn% */ + if (p[len - 1] == '%') { + /* unsigned int max is 4,294,967,295, 10 chars */ + char s_scale[11] = {}; + int ret = 0; + + if (len > ARRAY_SIZE(s_scale)) + return -EINVAL; + + memcpy(s_scale, p, len - 1); + ret = kstrtouint(s_scale, 10, &scratch_scale); + if (!ret) + pr_notice("scratch scale is %d%%\n", scratch_scale); + return ret; + } + + /* parse ll[KMG],mm[KMG],nn[KMG] */ + for (i = 0; i < ARRAY_SIZE(sizes); i++) { + char *endp = p; + + if (i > 0) { + if (*p != ',') + return -EINVAL; + p += 1; + } + + sizes[i] = memparse(p, &endp); + if (!sizes[i] || endp == p) + return -EINVAL; + p = endp; + } + + scratch_size_lowmem = sizes[0]; + scratch_size_global = sizes[1]; + scratch_size_pernode = sizes[2]; + scratch_scale = 0; + + pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", + (u64)(scratch_size_lowmem >> 20), + (u64)(scratch_size_global >> 20), + (u64)(scratch_size_pernode >> 20)); + + return 0; +} +early_param("kho_scratch", kho_parse_scratch_size); + +static void __init scratch_size_update(void) +{ + phys_addr_t size; + + if (!scratch_scale) + return; + + size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, + NUMA_NO_NODE); + size = size * scratch_scale / 100; + scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); + + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + NUMA_NO_NODE); + size = size * scratch_scale / 100 - scratch_size_lowmem; + scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +static phys_addr_t __init scratch_size_node(int nid) +{ + phys_addr_t size; + + if (scratch_scale) { + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + nid); + size = size * scratch_scale / 100; + } else { + size = scratch_size_pernode; + } + + return round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +/** + * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec + * + * With KHO we can preserve arbitrary pages in the system. To ensure we still + * have a large contiguous region of memory when we search the physical address + * space for target memory, let's make sure we always have a large CMA region + * active. This CMA region will only be used for movable pages which are not a + * problem for us during KHO because we can just move them somewhere else. + */ +static void __init kho_reserve_scratch(void) +{ + phys_addr_t addr, size; + int nid, i = 0; + + if (!kho_enable) + return; + + scratch_size_update(); + + /* FIXME: deal with node hot-plug/remove */ + kho_scratch_cnt = num_online_nodes() + 2; + size = kho_scratch_cnt * sizeof(*kho_scratch); + kho_scratch = memblock_alloc(size, PAGE_SIZE); + if (!kho_scratch) + goto err_disable_kho; + + /* + * reserve scratch area in low memory for lowmem allocations in the + * next kernel + */ + size = scratch_size_lowmem; + addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, + ARCH_LOW_ADDRESS_LIMIT); + if (!addr) + goto err_free_scratch_desc; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + /* reserve large contiguous area for allocations without nid */ + size = scratch_size_global; + addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + for_each_online_node(nid) { + size = scratch_size_node(nid); + addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, + 0, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, true); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + } + + return; + +err_free_scratch_areas: + for (i--; i >= 0; i--) + memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); +err_free_scratch_desc: + memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); +err_disable_kho: + kho_enable = false; +} + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +/** + * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. + * @ser: serialization control object passed by KHO notifiers. + * @name: name of the sub tree. + * @fdt: the sub tree blob. + * + * Creates a new child node named @name in KHO root FDT and records + * the physical address of @fdt. The pages of @fdt must also be preserved + * by KHO for the new kernel to retrieve it after kexec. + * + * A debugfs blob entry is also created at + * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * + * Return: 0 on success, error code on failure + */ +int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +{ + int err = 0; + u64 phys = (u64)virt_to_phys(fdt); + void *root = page_to_virt(ser->fdt); + + err |= fdt_begin_node(root, name); + err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); + err |= fdt_end_node(root); + + if (err) + return err; + + return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); +} +EXPORT_SYMBOL_GPL(kho_add_subtree); + +struct kho_out { + struct blocking_notifier_head chain_head; + + struct dentry *dir; + + struct mutex lock; /* protects KHO FDT finalization */ + + struct kho_serialization ser; + bool finalized; +}; + +static struct kho_out kho_out = { + .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), + .lock = __MUTEX_INITIALIZER(kho_out.lock), + .ser = { + .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), + }, + .finalized = false, +}; + +int register_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(register_kho_notifier); + +int unregister_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_kho_notifier); + +/* Handling for debug/kho/out */ + +static struct dentry *debugfs_root; + +static int kho_out_update_debugfs_fdt(void) +{ + int err = 0; + struct fdt_debugfs *ff, *tmp; + + if (kho_out.finalized) { + err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, + "fdt", page_to_virt(kho_out.ser.fdt)); + } else { + list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + } + } + + return err; +} + +static int kho_abort(void) +{ + int err; + + err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, + NULL); + err = notifier_to_errno(err); + + if (err) + pr_err("Failed to abort KHO finalization: %d\n", err); + + return err; +} + +static int kho_finalize(void) +{ + int err = 0; + void *fdt = page_to_virt(kho_out.ser.fdt); + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); + if (err) + goto abort; + + err = blocking_notifier_call_chain(&kho_out.chain_head, + KEXEC_KHO_FINALIZE, &kho_out.ser); + err = notifier_to_errno(err); + if (err) + goto abort; + + err |= fdt_end_node(fdt); + err |= fdt_finish(fdt); + +abort: + if (err) { + pr_err("Failed to convert KHO state tree: %d\n", err); + kho_abort(); + } + + return err; +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + mutex_lock(&kho_out.lock); + *val = kho_out.finalized; + mutex_unlock(&kho_out.lock); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 _val) +{ + int ret = 0; + bool val = !!_val; + + mutex_lock(&kho_out.lock); + + if (val == kho_out.finalized) { + if (kho_out.finalized) + ret = -EEXIST; + else + ret = -ENOENT; + goto unlock; + } + + if (val) + ret = kho_finalize(); + else + ret = kho_abort(); + + if (ret) + goto unlock; + + kho_out.finalized = val; + ret = kho_out_update_debugfs_fdt(); + +unlock: + mutex_unlock(&kho_out.lock); + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +static __init int kho_out_debugfs_init(void) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &fops_kho_out_finalize); + if (IS_ERR(f)) + goto err_rmdir; + + kho_out.dir = dir; + kho_out.ser.sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +static __init int kho_init(void) +{ + int err = 0; + + if (!kho_enable) + return 0; + + kho_out.ser.fdt = alloc_page(GFP_KERNEL); + if (!kho_out.ser.fdt) { + err = -ENOMEM; + goto err_free_scratch; + } + + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) { + err = -ENOENT; + goto err_free_fdt; + } + + err = kho_out_debugfs_init(); + if (err) + goto err_free_fdt; + + for (int i = 0; i < kho_scratch_cnt; i++) { + unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); + unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; + unsigned long pfn; + + for (pfn = base_pfn; pfn < base_pfn + count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); + } + + return 0; + +err_free_fdt: + put_page(kho_out.ser.fdt); + kho_out.ser.fdt = NULL; +err_free_scratch: + for (int i = 0; i < kho_scratch_cnt; i++) { + void *start = __va(kho_scratch[i].addr); + void *end = start + kho_scratch[i].size; + + free_reserved_area(start, end, -1, ""); + } + kho_enable = false; + return err; +} +late_initcall(kho_init); + +void __init kho_memory_init(void) +{ + kho_reserve_scratch(); +} -- cgit v1.2.3 From c609c144b0e8dbc19712ff8c8a0929be38afe58d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 May 2025 00:46:23 -0700 Subject: kexec: add KHO parsing support When we have a KHO kexec, we get an FDT blob and scratch region to populate the state of the system. Provide helper functions that allow architecture code to easily handle memory reservations based on them and give device drivers visibility into the KHO FDT and memory reservations so they can recover their own state. Include a fix from Arnd Bergmann https://lore.kernel.org/lkml/20250424093302.3894961-1-arnd@kernel.org/. Link: https://lkml.kernel.org/r/20250509074635.3187114-6-changyuanl@google.com Signed-off-by: Alexander Graf Signed-off-by: Arnd Bergmann Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 233 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 232 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index e541d3d5003d..59f3cf9557f5 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -17,6 +17,9 @@ #include #include #include + +#include + /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -501,9 +504,112 @@ err_rmdir: return -ENOENT; } +struct kho_in { + struct dentry *dir; + phys_addr_t fdt_phys; + phys_addr_t scratch_phys; + struct list_head fdt_list; +}; + +static struct kho_in kho_in = { + .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), +}; + +static const void *kho_get_fdt(void) +{ + return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; +} + +/** + * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. + * @name: the name of the sub FDT passed to kho_add_subtree(). + * @phys: if found, the physical address of the sub FDT is stored in @phys. + * + * Retrieve a preserved sub FDT named @name and store its physical + * address in @phys. + * + * Return: 0 on success, error code on failure + */ +int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +{ + const void *fdt = kho_get_fdt(); + const u64 *val; + int offset, len; + + if (!fdt) + return -ENOENT; + + if (!phys) + return -EINVAL; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) + return -ENOENT; + + val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); + if (!val || len != sizeof(*val)) + return -EINVAL; + + *phys = (phys_addr_t)*val; + + return 0; +} +EXPORT_SYMBOL_GPL(kho_retrieve_subtree); + +/* Handling for debugfs/kho/in */ + +static __init int kho_in_debugfs_init(const void *fdt) +{ + struct dentry *sub_fdt_dir; + int err, child; + + kho_in.dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(kho_in.dir)) + return PTR_ERR(kho_in.dir); + + sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", + name, len); + continue; + } + err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, + err); + continue; + } + } + + return 0; + +err_rmdir: + debugfs_remove_recursive(kho_in.dir); + return err; +} + static __init int kho_init(void) { int err = 0; + const void *fdt = kho_get_fdt(); if (!kho_enable) return 0; @@ -524,6 +630,20 @@ static __init int kho_init(void) if (err) goto err_free_fdt; + if (fdt) { + err = kho_in_debugfs_init(fdt); + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) + pr_err("failed exposing handover FDT in debugfs: %d\n", + err); + + return 0; + } + for (int i = 0; i < kho_scratch_cnt; i++) { unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; @@ -551,7 +671,118 @@ err_free_scratch: } late_initcall(kho_init); +static void __init kho_release_scratch(void) +{ + phys_addr_t start, end; + u64 i; + + memmap_init_kho_scratch_pages(); + + /* + * Mark scratch mem as CMA before we return it. That way we + * ensure that no kernel allocations happen on it. That means + * we can reuse it as scratch memory again later. + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { + ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); + ulong end_pfn = pageblock_align(PFN_UP(end)); + ulong pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) + set_pageblock_migratetype(pfn_to_page(pfn), + MIGRATE_CMA); + } +} + void __init kho_memory_init(void) { - kho_reserve_scratch(); + if (kho_in.scratch_phys) { + kho_scratch = phys_to_virt(kho_in.scratch_phys); + kho_release_scratch(); + } else { + kho_reserve_scratch(); + } +} + +void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, + phys_addr_t scratch_phys, u64 scratch_len) +{ + void *fdt = NULL; + struct kho_scratch *scratch = NULL; + int err = 0; + unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); + + /* Validate the input FDT */ + fdt = early_memremap(fdt_phys, fdt_len); + if (!fdt) { + pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); + err = -EFAULT; + goto out; + } + err = fdt_check_header(fdt); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", + fdt_phys, err); + err = -EINVAL; + goto out; + } + err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", + fdt_phys, KHO_FDT_COMPATIBLE, err); + err = -EINVAL; + goto out; + } + + scratch = early_memremap(scratch_phys, scratch_len); + if (!scratch) { + pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", + scratch_phys, scratch_len); + err = -EFAULT; + goto out; + } + + /* + * We pass a safe contiguous blocks of memory to use for early boot + * purporses from the previous kernel so that we can resize the + * memblock array as needed. + */ + for (int i = 0; i < scratch_cnt; i++) { + struct kho_scratch *area = &scratch[i]; + u64 size = area->size; + + memblock_add(area->addr, size); + err = memblock_mark_kho_scratch(area->addr, size); + if (WARN_ON(err)) { + pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", + &area->addr, &size, err); + goto out; + } + pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); + } + + memblock_reserve(scratch_phys, scratch_len); + + /* + * Now that we have a viable region of scratch memory, let's tell + * the memblocks allocator to only use that for any allocations. + * That way we ensure that nothing scribbles over in use data while + * we initialize the page tables which we will need to ingest all + * memory reservations from the previous kernel. + */ + memblock_set_kho_scratch_only(); + + kho_in.fdt_phys = fdt_phys; + kho_in.scratch_phys = scratch_phys; + kho_scratch_cnt = scratch_cnt; + pr_info("found kexec handover data. Will skip init for some devices\n"); + +out: + if (fdt) + early_memunmap(fdt, fdt_len); + if (scratch) + early_memunmap(scratch, scratch_len); + if (err) + pr_warn("disabling KHO revival: %d\n", err); } -- cgit v1.2.3 From fc33e4b44b2717feba2f6f07ce7943a96499c9ec Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 9 May 2025 00:46:24 -0700 Subject: kexec: enable KHO support for memory preservation Introduce APIs allowing KHO users to preserve memory across kexec and get access to that memory after boot of the kexeced kernel kho_preserve_folio() - record a folio to be preserved over kexec kho_restore_folio() - recreates the folio from the preserved memory kho_preserve_phys() - record physically contiguous range to be preserved over kexec. The memory preservations are tracked by two levels of xarrays to manage chunks of per-order 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a 1TB x86 system would fit inside a single 512 byte bitmap. For order 0 allocations each bitmap will cover 16M of address space. Thus, for 16G of memory at most 512K of bitmap memory will be needed for order 0. At serialization time all bitmaps are recorded in a linked list of pages for the next kernel to process and the physical address of the list is recorded in KHO FDT. The next kernel then processes that list, reserves the memory ranges and later, when a user requests a folio or a physical range, KHO restores corresponding memory map entries. Link: https://lkml.kernel.org/r/20250509074635.3187114-7-changyuanl@google.com Suggested-by: Jason Gunthorpe Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Alexander Graf Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 411 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 411 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 59f3cf9557f5..9cc818cefd15 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) "KHO: " fmt #include +#include #include #include #include @@ -44,12 +45,307 @@ static int __init kho_parse_enable(char *p) } early_param("kho", kho_parse_enable); +/* + * Keep track of memory that is to be preserved across KHO. + * + * The serializing side uses two levels of xarrays to manage chunks of per-order + * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a + * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations + * each bitmap will cover 16M of address space. Thus, for 16G of memory at most + * 512K of bitmap memory will be needed for order 0. + * + * This approach is fully incremental, as the serialization progresses folios + * can continue be aggregated to the tracker. The final step, immediately prior + * to kexec would serialize the xarray information into a linked list for the + * successor kernel to parse. + */ + +#define PRESERVE_BITS (512 * 8) + +struct kho_mem_phys_bits { + DECLARE_BITMAP(preserve, PRESERVE_BITS); +}; + +struct kho_mem_phys { + /* + * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized + * to order. + */ + struct xarray phys_bits; +}; + +struct kho_mem_track { + /* Points to kho_mem_phys, each order gets its own bitmap tree */ + struct xarray orders; +}; + +struct khoser_mem_chunk; + struct kho_serialization { struct page *fdt; struct list_head fdt_list; struct dentry *sub_fdt_dir; + struct kho_mem_track track; + /* First chunk of serialized preserved memory map */ + struct khoser_mem_chunk *preserved_mem_map; +}; + +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +{ + void *elm, *res; + + elm = xa_load(xa, index); + if (elm) + return elm; + + elm = kzalloc(sz, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); + + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); + if (xa_is_err(res)) + res = ERR_PTR(xa_err(res)); + + if (res) { + kfree(elm); + return res; + } + + return elm; +} + +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + const unsigned long pfn_high = pfn >> order; + + physxa = xa_load(&track->orders, order); + if (!physxa) + continue; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (!bits) + continue; + + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + pfn += 1 << order; + } +} + +static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; + + might_sleep(); + + physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa)); + if (IS_ERR(physxa)) + return PTR_ERR(physxa); + + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, + sizeof(*bits)); + if (IS_ERR(bits)) + return PTR_ERR(bits); + + set_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + return 0; +} + +/* almost as free_reserved_page(), just don't free the page */ +static void kho_restore_page(struct page *page) +{ + ClearPageReserved(page); + init_page_count(page); + adjust_managed_page_count(page, 1); +} + +/** + * kho_restore_folio - recreates the folio from the preserved memory. + * @phys: physical address of the folio. + * + * Return: pointer to the struct folio on success, NULL on failure. + */ +struct folio *kho_restore_folio(phys_addr_t phys) +{ + struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned long order; + + if (!page) + return NULL; + + order = page->private; + if (order) { + if (order > MAX_PAGE_ORDER) + return NULL; + + prep_compound_page(page, order); + } else { + kho_restore_page(page); + } + + return page_folio(page); +} +EXPORT_SYMBOL_GPL(kho_restore_folio); + +/* Serialize and deserialize struct kho_mem_phys across kexec + * + * Record all the bitmaps in a linked list of pages for the next kernel to + * process. Each chunk holds bitmaps of the same order and each block of bitmaps + * starts at a given physical address. This allows the bitmaps to be sparse. The + * xarray is used to store them in a tree while building up the data structure, + * but the KHO successor kernel only needs to process them once in order. + * + * All of this memory is normal kmalloc() memory and is not marked for + * preservation. The successor kernel will remain isolated to the scratch space + * until it completes processing this list. Once processed all the memory + * storing these ranges will be marked as free. + */ + +struct khoser_mem_bitmap_ptr { + phys_addr_t phys_start; + DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); +}; + +struct khoser_mem_chunk_hdr { + DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); + unsigned int order; + unsigned int num_elms; }; +#define KHOSER_BITMAP_SIZE \ + ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ + sizeof(struct khoser_mem_bitmap_ptr)) + +struct khoser_mem_chunk { + struct khoser_mem_chunk_hdr hdr; + struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; +}; + +static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); + +static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, + unsigned long order) +{ + struct khoser_mem_chunk *chunk; + + chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!chunk) + return NULL; + chunk->hdr.order = order; + if (cur_chunk) + KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); + return chunk; +} + +static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) +{ + struct khoser_mem_chunk *chunk = first_chunk; + + while (chunk) { + struct khoser_mem_chunk *tmp = chunk; + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + kfree(tmp); + } +} + +static int kho_mem_serialize(struct kho_serialization *ser) +{ + struct khoser_mem_chunk *first_chunk = NULL; + struct khoser_mem_chunk *chunk = NULL; + struct kho_mem_phys *physxa; + unsigned long order; + + xa_for_each(&ser->track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + chunk = new_chunk(chunk, order); + if (!chunk) + goto err_free; + + if (!first_chunk) + first_chunk = chunk; + + xa_for_each(&physxa->phys_bits, phys, bits) { + struct khoser_mem_bitmap_ptr *elm; + + if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { + chunk = new_chunk(chunk, order); + if (!chunk) + goto err_free; + } + + elm = &chunk->bitmaps[chunk->hdr.num_elms]; + chunk->hdr.num_elms++; + elm->phys_start = (phys * PRESERVE_BITS) + << (order + PAGE_SHIFT); + KHOSER_STORE_PTR(elm->bitmap, bits); + } + } + + ser->preserved_mem_map = first_chunk; + + return 0; + +err_free: + kho_mem_ser_free(first_chunk); + return -ENOMEM; +} + +static void deserialize_bitmap(unsigned int order, + struct khoser_mem_bitmap_ptr *elm) +{ + struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); + unsigned long bit; + + for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { + int sz = 1 << (order + PAGE_SHIFT); + phys_addr_t phys = + elm->phys_start + (bit << (order + PAGE_SHIFT)); + struct page *page = phys_to_page(phys); + + memblock_reserve(phys, sz); + memblock_reserved_mark_noinit(phys, sz); + page->private = order; + } +} + +static void __init kho_mem_deserialize(const void *fdt) +{ + struct khoser_mem_chunk *chunk; + const phys_addr_t *mem; + int len; + + mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + + if (!mem || len != sizeof(*mem)) { + pr_err("failed to get preserved memory bitmaps\n"); + return; + } + + chunk = *mem ? phys_to_virt(*mem) : NULL; + while (chunk) { + unsigned int i; + + for (i = 0; i != chunk->hdr.num_elms; i++) + deserialize_bitmap(chunk->hdr.order, + &chunk->bitmaps[i]); + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + } +} + /* * With KHO enabled, memory can become fragmented because KHO regions may * be anywhere in physical address space. The scratch regions give us a @@ -324,6 +620,9 @@ static struct kho_out kho_out = { .lock = __MUTEX_INITIALIZER(kho_out.lock), .ser = { .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), + .track = { + .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), + }, }, .finalized = false, }; @@ -340,6 +639,73 @@ int unregister_kho_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_kho_notifier); +/** + * kho_preserve_folio - preserve a folio across kexec. + * @folio: folio to preserve. + * + * Instructs KHO to preserve the whole folio across kexec. The order + * will be preserved as well. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.ser.track; + + if (kho_out.finalized) + return -EBUSY; + + return __kho_preserve_order(track, pfn, order); +} +EXPORT_SYMBOL_GPL(kho_preserve_folio); + +/** + * kho_preserve_phys - preserve a physically contiguous range across kexec. + * @phys: physical address of the range. + * @size: size of the range. + * + * Instructs KHO to preserve the memory range from @phys to @phys + @size + * across kexec. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_phys(phys_addr_t phys, size_t size) +{ + unsigned long pfn = PHYS_PFN(phys); + unsigned long failed_pfn = 0; + const unsigned long start_pfn = pfn; + const unsigned long end_pfn = PHYS_PFN(phys + size); + int err = 0; + struct kho_mem_track *track = &kho_out.ser.track; + + if (kho_out.finalized) + return -EBUSY; + + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return -EINVAL; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + err = __kho_preserve_order(track, pfn, order); + if (err) { + failed_pfn = pfn; + break; + } + + pfn += 1 << order; + } + + if (err) + __kho_unpreserve(track, start_pfn, failed_pfn); + + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_phys); + /* Handling for debug/kho/out */ static struct dentry *debugfs_root; @@ -366,6 +732,25 @@ static int kho_out_update_debugfs_fdt(void) static int kho_abort(void) { int err; + unsigned long order; + struct kho_mem_phys *physxa; + + xa_for_each(&kho_out.ser.track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + xa_for_each(&physxa->phys_bits, phys, bits) + kfree(bits); + + xa_destroy(&physxa->phys_bits); + kfree(physxa); + } + xa_destroy(&kho_out.ser.track.orders); + + if (kho_out.ser.preserved_mem_map) { + kho_mem_ser_free(kho_out.ser.preserved_mem_map); + kho_out.ser.preserved_mem_map = NULL; + } err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, NULL); @@ -380,12 +765,25 @@ static int kho_abort(void) static int kho_finalize(void) { int err = 0; + u64 *preserved_mem_map; void *fdt = page_to_virt(kho_out.ser.fdt); err |= fdt_create(fdt, PAGE_SIZE); err |= fdt_finish_reservemap(fdt); err |= fdt_begin_node(fdt, ""); err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); + /** + * Reserve the preserved-memory-map property in the root FDT, so + * that all property definitions will precede subnodes created by + * KHO callers. + */ + err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, + sizeof(*preserved_mem_map), + (void **)&preserved_mem_map); + if (err) + goto abort; + + err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); if (err) goto abort; @@ -395,6 +793,12 @@ static int kho_finalize(void) if (err) goto abort; + err = kho_mem_serialize(&kho_out.ser); + if (err) + goto abort; + + *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); + err |= fdt_end_node(fdt); err |= fdt_finish(fdt); @@ -697,9 +1101,16 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { + struct folio *folio; + if (kho_in.scratch_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); + + kho_mem_deserialize(kho_get_fdt()); + folio = kho_restore_folio(kho_in.fdt_phys); + if (!folio) + pr_warn("failed to restore folio for KHO fdt\n"); } else { kho_reserve_scratch(); } -- cgit v1.2.3 From 3bdecc3c93f9f68d11ed54971dde169b6ead9d78 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 May 2025 00:46:25 -0700 Subject: kexec: add KHO support to kexec file loads Kexec has 2 modes: A user space driven mode and a kernel driven mode. For the kernel driven mode, kernel code determines the physical addresses of all target buffers that the payload gets copied into. With KHO, we can only safely copy payloads into the "scratch area". Teach the kexec file loader about it, so it only allocates for that area. In addition, enlighten it with support to ask the KHO subsystem for its respective payloads to copy into target memory. Also teach the KHO subsystem how to fill the images for file loads. Link: https://lkml.kernel.org/r/20250509074635.3187114-8-changyuanl@google.com Signed-off-by: Alexander Graf Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 13 ++++++++++ kernel/kexec_handover.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kexec_internal.h | 16 ++++++++++++ 3 files changed, 96 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index fba686487e3b..77758c533122 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -253,6 +253,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* IMA needs to pass the measurement list to the next kernel. */ ima_add_kexec_buffer(image); + /* If KHO is active, add its images to the list */ + ret = kho_fill_kimage(image); + if (ret) + goto out; + /* Call image load handler */ ldata = kexec_image_load_default(image); @@ -648,6 +653,14 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; + /* + * If KHO is active, only use KHO scratch memory. All other memory + * could potentially be handed over. + */ + ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback); + if (ret <= 0) + return ret; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 9cc818cefd15..69b953551677 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -26,6 +26,7 @@ * internal APIs. */ #include "../mm/internal.h" +#include "kexec_internal.h" #define KHO_FDT_COMPATIBLE "kho-v1" #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" @@ -1197,3 +1198,69 @@ out: if (err) pr_warn("disabling KHO revival: %d\n", err); } + +/* Helper functions for kexec_file_load */ + +int kho_fill_kimage(struct kimage *image) +{ + ssize_t scratch_size; + int err = 0; + struct kexec_buf scratch; + + if (!kho_enable) + return 0; + + image->kho.fdt = page_to_phys(kho_out.ser.fdt); + + scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; + scratch = (struct kexec_buf){ + .image = image, + .buffer = kho_scratch, + .bufsz = scratch_size, + .mem = KEXEC_BUF_MEM_UNKNOWN, + .memsz = scratch_size, + .buf_align = SZ_64K, /* Makes it easier to map */ + .buf_max = ULONG_MAX, + .top_down = true, + }; + err = kexec_add_buffer(&scratch); + if (err) + return err; + image->kho.scratch = &image->segment[image->nr_segments - 1]; + + return 0; +} + +static int kho_walk_scratch(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret = 0; + int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + struct resource res = { + .start = kho_scratch[i].addr, + .end = kho_scratch[i].addr + kho_scratch[i].size - 1, + }; + + /* Try to fit the kimage into our KHO scratch region */ + ret = func(&res, kbuf); + if (ret) + break; + } + + return ret; +} + +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret; + + if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) + return 1; + + ret = kho_walk_scratch(kbuf, func); + + return ret == 1 ? 0 : -EADDRNOTAVAIL; +} diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index d35d9792402d..30a733a55a67 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -39,4 +39,20 @@ extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ + +struct kexec_buf; + +#ifdef CONFIG_KEXEC_HANDOVER +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)); +int kho_fill_kimage(struct kimage *image); +#else +static inline int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 1; +} + +static inline int kho_fill_kimage(struct kimage *image) { return 0; } +#endif /* CONFIG_KEXEC_HANDOVER */ #endif /* LINUX_KEXEC_INTERNAL_H */ -- cgit v1.2.3 From 4e1d010e3bda2e0e4147e26490dbb1989ef65fc1 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 May 2025 00:46:26 -0700 Subject: kexec: add config option for KHO We have all generic code in place now to support Kexec with KHO. This patch adds a config option that depends on architecture support to enable KHO support. Link: https://lkml.kernel.org/r/20250509074635.3187114-9-changyuanl@google.com Signed-off-by: Alexander Graf Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Changyuan Lyu Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 4d111f871951..4fa212909d69 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -95,6 +95,20 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select DEBUG_FS + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP -- cgit v1.2.3 From 312eca8a14c5f58de62692517afd4738099e6fac Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Apr 2025 14:33:40 +0100 Subject: mm, PM: use for_each_valid_pfn() in kernel/power/snapshot.c Link: https://lkml.kernel.org/r/20250423133821.789413-5-dwmw2@infradead.org Signed-off-by: David Woodhouse Acked-by: Mike Rapoport (Microsoft) Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: David Hildenbrand Cc: Marc Rutland Cc: Marc Zyngier Cc: Ruihan Li Cc: Will Deacon Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- kernel/power/snapshot.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4e6e24e8b854..2af36cfe35cd 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1094,16 +1094,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) ((unsigned long long) region->end_pfn << PAGE_SHIFT) - 1); - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - /* - * It is safe to ignore the result of - * mem_bm_set_bit_check() here, since we won't - * touch the PFNs for which the error is - * returned anyway. - */ - mem_bm_set_bit_check(bm, pfn); - } + for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } @@ -1255,21 +1254,20 @@ static void mark_free_pages(struct zone *zone) spin_lock_irqsave(&zone->lock, flags); max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); + for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) { + page = pfn_to_page(pfn); - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } - if (page_zone(page) != zone) - continue; + if (page_zone(page) != zone) + continue; - if (!swsusp_page_is_forbidden(page)) - swsusp_unset_page_free(page); - } + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } for_each_migratetype_order(order, t) { list_for_each_entry(page, -- cgit v1.2.3 From 1f6c6ac03db4a8fface0ad34ba12e5ffb4d51327 Mon Sep 17 00:00:00 2001 From: Libo Chen Date: Wed, 23 Apr 2025 19:45:22 -0700 Subject: sched/numa: skip VMA scanning on memory pinned to one NUMA node via cpuset.mems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "sched/numa: Skip VMA scanning on memory pinned to one NUMA node via cpuset.mems", v5. This patch (of 2): When the memory of the current task is pinned to one NUMA node by cgroup, there is no point in continuing the rest of VMA scanning and hinting page faults as they will just be overhead. With this change, there will be no more unnecessary PTE updates or page faults in this scenario. We have seen up to a 6x improvement on a typical java workload running on VMs with memory and CPU pinned to one NUMA node via cpuset in a two-socket AARCH64 system. With the same pinning, on a 18-cores-per-socket Intel platform, we have seen 20% improvment in a microbench that creates a 30-vCPU selftest KVM guest with 4GB memory, where each vCPU reads 4KB pages in a fixed number of loops. Link: https://lkml.kernel.org/r/20250424024523.2298272-1-libo.chen@oracle.com Link: https://lkml.kernel.org/r/20250424024523.2298272-2-libo.chen@oracle.com Signed-off-by: Libo Chen Tested-by: Chen Yu Tested-by: Srikanth Aithal Tested-by: Venkat Rao Bagalkote Cc: "Chen, Tim C" Cc: Chris Hyser Cc: Daniel Jordan Cc: Ingo Molnar Cc: Juri Lelli Cc: K Prateek Nayak Cc: Lorenzo Stoakes Cc: Madadi Vineeth Reddy Cc: Mel Gorman Cc: Michal Koutný Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Steven Rostedt Cc: Tejun Heo Cc: Vincent Guittot Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fb9bf995a47..b3b715e8a7cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3329,6 +3329,13 @@ static void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Memory is pinned to only one NUMA node via cpuset.mems, naturally + * no page can be migrated. + */ + if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) + return; + if (!mm->numa_next_scan) { mm->numa_next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); -- cgit v1.2.3 From 3fc567e4c0b71d6c59ba26c5d6e54cf3c490dd3a Mon Sep 17 00:00:00 2001 From: Libo Chen Date: Wed, 23 Apr 2025 19:45:23 -0700 Subject: sched/numa: add tracepoint that tracks the skipping of numa balancing due to cpuset memory pinning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike sched_skip_vma_numa tracepoint which tracks skipped VMAs, this tracks the task subjected to cpuset.mems pinning and prints out its allowed memory node mask. Link: https://lkml.kernel.org/r/20250424024523.2298272-3-libo.chen@oracle.com Signed-off-by: Libo Chen Cc: "Chen, Tim C" Cc: Chen Yu Cc: Chris Hyser Cc: Daniel Jordan Cc: Ingo Molnar Cc: Juri Lelli Cc: K Prateek Nayak Cc: Lorenzo Stoakes Cc: Madadi Vineeth Reddy Cc: Mel Gorman Cc: Michal Koutný Cc: Peter Zijlstra Cc: Raghavendra K T Cc: Srikanth Aithal Cc: Steven Rostedt Cc: Tejun Heo Cc: Venkat Rao Bagalkote Cc: Vincent Guittot Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b3b715e8a7cb..cef163c174bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3333,8 +3333,10 @@ static void task_numa_work(struct callback_head *work) * Memory is pinned to only one NUMA node via cpuset.mems, naturally * no page can be migrated. */ - if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) + if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) { + trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed); return; + } if (!mm->numa_next_scan) { mm->numa_next_scan = now + -- cgit v1.2.3 From 26a8f57760c12ccd860b36ac5b73daede4396aa3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Apr 2025 16:28:16 +0100 Subject: mm: move dup_mmap() to mm This is a key step in our being able to abstract and isolate VMA allocation and destruction logic. This function is the last one where vm_area_free() and vm_area_dup() are directly referenced outside of mmap, so having this in mm allows us to isolate these. We do the same for the nommu version which is substantially simpler. We place the declaration for dup_mmap() in mm/internal.h and have kernel/fork.c import this in order to prevent improper use of this functionality elsewhere in the kernel. While we're here, we remove the useless #ifdef CONFIG_MMU check around mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if CONFIG_MMU is set. Link: https://lkml.kernel.org/r/e49aad3d00212f5539d9fa5769bfda4ce451db3e.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Pedro Falcato Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Signed-off-by: Andrew Morton --- kernel/fork.c | 189 ++-------------------------------------------------------- 1 file changed, 4 insertions(+), 185 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 168681fc4b25..ac9f9267a473 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -112,6 +112,9 @@ #include #include +/* For dup_mmap(). */ +#include "../mm/internal.h" + #include #define CREATE_TRACE_POINTS @@ -589,7 +592,7 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; @@ -604,183 +607,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) } #ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp; - int retval; - unsigned long charge = 0; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, 0); - - if (mmap_write_lock_killable(oldmm)) - return -EINTR; - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - dup_mm_exe_file(mm, oldmm); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - /* Use __mt_dup() to efficiently build an identical maple tree. */ - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); - if (unlikely(retval)) - goto out; - - mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(vmi, mpnt) { - struct file *file; - - vma_start_write(mpnt); - if (mpnt->vm_flags & VM_DONTCOPY) { - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, - mpnt->vm_end, GFP_KERNEL); - if (retval) - goto loop_out; - - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - /* - * Don't duplicate many vmas if we've been oom-killed (for - * example) - */ - if (fatal_signal_pending(current)) { - retval = -EINTR; - goto loop_out; - } - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = vm_area_dup(mpnt); - if (!tmp) - goto fail_nomem; - - /* track_pfn_copy() will later take care of copying internal state. */ - if (unlikely(tmp->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(tmp); - - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* - * VM_WIPEONFORK gets a clean slate in the child. - * Don't prepare anon_vma until fault since we don't - * copy page for current vma. - */ - tmp->anon_vma = NULL; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - vm_flags_clear(tmp, VM_LOCKED_MASK); - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - file = tmp->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - get_file(file); - i_mmap_lock_write(mapping); - if (vma_is_shared_maywrite(tmp)) - mapping_allow_writable(mapping); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(tmp, mpnt); - - if (retval) { - mpnt = vma_next(&vmi); - goto loop_out; - } - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -loop_out: - vma_iter_free(&vmi); - if (!retval) { - mt_set_in_rcu(vmi.mas.tree); - ksm_fork(mm, oldmm); - khugepaged_fork(mm, oldmm); - } else { - - /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. - */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); - } - /* - * The mm_struct is going to exit, but the locks will be dropped - * first. Set the mm_struct as unstable is advisable as it is - * not fully initialised. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - } -out: - mmap_write_unlock(mm); - flush_tlb_mm(oldmm); - mmap_write_unlock(oldmm); - if (!retval) - dup_userfaultfd_complete(&uf); - else - dup_userfaultfd_fail(&uf); - return retval; - -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - vm_area_free(tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto loop_out; -} - static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); @@ -794,13 +620,6 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - mmap_write_lock(oldmm); - dup_mm_exe_file(mm, oldmm); - mmap_write_unlock(oldmm); - return 0; -} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ -- cgit v1.2.3 From 3e43e260f1e44d21861815faa905a1829027600f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Apr 2025 16:28:17 +0100 Subject: mm: perform VMA allocation, freeing, duplication in mm Right now these are performed in kernel/fork.c which is odd and a violation of separation of concerns, as well as preventing us from integrating this and related logic into userland VMA testing going forward. There is a fly in the ointment - nommu - mmap.c is not compiled if CONFIG_MMU not set, and neither is vma.c. To square the circle, let's add a new file - vma_init.c. This will be compiled for both CONFIG_MMU and nommu builds, and will also form part of the VMA userland testing. This allows us to de-duplicate code, while maintaining separation of concerns and the ability for us to userland test this logic. Update the VMA userland tests accordingly, additionally adding a detach_free_vma() helper function to correctly detach VMAs before freeing them in test code, as this change was triggering the assert for this. [akpm@linux-foundation.org: remove stray newline, per Liam] Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Pedro Falcato Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Signed-off-by: Andrew Morton --- kernel/fork.c | 88 ----------------------------------------------------------- 1 file changed, 88 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ac9f9267a473..9e4616dacd82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -431,88 +431,9 @@ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; -/* SLAB cache for vm_area_struct structures */ -static struct kmem_cache *vm_area_cachep; - /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!vma) - return NULL; - - vma_init(vma, mm); - - return vma; -} - -static void vm_area_init_from(const struct vm_area_struct *src, - struct vm_area_struct *dest) -{ - dest->vm_mm = src->vm_mm; - dest->vm_ops = src->vm_ops; - dest->vm_start = src->vm_start; - dest->vm_end = src->vm_end; - dest->anon_vma = src->anon_vma; - dest->vm_pgoff = src->vm_pgoff; - dest->vm_file = src->vm_file; - dest->vm_private_data = src->vm_private_data; - vm_flags_init(dest, src->vm_flags); - memcpy(&dest->vm_page_prot, &src->vm_page_prot, - sizeof(dest->vm_page_prot)); - /* - * src->shared.rb may be modified concurrently when called from - * dup_mmap(), but the clone will reinitialize it. - */ - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, - sizeof(dest->vm_userfaultfd_ctx)); -#ifdef CONFIG_ANON_VMA_NAME - dest->anon_name = src->anon_name; -#endif -#ifdef CONFIG_SWAP - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, - sizeof(dest->swap_readahead_info)); -#endif -#ifndef CONFIG_MMU - dest->vm_region = src->vm_region; -#endif -#ifdef CONFIG_NUMA - dest->vm_policy = src->vm_policy; -#endif -} - -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) -{ - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - - if (!new) - return NULL; - - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - vm_area_init_from(orig, new); - vma_lock_init(new, true); - INIT_LIST_HEAD(&new->anon_vma_chain); - vma_numab_state_init(new); - dup_anon_vma_name(orig, new); - - return new; -} - -void vm_area_free(struct vm_area_struct *vma) -{ - /* The vma should be detached while being destroyed. */ - vma_assert_detached(vma); - vma_numab_state_free(vma); - free_anon_vma_name(vma); - kmem_cache_free(vm_area_cachep, vma); -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -3033,11 +2954,6 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { - struct kmem_cache_args args = { - .use_freeptr_offset = true, - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), - }; - sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3054,10 +2970,6 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), &args, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } -- cgit v1.2.3 From 2aad4edf6e1018b28b7000faec56b7b6e585c8e1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 16 May 2025 17:34:46 -0700 Subject: mm: rename try_alloc_pages() to alloc_pages_nolock() The "try_" prefix is confusing, since it made people believe that try_alloc_pages() is analogous to spin_trylock() and NULL return means EAGAIN. This is not the case. If it returns NULL there is no reason to call it again. It will most likely return NULL again. Hence rename it to alloc_pages_nolock() to make it symmetrical to free_pages_nolock() and document that NULL means ENOMEM. Link: https://lkml.kernel.org/r/20250517003446.60260-1-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Harry Yoo Cc: Andrii Nakryiko Cc: Kumar Kartikeya Dwivedi Cc: Michal Hocko Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64c3393e8270..9cdb4f22640f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -578,7 +578,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return try_alloc_pages(nid, 0); + return alloc_pages_nolock(nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT -- cgit v1.2.3