From a7cd9a5376aa188529393a802329645dc8c5a63c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 23 Mar 2022 16:05:35 -0700 Subject: kernel/ksysfs.c: use helper macro __ATTR_RW Use helper macro __ATTR_RW to define kobj_attribute to make code more clear. Minor readability improvement. Link: https://lkml.kernel.org/r/20220222112034.48298-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 35859da8bd4f..b1292a57c2a5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,8 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KERNEL_ATTR_RW(_name) \ -static struct kobj_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) +static struct kobj_attribute _name##_attr = __ATTR_RW(_name) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, -- cgit v1.2.3 From b1e2c8df0f007f34c373925471c7b493c9e16620 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 23 Mar 2022 16:06:29 -0700 Subject: cgroup: use irqsave in cgroup_rstat_flush_locked(). All callers of cgroup_rstat_flush_locked() acquire cgroup_rstat_lock either with spin_lock_irq() or spin_lock_irqsave(). cgroup_rstat_flush_locked() itself acquires cgroup_rstat_cpu_lock which is a raw_spin_lock. This lock is also acquired in cgroup_rstat_updated() in IRQ context and therefore requires _irqsave() locking suffix in cgroup_rstat_flush_locked(). Since there is no difference between spin_lock_t and raw_spin_lock_t on !RT lockdep does not complain here. On RT lockdep complains because the interrupts were not disabled here and a deadlock is possible. Acquire the raw_spin_lock_t with disabled interrupts. Link: https://lkml.kernel.org/r/20220301122143.1521823-2-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Tejun Heo Cc: Johannes Weiner Cc: Thomas Gleixner Cc: Zefan Li From: Sebastian Andrzej Siewior Subject: cgroup: add a comment to cgroup_rstat_flush_locked(). Add a comment why spin_lock_irq() -> raw_spin_lock_irqsave() is needed. Link: https://lkml.kernel.org/r/Yh+DOK73hfVV5ThX@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Tejun Heo Cc: Johannes Weiner Cc: Thomas Gleixner Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/rstat.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9d331ba44870..ba7a660184e4 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -153,8 +153,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; + unsigned long flags; - raw_spin_lock(cpu_lock); + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + raw_spin_lock_irqsave(cpu_lock, flags); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; @@ -166,7 +175,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock(cpu_lock); + raw_spin_unlock_irqrestore(cpu_lock, flags); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || -- cgit v1.2.3 From 1a2383e8b84c0451fd9b1eec3b9aab16f30b597c Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 23 Mar 2022 16:06:51 -0700 Subject: panic: unset panic_on_warn inside panic() In the current code, the following three places need to unset panic_on_warn before calling panic() to avoid recursive panics: kernel/kcsan/report.c: print_report() kernel/sched/core.c: __schedule_bug() mm/kfence/report.c: kfence_report_error() In order to avoid copy-pasting "panic_on_warn = 0" all over the places, it is better to move it inside panic() and then remove it from the other places. Link: https://lkml.kernel.org/r/1644324666-15947-4-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Jonathan Corbet Cc: Xuefeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 55b50e052ec3..95ba825522dd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -185,6 +185,16 @@ void panic(const char *fmt, ...) int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + } + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -576,16 +586,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn) panic("panic_on_warn set ...\n"); - } if (!regs) dump_stack(); -- cgit v1.2.3 From 92333baaceb399c7878c7f868a0746d7d22f9b77 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 23 Mar 2022 16:07:00 -0700 Subject: taskstats: remove unneeded dead assignment make clang-analyzer on x86_64 defconfig caught my attention with: kernel/taskstats.c:120:2: warning: Value stored to 'rc' is never read \ [clang-analyzer-deadcode.DeadStores] rc = 0; ^ Commit d94a041519f3 ("taskstats: free skb, avoid returns in send_cpu_listeners") made send_cpu_listeners() not return a value and hence, the rc variable remained only to be used within the loop where it is always assigned before read and it does not need any other initialisation. So, simply remove this unneeded dead initializing assignment. As compilers will detect this unneeded assignment and optimize this anyway, the resulting object code is identical before and after this change. No functional change. No change to object code. [akpm@linux-foundation.org: reduce scope of `rc'] Link: https://lkml.kernel.org/r/20220307093942.21310-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: Nick Desaulniers Cc: Balbir Singh Cc: Tom Rix Cc: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2b4898b4752e..bcac5a9043aa 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -113,13 +113,14 @@ static void send_cpu_listeners(struct sk_buff *skb, struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, delcount = 0; + int delcount = 0; genlmsg_end(skb, reply); - rc = 0; down_read(&listeners->sem); list_for_each_entry(s, &listeners->list, list) { + int rc; + skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { skb_next = skb_clone(skb_cur, GFP_KERNEL); -- cgit v1.2.3 From 8d470a45d1a65e6a308aeee5da7f5b37d3303c04 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 23 Mar 2022 16:07:06 -0700 Subject: panic: add option to dump all CPUs backtraces in panic_print Currently the "panic_print" parameter/sysctl allows some interesting debug information to be printed during a panic event. This is useful for example in cases the user cannot kdump due to resource limits, or if the user collects panic logs in a serial output (or pstore) and prefers a fast reboot instead of a kdump. Happens that currently there's no way to see all CPUs backtraces in a panic using "panic_print" on architectures that support that. We do have "oops_all_cpu_backtrace" sysctl, but although partially overlapping in the functionality, they are orthogonal in nature: "panic_print" is a panic tuning (and we have panics without oopses, like direct calls to panic() or maybe other paths that don't go through oops_enter() function), and the original purpose of "oops_all_cpu_backtrace" is to provide more information on oopses for cases in which the users desire to continue running the kernel even after an oops, i.e., used in non-panic scenarios. So, we hereby introduce an additional bit for "panic_print" to allow dumping the CPUs backtraces during a panic event. Link: https://lkml.kernel.org/r/20211109202848.610874-3-gpiccoli@igalia.com Signed-off-by: Guilherme G. Piccoli Reviewed-by: Feng Tang Cc: Iurii Zaikin Cc: Kees Cook Cc: Luis Chamberlain Cc: Samuel Iglesias Gonsalvez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 95ba825522dd..3c3fb36d8d41 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 +#define PANIC_PRINT_ALL_CPU_BT 0x00000040 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -152,6 +153,9 @@ static void panic_print_sys_info(void) if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_ALL_CPU_BT) + trigger_all_cpu_backtrace(); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); -- cgit v1.2.3 From f953f140f318a641c443b0b8c618155ed90a7a10 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 23 Mar 2022 16:07:09 -0700 Subject: panic: move panic_print before kmsg dumpers The panic_print setting allows users to collect more information in a panic event, like memory stats, tasks, CPUs backtraces, etc. This is an interesting debug mechanism, but currently the print event happens *after* kmsg_dump(), meaning that pstore, for example, cannot collect a dmesg with the panic_print extra information. This patch changes that in 2 steps: (a) The panic_print setting allows to replay the existing kernel log buffer to the console (bit 5), besides the extra information dump. This functionality makes sense only at the end of the panic() function. So, we hereby allow to distinguish the two situations by a new boolean parameter in the function panic_print_sys_info(). (b) With the above change, we can safely call panic_print_sys_info() before kmsg_dump(), allowing to dump the extra information when using pstore or other kmsg dumpers. The additional messages from panic_print could overwrite the oldest messages when the buffer is full. The only reasonable solution is to use a large enough log buffer, hence we added an advice into the kernel parameters documentation about that. Link: https://lkml.kernel.org/r/20220214141308.841525-1-gpiccoli@igalia.com Signed-off-by: Guilherme G. Piccoli Acked-by: Baoquan He Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Cc: Feng Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 3c3fb36d8d41..eb4dfb932c85 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -148,10 +148,13 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); -static void panic_print_sys_info(void) +static void panic_print_sys_info(bool console_flush) { - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) - console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (console_flush) { + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) + console_flush_on_panic(CONSOLE_REPLAY_ALL); + return; + } if (panic_print & PANIC_PRINT_ALL_CPU_BT) trigger_all_cpu_backtrace(); @@ -286,6 +289,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + panic_print_sys_info(false); + kmsg_dump(KMSG_DUMP_PANIC); /* @@ -316,7 +321,7 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_print_sys_info(); + panic_print_sys_info(true); if (!panic_blink) panic_blink = no_blink; -- cgit v1.2.3 From 17581aa13680e1c38759cbb0ed32d04aa8849bea Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Wed, 23 Mar 2022 16:07:12 -0700 Subject: kcov: split ioctl handling into locked and unlocked parts Patch series "kcov: improve mmap processing", v3. Subsequent mmaps of the same kcov descriptor currently do not update the virtual memory of the task and yet return 0 (success). This is counter-intuitive and may lead to unexpected memory access errors. Also, this unnecessarily limits the functionality of kcov to only the simplest usage scenarios. Kcov instances are effectively forever attached to their first address spaces and it becomes impossible to e.g. reuse the same kcov handle in forked child processes without mmapping the memory first. This is exactly what we tried to do in syzkaller and inadvertently came upon this behavior. This patch series addresses the problem described above. This patch (of 3): Currently all ioctls are de facto processed under a spinlock in order to serialise them. This, however, prohibits the use of vmalloc and other memory management functions in the implementations of those ioctls, unnecessary complicating any further changes to the code. Let all ioctls first be processed inside the kcov_ioctl() function which should execute the ones that are not compatible with spinlock and then pass control to kcov_ioctl_locked() for all other ones. KCOV_REMOTE_ENABLE is processed both in kcov_ioctl() and kcov_ioctl_locked() as the steps are easily separable. Although it is still compatible with a spinlock, move KCOV_INIT_TRACE handling to kcov_ioctl(), so that the changes from the next commit are easier to follow. Link: https://lkml.kernel.org/r/20220117153634.150357-1-nogikh@google.com Link: https://lkml.kernel.org/r/20220117153634.150357-2-nogikh@google.com Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Marco Elver Cc: Alexander Potapenko Cc: Taras Madan Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 68 ++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 36ca640c4f8e..e1be7301500b 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -564,31 +564,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; - unsigned long size, unused; + unsigned long flags, unused; int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; - unsigned long flags; switch (cmd) { - case KCOV_INIT_TRACE: - /* - * Enable kcov in trace mode and setup buffer size. - * Must happen before anything else. - */ - if (kcov->mode != KCOV_MODE_DISABLED) - return -EBUSY; - /* - * Size must be at least 2 to hold current position and one PC. - * Later we allocate size * sizeof(unsigned long) memory, - * that must not overflow. - */ - size = arg; - if (size < 2 || size > INT_MAX / sizeof(unsigned long)) - return -EINVAL; - kcov->size = size; - kcov->mode = KCOV_MODE_INIT; - return 0; case KCOV_ENABLE: /* * Enable coverage for the current task. @@ -692,9 +673,32 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; - unsigned long flags; + unsigned long size, flags; - if (cmd == KCOV_REMOTE_ENABLE) { + kcov = filep->private_data; + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + * + * First check the size argument - it must be at least 2 + * to hold the current position and one PC. Later we allocate + * size * sizeof(unsigned long) memory, that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + spin_lock_irqsave(&kcov->lock, flags); + if (kcov->mode != KCOV_MODE_DISABLED) { + spin_unlock_irqrestore(&kcov->lock, flags); + return -EBUSY; + } + kcov->size = size; + kcov->mode = KCOV_MODE_INIT; + spin_unlock_irqrestore(&kcov->lock, flags); + return 0; + case KCOV_REMOTE_ENABLE: if (get_user(remote_num_handles, (unsigned __user *)(arg + offsetof(struct kcov_remote_arg, num_handles)))) return -EFAULT; @@ -710,16 +714,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) return -EINVAL; } arg = (unsigned long)remote_arg; + fallthrough; + default: + /* + * All other commands can be normally executed under a spin lock, so we + * obtain and release it here in order to simplify kcov_ioctl_locked(). + */ + spin_lock_irqsave(&kcov->lock, flags); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock_irqrestore(&kcov->lock, flags); + kfree(remote_arg); + return res; } - - kcov = filep->private_data; - spin_lock_irqsave(&kcov->lock, flags); - res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock_irqrestore(&kcov->lock, flags); - - kfree(remote_arg); - - return res; } static const struct file_operations kcov_fops = { -- cgit v1.2.3 From b3d7fe86fbd06638c71dd851ba921adf50d912ce Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Wed, 23 Mar 2022 16:07:15 -0700 Subject: kcov: properly handle subsequent mmap calls Allocate the kcov buffer during KCOV_MODE_INIT in order to untie mmapping of a kcov instance and the actual coverage collection process. Modify kcov_mmap, so that it can be reliably used any number of times once KCOV_MODE_INIT has succeeded. These changes to the user-facing interface of the tool only weaken the preconditions, so all existing user space code should remain compatible with the new version. Link: https://lkml.kernel.org/r/20220117153634.150357-3-nogikh@google.com Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Marco Elver Cc: Sebastian Andrzej Siewior Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index e1be7301500b..475524bd900a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t) static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) { int res = 0; - void *area; struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; unsigned long flags; - area = vmalloc_user(vma->vm_end - vma->vm_start); - if (!area) - return -ENOMEM; - spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); - if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || + if (kcov->area == NULL || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; } - if (!kcov->area) { - kcov->area = area; - vma->vm_flags |= VM_DONTEXPAND; - spin_unlock_irqrestore(&kcov->lock, flags); - for (off = 0; off < size; off += PAGE_SIZE) { - page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); - } - return 0; + spin_unlock_irqrestore(&kcov->lock, flags); + vma->vm_flags |= VM_DONTEXPAND; + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); } + return 0; exit: spin_unlock_irqrestore(&kcov->lock, flags); - vfree(area); return res; } @@ -674,6 +665,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) unsigned int remote_num_handles; unsigned long remote_arg_size; unsigned long size, flags; + void *area; kcov = filep->private_data; switch (cmd) { @@ -683,17 +675,21 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) * Must happen before anything else. * * First check the size argument - it must be at least 2 - * to hold the current position and one PC. Later we allocate - * size * sizeof(unsigned long) memory, that must not overflow. + * to hold the current position and one PC. */ size = arg; if (size < 2 || size > INT_MAX / sizeof(unsigned long)) return -EINVAL; + area = vmalloc_user(size * sizeof(unsigned long)); + if (area == NULL) + return -ENOMEM; spin_lock_irqsave(&kcov->lock, flags); if (kcov->mode != KCOV_MODE_DISABLED) { spin_unlock_irqrestore(&kcov->lock, flags); + vfree(area); return -EBUSY; } + kcov->area = area; kcov->size = size; kcov->mode = KCOV_MODE_INIT; spin_unlock_irqrestore(&kcov->lock, flags); -- cgit v1.2.3 From 0cbcc92917c5de80f15c24d033566539ad696892 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 23 Mar 2022 16:07:18 -0700 Subject: kernel/resource: fix kfree() of bootmem memory again Since commit ebff7d8f270d ("mem hotunplug: fix kfree() of bootmem memory"), we could get a resource allocated during boot via alloc_resource(). And it's required to release the resource using free_resource(). Howerver, many people use kfree directly which will result in kernel BUG. In order to fix this without fixing every call site, just leak a couple of bytes in such corner case. Link: https://lkml.kernel.org/r/20220217083619.19305-1-linmiaohe@huawei.com Fixes: ebff7d8f270d ("mem hotunplug: fix kfree() of bootmem memory") Signed-off-by: Miaohe Lin Suggested-by: David Hildenbrand Cc: Dan Williams Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 9c08d6e9eef2..34eaee179689 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -56,14 +56,6 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); -/* - * For memory hotplug, there is no way to free resource entries allocated - * by boot mem after the system is up. So for reusing the resource entry - * we need to remember the resource. - */ -static struct resource *bootmem_resource_free; -static DEFINE_SPINLOCK(bootmem_resource_lock); - static struct resource *next_resource(struct resource *p) { if (p->child) @@ -160,36 +152,19 @@ __initcall(ioresources_init); static void free_resource(struct resource *res) { - if (!res) - return; - - if (!PageSlab(virt_to_head_page(res))) { - spin_lock(&bootmem_resource_lock); - res->sibling = bootmem_resource_free; - bootmem_resource_free = res; - spin_unlock(&bootmem_resource_lock); - } else { + /** + * If the resource was allocated using memblock early during boot + * we'll leak it here: we can only return full pages back to the + * buddy and trying to be smart and reusing them eventually in + * alloc_resource() overcomplicates resource handling. + */ + if (res && PageSlab(virt_to_head_page(res))) kfree(res); - } } static struct resource *alloc_resource(gfp_t flags) { - struct resource *res = NULL; - - spin_lock(&bootmem_resource_lock); - if (bootmem_resource_free) { - res = bootmem_resource_free; - bootmem_resource_free = res->sibling; - } - spin_unlock(&bootmem_resource_lock); - - if (res) - memset(res, 0, sizeof(struct resource)); - else - res = kzalloc(sizeof(struct resource), flags); - - return res; + return kzalloc(sizeof(struct resource), flags); } /* Return the conflict entry if you can't request it */ -- cgit v1.2.3