From 49ca6153208f6efc409c1deb82dd5bcbb519d7e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Aug 2021 09:39:31 +0200 Subject: bpf: Relicense disassembler as GPL-2.0-only OR BSD-2-Clause Some time ago we dual-licensed both libbpf and bpftool through commits 1bc38b8ff6cc ("libbpf: relicense libbpf as LGPL-2.1 OR BSD-2-Clause") and 907b22365115 ("tools: bpftool: dual license all files"). The latter missed the disasm.{c,h} which we pull in via kernel/bpf/ such that we have a single source for verifier as well as bpftool asm dumping, see also f4ac7e0b5cc8 ("bpf: move instruction printing into a separate file"). It is currently GPL-2.0-only and missed the conversion in 907b22365115, therefore relicense the two as GPL-2.0-only OR BSD-2-Clause as well. Spotted-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Brendan Jackman Acked-by: Jakub Kicinski Acked-by: Jiri Olsa Acked-by: Simon Horman Acked-by: Martin KaFai Lau Acked-by: Xu Kuohai Acked-by: Edward Cree --- kernel/bpf/disasm.c | 2 +- kernel/bpf/disasm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ca3cd9aaa6ce..7b4afb7d96db 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e546b18d27da..a4b040793f44 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ -- cgit v1.2.3 From a61cb6017df0a9be072a35259e6e9ae7aa0ef6b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Sep 2021 14:40:00 +0200 Subject: dma-mapping: fix the kerneldoc for dma_map_sg_attrs Add the missing description for the nents parameter, and fix a trivial misalignment. Fixes: fffe3cc8c219 ("dma-mapping: allow map_sg() ops to return negative error codes") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 7ee5284bff58..06fec5547e7c 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -206,7 +206,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, /** * dma_map_sg_attrs - Map the given buffer for DMA * @dev: The device for which to perform the DMA operation - * @sg: The sg_table object describing the buffer + * @sg: The sg_table object describing the buffer + * @nents: Number of entries to map * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * -- cgit v1.2.3 From 2f1aaf3ea666b737ad717b3d88667225aca23149 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Sep 2021 08:49:59 -0700 Subject: bpf, mm: Fix lockdep warning triggered by stack_map_get_build_id_offset() Currently the bpf selftest "get_stack_raw_tp" triggered the warning: [ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0 [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod] [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G W O 5.14.0+ #53 [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0 [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8 [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246 [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000 [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0 [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0 [ 1411.304501] FS: 00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000 [ 1411.304504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0 [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.304512] Call Trace: [ 1411.304517] stack_map_get_build_id_offset+0x17c/0x260 [ 1411.304528] __bpf_get_stack+0x18f/0x230 [ 1411.304541] bpf_get_stack_raw_tp+0x5a/0x70 [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000 [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04 [ 1411.306978] bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0 [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108 [ 1411.309074] bpf_trace_run2+0x8f/0x1a0 [ 1411.309891] FS: 00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000 [ 1411.309896] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.311221] syscall_trace_enter.isra.20+0x161/0x1f0 [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0 [ 1411.312291] do_syscall_64+0x15/0x80 [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.313803] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.315082] RIP: 0033:0x7f00ad80a0e0 [ 1411.315626] Call Trace: [ 1411.315632] stack_map_get_build_id_offset+0x17c/0x260 To reproduce, first build `test_progs` binary: make -C tools/testing/selftests/bpf -j60 and then run the binary at tools/testing/selftests/bpf directory: ./test_progs -t get_stack_raw_tp The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") which added mmap_assert_locked() in find_vma() function. The mmap_assert_locked() function asserts that mm->mmap_lock needs to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid() helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner() instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case, the above warning is emitted during test run. This patch fixed the issue by (1). using mmap_read_trylock() instead of mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and (2). droping lockdep for mmap_lock right before the irq_work_queue(). The function mmap_read_trylock_non_owner() is also removed since after this patch nobody calls it any more. Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") Suggested-by: Jason Gunthorpe Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Reviewed-by: Liam R. Howlett Cc: Luigi Rizzo Cc: Jason Gunthorpe Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com --- kernel/bpf/stackmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e8eefdf8cf3e..09a3fd97d329 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -179,7 +179,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock_non_owner(current->mm)) { + !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -204,9 +204,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock_non_owner(current->mm); + mmap_read_unlock(current->mm); } else { work->mm = current->mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } -- cgit v1.2.3 From 510e1a724ab1bf38150be2c1acabb303f98d0047 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Fri, 10 Sep 2021 19:53:37 -0400 Subject: dma-debug: prevent an error message from causing runtime problems For some drivers, that use the DMA API. This error message can be reached several millions of times per second, causing spam to the kernel's printk buffer and bringing the CPU usage up to 100% (so, it should be rate limited). However, since there is at least one driver that is in the mainline and suffers from the error condition, it is more useful to err_printk() here instead of just rate limiting the error message (in hopes that it will make it easier for other drivers that suffer from this issue to be spotted). Link: https://lkml.kernel.org/r/fd67fbac-64bf-f0ea-01e1-5938ccfab9d0@arm.com Reported-by: Jeremy Linton Signed-off-by: Hamza Mahfooz Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 6c90c69e5311..95445bd6eb72 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -567,7 +567,8 @@ static void add_dma_entry(struct dma_debug_entry *entry) pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; } else if (rc == -EEXIST) { - pr_err("cacheline tracking EEXIST, overlapping mappings aren't supported\n"); + err_printk(entry->dev, entry, + "cacheline tracking EEXIST, overlapping mappings aren't supported\n"); } } -- cgit v1.2.3 From b94f9ac79a7395c2d6171cc753cc27942df0be73 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 9 Sep 2021 22:42:56 -0400 Subject: cgroup/cpuset: Change references of cpuset_mutex to cpuset_rwsem Since commit 1243dc518c9d ("cgroup/cpuset: Convert cpuset_mutex to percpu_rwsem"), cpuset_mutex has been replaced by cpuset_rwsem which is a percpu rwsem. However, the comments in kernel/cgroup/cpuset.c still reference cpuset_mutex which are now incorrect. Change all the references of cpuset_mutex to cpuset_rwsem. Fixes: 1243dc518c9d ("cgroup/cpuset: Convert cpuset_mutex to percpu_rwsem") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 56 ++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index df1ccf4558f8..2a9695ccb65f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -311,17 +311,19 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global locks guarding cpuset structures - cpuset_mutex and + * There are two global locks guarding cpuset structures - cpuset_rwsem and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. + * comment. The cpuset code uses only cpuset_rwsem write lock. Other + * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to + * prevent change to cpuset structures. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_mutex. While it is performing these checks, various + * just holding cpuset_rwsem. While it is performing these checks, various * callback routines can briefly acquire callback_lock to query cpusets. * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. @@ -393,7 +395,7 @@ static inline bool is_in_v2_mode(void) * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) @@ -435,7 +437,7 @@ out_unlock: * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -447,7 +449,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -468,7 +470,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. + * are only set if the other's are set. Call holding cpuset_rwsem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -577,7 +579,7 @@ static inline void free_cpuset(struct cpuset *cs) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_mutex held. + * cpuset_rwsem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -700,7 +702,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } -/* Must be called with cpuset_mutex held. */ +/* Must be called with cpuset_rwsem held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ @@ -726,7 +728,7 @@ static inline int nr_cpusets(void) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cpuset_mutex held. + * Must be called with cpuset_rwsem held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a @@ -1005,7 +1007,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes cpus_read_lock(). + * Call with cpuset_rwsem held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1078,7 +1080,7 @@ void rebuild_sched_domains(void) * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * * Iterate through each task of @cs updating its cpus_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_cpumask(struct cpuset *cs) @@ -1347,7 +1349,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) { @@ -1704,12 +1706,12 @@ static void *cpuset_being_rebound; * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { - static nodemask_t newmems; /* protected by cpuset_mutex */ + static nodemask_t newmems; /* protected by cpuset_rwsem */ struct css_task_iter it; struct task_struct *task; @@ -1722,7 +1724,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cpuset_mutex, we know that no other rebind effort + * the global cpuset_rwsem, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1768,7 +1770,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { @@ -1821,7 +1823,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_lock during call. + * Call with cpuset_rwsem held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1911,7 +1913,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays + * function is called with cpuset_rwsem held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) @@ -1931,7 +1933,7 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1980,7 +1982,7 @@ out: * cs: the cpuset to update * new_prs: new partition root state * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_prstate(struct cpuset *cs, int new_prs) { @@ -2167,7 +2169,7 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; -/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; @@ -2219,7 +2221,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) } /* - * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -2227,7 +2229,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_mutex */ + /* static buf protected by cpuset_rwsem */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; @@ -2417,7 +2419,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy + * grabbing cpuset_rwsem anyway. This only happens on the legacy * hierarchies. */ css_get(&cs->css); @@ -3672,7 +3674,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc//cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_rwsem, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, -- cgit v1.2.3 From 0e6491b559704da720f6da09dd0a52c4df44c514 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Sat, 11 Sep 2021 08:55:57 +0800 Subject: bpf: Add oversize check before call kvcalloc() Commit 7661809d493b ("mm: don't allow oversized kvmalloc() calls") add the oversize check. When the allocation is larger than what kmalloc() supports, the following warning triggered: WARNING: CPU: 0 PID: 8408 at mm/util.c:597 kvmalloc_node+0x108/0x110 mm/util.c:597 Modules linked in: CPU: 0 PID: 8408 Comm: syz-executor221 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:kvmalloc_node+0x108/0x110 mm/util.c:597 Call Trace: kvmalloc include/linux/mm.h:806 [inline] kvmalloc_array include/linux/mm.h:824 [inline] kvcalloc include/linux/mm.h:829 [inline] check_btf_line kernel/bpf/verifier.c:9925 [inline] check_btf_info kernel/bpf/verifier.c:10049 [inline] bpf_check+0xd634/0x150d0 kernel/bpf/verifier.c:13759 bpf_prog_load kernel/bpf/syscall.c:2301 [inline] __sys_bpf+0x11181/0x126e0 kernel/bpf/syscall.c:4587 __do_sys_bpf kernel/bpf/syscall.c:4691 [inline] __se_sys_bpf kernel/bpf/syscall.c:4689 [inline] __x64_sys_bpf+0x78/0x90 kernel/bpf/syscall.c:4689 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+f3e749d4c662818ae439@syzkaller.appspotmail.com Signed-off-by: Bixuan Cui Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210911005557.45518-1-cuibixuan@huawei.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 047ac4b4703b..e76b55917905 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9912,6 +9912,8 @@ static int check_btf_line(struct bpf_verifier_env *env, nr_linfo = attr->line_info_cnt; if (!nr_linfo) return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; rec_size = attr->line_info_rec_size; if (rec_size < MIN_BPF_LINEINFO_SIZE || -- cgit v1.2.3 From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Sep 2021 01:07:57 +0200 Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used. Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") embedded per-socket cgroup information into sock->sk_cgrp_data and in order to save 8 bytes in struct sock made both mutually exclusive, that is, when cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2 falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp). The assumption made was "there is no reason to mix the two and this is in line with how legacy and v2 compatibility is handled" as stated in bd1060a1d671. However, with Kubernetes more widely supporting cgroups v2 as well nowadays, this assumption no longer holds, and the possibility of the v1/v2 mixed mode with the v2 root fallback being hit becomes a real security issue. Many of the cgroup v2 BPF programs are also used for policy enforcement, just to pick _one_ example, that is, to programmatically deny socket related system calls like connect(2) or bind(2). A v2 root fallback would implicitly cause a policy bypass for the affected Pods. In production environments, we have recently seen this case due to various circumstances: i) a different 3rd party agent and/or ii) a container runtime such as [0] in the user's environment configuring legacy cgroup v1 net_cls tags, which triggered implicitly mentioned root fallback. Another case is Kubernetes projects like kind [1] which create Kubernetes nodes in a container and also add cgroup namespaces to the mix, meaning programs which are attached to the cgroup v2 root of the cgroup namespace get attached to a non-root cgroup v2 path from init namespace point of view. And the latter's root is out of reach for agents on a kind Kubernetes node to configure. Meaning, any entity on the node setting cgroup v1 net_cls tag will trigger the bypass despite cgroup v2 BPF programs attached to the namespace root. Generally, this mutual exclusiveness does not hold anymore in today's user environments and makes cgroup v2 usage from BPF side fragile and unreliable. This fix adds proper struct cgroup pointer for the cgroup v2 case to struct sock_cgroup_data in order to address these issues; this implicitly also fixes the tradeoffs being made back then with regards to races and refcount leaks as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF programs always operate as expected. [0] https://github.com/nestybox/sysbox/ [1] https://kind.sigs.k8s.io/ Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Stanislav Fomichev Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net --- kernel/cgroup/cgroup.c | 50 ++++++++++---------------------------------------- 1 file changed, 10 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470beb..8afa8690d288 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) return; rcu_read_lock(); - while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; + skcd->cgroup = cset->dfl_cgrp; cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); } - rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } -- cgit v1.2.3 From 356ed64991c6847a0c4f2e8fa3b1133f7a14f1fc Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 14 Sep 2021 10:33:51 +0800 Subject: bpf: Handle return value of BPF_PROG_TYPE_STRUCT_OPS prog Currently if a function ptr in struct_ops has a return value, its caller will get a random return value from it, because the return value of related BPF_PROG_TYPE_STRUCT_OPS prog is just dropped. So adding a new flag BPF_TRAMP_F_RET_FENTRY_RET to tell bpf trampoline to save and return the return value of struct_ops prog if ret_size of the function ptr is greater than 0. Also restricting the flag to be used alone. Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210914023351.3664499-1-houtao1@huawei.com --- kernel/bpf/bpf_struct_ops.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d6731c32864e..9abcc33f02cf 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *mtype, *ptype; struct bpf_prog *prog; u32 moff; + u32 flags; moff = btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); @@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + flags = st_ops->func_models[i].ret_size > 0 ? + BPF_TRAMP_F_RET_FENTRY_RET : 0; err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, - &st_ops->func_models[i], 0, - tprogs, NULL); + &st_ops->func_models[i], + flags, tprogs, NULL); if (err < 0) goto reset_unlock; -- cgit v1.2.3 From 77e02cf57b6cff9919949defb7fd9b8ac16399a2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Sep 2021 13:23:22 -0700 Subject: memblock: introduce saner 'memblock_free_ptr()' interface The boot-time allocation interface for memblock is a mess, with 'memblock_alloc()' returning a virtual pointer, but then you are supposed to free it with 'memblock_free()' that takes a _physical_ address. Not only is that all kinds of strange and illogical, but it actually causes bugs, when people then use it like a normal allocation function, and it fails spectacularly on a NULL pointer: https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/ or just random memory corruption if the debug checks don't catch it: https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/ I really don't want to apply patches that treat the symptoms, when the fundamental cause is this horribly confusing interface. I started out looking at just automating a sane replacement sequence, but because of this mix or virtual and physical addresses, and because people have used the "__pa()" macro that can take either a regular kernel pointer, or just the raw "unsigned long" address, it's all quite messy. So this just introduces a new saner interface for freeing a virtual address that was allocated using 'memblock_alloc()', and that was kept as a regular kernel pointer. And then it converts a couple of users that are obvious and easy to test, including the 'xbc_nodes' case in lib/bootconfig.c that caused problems. Reported-by: kernel test robot Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Steven Rostedt Cc: Mike Rapoport Cc: Andrew Morton Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Vlastimil Babka Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 825277e1e742..a8d0a58deebc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early) return; err_free_descs: - memblock_free(__pa(new_descs), new_descs_size); + memblock_free_ptr(new_descs, new_descs_size); err_free_log_buf: - memblock_free(__pa(new_log_buf), new_log_buf_len); + memblock_free_ptr(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; -- cgit v1.2.3 From b89a05b21f46150ac10a962aa50109250b56b03b Mon Sep 17 00:00:00 2001 From: Baptiste Lepers Date: Mon, 6 Sep 2021 11:53:10 +1000 Subject: events: Reuse value read using READ_ONCE instead of re-reading it In perf_event_addr_filters_apply, the task associated with the event (event->ctx->task) is read using READ_ONCE at the beginning of the function, checked, and then re-read from event->ctx->task, voiding all guarantees of the checks. Reuse the value that was read by READ_ONCE to ensure the consistency of the task struct throughout the function. Fixes: 375637bc52495 ("perf/core: Introduce address range filtering") Signed-off-by: Baptiste Lepers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210906015310.12802-1-baptiste.lepers@gmail.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 744e8726c5b2..0c000cb01eeb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10193,7 +10193,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) return; if (ifh->nr_file_filters) { - mm = get_task_mm(event->ctx->task); + mm = get_task_mm(task); if (!mm) goto restart; -- cgit v1.2.3 From 7687201e37fabf2b7cf2b828f7ca46bf30e2948f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Sep 2021 12:59:17 +0200 Subject: locking/rwbase: Properly match set_and_save_state() to restore_state() Noticed while looking at the readers race. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20210909110203.828203010@infradead.org --- kernel/locking/rwbase_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 4ba15088e640..542b0170e4f5 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -220,7 +220,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, for (; atomic_read(&rwb->readers);) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { - __set_current_state(TASK_RUNNING); + rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); return -EINTR; } -- cgit v1.2.3 From 616be87eac9fa2ab2dca1069712f7236e50f3bf6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Sep 2021 12:59:18 +0200 Subject: locking/rwbase: Extract __rwbase_write_trylock() The code in rwbase_write_lock() is a little non-obvious vs the read+set 'trylock', extract the sequence into a helper function to clarify the code. This also provides a single site to fix fast-path ordering. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/YUCq3L+u44NDieEJ@hirez.programming.kicks-ass.net --- kernel/locking/rwbase_rt.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 542b0170e4f5..48fbbccdce3f 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -196,6 +196,19 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); } +static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb) +{ + /* Can do without CAS because we're serialized by wait_lock. */ + lockdep_assert_held(&rwb->rtmutex.wait_lock); + + if (!atomic_read(&rwb->readers)) { + atomic_set(&rwb->readers, WRITER_BIAS); + return 1; + } + + return 0; +} + static int __sched rwbase_write_lock(struct rwbase_rt *rwb, unsigned int state) { @@ -210,34 +223,30 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, atomic_sub(READER_BIAS, &rwb->readers); raw_spin_lock_irqsave(&rtm->wait_lock, flags); - /* - * set_current_state() for rw_semaphore - * current_save_and_set_rtlock_wait_state() for rwlock - */ - rwbase_set_and_save_current_state(state); + if (__rwbase_write_trylock(rwb)) + goto out_unlock; - /* Block until all readers have left the critical section. */ - for (; atomic_read(&rwb->readers);) { + rwbase_set_and_save_current_state(state); + for (;;) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); return -EINTR; } + + if (__rwbase_write_trylock(rwb)) + break; + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); - /* - * Schedule and wait for the readers to leave the critical - * section. The last reader leaving it wakes the waiter. - */ - if (atomic_read(&rwb->readers) != 0) - rwbase_schedule(); set_current_state(state); - raw_spin_lock_irqsave(&rtm->wait_lock, flags); } - - atomic_set(&rwb->readers, WRITER_BIAS); rwbase_restore_current_state(); + +out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); return 0; } @@ -253,8 +262,7 @@ static inline int rwbase_write_trylock(struct rwbase_rt *rwb) atomic_sub(READER_BIAS, &rwb->readers); raw_spin_lock_irqsave(&rtm->wait_lock, flags); - if (!atomic_read(&rwb->readers)) { - atomic_set(&rwb->readers, WRITER_BIAS); + if (__rwbase_write_trylock(rwb)) { raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); return 1; } -- cgit v1.2.3 From 81121524f1c798c9481bd7900450b72ee7ac2eef Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 9 Sep 2021 12:59:19 +0200 Subject: locking/rwbase: Take care of ordering guarantee for fastpath reader Readers of rwbase can lock and unlock without taking any inner lock, if that happens, we need the ordering provided by atomic operations to satisfy the ordering semantics of lock/unlock. Without that, considering the follow case: { X = 0 initially } CPU 0 CPU 1 ===== ===== rt_write_lock(); X = 1 rt_write_unlock(): atomic_add(READER_BIAS - WRITER_BIAS, ->readers); // ->readers is READER_BIAS. rt_read_lock(): if ((r = atomic_read(->readers)) < 0) // True atomic_try_cmpxchg(->readers, r, r + 1); // succeed. r1 = X; // r1 may be 0, because nothing prevent the reordering // of "X=1" and atomic_add() on CPU 1. Therefore audit every usage of atomic operations that may happen in a fast path, and add necessary barriers. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20210909110203.953991276@infradead.org --- kernel/locking/rwbase_rt.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 48fbbccdce3f..88191f6e252c 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -41,6 +41,12 @@ * The risk of writer starvation is there, but the pathological use cases * which trigger it are not necessarily the typical RT workloads. * + * Fast-path orderings: + * The lock/unlock of readers can run in fast paths: lock and unlock are only + * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE + * semantics of rwbase_rt. Atomic ops should thus provide _acquire() + * and _release() (or stronger). + * * Common code shared between RT rw_semaphore and rwlock */ @@ -53,6 +59,7 @@ static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) * set. */ for (r = atomic_read(&rwb->readers); r < 0;) { + /* Fully-ordered if cmpxchg() succeeds, provides ACQUIRE */ if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1))) return 1; } @@ -162,6 +169,8 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, /* * rwb->readers can only hit 0 when a writer is waiting for the * active readers to leave the critical section. + * + * dec_and_test() is fully ordered, provides RELEASE. */ if (unlikely(atomic_dec_and_test(&rwb->readers))) __rwbase_read_unlock(rwb, state); @@ -172,7 +181,11 @@ static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, { struct rt_mutex_base *rtm = &rwb->rtmutex; - atomic_add(READER_BIAS - bias, &rwb->readers); + /* + * _release() is needed in case that reader is in fast path, pairing + * with atomic_try_cmpxchg() in rwbase_read_trylock(), provides RELEASE + */ + (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers); raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); rwbase_rtmutex_unlock(rtm); } @@ -201,7 +214,11 @@ static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb) /* Can do without CAS because we're serialized by wait_lock. */ lockdep_assert_held(&rwb->rtmutex.wait_lock); - if (!atomic_read(&rwb->readers)) { + /* + * _acquire is needed in case the reader is in the fast path, pairing + * with rwbase_read_unlock(), provides ACQUIRE. + */ + if (!atomic_read_acquire(&rwb->readers)) { atomic_set(&rwb->readers, WRITER_BIAS); return 1; } -- cgit v1.2.3 From 20c36ce2164f1774b487d443ece99b754bc6ad43 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Thu, 16 Sep 2021 10:52:03 +0800 Subject: irqdomain: Change the type of 'size' in __irq_domain_add() to be consistent The 'size' is used in struct_size(domain, revmap, size) and its input parameter type is 'size_t'(unsigned int). Changing the size to 'unsigned int' to make the type consistent. Signed-off-by: Bixuan Cui Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210916025203.44841-1-cuibixuan@huawei.com --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 62be16135e7c..bfa289ed57ab 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); * Allocates and initializes an irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) -- cgit v1.2.3 From 8646e53633f314e4d746a988240d3b951a92f94a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:26 -0700 Subject: KVM: rseq: Update rseq when processing NOTIFY_RESUME on xfer to KVM guest Invoke rseq's NOTIFY_RESUME handler when processing the flag prior to transferring to a KVM guest, which is roughly equivalent to an exit to userspace and processes many of the same pending actions. While the task cannot be in an rseq critical section as the KVM path is reachable only by via ioctl(KVM_RUN), the side effects that apply to rseq outside of a critical section still apply, e.g. the current CPU needs to be updated if the task is migrated. Clearing TIF_NOTIFY_RESUME without informing rseq can lead to segfaults and other badness in userspace VMMs that use rseq in combination with KVM, e.g. due to the CPU ID being stale after task migration. Fixes: 72c3c0fe54a3 ("x86/kvm: Use generic xfer to guest work function") Reported-by: Peter Foley Bisected-by: Doug Evans Acked-by: Mathieu Desnoyers Cc: Shakeel Butt Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210901203030.1292304-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- kernel/entry/kvm.c | 4 +++- kernel/rseq.c | 14 +++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 49972ee99aff..049fd06b4c3d 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -19,8 +19,10 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NEED_RESCHED) schedule(); - if (ti_work & _TIF_NOTIFY_RESUME) + if (ti_work & _TIF_NOTIFY_RESUME) { tracehook_notify_resume(NULL); + rseq_handle_notify_resume(NULL, NULL); + } ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) diff --git a/kernel/rseq.c b/kernel/rseq.c index 35f7bd0fced0..6d45ac3dae7f 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -282,9 +282,17 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + + /* + * regs is NULL if and only if the caller is in a syscall path. Skip + * fixup and leave rseq_cs as is so that rseq_sycall() will detect and + * kill a misbehaving userspace on debug kernels. + */ + if (regs) { + ret = rseq_ip_fixup(regs); + if (unlikely(ret < 0)) + goto error; + } if (unlikely(rseq_update_cpu_id(t))) goto error; return; -- cgit v1.2.3 From a68de80f61f6af397bc06fb391ff2e571c9c4d80 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:27 -0700 Subject: entry: rseq: Call rseq_handle_notify_resume() in tracehook_notify_resume() Invoke rseq_handle_notify_resume() from tracehook_notify_resume() now that the two function are always called back-to-back by architectures that have rseq. The rseq helper is stubbed out for architectures that don't support rseq, i.e. this is a nop across the board. Note, tracehook_notify_resume() is horribly named and arguably does not belong in tracehook.h as literally every line of code in it has nothing to do with tracing. But, that's been true since commit a42c6ded827d ("move key_repace_session_keyring() into tracehook_notify_resume()") first usurped tracehook_notify_resume() back in 2012. Punt cleaning that mess up to future patches. No functional change intended. Acked-by: Mathieu Desnoyers Signed-off-by: Sean Christopherson Message-Id: <20210901203030.1292304-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- kernel/entry/common.c | 4 +--- kernel/entry/kvm.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bf16395b9e13..d5a61d565ad5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) handle_signal_work(regs, ti_work); - if (ti_work & _TIF_NOTIFY_RESUME) { + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 049fd06b4c3d..49972ee99aff 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -19,10 +19,8 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NEED_RESCHED) schedule(); - if (ti_work & _TIF_NOTIFY_RESUME) { + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(NULL); - rseq_handle_notify_resume(NULL, NULL); - } ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) -- cgit v1.2.3 From 8cd9da85d2bd87ce889043e7b1735723dd10eb89 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 13 Sep 2021 16:53:32 +0200 Subject: posix-cpu-timers: Prevent spuriously armed 0-value itimer Resetting/stopping an itimer eventually leads to it being reprogrammed with an actual "0" value. As a result the itimer expires on the next tick, triggering an unexpected signal. To fix this, make sure that struct signal_struct::it[CPUCLOCK_PROF/VIRT]::expires is set to 0 when setitimer() passes a 0 it_value, indicating that the timer must stop. Fixes: 406dd42bd1ba ("posix-cpu-timers: Force next expiration recalc after itimer reset") Reported-by: Victor Stinner Reported-by: Chris Hixon Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210913145332.232023-1-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ee736861b18f..643d412ac623 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1404,7 +1404,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - *newval += now; + if (*newval) + *newval += now; } /* -- cgit v1.2.3 From 5afedf670caf30a2b5a52da96eb7eac7dee6a9c9 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Thu, 23 Sep 2021 21:49:21 +0800 Subject: blktrace: Fix uaf in blk_trace access after removing by sysfs There is an use-after-free problem triggered by following process: P1(sda) P2(sdb) echo 0 > /sys/block/sdb/trace/enable blk_trace_remove_queue synchronize_rcu blk_trace_free relay_close rcu_read_lock __blk_add_trace trace_note_tsk (Iterate running_trace_list) relay_close_buf relay_destroy_buf kfree(buf) trace_note(sdb's bt) relay_reserve buf->offset <- nullptr deference (use-after-free) !!! rcu_read_unlock [ 502.714379] BUG: kernel NULL pointer dereference, address: 0000000000000010 [ 502.715260] #PF: supervisor read access in kernel mode [ 502.715903] #PF: error_code(0x0000) - not-present page [ 502.716546] PGD 103984067 P4D 103984067 PUD 17592b067 PMD 0 [ 502.717252] Oops: 0000 [#1] SMP [ 502.720308] RIP: 0010:trace_note.isra.0+0x86/0x360 [ 502.732872] Call Trace: [ 502.733193] __blk_add_trace.cold+0x137/0x1a3 [ 502.733734] blk_add_trace_rq+0x7b/0xd0 [ 502.734207] blk_add_trace_rq_issue+0x54/0xa0 [ 502.734755] blk_mq_start_request+0xde/0x1b0 [ 502.735287] scsi_queue_rq+0x528/0x1140 ... [ 502.742704] sg_new_write.isra.0+0x16e/0x3e0 [ 502.747501] sg_ioctl+0x466/0x1100 Reproduce method: ioctl(/dev/sda, BLKTRACESETUP, blk_user_trace_setup[buf_size=127]) ioctl(/dev/sda, BLKTRACESTART) ioctl(/dev/sdb, BLKTRACESETUP, blk_user_trace_setup[buf_size=127]) ioctl(/dev/sdb, BLKTRACESTART) echo 0 > /sys/block/sdb/trace/enable & // Add delay(mdelay/msleep) before kernel enters blk_trace_free() ioctl$SG_IO(/dev/sda, SG_IO, ...) // Enters trace_note_tsk() after blk_trace_free() returned // Use mdelay in rcu region rather than msleep(which may schedule out) Remove blk_trace from running_list before calling blk_trace_free() by sysfs if blk_trace is at Blktrace_running state. Fixes: c71a896154119f ("blktrace: add ftrace plugin") Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20210923134921.109194-1-chengzhihao1@huawei.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c221e4c3f625..fa91f398f28b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1605,6 +1605,14 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + } + put_probe_ref(); synchronize_rcu(); blk_trace_free(bt); -- cgit v1.2.3 From 8a98ae12fbefdb583a7696de719a1d57e5e940a2 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 22 Sep 2021 12:11:52 +0100 Subject: bpf: Exempt CAP_BPF from checks against bpf_jit_limit When introducing CAP_BPF, bpf_jit_charge_modmem() was not changed to treat programs with CAP_BPF as privileged for the purpose of JIT memory allocation. This means that a program without CAP_BPF can block a program with CAP_BPF from loading a program. Fix this by checking bpf_capable() in bpf_jit_charge_modmem(). Fixes: 2c78ee898d8f ("bpf: Implement CAP_BPF") Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210922111153.19843-1-lmb@cloudflare.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9f4636d021b1..d6b7dfdd8066 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { atomic_long_sub(pages, &bpf_jit_current); return -EPERM; } -- cgit v1.2.3 From 78cc316e9583067884eb8bd154301dc1e9ee945c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 27 Sep 2021 14:39:20 +0200 Subject: bpf, cgroup: Assign cgroup in cgroup_sk_alloc when called from interrupt If cgroup_sk_alloc() is called from interrupt context, then just assign the root cgroup to skcd->cgroup. Prior to commit 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode") we would just return, and later on in sock_cgroup_ptr(), we were NULL-testing the cgroup in fast-path, and iff indeed NULL returning the root cgroup (v ?: &cgrp_dfl_root.cgrp). Rather than re-adding the NULL-test to the fast-path we can just assign it once from cgroup_sk_alloc() given v1/v2 handling has been simplified. The migration from NULL test with returning &cgrp_dfl_root.cgrp to assigning &cgrp_dfl_root.cgrp directly does /not/ change behavior for callers of sock_cgroup_ptr(). syzkaller was able to trigger a splat in the legacy netrom code base, where the RX handler in nr_rx_frame() calls nr_make_new() which calls sk_alloc() and therefore cgroup_sk_alloc() with in_interrupt() condition. Thus the NULL skcd->cgroup, where it trips over on cgroup_sk_free() side given it expects a non-NULL object. There are a few other candidates aside from netrom which have similar pattern where in their accept-like implementation, they just call to sk_alloc() and thus cgroup_sk_alloc() instead of sk_clone_lock() with the corresponding cgroup_sk_clone() which then inherits the cgroup from the parent socket. None of them are related to core protocols where BPF cgroup programs are running from. However, in future, they should follow to implement a similar inheritance mechanism. Additionally, with a !CONFIG_CGROUP_NET_PRIO and !CONFIG_CGROUP_NET_CLASSID configuration, the same issue was exposed also prior to 8520e224f547 due to commit e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated cgroup") which added the early in_interrupt() return back then. Fixes: 8520e224f547 ("bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode") Fixes: e876ecc67db8 ("cgroup: memcg: net: do not associate sock with unrelated cgroup") Reported-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com Reported-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Tested-by: syzbot+df709157a4ecaf192b03@syzkaller.appspotmail.com Tested-by: syzbot+533f389d4026d86a2a95@syzkaller.appspotmail.com Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210927123921.21535-1-daniel@iogearbox.net --- kernel/cgroup/cgroup.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8afa8690d288..570b0c97392a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6574,22 +6574,29 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } + while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->cgroup = cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } -- cgit v1.2.3 From 0d67e332e6df72f43eaa21228daa3a79e23093f3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Sep 2021 14:15:10 +0200 Subject: module: fix clang CFI with MODULE_UNLOAD=n When CONFIG_MODULE_UNLOAD is disabled, the module->exit member is not defined, causing a build failure: kernel/module.c:4493:8: error: no member named 'exit' in 'struct module' mod->exit = *exit; add an #ifdef block around this. Fixes: cf68fffb66d6 ("add support for Clang CFI") Acked-by: Kees Cook Reviewed-by: Sami Tolvanen Reviewed-by: Miroslav Benes Signed-off-by: Arnd Bergmann Signed-off-by: Jessica Yu --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 40ec9a030eec..5c26a76e800b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4489,8 +4489,10 @@ static void cfi_init(struct module *mod) /* Fix init/exit functions to point to the CFI jump table */ if (init) mod->init = *init; +#ifdef CONFIG_MODULE_UNLOAD if (exit) mod->exit = *exit; +#endif cfi_module_add(mod, module_addr_min); #endif -- cgit v1.2.3 From 30e29a9a2bc6a4888335a6ede968b75cd329657a Mon Sep 17 00:00:00 2001 From: Tatsuhiko Yasumatsu Date: Thu, 30 Sep 2021 22:55:45 +0900 Subject: bpf: Fix integer overflow in prealloc_elems_and_freelist() In prealloc_elems_and_freelist(), the multiplication to calculate the size passed to bpf_map_area_alloc() could lead to an integer overflow. As a result, out-of-bounds write could occur in pcpu_freelist_populate() as reported by KASAN: [...] [ 16.968613] BUG: KASAN: slab-out-of-bounds in pcpu_freelist_populate+0xd9/0x100 [ 16.969408] Write of size 8 at addr ffff888104fc6ea0 by task crash/78 [ 16.970038] [ 16.970195] CPU: 0 PID: 78 Comm: crash Not tainted 5.15.0-rc2+ #1 [ 16.970878] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 16.972026] Call Trace: [ 16.972306] dump_stack_lvl+0x34/0x44 [ 16.972687] print_address_description.constprop.0+0x21/0x140 [ 16.973297] ? pcpu_freelist_populate+0xd9/0x100 [ 16.973777] ? pcpu_freelist_populate+0xd9/0x100 [ 16.974257] kasan_report.cold+0x7f/0x11b [ 16.974681] ? pcpu_freelist_populate+0xd9/0x100 [ 16.975190] pcpu_freelist_populate+0xd9/0x100 [ 16.975669] stack_map_alloc+0x209/0x2a0 [ 16.976106] __sys_bpf+0xd83/0x2ce0 [...] The possibility of this overflow was originally discussed in [0], but was overlooked. Fix the integer overflow by changing elem_size to u64 from u32. [0] https://lore.kernel.org/bpf/728b238e-a481-eb50-98e9-b0f430ab01e7@gmail.com/ Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") Signed-off-by: Tatsuhiko Yasumatsu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210930135545.173698-1-th.yasumatsu@gmail.com --- kernel/bpf/stackmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 09a3fd97d329..6e75bbee39f0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -63,7 +63,8 @@ static inline int stack_map_data_size(struct bpf_map *map) static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { - u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + u64 elem_size = sizeof(struct stack_map_bucket) + + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, -- cgit v1.2.3 From f792565326825ed806626da50c6f9a928f1079c1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 29 Sep 2021 12:43:13 -0700 Subject: perf/core: fix userpage->time_enabled of inactive events Users of rdpmc rely on the mmapped user page to calculate accurate time_enabled. Currently, userpage->time_enabled is only updated when the event is added to the pmu. As a result, inactive event (due to counter multiplexing) does not have accurate userpage->time_enabled. This can be reproduced with something like: /* open 20 task perf_event "cycles", to create multiplexing */ fd = perf_event_open(); /* open task perf_event "cycles" */ userpage = mmap(fd); /* use mmap and rdmpc */ while (true) { time_enabled_mmap = xxx; /* use logic in perf_event_mmap_page */ time_enabled_read = read(fd).time_enabled; if (time_enabled_mmap > time_enabled_read) BUG(); } Fix this by updating userpage for inactive events in merge_sched_in. Suggested-by: Peter Zijlstra (Intel) Reported-and-tested-by: Lucian Grijincu Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210929194313.2398474-1-songliubraving@fb.com --- kernel/events/core.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0c000cb01eeb..f23ca260307f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3707,6 +3707,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +static inline bool event_update_userpage(struct perf_event *event) +{ + if (likely(!atomic_read(&event->mmap_count))) + return false; + + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); + perf_event_update_userpage(event); + + return true; +} + +static inline void group_update_userpage(struct perf_event *group_event) +{ + struct perf_event *event; + + if (!event_update_userpage(group_event)) + return; + + for_each_sibling_event(event, group_event) + event_update_userpage(event); +} + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; @@ -3725,14 +3748,15 @@ static int merge_sched_in(struct perf_event *event, void *data) } if (event->state == PERF_EVENT_STATE_INACTIVE) { + *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } else { + ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); + group_update_userpage(event); } - - *can_add_hw = 0; - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); } return 0; @@ -6324,6 +6348,8 @@ accounting: ring_buffer_attach(event, rb); + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { -- cgit v1.2.3 From 2630cde26711dab0d0b56a8be1616475be646d13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 17 Sep 2021 17:30:37 +0200 Subject: sched/fair: Add ancestors of unthrottled undecayed cfs_rq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit a7b359fc6a37 ("sched/fair: Correctly insert cfs_rq's to list on unthrottle") we add cfs_rqs with no runnable tasks but not fully decayed into the load (leaf) list. We may ignore adding some ancestors and therefore breaking tmp_alone_branch invariant. This broke LTP test cfs_bandwidth01 and it was partially fixed in commit fdaba61ef8a2 ("sched/fair: Ensure that the CFS parent is added after unthrottling"). I noticed the named test still fails even with the fix (but with low probability, 1 in ~1000 executions of the test). The reason is when bailing out of unthrottle_cfs_rq early, we may miss adding ancestors of the unthrottled cfs_rq, thus, not joining tmp_alone_branch properly. Fix this by adding ancestors if we notice the unthrottled cfs_rq was added to the load list. Fixes: a7b359fc6a37 ("sched/fair: Correctly insert cfs_rq's to list on unthrottle") Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Odin Ugedal Link: https://lore.kernel.org/r/20210917153037.11176-1-mkoutny@suse.com --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..f6a05d9b5443 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4936,8 +4936,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - if (!cfs_rq->load.weight) + /* Nothing to run but something to decay (on_list)? Complete the branch */ + if (!cfs_rq->load.weight) { + if (cfs_rq->on_list) + goto unthrottle_throttle; return; + } task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; -- cgit v1.2.3 From 703066188f63d66cc6b9d678e5b5ef1213c5938e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 27 Sep 2021 12:46:35 +0100 Subject: sched/fair: Null terminate buffer when updating tunable_scaling This patch null-terminates the temporary buffer in sched_scaling_write() so kstrtouint() does not return failure and checks the value is valid. Before: $ cat /sys/kernel/debug/sched/tunable_scaling 1 $ echo 0 > /sys/kernel/debug/sched/tunable_scaling -bash: echo: write error: Invalid argument $ cat /sys/kernel/debug/sched/tunable_scaling 1 After: $ cat /sys/kernel/debug/sched/tunable_scaling 1 $ echo 0 > /sys/kernel/debug/sched/tunable_scaling $ cat /sys/kernel/debug/sched/tunable_scaling 0 $ echo 3 > /sys/kernel/debug/sched/tunable_scaling -bash: echo: write error: Invalid argument Fixes: 8a99b6833c88 ("sched: Move SCHED_DEBUG sysctl to debugfs") Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Link: https://lore.kernel.org/r/20210927114635.GH3959@techsingularity.net --- kernel/sched/debug.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 49716228efb4..17a653b67006 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -173,16 +173,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[16]; + unsigned int scaling; if (cnt > 15) cnt = 15; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = '\0'; - if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling)) + if (kstrtouint(buf, 10, &scaling)) return -EINVAL; + if (scaling >= SCHED_TUNABLESCALING_END) + return -EINVAL; + + sysctl_sched_tunable_scaling = scaling; if (sched_update_scaling()) return -EINVAL; -- cgit v1.2.3 From 424b650f35c77defbb3cbd6e5221d3697af42250 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 22 Sep 2021 10:51:22 +0800 Subject: tracing: Fix missing osnoise tracer on max_latency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compiler warns when the data are actually unused: kernel/trace/trace.c:1712:13: error: ‘trace_create_maxlat_file’ defined but not used [-Werror=unused-function] 1712 | static void trace_create_maxlat_file(struct trace_array *tr, | ^~~~~~~~~~~~~~~~~~~~~~~~ [Why] CONFIG_HWLAT_TRACER=n, CONFIG_TRACER_MAX_TRACE=n, CONFIG_OSNOISE_TRACER=y gcc report warns. [How] Now trace_create_maxlat_file will only take effect when CONFIG_HWLAT_TRACER=y or CONFIG_TRACER_MAX_TRACE=y. In fact, after adding osnoise trace, it also needs to take effect. Link: https://lore.kernel.org/all/c1d9e328-ad7c-920b-6c24-9e1598a6421c@infradead.org/ Link: https://lkml.kernel.org/r/20210922025122.3268022-1-liu.yun@linux.dev Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Cc: Daniel Bristot de Oliveira Suggested-by: Steven Rostedt Reviewed-by: Daniel Bristot de Oliveira Tested-by: Randy Dunlap # build-tested Signed-off-by: Jackie Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7896d30d90f7..bc677cd64224 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1744,16 +1744,15 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); } -/* - * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - * defined(CONFIG_FSNOTIFY) - */ -#else +#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER) #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", 0644, d_tracer, \ &tr->max_latency, &tracing_max_lat_fops) +#else +#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif #ifdef CONFIG_TRACER_MAX_TRACE @@ -9473,9 +9472,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_maxlat_file(tr, d_tracer); -#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); -- cgit v1.2.3 From 6675880fc4b7c5137a2640b0725505c21b1ac525 Mon Sep 17 00:00:00 2001 From: Vamshi K Sthambamkadi Date: Fri, 8 Oct 2021 12:48:06 +0530 Subject: tracing: Fix memory leak in eprobe_register() kmemleak report: unreferenced object 0xffff900a70ec7ec0 (size 32): comm "ftracetest", pid 2770, jiffies 4295042510 (age 311.464s) hex dump (first 32 bytes): c8 31 23 45 0a 90 ff ff 40 85 c7 6e 0a 90 ff ff .1#E....@..n.... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000009d3751fd>] kmem_cache_alloc_trace+0x2a2/0x440 [<0000000088b8124b>] eprobe_register+0x1e3/0x350 [<000000002a9a0517>] __ftrace_event_enable_disable+0x7c/0x240 [<0000000019109321>] event_enable_write+0x93/0xe0 [<000000007d85b320>] vfs_write+0xb9/0x260 [<00000000b94c5e41>] ksys_write+0x67/0xe0 [<000000005a08c81d>] __x64_sys_write+0x1a/0x20 [<00000000240bf576>] do_syscall_64+0x3b/0xc0 [<0000000043d5d9f6>] entry_SYSCALL_64_after_hwframe+0x44/0xae unreferenced object 0xffff900a56bbf280 (size 128): comm "ftracetest", pid 2770, jiffies 4295042510 (age 311.464s) hex dump (first 32 bytes): ff ff ff ff ff ff ff ff 00 00 00 00 01 00 00 00 ................ 80 69 3b b2 ff ff ff ff 20 69 3b b2 ff ff ff ff .i;..... i;..... backtrace: [<000000009d3751fd>] kmem_cache_alloc_trace+0x2a2/0x440 [<00000000c4e90fad>] eprobe_register+0x1fc/0x350 [<000000002a9a0517>] __ftrace_event_enable_disable+0x7c/0x240 [<0000000019109321>] event_enable_write+0x93/0xe0 [<000000007d85b320>] vfs_write+0xb9/0x260 [<00000000b94c5e41>] ksys_write+0x67/0xe0 [<000000005a08c81d>] __x64_sys_write+0x1a/0x20 [<00000000240bf576>] do_syscall_64+0x3b/0xc0 [<0000000043d5d9f6>] entry_SYSCALL_64_after_hwframe+0x44/0xae In new_eprobe_trigger(), allocated edata and trigger variables are never freed. To fix, free memory in disable_eprobe(). Link: https://lkml.kernel.org/r/20211008071802.GA2098@cosmos Fixes: 7491e2c442781 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Vamshi K Sthambamkadi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_eprobe.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 3044b762cbd7..570d081929fb 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -632,6 +632,13 @@ static int disable_eprobe(struct trace_eprobe *ep, trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); + + /* Make sure nothing is using the edata or trigger */ + tracepoint_synchronize_unregister(); + + kfree(edata); + kfree(trigger); + return 0; } -- cgit v1.2.3 From b26503b15631229583e925de774e95b11d8144e8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 6 Oct 2021 18:28:30 +0100 Subject: tracing: Fix missing * in comment block There is a missing * in a comment block, add it in. Link: https://lkml.kernel.org/r/20211006172830.1025336-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a6061a69aa84..f01e442716e2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2506,7 +2506,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data, * events. However, for convenience, users are allowed to directly * specify an event field in an action, which will be automatically * converted into a variable on their behalf. - + * * If a user specifies a field on an event that isn't the event the * histogram currently being defined (the target event histogram), the * only way that can be accomplished is if a new hist trigger is -- cgit v1.2.3 From 57116ce17b04fde2fe30f0859df69d8dbe5809f6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 6 Oct 2021 13:58:52 +0200 Subject: workqueue: fix state-dump console deadlock Console drivers often queue work while holding locks also taken in their console write paths, something which can lead to deadlocks on SMP when dumping workqueue state (e.g. sysrq-t or on suspend failures). For serial console drivers this could look like: CPU0 CPU1 ---- ---- show_workqueue_state(); lock(&pool->lock); lock(&port->lock); schedule_work(); lock(&pool->lock); printk(); lock(console_owner); lock(&port->lock); where workqueues are, for example, used to push data to the line discipline, process break signals and handle modem-status changes. Line disciplines and serdev drivers can also queue work on write-wakeup notifications, etc. Reworking every console driver to avoid queuing work while holding locks also taken in their write paths would complicate drivers and is neither desirable or feasible. Instead use the deferred-printk mechanism to avoid printing while holding pool locks when dumping workqueue state. Note that there are a few WARN_ON() assertions in the workqueue code which could potentially also trigger a deadlock. Hopefully the ongoing printk rework will provide a general solution for this eventually. This was originally reported after a lockdep splat when executing sysrq-t with the imx serial driver. Fixes: 3494fc30846d ("workqueue: dump workqueues on sysrq-t") Cc: stable@vger.kernel.org # 4.0 Reported-by: Fabio Estevam Tested-by: Fabio Estevam Signed-off-by: Johan Hovold Reviewed-by: John Ogness Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33a6b4a2443d..1b3eb1e9531f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4830,8 +4830,16 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); show_pwq(pwq); + printk_deferred_exit(); + } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. @@ -4849,7 +4857,12 @@ void show_workqueue_state(void) raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; - + /* + * Defer printing to avoid deadlocks in console drivers that + * queue work while holding locks also taken in their write + * paths. + */ + printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%us workers=%d", @@ -4864,6 +4877,7 @@ void show_workqueue_state(void) first = false; } pr_cont("\n"); + printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, flags); /* -- cgit v1.2.3 From 7d5fda1c841f9f3930eed61b92d542137f93fc30 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 13 Oct 2021 16:51:12 -0400 Subject: tracing: Fix event probe removal from dynamic events When an event probe is to be removed via the API that created it via the dynamic events, an -ENOENT error is returned. This is because the removal of the event probe does not expect to see the event system and name that the event probe is attached to, even though that's part of the API to create it. As the removal of probes is to use the same API as they are created. In fact, the removal is not consistent with the kprobes and uprobes removal. Fix that by allowing various ways to remove the eprobe. The eprobe is created with: e:[GROUP/]NAME SYSTEM/EVENT [OPTIONS] Have it get removed by echoing in the following into dynamic_events: # Remove all eprobes with NAME echo '-:NAME' >> dynamic_events # Remove a specific eprobe echo '-:GROUP/NAME' >> dynamic_events echo '-:GROUP/NAME SYSTEM/EVENT' >> dynamic_events echo '-:NAME SYSTEM/EVENT' >> dynamic_events echo '-:GROUP/NAME SYSTEM/EVENT OPTIONS' >> dynamic_events echo '-:NAME SYSTEM/EVENT OPTIONS' >> dynamic_events Link: https://lkml.kernel.org/r/20211012081925.0e19cc4f@gandalf.local.home Link: https://lkml.kernel.org/r/20211013205533.630722129@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Fixes: 7491e2c442781 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_eprobe.c | 54 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 570d081929fb..c4a15aef36af 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -119,10 +119,58 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_eprobe *ep = to_trace_eprobe(ev); + const char *slash; - return strcmp(trace_probe_name(&ep->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) && - trace_probe_match_command_args(&ep->tp, argc, argv); + /* + * We match the following: + * event only - match all eprobes with event name + * system and event only - match all system/event probes + * + * The below has the above satisfied with more arguments: + * + * attached system/event - If the arg has the system and event + * the probe is attached to, match + * probes with the attachment. + * + * If any more args are given, then it requires a full match. + */ + + /* + * If system exists, but this probe is not part of that system + * do not match. + */ + if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0) + return false; + + /* Must match the event name */ + if (strcmp(trace_probe_name(&ep->tp), event) != 0) + return false; + + /* No arguments match all */ + if (argc < 1) + return true; + + /* First argument is the system/event the probe is attached to */ + + slash = strchr(argv[0], '/'); + if (!slash) + slash = strchr(argv[0], '.'); + if (!slash) + return false; + + if (strncmp(ep->event_system, argv[0], slash - argv[0])) + return false; + if (strcmp(ep->event_name, slash + 1)) + return false; + + argc--; + argv++; + + /* If there are no other args, then match */ + if (argc < 1) + return true; + + return trace_probe_match_command_args(&ep->tp, argc, argv); } static struct dyn_event_operations eprobe_dyn_event_ops = { -- cgit v1.2.3