From 41a55567b9e31cb852670684404654ec4fd0d8d6 Mon Sep 17 00:00:00 2001 From: David Gow Date: Wed, 13 Jul 2022 08:52:20 +0800 Subject: module: kunit: Load .kunit_test_suites section when CONFIG_KUNIT=m The new KUnit module handling has KUnit test suites listed in a .kunit_test_suites section of each module. This should be loaded when the module is, but at the moment this only happens if KUnit is built-in. Also load this when KUnit is enabled as a module: it'll not be usable unless KUnit is loaded, but such modules are likely to depend on KUnit anyway, so it's unlikely to ever be loaded needlessly. Fixes: 3d6e44623841 ("kunit: unify module and builtin suite definitions") Signed-off-by: David Gow Reviewed-by: Brendan Higgins Tested-by: Geert Uytterhoeven Signed-off-by: Shuah Khan --- kernel/module/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 6a477c622544..a4e4d84b6f4e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2099,7 +2099,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->static_call_sites), &mod->num_static_call_sites); #endif -#ifdef CONFIG_KUNIT +#if IS_ENABLED(CONFIG_KUNIT) mod->kunit_suites = section_objs(info, ".kunit_test_suites", sizeof(*mod->kunit_suites), &mod->num_kunit_suites); -- cgit v1.2.3 From 2b97cf76289a4fcae66d7959b0d74a87207d7068 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 6 Aug 2022 20:05:08 +0800 Subject: sched/psi: Zero the memory of struct psi_group After commit 5f69a6577bc3 ("psi: dont alloc memory for psi by default"), the memory used by struct psi_group is no longer allocated and zeroed in cgroup_create(). Since the memory of struct psi_group is not zeroed, the data in this memory is random, which will lead to inaccurate psi statistics when creating a new cgroup. So we use kzlloc() to allocate and zero the struct psi_group and remove the redundant zeroing in group_init(). Steps to reproduce: 1. Use cgroup v2 and enable CONFIG_PSI 2. Create a new cgroup, and query psi statistics mkdir /sys/fs/cgroup/test cat /sys/fs/cgroup/test/cpu.pressure some avg10=0.00 avg60=0.00 avg300=47927752200.00 total=12884901 full avg10=561815124.00 avg60=125835394188.00 avg300=1077090462000.00 total=10273561772 cat /sys/fs/cgroup/test/io.pressure some avg10=1040093132823.95 avg60=1203770351379.21 avg300=3862252669559.46 total=4294967296 full avg10=921884564601.39 avg60=0.00 avg300=1984507298.35 total=442381631 cat /sys/fs/cgroup/test/memory.pressure some avg10=232476085778.11 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=2585658472280.57 total=12884901 Fixes: commit 5f69a6577bc3 ("psi: dont alloc memory for psi by default") Signed-off-by: Hao Jia Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/sched/psi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ec66b40bdd40..5ee615a59fe1 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,12 +190,8 @@ static void group_init(struct psi_group *group) /* Init trigger-related members */ mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; init_waitqueue_head(&group->poll_wait); timer_setup(&group->poll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->poll_task, NULL); @@ -957,7 +953,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return 0; - cgroup->psi = kmalloc(sizeof(struct psi_group), GFP_KERNEL); + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); if (!cgroup->psi) return -ENOMEM; -- cgit v1.2.3 From 76b079ef4cc954fc2c2e0333a01855b0b2b6bdee Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 6 Aug 2022 20:05:09 +0800 Subject: sched/psi: Remove unused parameter nbytes of psi_trigger_create() psi_trigger_create()'s 'nbytes' parameter is not used, so we can remove it. Signed-off-by: Hao Jia Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- kernel/sched/psi.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ffaccd6373f1..df7df5843b4f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3698,7 +3698,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; - new = psi_trigger_create(psi, buf, nbytes, res); + new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5ee615a59fe1..ecb4b4ff4ce0 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1087,7 +1087,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) + char *buf, enum psi_res res) { struct psi_trigger *t; enum psi_states state; @@ -1316,7 +1316,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, nbytes, res); + new = psi_trigger_create(&psi_system, buf, res); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); -- cgit v1.2.3 From 4f7e7236435ca0abe005c674ebd6892c6e83aeb3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 Aug 2022 13:27:38 -1000 Subject: cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock Bringing up a CPU may involve creating and destroying tasks which requires read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside cpus_read_lock(). However, cpuset's ->attach(), which may be called with thredagroup_rwsem write-locked, also wants to disable CPU hotplug and acquires cpus_read_lock(), leading to a deadlock. Fix it by guaranteeing that ->attach() is always called with CPU hotplug disabled and removing cpus_read_lock() call from cpuset_attach(). Signed-off-by: Tejun Heo Reviewed-and-tested-by: Imran Khan Reported-and-tested-by: Xuewen Yan Fixes: 05c7b7a92cc8 ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug") Cc: stable@vger.kernel.org # v5.17+ --- kernel/cgroup/cgroup.c | 77 +++++++++++++++++++++++++++++++++++--------------- kernel/cgroup/cpuset.c | 3 +- 2 files changed, 55 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index df7df5843b4f..e1387499b336 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2369,6 +2369,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); +/** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(bool lock_threadgroup) +{ + cpus_read_lock(); + if (lock_threadgroup) + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(bool lock_threadgroup) +{ + if (lock_threadgroup) + percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); +} + /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task @@ -2841,8 +2882,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) - __acquires(&cgroup_threadgroup_rwsem) + bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; @@ -2859,12 +2899,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) { - percpu_down_write(&cgroup_threadgroup_rwsem); - *locked = true; - } else { - *locked = false; - } + *threadgroup_locked = pid || threadgroup; + cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { @@ -2895,17 +2931,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - if (*locked) { - percpu_up_write(&cgroup_threadgroup_rwsem); - *locked = false; - } + cgroup_attach_unlock(*threadgroup_locked); + *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool locked) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; @@ -2913,8 +2946,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - if (locked) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(threadgroup_locked); + for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -3000,8 +3033,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - if (has_tasks) - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(has_tasks); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3022,8 +3054,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - if (has_tasks) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(has_tasks); return ret; } @@ -4971,13 +5002,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool locked; + bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5003,7 +5034,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58aadfda9b8b..1f3a55297f39 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2289,7 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - cpus_read_lock(); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2343,7 +2343,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ -- cgit v1.2.3 From 14b20b784f59bdd95f6f1cfb112c9818bcec4d84 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 16 Aug 2022 20:55:16 +0000 Subject: bpf: Restrict bpf_sys_bpf to CAP_PERFMON The verifier cannot perform sufficient validation of any pointers passed into bpf_attr and treats them as integers rather than pointers. The helper will then read from arbitrary pointers passed into it. Restrict the helper to CAP_PERFMON since the security model in BPF of arbitrary kernel read is CAP_BPF + CAP_PERFMON. Fixes: af2ac3e13e45 ("bpf: Prepare bpf syscall to be used from kernel and user space.") Signed-off-by: YiFei Zhu Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220816205517.682470-1-zhuyifei@google.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a4d40d98428a..27760627370d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5197,7 +5197,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return &bpf_sys_bpf_proto; + return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: -- cgit v1.2.3 From 7d6620f107bae6ed687ff07668e8e8f855487aa9 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 13 Aug 2022 21:40:30 +0800 Subject: bpf, cgroup: Fix kernel BUG in purge_effective_progs Syzkaller reported a triggered kernel BUG as follows: ------------[ cut here ]------------ kernel BUG at kernel/bpf/cgroup.c:925! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 194 Comm: detach Not tainted 5.19.0-14184-g69dac8e431af #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__cgroup_bpf_detach+0x1f2/0x2a0 Code: 00 e8 92 60 30 00 84 c0 75 d8 4c 89 e0 31 f6 85 f6 74 19 42 f6 84 28 48 05 00 00 02 75 0e 48 8b 80 c0 00 00 00 48 85 c0 75 e5 <0f> 0b 48 8b 0c5 RSP: 0018:ffffc9000055bdb0 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffff888100ec0800 RCX: ffffc900000f1000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff888100ec4578 RBP: 0000000000000000 R08: ffff888100ec0800 R09: 0000000000000040 R10: 0000000000000000 R11: 0000000000000000 R12: ffff888100ec4000 R13: 000000000000000d R14: ffffc90000199000 R15: ffff888100effb00 FS: 00007f68213d2b80(0000) GS:ffff88813bc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f74a0e5850 CR3: 0000000102836000 CR4: 00000000000006e0 Call Trace: cgroup_bpf_prog_detach+0xcc/0x100 __sys_bpf+0x2273/0x2a00 __x64_sys_bpf+0x17/0x20 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f68214dbcb9 Code: 08 44 89 e0 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff8 RSP: 002b:00007ffeb487db68 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 000000000000000b RCX: 00007f68214dbcb9 RDX: 0000000000000090 RSI: 00007ffeb487db70 RDI: 0000000000000009 RBP: 0000000000000003 R08: 0000000000000012 R09: 0000000b00000003 R10: 00007ffeb487db70 R11: 0000000000000246 R12: 00007ffeb487dc20 R13: 0000000000000004 R14: 0000000000000001 R15: 000055f74a1011b0 Modules linked in: ---[ end trace 0000000000000000 ]--- Repetition steps: For the following cgroup tree, root | cg1 | cg2 1. attach prog2 to cg2, and then attach prog1 to cg1, both bpf progs attach type is NONE or OVERRIDE. 2. write 1 to /proc/thread-self/fail-nth for failslab. 3. detach prog1 for cg1, and then kernel BUG occur. Failslab injection will cause kmalloc fail and fall back to purge_effective_progs. The problem is that cg2 have attached another prog, so when go through cg2 layer, iteration will add pos to 1, and subsequent operations will be skipped by the following condition, and cg will meet NULL in the end. `if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))` The NULL cg means no link or prog match, this is as expected, and it's not a bug. So here just skip the no match situation. Fixes: 4c46091ee985 ("bpf: Fix KASAN use-after-free Read in compute_effective_progs") Signed-off-by: Pu Lehui Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220813134030.1972696-1-pulehui@huawei.com --- kernel/bpf/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 59b7eb60d5b4..4a400cd63731 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -921,8 +921,10 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, pos++; } } + + /* no link or prog match, skip the cgroup of this layer */ + continue; found: - BUG_ON(!cg); progs = rcu_dereference_protected( desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); -- cgit v1.2.3 From a8faed3a02eeb75857a3b5d660fa80fe79db77a3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 7 Aug 2022 15:09:34 -0700 Subject: kernel/sys_ni: add compat entry for fadvise64_64 When CONFIG_ADVISE_SYSCALLS is not set/enabled and CONFIG_COMPAT is set/enabled, the riscv compat_syscall_table references 'compat_sys_fadvise64_64', which is not defined: riscv64-linux-ld: arch/riscv/kernel/compat_syscall_table.o:(.rodata+0x6f8): undefined reference to `compat_sys_fadvise64_64' Add 'fadvise64_64' to kernel/sys_ni.c as a conditional COMPAT function so that when CONFIG_ADVISE_SYSCALLS is not set, there is a fallback function available. Link: https://lkml.kernel.org/r/20220807220934.5689-1-rdunlap@infradead.org Fixes: d3ac21cacc24 ("mm: Support compiling out madvise and fadvise") Signed-off-by: Randy Dunlap Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Cc: Josh Triplett Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Signed-off-by: Andrew Morton --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a492f159624f..860b2dcf3ac4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self); /* mm/fadvise.c */ COND_SYSCALL(fadvise64_64); +COND_SYSCALL_COMPAT(fadvise64_64); /* mm/, CONFIG_MMU only */ COND_SYSCALL(swapon); -- cgit v1.2.3 From 9c80e79906b4ca440d09e7f116609262bb747909 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 12 Aug 2022 19:05:09 -0700 Subject: kprobes: don't call disarm_kprobe() for disabled kprobes The assumption in __disable_kprobe() is wrong, and it could try to disarm an already disarmed kprobe and fire the WARN_ONCE() below. [0] We can easily reproduce this issue. 1. Write 0 to /sys/kernel/debug/kprobes/enabled. # echo 0 > /sys/kernel/debug/kprobes/enabled 2. Run execsnoop. At this time, one kprobe is disabled. # /usr/share/bcc/tools/execsnoop & [1] 2460 PCOMM PID PPID RET ARGS # cat /sys/kernel/debug/kprobes/list ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE] ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE] 3. Write 1 to /sys/kernel/debug/kprobes/enabled, which changes kprobes_all_disarmed to false but does not arm the disabled kprobe. # echo 1 > /sys/kernel/debug/kprobes/enabled # cat /sys/kernel/debug/kprobes/list ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE] ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE] 4. Kill execsnoop, when __disable_kprobe() calls disarm_kprobe() for the disabled kprobe and hits the WARN_ONCE() in __disarm_kprobe_ftrace(). # fg /usr/share/bcc/tools/execsnoop ^C Actually, WARN_ONCE() is fired twice, and __unregister_kprobe_top() misses some cleanups and leaves the aggregated kprobe in the hash table. Then, __unregister_trace_kprobe() initialises tk->rp.kp.list and creates an infinite loop like this. aggregated kprobe.list -> kprobe.list -. ^ | '.__.' In this situation, these commands fall into the infinite loop and result in RCU stall or soft lockup. cat /sys/kernel/debug/kprobes/list : show_kprobe_addr() enters into the infinite loop with RCU. /usr/share/bcc/tools/execsnoop : warn_kprobe_rereg() holds kprobe_mutex, and __get_valid_kprobe() is stuck in the loop. To avoid the issue, make sure we don't call disarm_kprobe() for disabled kprobes. [0] Failed to disarm kprobe-ftrace at __x64_sys_execve+0x0/0x40 (error -2) WARNING: CPU: 6 PID: 2460 at kernel/kprobes.c:1130 __disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129) Modules linked in: ena CPU: 6 PID: 2460 Comm: execsnoop Not tainted 5.19.0+ #28 Hardware name: Amazon EC2 c5.2xlarge/, BIOS 1.0 10/16/2017 RIP: 0010:__disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129) Code: 24 8b 02 eb c1 80 3d c4 83 f2 01 00 75 d4 48 8b 75 00 89 c2 48 c7 c7 90 fa 0f 92 89 04 24 c6 05 ab 83 01 e8 e4 94 f0 ff <0f> 0b 8b 04 24 eb b1 89 c6 48 c7 c7 60 fa 0f 92 89 04 24 e8 cc 94 RSP: 0018:ffff9e6ec154bd98 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffff930f7b00 RCX: 0000000000000001 RDX: 0000000080000001 RSI: ffffffff921461c5 RDI: 00000000ffffffff RBP: ffff89c504286da8 R08: 0000000000000000 R09: c0000000fffeffff R10: 0000000000000000 R11: ffff9e6ec154bc28 R12: ffff89c502394e40 R13: ffff89c502394c00 R14: ffff9e6ec154bc00 R15: 0000000000000000 FS: 00007fe800398740(0000) GS:ffff89c812d80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c00057f010 CR3: 0000000103b54006 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __disable_kprobe (kernel/kprobes.c:1716) disable_kprobe (kernel/kprobes.c:2392) __disable_trace_kprobe (kernel/trace/trace_kprobe.c:340) disable_trace_kprobe (kernel/trace/trace_kprobe.c:429) perf_trace_event_unreg.isra.2 (./include/linux/tracepoint.h:93 kernel/trace/trace_event_perf.c:168) perf_kprobe_destroy (kernel/trace/trace_event_perf.c:295) _free_event (kernel/events/core.c:4971) perf_event_release_kernel (kernel/events/core.c:5176) perf_release (kernel/events/core.c:5186) __fput (fs/file_table.c:321) task_work_run (./include/linux/sched.h:2056 (discriminator 1) kernel/task_work.c:179 (discriminator 1)) exit_to_user_mode_prepare (./include/linux/resume_user_mode.h:49 kernel/entry/common.c:169 kernel/entry/common.c:201) syscall_exit_to_user_mode (./arch/x86/include/asm/jump_label.h:55 ./arch/x86/include/asm/nospec-branch.h:384 ./arch/x86/include/asm/entry-common.h:94 kernel/entry/common.c:133 kernel/entry/common.c:296) do_syscall_64 (arch/x86/entry/common.c:87) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7fe7ff210654 Code: 15 79 89 20 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb be 0f 1f 00 8b 05 9a cd 20 00 48 63 ff 85 c0 75 11 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3a f3 c3 48 83 ec 18 48 89 7c 24 08 e8 34 fc RSP: 002b:00007ffdbd1d3538 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000000000000008 RCX: 00007fe7ff210654 RDX: 0000000000000000 RSI: 0000000000002401 RDI: 0000000000000008 RBP: 0000000000000000 R08: 94ae31d6fda838a4 R0900007fe8001c9d30 R10: 00007ffdbd1d34b0 R11: 0000000000000246 R12: 00007ffdbd1d3600 R13: 0000000000000000 R14: fffffffffffffffc R15: 00007ffdbd1d3560 Link: https://lkml.kernel.org/r/20220813020509.90805-1-kuniyu@amazon.com Fixes: 69d54b916d83 ("kprobes: makes kprobes/enabled works correctly for optimized kprobes.") Signed-off-by: Kuniyuki Iwashima Reported-by: Ayushman Dutta Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Wang Nan Cc: Kuniyuki Iwashima Cc: Kuniyuki Iwashima Cc: Ayushman Dutta Cc: Signed-off-by: Andrew Morton --- kernel/kprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 80697e5e03e4..08350e35aba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1707,11 +1707,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If 'kprobes_all_disarmed' is set, 'orig_p' - * should have already been disarmed, so - * skip unneed disarming process. + * Don't be lazy here. Even if 'kprobes_all_disarmed' + * is false, 'orig_p' might not have been armed yet. + * Note arm_all_kprobes() __tries__ to arm all kprobes + * on the best effort basis. */ - if (!kprobes_all_disarmed) { + if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { ret = disarm_kprobe(orig_p, true); if (ret) { p->flags &= ~KPROBE_FLAG_DISABLED; -- cgit v1.2.3 From 123d6455771ec577ce65f8d1bda548fb0eb7ef21 Mon Sep 17 00:00:00 2001 From: Wang Jingjin Date: Mon, 1 Aug 2022 16:47:45 +0800 Subject: ftrace: Fix build warning for ops_references_rec() not used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The change that made IPMODIFY and DIRECT ops work together needed access to the ops_references_ip() function, which it pulled out of the module only code. But now if both CONFIG_MODULES and CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is not set, we get the below warning: ‘ops_references_rec’ defined but not used. Since ops_references_rec() only calls ops_references_ip() replace the usage of ops_references_rec() with ops_references_ip() and encompass the function with an #ifdef of DIRECT_CALLS || MODULES being defined. Link: https://lkml.kernel.org/r/20220801084745.1187987-1-wangjingjin1@huawei.com Fixes: 53cd885bc5c3 ("ftrace: Allow IPMODIFY and DIRECT ops on the same function") Signed-off-by: Wang Jingjin Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 79 +++++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 126c769d36c3..439e2ab6905e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1861,8 +1861,6 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } -static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip); - /* * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK * or no-needed to update, -EBUSY if it detects a conflict of the flag @@ -3118,49 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) ftrace_hash_empty(ops->func_hash->notrace_hash); } -/* - * Check if the current ops references the given ip. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_ip(struct ftrace_ops *ops, unsigned long ip) -{ - /* If ops isn't enabled, ignore it */ - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return false; - - /* If ops traces all then it includes this function */ - if (ops_traces_mod(ops)) - return true; - - /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) - return false; - - /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) - return false; - - return true; -} - -/* - * Check if the current ops references the record. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) -{ - return ops_references_ip(ops, rec->ip); -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -6822,6 +6777,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, return -ERANGE; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES) +/* + * Check if the current ops references the given ip. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* If ops traces all then it includes this function */ + if (ops_traces_mod(ops)) + return true; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) + return false; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) + return false; + + return true; +} +#endif + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6834,7 +6821,7 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) { + if (ops_references_ip(ops, rec->ip)) { if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) continue; if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) -- cgit v1.2.3 From ad982c3be4e60c7d39c03f782733503cbd88fd2a Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 22 Aug 2022 10:29:05 +0800 Subject: audit: fix potential double free on error path from fsnotify_add_inode_mark Audit_alloc_mark() assign pathname to audit_mark->path, on error path from fsnotify_add_inode_mark(), fsnotify_put_mark will free memory of audit_mark->path, but the caller of audit_alloc_mark will free the pathname again, so there will be double free problem. Fix this by resetting audit_mark->path to NULL pointer on error path from fsnotify_add_inode_mark(). Cc: stable@vger.kernel.org Fixes: 7b1293234084d ("fsnotify: Add group pointer in fsnotify_init_mark()") Signed-off-by: Gaosheng Cui Reviewed-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_fsnotify.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 6432a37ac1c9..c565fbf66ac8 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -102,6 +102,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); if (ret < 0) { + audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } -- cgit v1.2.3 From 763f4fb76e24959c370cdaa889b2492ba6175580 Mon Sep 17 00:00:00 2001 From: Jing-Ting Wu Date: Tue, 23 Aug 2022 13:41:46 +0800 Subject: cgroup: Fix race condition at rebind_subsystems() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: The rebind_subsystems() is no lock held when move css object from A list to B list,then let B's head be treated as css node at list_for_each_entry_rcu(). Solution: Add grace period before invalidating the removed rstat_css_node. Reported-by: Jing-Ting Wu Suggested-by: Michal Koutný Signed-off-by: Jing-Ting Wu Tested-by: Jing-Ting Wu Link: https://lore.kernel.org/linux-arm-kernel/d8f0bc5e2fb6ed259f9334c83279b4c011283c41.camel@mediatek.com/T/ Acked-by: Mukesh Ojha Fixes: a7df69b81aac ("cgroup: rstat: support cgroup1") Cc: stable@vger.kernel.org # v5.13+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e1387499b336..e4bb5d57f4d1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1820,6 +1820,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) if (ss->css_rstat_flush) { list_del_rcu(&css->rstat_css_node); + synchronize_rcu(); list_add_rcu(&css->rstat_css_node, &dcgrp->rstat_css_list); } -- cgit v1.2.3 From 0947ae1121083d363d522ff7518ee72b55bd8d29 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 14:58:04 -0700 Subject: bpf: Fix a data-race around bpf_jit_limit. While reading bpf_jit_limit, it can be changed concurrently via sysctl, WRITE_ONCE() in __do_proc_doulongvec_minmax(). The size of bpf_jit_limit is long, so we need to add a paired READ_ONCE() to avoid load-tearing. Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220823215804.2177-1-kuniyu@amazon.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb..3d9eb3ae334c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -971,7 +971,7 @@ pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { + if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; -- cgit v1.2.3 From 43626dade36fa74d3329046f4ae2d7fdefe401c6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 25 Aug 2022 17:38:38 +0900 Subject: cgroup: Add missing cpus_read_lock() to cgroup_attach_task_all() syzbot is hitting percpu_rwsem_assert_held(&cpu_hotplug_lock) warning at cpuset_attach() [1], for commit 4f7e7236435ca0ab ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock") missed that cpuset_attach() is also called from cgroup_attach_task_all(). Add cpus_read_lock() like what cgroup_procs_write_start() does. Link: https://syzkaller.appspot.com/bug?extid=29d3a3b4d86c8136ad9e [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Fixes: 4f7e7236435ca0ab ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock") Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2ade21b54dc4..ff6a8099eb2a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + cpus_read_lock(); percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -72,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) break; } percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); mutex_unlock(&cgroup_mutex); return retval; -- cgit v1.2.3 From 2fc31465c5373b5ca4edf2e5238558cb62902311 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 20:52:59 +0200 Subject: bpf: Do mark_chain_precision for ARG_CONST_ALLOC_SIZE_OR_ZERO Precision markers need to be propagated whenever we have an ARG_CONST_* style argument, as the verifier cannot consider imprecise scalars to be equivalent for the purposes of states_equal check when such arguments refine the return value (in this case, set mem_size for PTR_TO_MEM). The resultant mem_size for the R0 is derived from the constant value, and if the verifier incorrectly prunes states considering them equivalent where such arguments exist (by seeing that both registers have reg->precise as false in regsafe), we can end up with invalid programs passing the verifier which can do access beyond what should have been the correct mem_size in that explored state. To show a concrete example of the problem: 0000000000000000 : 0: r2 = *(u32 *)(r1 + 80) 1: r1 = *(u32 *)(r1 + 76) 2: r3 = r1 3: r3 += 4 4: if r3 > r2 goto +18 5: w2 = 0 6: *(u32 *)(r1 + 0) = r2 7: r1 = *(u32 *)(r1 + 0) 8: r2 = 1 9: if w1 == 0 goto +1 10: r2 = -1 0000000000000058 : 11: r1 = 0 ll 13: r3 = 0 14: call bpf_ringbuf_reserve 15: if r0 == 0 goto +7 16: r1 = r0 17: r1 += 16777215 18: w2 = 0 19: *(u8 *)(r1 + 0) = r2 20: r1 = r0 21: r2 = 0 22: call bpf_ringbuf_submit 00000000000000b8 : 23: w0 = 0 24: exit For the first case, the single line execution's exploration will prune the search at insn 14 for the branch insn 9's second leg as it will be verified first using r2 = -1 (UINT_MAX), while as w1 at insn 9 will always be 0 so at runtime we don't get error for being greater than UINT_MAX/4 from bpf_ringbuf_reserve. The verifier during regsafe just sees reg->precise as false for both r2 registers in both states, hence considers them equal for purposes of states_equal. If we propagated precise markers using the backtracking support, we would use the precise marking to then ensure that old r2 (UINT_MAX) was within the new r2 (1) and this would never be true, so the verification would rightfully fail. The end result is that the out of bounds access at instruction 19 would be permitted without this fix. Note that reg->precise is always set to true when user does not have CAP_BPF (or when subprog count is greater than 1 (i.e. use of any static or global functions)), hence this is only a problem when precision marks need to be explicitly propagated (i.e. privileged users with CAP_BPF). A simplified test case has been included in the next patch to prevent future regressions. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823185300.406-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 096fdac70165..30c6eebce146 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6066,6 +6066,9 @@ skip_type_check: return -EACCES; } meta->mem_size = reg->var_off.value; + err = mark_chain_precision(env, regno); + if (err) + return err; break; case ARG_PTR_TO_INT: case ARG_PTR_TO_LONG: -- cgit v1.2.3 From d4fefa4801a1c2f9c0c7a48fbb0fdf384e89a4ab Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 25 Aug 2022 15:32:40 -0400 Subject: audit: move audit_return_fixup before the filters The success and return_code are needed by the filters. Move audit_return_fixup() before the filters. This was causing syscall auditing events to be missed. Link: https://github.com/linux-audit/audit-kernel/issues/138 Cc: stable@vger.kernel.org Fixes: 12c5e81d3fd0 ("audit: prepare audit_context for use in calling contexts beyond syscalls") Signed-off-by: Richard Guy Briggs [PM: manual merge required] Signed-off-by: Paul Moore --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dd8d9ab747c3..79a5da1bc5bb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1940,6 +1940,7 @@ void __audit_uring_exit(int success, long code) goto out; } + audit_return_fixup(ctx, success, code); if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case @@ -1981,7 +1982,6 @@ void __audit_uring_exit(int success, long code) audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) goto out; - audit_return_fixup(ctx, success, code); audit_log_exit(); out: @@ -2065,13 +2065,13 @@ void __audit_syscall_exit(int success, long return_code) if (!list_empty(&context->killed_trees)) audit_kill_trees(context); + audit_return_fixup(context, success, return_code); /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); if (context->current_state < AUDIT_STATE_RECORD) goto out; - audit_return_fixup(context, success, return_code); audit_log_exit(); out: -- cgit v1.2.3 From a657182a5c5150cdfacb6640aad1d2712571a409 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 Aug 2022 23:26:47 +0200 Subject: bpf: Don't use tnum_range on array range checking for poke descriptors Hsin-Wei reported a KASAN splat triggered by their BPF runtime fuzzer which is based on a customized syzkaller: BUG: KASAN: slab-out-of-bounds in bpf_int_jit_compile+0x1257/0x13f0 Read of size 8 at addr ffff888004e90b58 by task syz-executor.0/1489 CPU: 1 PID: 1489 Comm: syz-executor.0 Not tainted 5.19.0 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x9c/0xc9 print_address_description.constprop.0+0x1f/0x1f0 ? bpf_int_jit_compile+0x1257/0x13f0 kasan_report.cold+0xeb/0x197 ? kvmalloc_node+0x170/0x200 ? bpf_int_jit_compile+0x1257/0x13f0 bpf_int_jit_compile+0x1257/0x13f0 ? arch_prepare_bpf_dispatcher+0xd0/0xd0 ? rcu_read_lock_sched_held+0x43/0x70 bpf_prog_select_runtime+0x3e8/0x640 ? bpf_obj_name_cpy+0x149/0x1b0 bpf_prog_load+0x102f/0x2220 ? __bpf_prog_put.constprop.0+0x220/0x220 ? find_held_lock+0x2c/0x110 ? __might_fault+0xd6/0x180 ? lock_downgrade+0x6e0/0x6e0 ? lock_is_held_type+0xa6/0x120 ? __might_fault+0x147/0x180 __sys_bpf+0x137b/0x6070 ? bpf_perf_link_attach+0x530/0x530 ? new_sync_read+0x600/0x600 ? __fget_files+0x255/0x450 ? lock_downgrade+0x6e0/0x6e0 ? fput+0x30/0x1a0 ? ksys_write+0x1a8/0x260 __x64_sys_bpf+0x7a/0xc0 ? syscall_enter_from_user_mode+0x21/0x70 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f917c4e2c2d The problem here is that a range of tnum_range(0, map->max_entries - 1) has limited ability to represent the concrete tight range with the tnum as the set of resulting states from value + mask can result in a superset of the actual intended range, and as such a tnum_in(range, reg->var_off) check may yield true when it shouldn't, for example tnum_range(0, 2) would result in 00XX -> v = 0000, m = 0011 such that the intended set of {0, 1, 2} is here represented by a less precise superset of {0, 1, 2, 3}. As the register is known const scalar, really just use the concrete reg->var_off.value for the upper index check. Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes") Reported-by: Hsin-Wei Hung Signed-off-by: Daniel Borkmann Cc: Shung-Hsi Yu Acked-by: John Fastabend Link: https://lore.kernel.org/r/984b37f9fdf7ac36831d2137415a4a915744c1b6.1661462653.git.daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 30c6eebce146..3eadb14e090b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7033,8 +7033,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; struct bpf_reg_state *regs = cur_regs(env), *reg; struct bpf_map *map = meta->map_ptr; - struct tnum range; - u64 val; + u64 val, max; int err; if (func_id != BPF_FUNC_tail_call) @@ -7044,10 +7043,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - range = tnum_range(0, map->max_entries - 1); reg = ®s[BPF_REG_3]; + val = reg->var_off.value; + max = map->max_entries; - if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + if (!(register_is_const(reg) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -7055,8 +7055,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, err = mark_chain_precision(env, BPF_REG_3); if (err) return err; - - val = reg->var_off.value; if (bpf_map_key_unseen(aux)) bpf_map_key_store(aux, val); else if (!bpf_map_key_poisoned(aux) && -- cgit v1.2.3 From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 26 Aug 2022 09:17:08 -0400 Subject: wait_on_bit: add an acquire memory barrier There are several places in the kernel where wait_on_bit is not followed by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read). On architectures with weak memory ordering, it may happen that memory accesses that follow wait_on_bit are reordered before wait_on_bit and they may return invalid data. Fix this class of bugs by introducing a new function "test_bit_acquire" that works like test_bit, but has acquire memory ordering semantics. Signed-off-by: Mikulas Patocka Acked-by: Will Deacon Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/sched/wait_bit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index d4788f810b55..0b1cd985dc27 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); finish_wait(wq_head, &wbq_entry->wq_entry); -- cgit v1.2.3 From f09bddbd86619bf6213c96142a3b6b6a84818798 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 8 Aug 2022 13:54:10 -0700 Subject: vmcoreinfo: add kallsyms_num_syms symbol The rest of the kallsyms symbols are useless without knowing the number of symbols in the table. In an earlier patch, I somehow dropped the kallsyms_num_syms symbol, so add it back in. Link: https://lkml.kernel.org/r/20220808205410.18590-1-stephen.s.brennan@oracle.com Fixes: 5fd8fea935a1 ("vmcoreinfo: include kallsyms symbols") Signed-off-by: Stephen Brennan Cc: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 07b26df453a9..a0eb4d5cf557 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -494,6 +494,7 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_num_syms); VMCOREINFO_SYMBOL(kallsyms_token_table); VMCOREINFO_SYMBOL(kallsyms_token_index); #ifdef CONFIG_KALLSYMS_BASE_RELATIVE -- cgit v1.2.3 From c2e406596571659451f4b95e37ddfd5a8ef1d0dc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Sep 2022 14:31:07 +0200 Subject: sched/debug: fix dentry leak in update_sched_domain_debugfs Kuyo reports that the pattern of using debugfs_remove(debugfs_lookup()) leaks a dentry and with a hotplug stress test, the machine eventually runs out of memory. Fix this up by using the newly created debugfs_lookup_and_remove() call instead which properly handles the dentry reference counting logic. Cc: Major Chen Cc: stable Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: Valentin Schneider Cc: Matthias Brugger Reported-by: Kuyo Chang Tested-by: Kuyo Chang Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220902123107.109274-2-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bb3d63bdf4ae..667876da8382 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -416,7 +416,7 @@ void update_sched_domain_debugfs(void) char buf[32]; snprintf(buf, sizeof(buf), "cpu%d", cpu); - debugfs_remove(debugfs_lookup(buf, sd_dentry)); + debugfs_lookup_and_remove(buf, sd_dentry); d_cpu = debugfs_create_dir(buf, sd_dentry); i = 0; -- cgit v1.2.3 From 85eaeb5058f0f04dffb124c97c86b4f18db0b833 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 24 Aug 2022 09:10:36 +0300 Subject: IB/core: Fix a nested dead lock as part of ODP flow Fix a nested dead lock as part of ODP flow by using mmput_async(). From the below call trace [1] can see that calling mmput() once we have the umem_odp->umem_mutex locked as required by ib_umem_odp_map_dma_and_lock() might trigger in the same task the exit_mmap()->__mmu_notifier_release()->mlx5_ib_invalidate_range() which may dead lock when trying to lock the same mutex. Moving to use mmput_async() will solve the problem as the above exit_mmap() flow will be called in other task and will be executed once the lock will be available. [1] [64843.077665] task:kworker/u133:2 state:D stack: 0 pid:80906 ppid: 2 flags:0x00004000 [64843.077672] Workqueue: mlx5_ib_page_fault mlx5_ib_eqe_pf_action [mlx5_ib] [64843.077719] Call Trace: [64843.077722] [64843.077724] __schedule+0x23d/0x590 [64843.077729] schedule+0x4e/0xb0 [64843.077735] schedule_preempt_disabled+0xe/0x10 [64843.077740] __mutex_lock.constprop.0+0x263/0x490 [64843.077747] __mutex_lock_slowpath+0x13/0x20 [64843.077752] mutex_lock+0x34/0x40 [64843.077758] mlx5_ib_invalidate_range+0x48/0x270 [mlx5_ib] [64843.077808] __mmu_notifier_release+0x1a4/0x200 [64843.077816] exit_mmap+0x1bc/0x200 [64843.077822] ? walk_page_range+0x9c/0x120 [64843.077828] ? __cond_resched+0x1a/0x50 [64843.077833] ? mutex_lock+0x13/0x40 [64843.077839] ? uprobe_clear_state+0xac/0x120 [64843.077860] mmput+0x5f/0x140 [64843.077867] ib_umem_odp_map_dma_and_lock+0x21b/0x580 [ib_core] [64843.077931] pagefault_real_mr+0x9a/0x140 [mlx5_ib] [64843.077962] pagefault_mr+0xb4/0x550 [mlx5_ib] [64843.077992] pagefault_single_data_segment.constprop.0+0x2ac/0x560 [mlx5_ib] [64843.078022] mlx5_ib_eqe_pf_action+0x528/0x780 [mlx5_ib] [64843.078051] process_one_work+0x22b/0x3d0 [64843.078059] worker_thread+0x53/0x410 [64843.078065] ? process_one_work+0x3d0/0x3d0 [64843.078073] kthread+0x12a/0x150 [64843.078079] ? set_kthread_struct+0x50/0x50 [64843.078085] ret_from_fork+0x22/0x30 [64843.078093] Fixes: 36f30e486dce ("IB/core: Improve ODP to use hmm_range_fault()") Reviewed-by: Maor Gottlieb Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/74d93541ea533ef7daec6f126deb1072500aeb16.1661251841.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b17bf69..8a9e92068b15 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1225,6 +1225,7 @@ void mmput_async(struct mm_struct *mm) schedule_work(&mm->async_put_work); } } +EXPORT_SYMBOL_GPL(mmput_async); #endif /** -- cgit v1.2.3 From baf2c002402702accc30dcc8f19199f303638306 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 23 Aug 2022 17:20:29 +0200 Subject: rv/monitors: Make monitor's automata definition static Monitor's automata definition is only used locally, so make them static for all existing monitors. Link: https://lore.kernel.org/all/202208210332.gtHXje45-lkp@intel.com Link: https://lore.kernel.org/all/202208210358.6HH3OrVs-lkp@intel.com Link: https://lkml.kernel.org/r/a50e27c3738d6ef809f4201857229fed64799234.1661266564.git.bristot@kernel.org Fixes: ccc319dcb450 ("rv/monitor: Add the wwnr monitor") Fixes: 8812d21219b9 ("rv/monitor: Add the wip monitor skeleton created by dot2k") Reported-by: kernel test robot Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/wip/wip.h | 2 +- kernel/trace/rv/monitors/wwnr/wwnr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index c1c47e2305ef..dacc37b62a2c 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -27,7 +27,7 @@ struct automaton_wip { bool final_states[state_max_wip]; }; -struct automaton_wip automaton_wip = { +static struct automaton_wip automaton_wip = { .state_names = { "preemptive", "non_preemptive" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d1afe55cdd4c..118e576b91b4 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -27,7 +27,7 @@ struct automaton_wwnr { bool final_states[state_max_wwnr]; }; -struct automaton_wwnr automaton_wwnr = { +static struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", "running" -- cgit v1.2.3 From 54be5509422ebee333709c92441aa7185ca182fa Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Mon, 29 Aug 2022 10:10:48 -0700 Subject: tracepoint: Allow trace events in modules with TAINT_TEST Commit 2852ca7fba9f ("panic: Taint kernel if tests are run") introduced a new taint type, TAINT_TEST, to signal that an in-kernel test module has been loaded. TAINT_TEST taint type defaults into a 'bad_taint' list for kernel tracing and blocks the creation of trace events. This causes a problem for CXL testing where loading the cxl_test module makes all CXL modules out-of-tree, blocking any trace events. Trace events are in development for CXL at the moment and this issue was found in test with v6.0-rc1. Link: https://lkml.kernel.org/r/20220829171048.263065-1-alison.schofield@intel.com Fixes: 2852ca7fba9f7 ("panic: Taint kernel if tests are run") Reported-by: Ira Weiny Suggested-by: Dan Williams Tested-by: Ira Weiny Reviewed-by: David Gow Signed-off-by: Alison Schofield Signed-off-by: Steven Rostedt (Google) --- kernel/tracepoint.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 64ea283f2f86..ef42c1a11920 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -571,7 +571,8 @@ static void for_each_tracepoint_range( bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | - (1 << TAINT_UNSIGNED_MODULE)); + (1 << TAINT_UNSIGNED_MODULE) | + (1 << TAINT_TEST)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); @@ -647,7 +648,7 @@ static int tracepoint_module_coming(struct module *mod) /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. - * Staging, out-of-tree, and unsigned GPL modules are fine. + * Staging, out-of-tree, unsigned GPL, and test modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; -- cgit v1.2.3 From 54c3931957f6a6194d5972eccc36d052964b2abe Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Thu, 1 Sep 2022 18:45:14 +0800 Subject: tracing: hold caller_addr to hardirq_{enable,disable}_ip Currently, The arguments passing to lockdep_hardirqs_{on,off} was fixed in CALLER_ADDR0. The function trace_hardirqs_on_caller should have been intended to use caller_addr to represent the address that caller wants to be traced. For example, lockdep log in riscv showing the last {enabled,disabled} at __trace_hardirqs_{on,off} all the time(if called by): [ 57.853175] hardirqs last enabled at (2519): __trace_hardirqs_on+0xc/0x14 [ 57.853848] hardirqs last disabled at (2520): __trace_hardirqs_off+0xc/0x14 After use trace_hardirqs_xx_caller, we can get more effective information: [ 53.781428] hardirqs last enabled at (2595): restore_all+0xe/0x66 [ 53.782185] hardirqs last disabled at (2596): ret_from_exception+0xa/0x10 Link: https://lkml.kernel.org/r/20220901104515.135162-2-zouyipeng@huawei.com Cc: stable@vger.kernel.org Fixes: c3bc8fd637a96 ("tracing: Centralize preemptirq tracepoints and unify their usage") Signed-off-by: Yipeng Zou Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_preemptirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 95b58bd757ce..1e130da1b742 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -95,14 +95,14 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) } lockdep_hardirqs_on_prepare(); - lockdep_hardirqs_on(CALLER_ADDR0); + lockdep_hardirqs_on(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { - lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirqs_off(caller_addr); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); -- cgit v1.2.3 From cecf8e128ec69149fe53c9a7bafa505a4bee25d9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 4 Sep 2022 13:12:29 +0900 Subject: tracing: Fix to check event_mutex is held while accessing trigger list Since the check_user_trigger() is called outside of RCU read lock, this list_for_each_entry_rcu() caused a suspicious RCU usage warning. # echo hist:keys=pid > events/sched/sched_stat_runtime/trigger # cat events/sched/sched_stat_runtime/trigger [ 43.167032] [ 43.167418] ============================= [ 43.167992] WARNING: suspicious RCU usage [ 43.168567] 5.19.0-rc5-00029-g19ebe4651abf #59 Not tainted [ 43.169283] ----------------------------- [ 43.169863] kernel/trace/trace_events_trigger.c:145 RCU-list traversed in non-reader section!! ... However, this file->triggers list is safe when it is accessed under event_mutex is held. To fix this warning, adds a lockdep_is_held check to the list_for_each_entry_rcu(). Link: https://lkml.kernel.org/r/166226474977.223837.1992182913048377113.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cb866c3141af..918730d74932 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -142,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file) { struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu(data, &file->triggers, list, + lockdep_is_held(&event_mutex)) { if (data->flags & EVENT_TRIGGER_FL_PROBE) continue; return true; -- cgit v1.2.3 From 93d71986a60c37395e0a1d2f7fe170ef1e0a8ba4 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 22:12:10 +0800 Subject: rv/reactor: add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Link: https://lkml.kernel.org/r/20220906141210.132607-1-xiujianfeng@huawei.com Fixes: 135b881ea885 ("rv/reactor: Add the printk reactor") Fixes: e88043c0ac16 ("rv/reactor: Add the panic reactor") Signed-off-by: Xiu Jianfeng Acked-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/reactor_panic.c | 4 ++-- kernel/trace/rv/reactor_printk.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index b698d05dd069..d65f6c25a87c 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -24,13 +24,13 @@ static struct rv_reactor rv_panic = { .react = rv_panic_reaction }; -static int register_react_panic(void) +static int __init register_react_panic(void) { rv_register_reactor(&rv_panic); return 0; } -static void unregister_react_panic(void) +static void __exit unregister_react_panic(void) { rv_unregister_reactor(&rv_panic); } diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 31899f953af4..4b6b7106a477 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -23,13 +23,13 @@ static struct rv_reactor rv_printk = { .react = rv_printk_reaction }; -static int register_react_printk(void) +static int __init register_react_printk(void) { rv_register_reactor(&rv_printk); return 0; } -static void unregister_react_printk(void) +static void __exit unregister_react_printk(void) { rv_unregister_reactor(&rv_printk); } -- cgit v1.2.3 From 81c12e922b97b94f64e84aa9bfe5a9d1fca2dc25 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 31 Aug 2022 00:38:18 -0600 Subject: Revert "swiotlb: panic if nslabs is too small" This reverts commit 0bf28fc40d89b1a3e00d1b79473bad4e9ca20ad1. Reasons: 1. new panic()s shouldn't be added [1]. 2. It does no "cleanup" but breaks MIPS [2]. v2: properly solved the conflict [3] with commit 20347fca71a38 ("swiotlb: split up the global swiotlb lock") Reported-by: kernel test robot Reported-by: Dan Carpenter [1] https://lore.kernel.org/r/CAHk-=wit-DmhMfQErY29JSPjFgebx_Ld+pnerc4J2Ag990WwAA@mail.gmail.com/ [2] https://lore.kernel.org/r/20220820012031.1285979-1-yuzhao@google.com/ [3] https://lore.kernel.org/r/202208310701.LKr1WDCh-lkp@intel.com/ Fixes: 0bf28fc40d89b ("swiotlb: panic if nslabs is too small") Signed-off-by: Yu Zhao Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c5a9190b218f..dd8863987e0c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -326,9 +326,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; - if (nslabs < IO_TLB_MIN_SLABS) - panic("%s: nslabs = %lu too small\n", __func__, nslabs); - /* * By default allocate the bounce buffer memory from low memory, but * allow to pick a location everywhere for hypervisors with guest @@ -341,8 +338,7 @@ retry: else tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) { - pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", - __func__, bytes); + pr_warn("%s: failed to allocate tlb structure\n", __func__); return; } -- cgit v1.2.3 From 2995b8002cefc7b0b00b8f9a0ce36601a8c390c0 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 15 Aug 2022 20:28:40 +0100 Subject: dma-debug: improve search for partial syncs When bucket_find_contains() tries to find the original entry for a partial sync, it manages to constrain its search in a way that is both too restrictive and not restrictive enough. A driver which only uses single mappings rather than scatterlists might not set max_seg_size, but could still technically perform a partial sync at an offset of more than 64KB into a sufficiently large mapping, so we could stop searching too early before reaching a legitimate entry. Conversely, if no valid entry is present and max_range is large enough, we can pointlessly search buckets that we've already searched, or that represent an impossible wrapping around the bottom of the address space. At worst, the (legitimate) case of max_seg_size == UINT_MAX can make the loop infinite. Replace the fragile and frankly hard-to-follow "range" logic with a simple counted loop for the number of possible hash buckets below the given address. Reported-by: Yunfei Wang Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 2caafd13f8aa..18c93c2276ca 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -350,11 +350,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, unsigned long *flags) { - unsigned int max_range = dma_get_max_seg_size(ref->dev); struct dma_debug_entry *entry, index = *ref; - unsigned int range = 0; + int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1); - while (range <= max_range) { + for (int i = 0; i < limit; i++) { entry = __hash_bucket_find(*bucket, ref, containing_match); if (entry) @@ -364,7 +363,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, * Nothing found, go back a hash bucket */ put_hash_bucket(*bucket, *flags); - range += (1 << HASH_FN_SHIFT); index.dev_addr -= (1 << HASH_FN_SHIFT); *bucket = get_hash_bucket(&index, flags); } -- cgit v1.2.3 From 3f0461613ebcdc8c4073e235053d06d5aa58750f Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Aug 2022 16:45:37 +0800 Subject: swiotlb: avoid potential left shift overflow The second operand passed to slot_addr() is declared as int or unsigned int in all call sites. The left-shift to get the offset of a slot can overflow if swiotlb size is larger than 4G. Convert the macro to an inline function and declare the second argument as phys_addr_t to avoid the potential overflow. Fixes: 26a7e094783d ("swiotlb: refactor swiotlb_tbl_map_single") Signed-off-by: Chao Gao Reviewed-by: Dongli Zhang Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index dd8863987e0c..1ce8977d911c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -575,7 +575,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size } } -#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) +static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) +{ + return start + (idx << IO_TLB_SHIFT); +} /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. -- cgit v1.2.3 From 43b919017fe755ccd2b19afb2dc2d3da8f840038 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 26 Aug 2022 17:50:46 +0800 Subject: swiotlb: fix a typo "overwirte" isn't a word. It should be "overwrite". Signed-off-by: Chao Gao Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1ce8977d911c..0ef6b12f961d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -764,7 +764,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, /* * When dir == DMA_FROM_DEVICE we could omit the copy from the orig * to the tlb buffer, if we knew for sure the device will - * overwirte the entire current content. But we don't. Thus + * overwrite the entire current content. But we don't. Thus * unconditional bounce may prevent leaking swiotlb content (i.e. * kernel memory) to user-space. */ -- cgit v1.2.3 From 9fc18f6d56d5b79d527c17a8100a0965d18345cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 21 Aug 2022 16:06:44 +0200 Subject: dma-mapping: mark dma_supported static Now that the remaining users in drivers are gone, this function can be marked static. Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 49cbf3e33de7..27f272381cf2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -707,7 +707,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); -int dma_supported(struct device *dev, u64 mask) +static int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -721,7 +721,6 @@ int dma_supported(struct device *dev, u64 mask) return 1; return ops->dma_supported(dev, mask); } -EXPORT_SYMBOL(dma_supported); bool dma_pci_p2pdma_supported(struct device *dev) { -- cgit v1.2.3 From 1efda38d6f9ba26ac88b359c6277f1172db03f1e Mon Sep 17 00:00:00 2001 From: "Christian A. Ehrhardt" Date: Wed, 7 Sep 2022 22:09:17 +0200 Subject: kprobes: Prohibit probes in gate area The system call gate area counts as kernel text but trying to install a kprobe in this area fails with an Oops later on. To fix this explicitly disallow the gate area for kprobes. Found by syzkaller with the following reproducer: perf_event_open$cgroup(&(0x7f00000001c0)={0x6, 0x80, 0x0, 0x0, 0x0, 0x0, 0x80ffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext={0x0, 0xffffffffff600000}}, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0) Sample report: BUG: unable to handle page fault for address: fffffbfff3ac6000 PGD 6dfcb067 P4D 6dfcb067 PUD 6df8f067 PMD 6de4d067 PTE 0 Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 0 PID: 21978 Comm: syz-executor.2 Not tainted 6.0.0-rc3-00363-g7726d4c3e60b-dirty #6 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:__insn_get_emulate_prefix arch/x86/lib/insn.c:91 [inline] RIP: 0010:insn_get_emulate_prefix arch/x86/lib/insn.c:106 [inline] RIP: 0010:insn_get_prefixes.part.0+0xa8/0x1110 arch/x86/lib/insn.c:134 Code: 49 be 00 00 00 00 00 fc ff df 48 8b 40 60 48 89 44 24 08 e9 81 00 00 00 e8 e5 4b 39 ff 4c 89 fa 4c 89 f9 48 c1 ea 03 83 e1 07 <42> 0f b6 14 32 38 ca 7f 08 84 d2 0f 85 06 10 00 00 48 89 d8 48 89 RSP: 0018:ffffc900088bf860 EFLAGS: 00010246 RAX: 0000000000040000 RBX: ffffffff9b9bebc0 RCX: 0000000000000000 RDX: 1ffffffff3ac6000 RSI: ffffc90002d82000 RDI: ffffc900088bf9e8 RBP: ffffffff9d630001 R08: 0000000000000000 R09: ffffc900088bf9e8 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001 R13: ffffffff9d630000 R14: dffffc0000000000 R15: ffffffff9d630000 FS: 00007f63eef63640(0000) GS:ffff88806d000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff3ac6000 CR3: 0000000029d90005 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: insn_get_prefixes arch/x86/lib/insn.c:131 [inline] insn_get_opcode arch/x86/lib/insn.c:272 [inline] insn_get_modrm+0x64a/0x7b0 arch/x86/lib/insn.c:343 insn_get_sib+0x29a/0x330 arch/x86/lib/insn.c:421 insn_get_displacement+0x350/0x6b0 arch/x86/lib/insn.c:464 insn_get_immediate arch/x86/lib/insn.c:632 [inline] insn_get_length arch/x86/lib/insn.c:707 [inline] insn_decode+0x43a/0x490 arch/x86/lib/insn.c:747 can_probe+0xfc/0x1d0 arch/x86/kernel/kprobes/core.c:282 arch_prepare_kprobe+0x79/0x1c0 arch/x86/kernel/kprobes/core.c:739 prepare_kprobe kernel/kprobes.c:1160 [inline] register_kprobe kernel/kprobes.c:1641 [inline] register_kprobe+0xb6e/0x1690 kernel/kprobes.c:1603 __register_trace_kprobe kernel/trace/trace_kprobe.c:509 [inline] __register_trace_kprobe+0x26a/0x2d0 kernel/trace/trace_kprobe.c:477 create_local_trace_kprobe+0x1f7/0x350 kernel/trace/trace_kprobe.c:1833 perf_kprobe_init+0x18c/0x280 kernel/trace/trace_event_perf.c:271 perf_kprobe_event_init+0xf8/0x1c0 kernel/events/core.c:9888 perf_try_init_event+0x12d/0x570 kernel/events/core.c:11261 perf_init_event kernel/events/core.c:11325 [inline] perf_event_alloc.part.0+0xf7f/0x36a0 kernel/events/core.c:11619 perf_event_alloc kernel/events/core.c:12059 [inline] __do_sys_perf_event_open+0x4a8/0x2a00 kernel/events/core.c:12157 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f63ef7efaed Code: 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f63eef63028 EFLAGS: 00000246 ORIG_RAX: 000000000000012a RAX: ffffffffffffffda RBX: 00007f63ef90ff80 RCX: 00007f63ef7efaed RDX: 0000000000000000 RSI: ffffffffffffffff RDI: 00000000200001c0 RBP: 00007f63ef86019c R08: 0000000000000000 R09: 0000000000000000 R10: ffffffffffffffff R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000002 R14: 00007f63ef90ff80 R15: 00007f63eef43000 Modules linked in: CR2: fffffbfff3ac6000 ---[ end trace 0000000000000000 ]--- RIP: 0010:__insn_get_emulate_prefix arch/x86/lib/insn.c:91 [inline] RIP: 0010:insn_get_emulate_prefix arch/x86/lib/insn.c:106 [inline] RIP: 0010:insn_get_prefixes.part.0+0xa8/0x1110 arch/x86/lib/insn.c:134 Code: 49 be 00 00 00 00 00 fc ff df 48 8b 40 60 48 89 44 24 08 e9 81 00 00 00 e8 e5 4b 39 ff 4c 89 fa 4c 89 f9 48 c1 ea 03 83 e1 07 <42> 0f b6 14 32 38 ca 7f 08 84 d2 0f 85 06 10 00 00 48 89 d8 48 89 RSP: 0018:ffffc900088bf860 EFLAGS: 00010246 RAX: 0000000000040000 RBX: ffffffff9b9bebc0 RCX: 0000000000000000 RDX: 1ffffffff3ac6000 RSI: ffffc90002d82000 RDI: ffffc900088bf9e8 RBP: ffffffff9d630001 R08: 0000000000000000 R09: ffffc900088bf9e8 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000001 R13: ffffffff9d630000 R14: dffffc0000000000 R15: ffffffff9d630000 FS: 00007f63eef63640(0000) GS:ffff88806d000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff3ac6000 CR3: 0000000029d90005 CR4: 0000000000770ef0 PKRU: 55555554 ================================================================== Link: https://lkml.kernel.org/r/20220907200917.654103-1-lk@c--e.de cc: "Naveen N. Rao" cc: Anil S Keshavamurthy cc: "David S. Miller" Cc: stable@vger.kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Christian A. Ehrhardt Signed-off-by: Steven Rostedt (Google) --- kernel/kprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 08350e35aba2..ca9d834d0b84 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1562,6 +1562,7 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!(core_kernel_text((unsigned long) p->addr) || is_module_text_address((unsigned long) p->addr)) || + in_gate_area_no_mm((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || -- cgit v1.2.3