From 90574a9c02f1ed46d9d8fec222fbcf375eb90e9b Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 6 Nov 2020 04:40:05 +0100 Subject: printk: remove unneeded dead-store assignment make clang-analyzer on x86_64 defconfig caught my attention with: kernel/printk/printk_ringbuffer.c:885:3: warning: Value stored to 'desc' is never read [clang-analyzer-deadcode.DeadStores] desc = to_desc(desc_ring, head_id); ^ Commit b6cf8b3f3312 ("printk: add lockless ringbuffer") introduced desc_reserve() with this unneeded dead-store assignment. As discussed with John Ogness privately, this is probably just some minor left-over from previous iterations of the ringbuffer implementation. So, simply remove this unneeded dead assignment to make clang-analyzer happy. As compilers will detect this unneeded assignment and optimize this anyway, the resulting object code is identical before and after this change. No functional change. No change to object code. Signed-off-by: Lukas Bulwahn Reviewed-by: Sergey Senozhatsky Reviewed-by: John Ogness Reviewed-by: Nathan Chancellor Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20201106034005.18822-1-lukas.bulwahn@gmail.com --- kernel/printk/printk_ringbuffer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 24a960a89aa8..dd43c4cf16fb 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -882,8 +882,6 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ do { - desc = to_desc(desc_ring, head_id); - id = DESC_ID(head_id + 1); id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); -- cgit v1.2.3 From abbaa433de07076fb8ef524b77ce55d94bad5fc5 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 7 Nov 2020 15:45:44 +0800 Subject: bpf: Fix passing zero to PTR_ERR() in bpf_btf_printf_prepare There is a bug when passing zero to PTR_ERR() and return. Fix the smatch error. Fixes: c4d0bfb45068 ("bpf: Add bpf_snprintf_btf helper") Signed-off-by: Wang Qing Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/1604735144-686-1-git-send-email-wangqing@vivo.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..5113fd423cdf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1198,7 +1198,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) - return PTR_ERR(*btf); + return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id; -- cgit v1.2.3 From c583bcb8f5edd48c1798798e341f78afb9bf4f6f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Sep 2020 15:11:55 -0700 Subject: rcu: Don't invoke try_invoke_on_locked_down_task() with irqs disabled The try_invoke_on_locked_down_task() function requires that interrupts be enabled, but it is called with interrupts disabled from rcu_print_task_stall(), resulting in an "IRQs not enabled as expected" diagnostic. This commit therefore updates rcu_print_task_stall() to accumulate a list of the first few tasks while holding the current leaf rcu_node structure's ->lock, then releases that lock and only then uses try_invoke_on_locked_down_task() to attempt to obtain per-task detailed information. Of course, as soon as ->lock is released, the task might exit, so the get_task_struct() function is used to prevent the task structure from going away in the meantime. Link: https://lore.kernel.org/lkml/000000000000903d5805ab908fc4@google.com/ Fixes: 5bef8da66a9c ("rcu: Add per-task state to RCU CPU stall warnings") Reported-by: syzbot+cb3b69ae80afd6535b0e@syzkaller.appspotmail.com Reported-by: syzbot+f04854e1c5c9e913cc27@syzkaller.appspotmail.com Tested-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0fde39b8daab..ca21d28a0f98 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -249,13 +249,16 @@ static bool check_slow_task(struct task_struct *t, void *arg) /* * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. + * sections, printing out the tid of each of the first few of them. */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { + int i = 0; int ndetected = 0; struct rcu_stall_chk_rdr rscr; struct task_struct *t; + struct task_struct *ts[8]; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -264,6 +267,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + get_task_struct(t); + ts[i++] = t; + if (i >= ARRAY_SIZE(ts)) + break; + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + for (i--; i; i--) { + t = ts[i]; if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else @@ -273,6 +284,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) ".q"[rscr.rs.b.need_qs], ".e"[rscr.rs.b.exp_hint], ".l"[rscr.on_blkd_list]); + put_task_struct(t); ndetected++; } pr_cont("\n"); @@ -293,8 +305,9 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ @@ -472,7 +485,6 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -480,7 +492,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) ndetected++; } } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. } for_each_possible_cpu(cpu) -- cgit v1.2.3 From f16e631333a8f12ae8128826e695db4b2a528407 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Wed, 11 Nov 2020 13:03:46 +0800 Subject: bpf: Fix unsigned 'datasec_id' compared with zero in check_pseudo_btf_id The unsigned variable datasec_id is assigned a return value from the call to check_pseudo_btf_id(), which may return negative error code. This fixes the following coccicheck warning: ./kernel/bpf/verifier.c:9616:5-15: WARNING: Unsigned expression compared with zero: datasec_id > 0 Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Reported-by: Tosk Robot Signed-off-by: Kaixu Xia Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Cc: Hao Luo Link: https://lore.kernel.org/bpf/1605071026-25906-1-git-send-email-kaixuxia@tencent.com --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6200519582a6..6204ec705d80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9572,12 +9572,13 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *aux) { - u32 datasec_id, type, id = insn->imm; const struct btf_var_secinfo *vsi; const struct btf_type *datasec; const struct btf_type *t; const char *sym_name; bool percpu = false; + u32 type, id = insn->imm; + s32 datasec_id; u64 addr; int i; -- cgit v1.2.3 From f782e2c300a717233b64697affda3ea7aac00b2b Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Fri, 13 Nov 2020 17:17:56 +0000 Subject: bpf: Relax return code check for subprograms Currently verifier enforces return code checks for subprograms in the same manner as it does for program entry points. This prevents returning arbitrary scalar values from subprograms. Scalar type of returned values is checked by btf_prepare_func_args() and hence it should be safe to allow only scalars for now. Relax return code checks for subprograms and allow any correct scalar values. Fixes: 51c39bb1d5d10 (bpf: Introduce function-by-function verification) Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201113171756.90594-1-me@ubique.spb.ru --- kernel/bpf/verifier.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6204ec705d80..1388bf733071 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7786,9 +7786,11 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; + const bool is_subprog = env->cur_state->frame[0]->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS || + if (!is_subprog && + (prog_type == BPF_PROG_TYPE_STRUCT_OPS || prog_type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -7808,6 +7810,16 @@ static int check_return_code(struct bpf_verifier_env *env) return -EACCES; } + reg = cur_regs(env) + BPF_REG_0; + if (is_subprog) { + if (reg->type != SCALAR_VALUE) { + verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + return 0; + } + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || @@ -7861,7 +7873,6 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } - reg = cur_regs(env) + BPF_REG_0; if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); -- cgit v1.2.3 From 8e1ac4299a6e8726de42310d9c1379f188140c71 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 12 Nov 2020 11:12:01 +0000 Subject: sched/fair: Fix overutilized update in enqueue_task_fair() enqueue_task_fair() attempts to skip the overutilized update for new tasks as their util_avg is not accurate yet. However, the flag we check to do so is overwritten earlier on in the function, which makes the condition pretty much a nop. Fix this by saving the flag early on. Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator") Reported-by: Rick Yiu Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20201112111201.2081902-1-qperret@google.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e563cf2b5e7..56a8ca93a971 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5477,6 +5477,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); /* * The code below (indirectly) updates schedutil which looks at @@ -5549,7 +5550,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ - if (flags & ENQUEUE_WAKEUP) + if (!task_new) update_overutilized_status(rq); enqueue_throttle: -- cgit v1.2.3 From ec618b84f6e15281cc3660664d34cd0dd2f2579e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Sep 2020 13:50:42 +0200 Subject: sched: Fix rq->nr_iowait ordering schedule() ttwu() deactivate_task(); if (p->on_rq && ...) // false atomic_dec(&task_rq(p)->nr_iowait); if (prev->in_iowait) atomic_inc(&rq->nr_iowait); Allows nr_iowait to be decremented before it gets incremented, resulting in more dodgy IO-wait numbers than usual. Note that because we can now do ttwu_queue_wakelist() before p->on_cpu==0, we lose the natural ordering and have to further delay the decrement. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20201117093829.GD3121429@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..9f0ebfb0d17b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2501,7 +2501,12 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; + else #endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); @@ -2888,11 +2893,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - #ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be @@ -2963,6 +2963,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); -- cgit v1.2.3 From 2279f540ea7d05f22d2f0c4224319330228586bc Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 17 Nov 2020 07:14:32 +0100 Subject: sched/deadline: Fix priority inheritance with multiple scheduling classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Glenn reported that "an application [he developed produces] a BUG in deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested PTHREAD_PRIO_INHERIT mutexes. I believe the bug is triggered when a CFS task that was boosted by a SCHED_DEADLINE task boosts another CFS task (nested priority inheritance). ------------[ cut here ]------------ kernel BUG at kernel/sched/deadline.c:1462! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ... Hardware name: ... RIP: 0010:enqueue_task_dl+0x335/0x910 Code: ... RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002 RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500 RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600 RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078 R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600 R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600 FS: 00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? intel_pstate_update_util_hwp+0x13/0x170 rt_mutex_setprio+0x1cc/0x4b0 task_blocks_on_rt_mutex+0x225/0x260 rt_spin_lock_slowlock_locked+0xab/0x2d0 rt_spin_lock_slowlock+0x50/0x80 hrtimer_grab_expiry_lock+0x20/0x30 hrtimer_cancel+0x13/0x30 do_nanosleep+0xa0/0x150 hrtimer_nanosleep+0xe1/0x230 ? __hrtimer_init_sleeper+0x60/0x60 __x64_sys_nanosleep+0x8d/0xa0 do_syscall_64+0x4a/0x100 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fa58b52330d ... ---[ end trace 0000000000000002 ]— He also provided a simple reproducer creating the situation below: So the execution order of locking steps are the following (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2 are mutexes that are enabled * with priority inheritance.) Time moves forward as this timeline goes down: N1 N2 D1 | | | | | | Lock(M1) | | | | | | Lock(M2) | | | | | | Lock(M2) | | | | Lock(M1) | | (!!bug triggered!) | Daniel reported a similar situation as well, by just letting ksoftirqd run with DEADLINE (and eventually block on a mutex). Problem is that boosted entities (Priority Inheritance) use static DEADLINE parameters of the top priority waiter. However, there might be cases where top waiter could be a non-DEADLINE entity that is currently boosted by a DEADLINE entity from a different lock chain (i.e., nested priority chains involving entities of non-DEADLINE classes). In this case, top waiter static DEADLINE parameters could be null (initialized to 0 at fork()) and replenish_dl_entity() would hit a BUG(). Fix this by keeping track of the original donor and using its parameters when a task is boosted. Reported-by: Glenn Elliott Reported-by: Daniel Bristot de Oliveira Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com --- kernel/sched/core.c | 11 +++--- kernel/sched/deadline.c | 97 +++++++++++++++++++++++++++---------------------- 2 files changed, 59 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9f0ebfb0d17b..e7e453492cff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4912,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (!dl_prio(p->normal_prio) || (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; + p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; + } else { + p->dl.pi_se = &p->dl; + } p->sched_class = &dl_sched_class; } else if (rt_prio(prio)) { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; p->sched_class = &fair_sched_class; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6d93f4518734..949bc5c083c1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + BUG_ON(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded && dl_se->runtime > 0) @@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded) @@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } } @@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1287,7 +1306,7 @@ throttle: dl_se->dl_overrun = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { BUG_ON(on_dl_rq(dl_se)); @@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, */ if (flags & ENQUEUE_WAKEUP) { task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); + update_dl_entity(dl_se); } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); + replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { @@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; + if (is_dl_boosted(&p->dl)) { /* * Because of delays in the detection of the overrun of a * thread's runtime, it might be the case that a thread @@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * the throttle. */ p->dl.dl_throttled = 0; - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); return; } @@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + enqueue_dl_entity(&p->dl, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); @@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; - dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) -- cgit v1.2.3 From 43be4388e94b915799a24f0eaf664bf95b85231f Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 13 Nov 2020 19:05:03 +0800 Subject: lockdep: Put graph lock/unlock under lock_recursion protection A warning was hit when running xfstests/generic/068 in a Hyper-V guest: [...] ------------[ cut here ]------------ [...] DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled()) [...] WARNING: CPU: 2 PID: 1350 at kernel/locking/lockdep.c:5280 check_flags.part.0+0x165/0x170 [...] ... [...] Workqueue: events pwq_unbound_release_workfn [...] RIP: 0010:check_flags.part.0+0x165/0x170 [...] ... [...] Call Trace: [...] lock_is_held_type+0x72/0x150 [...] ? lock_acquire+0x16e/0x4a0 [...] rcu_read_lock_sched_held+0x3f/0x80 [...] __send_ipi_one+0x14d/0x1b0 [...] hv_send_ipi+0x12/0x30 [...] __pv_queued_spin_unlock_slowpath+0xd1/0x110 [...] __raw_callee_save___pv_queued_spin_unlock_slowpath+0x11/0x20 [...] .slowpath+0x9/0xe [...] lockdep_unregister_key+0x128/0x180 [...] pwq_unbound_release_workfn+0xbb/0xf0 [...] process_one_work+0x227/0x5c0 [...] worker_thread+0x55/0x3c0 [...] ? process_one_work+0x5c0/0x5c0 [...] kthread+0x153/0x170 [...] ? __kthread_bind_mask+0x60/0x60 [...] ret_from_fork+0x1f/0x30 The cause of the problem is we have call chain lockdep_unregister_key() -> lockdep_unlock() -> arch_spin_unlock() -> __pv_queued_spin_unlock_slowpath() -> pv_kick() -> __send_ipi_one() -> trace_hyperv_send_ipi_one(). Although this particular warning is triggered because Hyper-V has a trace point in ipi sending, but in general arch_spin_unlock() may call another function having a trace point in it, so put the arch_spin_lock() and arch_spin_unlock() after lock_recursion protection to fix this problem and avoid similiar problems. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201113110512.1056501-1-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d9fb9e19d2ed..c1418b47f625 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -108,19 +108,21 @@ static inline void lockdep_lock(void) { DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + __this_cpu_inc(lockdep_recursion); arch_spin_lock(&__lock); __owner = current; - __this_cpu_inc(lockdep_recursion); } static inline void lockdep_unlock(void) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - __this_cpu_dec(lockdep_recursion); __owner = NULL; arch_spin_unlock(&__lock); + __this_cpu_dec(lockdep_recursion); } static inline bool lockdep_assert_locked(void) -- cgit v1.2.3 From cf23705244c947151179f929774fabf71e239eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 30 Oct 2020 13:38:48 +0100 Subject: ptrace: Set PF_SUPERPRIV when checking capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 69f594a38967 ("ptrace: do not audit capability check when outputing /proc/pid/stat") replaced the use of ns_capable() with has_ns_capability{,_noaudit}() which doesn't set PF_SUPERPRIV. Commit 6b3ad6649a4c ("ptrace: reintroduce usage of subjective credentials in ptrace_has_cap()") replaced has_ns_capability{,_noaudit}() with security_capable(), which doesn't set PF_SUPERPRIV neither. Since commit 98f368e9e263 ("kernel: Add noaudit variant of ns_capable()"), a new ns_capable_noaudit() helper is available. Let's use it! As a result, the signature of ptrace_has_cap() is restored to its original one. Cc: Christian Brauner Cc: Eric Paris Cc: Jann Horn Cc: Kees Cook Cc: Oleg Nesterov Cc: Serge E. Hallyn Cc: Tyler Hicks Cc: stable@vger.kernel.org Fixes: 6b3ad6649a4c ("ptrace: reintroduce usage of subjective credentials in ptrace_has_cap()") Fixes: 69f594a38967 ("ptrace: do not audit capability check when outputing /proc/pid/stat") Signed-off-by: Mickaël Salaün Reviewed-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201030123849.770769-2-mic@digikod.net --- kernel/ptrace.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..79de1294f8eb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -264,17 +264,11 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, - unsigned int mode) +static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - int ret; - if (mode & PTRACE_MODE_NOAUDIT) - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); - else - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); - - return ret == 0; + return ns_capable_noaudit(ns, CAP_SYS_PTRACE); + return ns_capable(ns, CAP_SYS_PTRACE); } /* Returns 0 on success, -errno on denial. */ @@ -326,7 +320,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(cred, tcred->user_ns, mode)) + if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -345,7 +339,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(cred, mm->user_ns, mode))) + !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); -- cgit v1.2.3 From fb14528e443646dd3fd02df4437fcf5265b66baa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 30 Oct 2020 13:38:49 +0100 Subject: seccomp: Set PF_SUPERPRIV when checking capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the use of security_capable(current_cred(), ...) with ns_capable_noaudit() which set PF_SUPERPRIV. Since commit 98f368e9e263 ("kernel: Add noaudit variant of ns_capable()"), a new ns_capable_noaudit() helper is available. Let's use it! Cc: Jann Horn Cc: Kees Cook Cc: Tyler Hicks Cc: Will Drewry Cc: stable@vger.kernel.org Fixes: e2cfabdfd075 ("seccomp: add system call filtering using BPF") Signed-off-by: Mickaël Salaün Reviewed-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201030123849.770769-3-mic@digikod.net --- kernel/seccomp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..53a7d1512dd7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include @@ -558,8 +558,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable(current_cred(), current_user_ns(), - CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ -- cgit v1.2.3 From 6fa6d28051e9fcaa1570e69648ea13a353a5d218 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 17 Nov 2020 12:05:45 -0800 Subject: lib/strncpy_from_user.c: Mask out bytes after NUL terminator. do_strncpy_from_user() may copy some extra bytes after the NUL terminator into the destination buffer. This usually does not matter for normal string operations. However, when BPF programs key BPF maps with strings, this matters a lot. A BPF program may read strings from user memory by calling the bpf_probe_read_user_str() helper which eventually calls do_strncpy_from_user(). The program can then key a map with the destination buffer. BPF map keys are fixed-width and string-agnostic, meaning that map keys are treated as a set of bytes. The issue is when do_strncpy_from_user() overcopies bytes after the NUL terminator, it can result in seemingly identical strings occupying multiple slots in a BPF map. This behavior is subtle and totally unexpected by the user. This commit masks out the bytes following the NUL while preserving long-sized stride in the fast path. Fixes: 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers") Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/21efc982b3e9f2f7b0379eed642294caaa0c27a7.1605642949.git.dxu@dxuuu.xyz --- kernel/trace/bpf_trace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5113fd423cdf..048c655315f1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -181,6 +181,16 @@ bpf_probe_read_user_str_common(void *dst, u32 size, { int ret; + /* + * NB: We rely on strncpy_from_user() not copying junk past the NUL + * terminator into `dst`. + * + * strncpy_from_user() does long-sized strides in the fast path. If the + * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`, + * then there could be junk after the NUL in `dst`. If user takes `dst` + * and keys a hash map with it, then semantically identical strings can + * occupy multiple entries in the map. + */ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); -- cgit v1.2.3 From 2801a5da5b25b7af9dd2addd19b2315c02d17b64 Mon Sep 17 00:00:00 2001 From: Luo Meng Date: Wed, 18 Nov 2020 22:49:31 +0900 Subject: fail_function: Remove a redundant mutex unlock Fix a mutex_unlock() issue where before copy_from_user() is not called mutex_locked. Fixes: 4b1a29a7f542 ("error-injection: Support fault injection framework") Reported-by: Hulk Robot Signed-off-by: Luo Meng Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/160570737118.263807.8358435412898356284.stgit@devnote2 --- kernel/fail_function.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 63b349168da7..b0b1ad93fa95 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -253,7 +253,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, if (copy_from_user(buf, buffer, count)) { ret = -EFAULT; - goto out; + goto out_free; } buf[count] = '\0'; sym = strstrip(buf); @@ -307,8 +307,9 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = count; } out: - kfree(buf); mutex_unlock(&fei_lock); +out_free: + kfree(buf); return ret; } -- cgit v1.2.3 From 58c644ba512cfbc2e39b758dd979edd1d6d00e27 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Nov 2020 11:50:35 +0100 Subject: sched/idle: Fix arch_cpu_idle() vs tracing We call arch_cpu_idle() with RCU disabled, but then use local_irq_{en,dis}able(), which invokes tracing, which relies on RCU. Switch all arch_cpu_idle() implementations to use raw_local_irq_{en,dis}able() and carefully manage the lockdep,rcu,tracing state like we do in entry. (XXX: we really should change arch_cpu_idle() to not return with interrupts enabled) Reported-by: Sven Schnelle Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Tested-by: Mark Rutland Link: https://lkml.kernel.org/r/20201120114925.594122626@infradead.org --- kernel/sched/idle.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..c6932b8f4467 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,7 +78,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - local_irq_enable(); + raw_local_irq_enable(); } /** @@ -94,9 +94,35 @@ void __cpuidle default_idle_call(void) trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); + + /* + * arch_cpu_idle() is supposed to enable IRQs, however + * we can't do that because of RCU and tracing. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * rcu_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(_THIS_IP_); rcu_idle_enter(); + lockdep_hardirqs_on(_THIS_IP_); + arch_cpu_idle(); + + /* + * OK, so IRQs are enabled here, but RCU needs them disabled to + * turn itself back on.. funny thing is that disabling IRQs + * will cause tracing, which needs RCU. Jump through hoops to + * make it 'work'. + */ + raw_local_irq_disable(); + lockdep_hardirqs_off(_THIS_IP_); rcu_idle_exit(); + lockdep_hardirqs_on(_THIS_IP_); + raw_local_irq_enable(); + start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } -- cgit v1.2.3 From 8ff00399b153440c1c83e20c43020385b416415b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 26 Nov 2020 20:25:29 +1000 Subject: kernel/cpu: add arch override for clear_tasks_mm_cpumask() mm handling powerpc/64s keeps a counter in the mm which counts bits set in mm_cpumask as well as other things. This means it can't use generic code to clear bits out of the mask and doesn't adjust the arch specific counter. Add an arch override that allows powerpc/64s to use clear_tasks_mm_cpumask(). Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Acked-by: Peter Zijlstra (Intel) Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201126102530.691335-4-npiggin@gmail.com --- kernel/cpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ff2578ecf17..2b8d7a5db383 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -815,6 +815,10 @@ void __init cpuhp_threads_init(void) } #ifdef CONFIG_HOTPLUG_CPU +#ifndef arch_clear_mm_cpumask_cpu +#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) +#endif + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id @@ -850,7 +854,7 @@ void clear_tasks_mm_cpumask(int cpu) t = find_lock_task_mm(p); if (!t) continue; - cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); + arch_clear_mm_cpumask_cpu(cpu, t->mm); task_unlock(t); } rcu_read_unlock(); -- cgit v1.2.3 From 4ad9921af4f18490980369f7d60f90ade0195812 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 26 Nov 2020 12:54:36 +0106 Subject: printk: finalize records with trailing newlines Any record with a trailing newline (LOG_NEWLINE flag) cannot be continued because the newline has been stripped and will not be visible if the message is appended. This was already handled correctly when committing in log_output() but was not handled correctly when committing in log_store(). Fixes: f5f022e53b87 ("printk: reimplement log_cont using record extension") Link: https://lore.kernel.org/r/20201126114836.14750-1-john.ogness@linutronix.de Reported-by: Kefeng Wang Signed-off-by: John Ogness Tested-by: Kefeng Wang Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fe64a49344bf..bc1e3b5a97bd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,8 +528,8 @@ static int log_store(u32 caller_id, int facility, int level, if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); - /* insert message */ - if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + /* A message without a trailing newline can be continued. */ + if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); -- cgit v1.2.3 From bb4c6910c8b41623104c2e64a30615682689a54d Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Thu, 26 Nov 2020 09:28:51 +0100 Subject: genirq/irqdomain: Add an irq_create_mapping_affinity() function There is currently no way to convey the affinity of an interrupt via irq_create_mapping(), which creates issues for devices that expect that affinity to be managed by the kernel. In order to sort this out, rename irq_create_mapping() to irq_create_mapping_affinity() with an additional affinity parameter that can be passed down to irq_domain_alloc_descs(). irq_create_mapping() is re-implemented as a wrapper around irq_create_mapping_affinity(). No functional change. Fixes: e75eafb9b039 ("genirq/msi: Switch to new irq spreading infrastructure") Signed-off-by: Laurent Vivier Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kurz Cc: Michael Ellerman Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201126082852.1178497-2-lvivier@redhat.com --- kernel/irq/irqdomain.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf8b374b892d..e4ca69608f3b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -624,17 +624,19 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) EXPORT_SYMBOL_GPL(irq_create_direct_mapping); /** - * irq_create_mapping() - Map a hardware interrupt into linux irq space + * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain * @hwirq: hardware irq number in that domain space + * @affinity: irq affinity * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ -unsigned int irq_create_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq) +unsigned int irq_create_mapping_affinity(struct irq_domain *domain, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) { struct device_node *of_node; int virq; @@ -660,7 +662,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), + affinity); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -676,7 +679,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } -EXPORT_SYMBOL_GPL(irq_create_mapping); +EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); /** * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs -- cgit v1.2.3 From 55ea4cf403800af2ce6b125bc3d853117e0c0456 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 27 Nov 2020 11:20:58 -0500 Subject: ring-buffer: Update write stamp with the correct ts The write stamp, used to calculate deltas between events, was updated with the stale "ts" value in the "info" structure, and not with the updated "ts" variable. This caused the deltas between events to be inaccurate, and when crossing into a new sub buffer, had time go backwards. Link: https://lkml.kernel.org/r/20201124223917.795844-1-elavila@google.com Cc: stable@vger.kernel.org Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp") Reported-by: "J. Avila" Tested-by: Daniel Mentz Tested-by: Will McVicker Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dc83b3fa9fe7..bccaf88d3706 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3291,7 +3291,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* Nothing came after this event between C and E */ info->delta = ts - info->after; (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, info->ts); + info->after, ts); info->ts = ts; } else { /* -- cgit v1.2.3 From 8785f51a17083eee7c37606079c6447afc6ba102 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 28 Nov 2020 10:15:17 +0100 Subject: ring-buffer: Set the right timestamp in the slow path of __rb_reserve_next() In the slow path of __rb_reserve_next() a nested event(s) can happen between evaluating the timestamp delta of the current event and updating write_stamp via local_cmpxchg(); in this case the delta is not valid anymore and it should be set to 0 (same timestamp as the interrupting event), since the event that we are currently processing is not the last event in the buffer. Link: https://lkml.kernel.org/r/X8IVJcp1gRE+FJCJ@xps-13-7390 Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Link: https://lwn.net/Articles/831207 Fixes: a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp") Signed-off-by: Andrea Righi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bccaf88d3706..35d91b20d47a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3287,11 +3287,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - info->after < ts) { + info->after < ts && + rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, ts)) { /* Nothing came after this event between C and E */ info->delta = ts - info->after; - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, ts); info->ts = ts; } else { /* -- cgit v1.2.3 From 310e3a4b5a4fc718a72201c1e4cf5c64ac6f5442 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 18 Nov 2020 15:05:20 +0300 Subject: tracing: Remove WARN_ON in start_thread() This patch reverts commit 978defee11a5 ("tracing: Do a WARN_ON() if start_thread() in hwlat is called when thread exists") .start hook can be legally called several times if according tracer is stopped screen window 1 [root@localhost ~]# echo 1 > /sys/kernel/tracing/events/kmem/kfree/enable [root@localhost ~]# echo 1 > /sys/kernel/tracing/options/pause-on-trace [root@localhost ~]# less -F /sys/kernel/tracing/trace screen window 2 [root@localhost ~]# cat /sys/kernel/debug/tracing/tracing_on 0 [root@localhost ~]# echo hwlat > /sys/kernel/debug/tracing/current_tracer [root@localhost ~]# echo 1 > /sys/kernel/debug/tracing/tracing_on [root@localhost ~]# cat /sys/kernel/debug/tracing/tracing_on 0 [root@localhost ~]# echo 2 > /sys/kernel/debug/tracing/tracing_on triggers warning in dmesg: WARNING: CPU: 3 PID: 1403 at kernel/trace/trace_hwlat.c:371 hwlat_tracer_start+0xc9/0xd0 Link: https://lkml.kernel.org/r/bd4d3e70-400d-9c82-7b73-a2d695e86b58@virtuozzo.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 978defee11a5 ("tracing: Do a WARN_ON() if start_thread() in hwlat is called when thread exists") Signed-off-by: Vasily Averin Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index c9ad5c6fbaad..d071fc271eef 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -368,7 +368,7 @@ static int start_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - if (WARN_ON(hwlat_kthread)) + if (hwlat_kthread) return 0; /* Just pick the first CPU on first iteration */ -- cgit v1.2.3 From 8fa655a3a0013a0c2a2aada6f39a93ee6fc25549 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Nov 2020 14:56:54 -0800 Subject: tracing: Fix alignment of static buffer With 5.9 kernel on ARM64, I found ftrace_dump output was broken but it had no problem with normal output "cat /sys/kernel/debug/tracing/trace". With investigation, it seems coping the data into temporal buffer seems to break the align binary printf expects if the static buffer is not aligned with 4-byte. IIUC, get_arg in bstr_printf expects that args has already right align to be decoded and seq_buf_bprintf says ``the arguments are saved in a 32bit word array that is defined by the format string constraints``. So if we don't keep the align under copy to temporal buffer, the output will be broken by shifting some bytes. This patch fixes it. Link: https://lkml.kernel.org/r/20201125225654.1618966-1-minchan@kernel.org Cc: Fixes: 8e99cf91b99bb ("tracing: Do not allocate buffer in trace_find_next_entry() in atomic") Signed-off-by: Namhyung Kim Signed-off-by: Minchan Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 410cfeb16db5..7d53c5bdea3e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3534,7 +3534,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, } #define STATIC_TEMP_BUF_SIZE 128 -static char static_temp_buf[STATIC_TEMP_BUF_SIZE]; +static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, -- cgit v1.2.3 From 4c75b0ff4e4bf7a45b5aef9639799719c28d0073 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 26 Nov 2020 23:38:38 +0530 Subject: ftrace: Fix updating FTRACE_FL_TRAMP On powerpc, kprobe-direct.tc triggered FTRACE_WARN_ON() in ftrace_get_addr_new() followed by the below message: Bad trampoline accounting at: 000000004222522f (wake_up_process+0xc/0x20) (f0000001) The set of steps leading to this involved: - modprobe ftrace-direct-too - enable_probe - modprobe ftrace-direct - rmmod ftrace-direct <-- trigger The problem turned out to be that we were not updating flags in the ftrace record properly. From the above message about the trampoline accounting being bad, it can be seen that the ftrace record still has FTRACE_FL_TRAMP set though ftrace-direct module is going away. This happens because we are checking if any ftrace_ops has the FTRACE_FL_TRAMP flag set _before_ updating the filter hash. The fix for this is to look for any _other_ ftrace_ops that also needs FTRACE_FL_TRAMP. Link: https://lkml.kernel.org/r/56c113aa9c3e10c19144a36d9684c7882bf09af5.1606412433.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: a124692b698b0 ("ftrace: Enable trampoline when rec count returns back to one") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8185f7240095..9c1bba8cc51b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1629,6 +1629,8 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude); +static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, @@ -1778,7 +1780,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, * to it. */ if (ftrace_rec_count(rec) == 1 && - ftrace_find_tramp_ops_any(rec)) + ftrace_find_tramp_ops_any_other(rec, ops)) rec->flags |= FTRACE_FL_TRAMP; else rec->flags &= ~FTRACE_FL_TRAMP; @@ -2244,6 +2246,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) return NULL; } +static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (op == op_exclude || !op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *op) -- cgit v1.2.3 From 49a962c075dfa41c78e34784772329bc8784d217 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 26 Nov 2020 23:38:39 +0530 Subject: ftrace: Fix DYNAMIC_FTRACE_WITH_DIRECT_CALLS dependency DYNAMIC_FTRACE_WITH_DIRECT_CALLS should depend on DYNAMIC_FTRACE_WITH_REGS since we need ftrace_regs_caller(). Link: https://lkml.kernel.org/r/fc4b257ea8689a36f086d2389a9ed989496ca63a.1606412433.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 763e34e74bb7d5c ("ftrace: Add register_ftrace_direct()") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508..e1bf5228fb69 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -202,7 +202,7 @@ config DYNAMIC_FTRACE_WITH_REGS config DYNAMIC_FTRACE_WITH_DIRECT_CALLS def_bool y - depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_REGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config FUNCTION_PROFILER -- cgit v1.2.3 From 68e10d5ff512b503dcba1246ad5620f32035e135 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 30 Nov 2020 23:16:03 -0500 Subject: ring-buffer: Always check to put back before stamp when crossing pages The current ring buffer logic checks to see if the updating of the event buffer was interrupted, and if it is, it will try to fix up the before stamp with the write stamp to make them equal again. This logic is flawed, because if it is not interrupted, the two are guaranteed to be different, as the current event just updated the before stamp before allocation. This guarantees that the next event (this one or another interrupting one) will think it interrupted the time updates of a previous event and inject an absolute time stamp to compensate. The correct logic is to always update the timestamps when traversing to a new sub buffer. Cc: stable@vger.kernel.org Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 35d91b20d47a..a6268e09160a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3234,14 +3234,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) { - if (tail != w) { - /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - if (a_ok && b_ok && info->before != info->after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, - info->before, info->after); - } + /* before and after may now different, fix it up*/ + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + if (a_ok && b_ok && info->before != info->after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, + info->before, info->after); return rb_move_tail(cpu_buffer, tail, info); } -- cgit v1.2.3 From bcee5278958802b40ee8b26679155a6d9231783e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 4 Dec 2020 16:36:16 -0500 Subject: tracing: Fix userstacktrace option for instances When the instances were able to use their own options, the userstacktrace option was left hardcoded for the top level. This made the instance userstacktrace option bascially into a nop, and will confuse users that set it, but nothing happens (I was confused when it happened to me!) Cc: stable@vger.kernel.org Fixes: 16270145ce6b ("tracing: Add trace options for core options to instances") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d53c5bdea3e..06134189e9a7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -163,7 +163,8 @@ static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ int tracing_set_tracer(struct trace_array *tr, const char *buf); -static void ftrace_trace_userstack(struct trace_buffer *buffer, +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc); #define MAX_TRACER_SIZE 100 @@ -2870,7 +2871,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, * two. They are not that meaningful. */ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); - ftrace_trace_userstack(buffer, flags, pc); + ftrace_trace_userstack(tr, buffer, flags, pc); } /* @@ -3056,13 +3057,14 @@ EXPORT_SYMBOL_GPL(trace_dump_stack); static DEFINE_PER_CPU(int, user_stack_count); static void -ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc) +ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; - if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* @@ -3101,7 +3103,8 @@ ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc) preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ -static void ftrace_trace_userstack(struct trace_buffer *buffer, +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc) { } -- cgit v1.2.3 From 2ecedd7569080fd05c1a457e8af2165afecfa29f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 3 Dec 2020 21:07:04 -0800 Subject: membarrier: Add an actual barrier before rseq_preempt() It seems that most RSEQ membarrier users will expect any stores done before the membarrier() syscall to be visible to the target task(s). While this is extremely likely to be true in practice, nothing actually guarantees it by a strict reading of the x86 manuals. Rather than providing this guarantee by accident and potentially causing a problem down the road, just add an explicit barrier. Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/d3e7197e034fa4852afcf370ca49c30496e58e40.1607058304.git.luto@kernel.org --- kernel/sched/membarrier.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index e23e74d52db5..7d98ef5d3bcd 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -40,6 +40,14 @@ static void ipi_mb(void *info) static void ipi_rseq(void *info) { + /* + * Ensure that all stores done by the calling thread are visible + * to the current task before the current task resumes. We could + * probably optimize this away on most architectures, but by the + * time we've already sent an IPI, the cost of the extra smp_mb() + * is negligible. + */ + smp_mb(); rseq_preempt(current); } -- cgit v1.2.3 From 758c9373d84168dc7d039cf85a0e920046b17b41 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 3 Dec 2020 21:07:05 -0800 Subject: membarrier: Explicitly sync remote cores when SYNC_CORE is requested membarrier() does not explicitly sync_core() remote CPUs; instead, it relies on the assumption that an IPI will result in a core sync. On x86, this may be true in practice, but it's not architecturally reliable. In particular, the SDM and APM do not appear to guarantee that interrupt delivery is serializing. While IRET does serialize, IPI return can schedule, thereby switching to another task in the same mm that was sleeping in a syscall. The new task could then SYSRET back to usermode without ever executing IRET. Make this more robust by explicitly calling sync_core_before_usermode() on remote cores. (This also helps people who search the kernel tree for instances of sync_core() and sync_core_before_usermode() -- one might be surprised that the core membarrier code doesn't currently show up in a such a search.) Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/776b448d5f7bd6b12690707f5ed67bcda7f1d427.1607058304.git.luto@kernel.org --- kernel/sched/membarrier.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 7d98ef5d3bcd..1c278dff4f2d 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -38,6 +38,23 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_core(void *info) +{ + /* + * The smp_mb() in membarrier after all the IPIs is supposed to + * ensure that memory on remote CPUs that occur before the IPI + * become visible to membarrier()'s caller -- see scenario B in + * the big comment at the top of this file. + * + * A sync_core() would provide this guarantee, but + * sync_core_before_usermode() might end up being deferred until + * after membarrier()'s smp_mb(). + */ + smp_mb(); /* IPIs should be serializing but paranoid. */ + + sync_core_before_usermode(); +} + static void ipi_rseq(void *info) { /* @@ -162,6 +179,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + ipi_func = ipi_sync_core; } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; -- cgit v1.2.3 From e45cdc71d1fa5ac3a57b23acc31eb959e4f60135 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 3 Dec 2020 21:07:06 -0800 Subject: membarrier: Execute SYNC_CORE on the calling thread membarrier()'s MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is documented as syncing the core on all sibling threads but not necessarily the calling thread. This behavior is fundamentally buggy and cannot be used safely. Suppose a user program has two threads. Thread A is on CPU 0 and thread B is on CPU 1. Thread A modifies some text and calls membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE). Then thread B executes the modified code. If, at any point after membarrier() decides which CPUs to target, thread A could be preempted and replaced by thread B on CPU 0. This could even happen on exit from the membarrier() syscall. If this happens, thread B will end up running on CPU 0 without having synced. In principle, this could be fixed by arranging for the scheduler to issue sync_core_before_usermode() whenever switching between two threads in the same mm if there is any possibility of a concurrent membarrier() call, but this would have considerable overhead. Instead, make membarrier() sync the calling CPU as well. As an optimization, this avoids an extra smp_mb() in the default barrier-only mode and an extra rseq preempt on the caller. Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/250ded637696d490c69bef1877148db86066881c.1607058304.git.luto@kernel.org --- kernel/sched/membarrier.c | 51 ++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 1c278dff4f2d..9d8df34bea75 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -194,7 +194,8 @@ static int membarrier_private_expedited(int flags, int cpu_id) return -EPERM; } - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) + if (flags != MEMBARRIER_FLAG_SYNC_CORE && + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) return 0; /* @@ -213,8 +214,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) goto out; - if (cpu_id == raw_smp_processor_id()) - goto out; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu_id)->curr); if (!p || p->mm != mm) { @@ -229,16 +228,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) for_each_online_cpu(cpu) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); @@ -246,12 +235,38 @@ static int membarrier_private_expedited(int flags, int cpu_id) rcu_read_unlock(); } - preempt_disable(); - if (cpu_id >= 0) + if (cpu_id >= 0) { + /* + * smp_call_function_single() will call ipi_func() if cpu_id + * is the calling CPU. + */ smp_call_function_single(cpu_id, ipi_func, NULL, 1); - else - smp_call_function_many(tmpmask, ipi_func, NULL, 1); - preempt_enable(); + } else { + /* + * For regular membarrier, we can save a few cycles by + * skipping the current cpu -- we're about to do smp_mb() + * below, and if we migrate to a different cpu, this cpu + * and the new cpu will execute a full barrier in the + * scheduler. + * + * For SYNC_CORE, we do need a barrier on the current cpu -- + * otherwise, if we are migrated and replaced by a different + * task in the same mm just before, during, or after + * membarrier, we will end up with some thread in the mm + * running without a core sync. + * + * For RSEQ, don't rseq_preempt() the caller. User code + * is not supposed to issue syscalls at all from inside an + * rseq critical section. + */ + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_func, NULL, true); + preempt_enable(); + } else { + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); + } + } out: if (cpu_id < 0) -- cgit v1.2.3 From b02709587ea3d699a608568ee8157d8db4fd8cae Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 8 Dec 2020 19:01:51 +0100 Subject: bpf: Fix propagation of 32-bit signed bounds from 64-bit bounds. The 64-bit signed bounds should not affect 32-bit signed bounds unless the verifier knows that upper 32-bits are either all 1s or all 0s. For example the register with smin_value==1 doesn't mean that s32_min_value is also equal to 1, since smax_value could be larger than 32-bit subregister can hold. The verifier refines the smax/s32_max return value from certain helpers in do_refine_retval_range(). Teach the verifier to recognize that smin/s32_min value is also bounded. When both smin and smax bounds fit into 32-bit subregister the verifier can propagate those bounds. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Jean-Philippe Brucker Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1388bf733071..53fe6ef6d931 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1298,9 +1298,7 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg) static bool __reg64_bound_s32(s64 a) { - if (a > S32_MIN && a < S32_MAX) - return true; - return false; + return a > S32_MIN && a < S32_MAX; } static bool __reg64_bound_u32(u64 a) @@ -1314,10 +1312,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg) { __mark_reg32_unbounded(reg); - if (__reg64_bound_s32(reg->smin_value)) + if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) { reg->s32_min_value = (s32)reg->smin_value; - if (__reg64_bound_s32(reg->smax_value)) reg->s32_max_value = (s32)reg->smax_value; + } if (__reg64_bound_u32(reg->umin_value)) reg->u32_min_value = (u32)reg->umin_value; if (__reg64_bound_u32(reg->umax_value)) @@ -4895,6 +4893,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, ret_reg->smax_value = meta->msize_max_value; ret_reg->s32_max_value = meta->msize_max_value; + ret_reg->smin_value = -MAX_ERRNO; + ret_reg->s32_min_value = -MAX_ERRNO; __reg_deduce_bounds(ret_reg); __reg_bound_offset(ret_reg); __update_reg_bounds(ret_reg); -- cgit v1.2.3 From 6e7b64b9dd6d96537d816ea07ec26b7dedd397b9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 11 Dec 2020 13:36:46 -0800 Subject: elfcore: fix building with clang kernel/elfcore.c only contains weak symbols, which triggers a bug with clang in combination with recordmcount: Cannot find symbol for section 2: .text. kernel/elfcore.o: failed Move the empty stubs into linux/elfcore.h as inline functions. As only two architectures use these, just use the architecture specific Kconfig symbols to key off the declaration. Link: https://lkml.kernel.org/r/20201204165742.3815221-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Barret Rhoden Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 - kernel/elfcore.c | 26 -------------------------- 2 files changed, 27 deletions(-) delete mode 100644 kernel/elfcore.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index af601b9bda0e..6c9f19911be0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -97,7 +97,6 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_ELFCORE) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ diff --git a/kernel/elfcore.c b/kernel/elfcore.c deleted file mode 100644 index 57fb4dcff434..000000000000 --- a/kernel/elfcore.c +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - -Elf_Half __weak elf_core_extra_phdrs(void) -{ - return 0; -} - -int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) -{ - return 1; -} - -int __weak elf_core_write_extra_data(struct coredump_params *cprm) -{ - return 1; -} - -size_t __weak elf_core_extra_data_size(void) -{ - return 0; -} -- cgit v1.2.3 From b7906b70a2337e445b8dca3ce7ba8976b6ebd07d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 11 Dec 2020 22:36:25 +0100 Subject: bpf: Fix enum names for bpf_this_cpu_ptr() and bpf_per_cpu_ptr() helpers Remove bpf_ prefix, which causes these helpers to be reported in verifier dump as bpf_bpf_this_cpu_ptr() and bpf_bpf_per_cpu_ptr(), respectively. Lets fix it as long as it is still possible before UAPI freezes on these helpers. Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Linus Torvalds --- kernel/bpf/helpers.c | 4 ++-- kernel/trace/bpf_trace.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 25520f5eeaf6..deda1185237b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -717,9 +717,9 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; - case BPF_FUNC_bpf_per_cpu_ptr: + case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_bpf_this_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; default: break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 048c655315f1..a125ea5e04cd 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1337,9 +1337,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; - case BPF_FUNC_bpf_per_cpu_ptr: + case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_bpf_this_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; default: return NULL; -- cgit v1.2.3