From fbd44a607a1a5019bc32c3615cead8c5ee8f89c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 May 2013 20:22:36 +0200 Subject: tick: Use zalloc_cpumask_var for allocating offstack cpumasks commit b352bc1cbc (tick: Convert broadcast cpu bitmaps to cpumask_var_t) broke CONFIG_CPUMASK_OFFSTACK in a very subtle way. Instead of allocating the cpumasks with zalloc_cpumask_var it uses alloc_cpumask_var, so we can get random data there, which of course confuses the logic completely and causes random failures. Reported-and-tested-by: Dave Jones Reported-and-tested-by: Yinghai Lu Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305032015060.2990@ionos Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 61d00a8cdf2f..d70cdc42c829 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -785,11 +785,11 @@ bool tick_broadcast_oneshot_available(void) void __init tick_broadcast_init(void) { - alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); - alloc_cpumask_var(&tmpmask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT - alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); - alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); - alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); #endif } -- cgit v1.2.3 From 524eff183f51d080a83b348d0ea97c08b3607b9a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 May 2013 18:27:17 +0200 Subject: perf: Fix EXIT event notification The perf_event_task_ctx() function needs to be called with preemption disabled, since it's checking for currently scheduled cpu against event cpu. We disable preemption for task related perf event context if there's one defined, leaving up to the chance which cpu it gets scheduled in. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Stephane Eranian Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1367857638-27631-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6b41c1899a8b..38b68a05c3c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4474,7 +4474,7 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *task_ctx = task_event->task_ctx; struct pmu *pmu; int ctxn; @@ -4485,20 +4485,22 @@ static void perf_event_task_event(struct perf_task_event *task_event) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); - ctx = task_event->task_ctx; - if (!ctx) { - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_task_ctx(ctx, task_event); - } + if (task_ctx) + goto next; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } - if (task_event->task_ctx) - perf_event_task_ctx(task_event->task_ctx, task_event); + if (task_ctx) { + preempt_disable(); + perf_event_task_ctx(task_ctx, task_event); + preempt_enable(); + } rcu_read_unlock(); } -- cgit v1.2.3 From 52d857a8784a09576215c71cebf368d61c12a754 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 May 2013 18:27:18 +0200 Subject: perf: Factor out auxiliary events notification Add perf_event_aux() function to send out all types of auxiliary events - mmap, task, comm events. For each type there's match and output functions defined and used as callbacks during perf_event_aux processing. This way we can centralize the pmu/context iterating and event matching logic. Also since lot of the code was duplicated, this patch reduces the .text size about 2kB on my setup: snipped output from 'objdump -x kernel/events/core.o' before: Idx Name Size 0 .text 0000d313 after: Idx Name Size 0 .text 0000cad3 Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1367857638-27631-3-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 242 +++++++++++++++++++-------------------------------- 1 file changed, 89 insertions(+), 153 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 38b68a05c3c6..9dc297faf7c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4394,6 +4394,64 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } +typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); +typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); + +static void +perf_event_aux_ctx(struct perf_event_context *ctx, + perf_event_aux_match_cb match, + perf_event_aux_output_cb output, + void *data) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + if (match(event, data)) + output(event, data); + } +} + +static void +perf_event_aux(perf_event_aux_match_cb match, + perf_event_aux_output_cb output, + void *data, + struct perf_event_context *task_ctx) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + goto next; + perf_event_aux_ctx(&cpuctx->ctx, match, output, data); + if (task_ctx) + goto next; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_aux_ctx(ctx, match, output, data); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + + if (task_ctx) { + preempt_disable(); + perf_event_aux_ctx(task_ctx, match, output, data); + preempt_enable(); + } + rcu_read_unlock(); +} + /* * task tracking -- fork/exit * @@ -4416,8 +4474,9 @@ struct perf_task_event { }; static void perf_event_task_output(struct perf_event *event, - struct perf_task_event *task_event) + void *data) { + struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; @@ -4445,64 +4504,11 @@ out: task_event->event_id.header.size = size; } -static int perf_event_task_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task) - return 1; - - return 0; -} - -static void perf_event_task_ctx(struct perf_event_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_task_match(event)) - perf_event_task_output(event, task_event); - } -} - -static void perf_event_task_event(struct perf_task_event *task_event) +static int perf_event_task_match(struct perf_event *event, + void *data __maybe_unused) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx, *task_ctx = task_event->task_ctx; - struct pmu *pmu; - int ctxn; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_task_ctx(&cpuctx->ctx, task_event); - - if (task_ctx) - goto next; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_task_ctx(ctx, task_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - if (task_ctx) { - preempt_disable(); - perf_event_task_ctx(task_ctx, task_event); - preempt_enable(); - } - - rcu_read_unlock(); + return event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task; } static void perf_event_task(struct task_struct *task, @@ -4533,7 +4539,10 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_task_event(&task_event); + perf_event_aux(perf_event_task_match, + perf_event_task_output, + &task_event, + task_ctx); } void perf_event_fork(struct task_struct *task) @@ -4559,8 +4568,9 @@ struct perf_comm_event { }; static void perf_event_comm_output(struct perf_event *event, - struct perf_comm_event *comm_event) + void *data) { + struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; @@ -4587,39 +4597,16 @@ out: comm_event->event_id.header.size = size; } -static int perf_event_comm_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm) - return 1; - - return 0; -} - -static void perf_event_comm_ctx(struct perf_event_context *ctx, - struct perf_comm_event *comm_event) +static int perf_event_comm_match(struct perf_event *event, + void *data __maybe_unused) { - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_comm_match(event)) - perf_event_comm_output(event, comm_event); - } + return event->attr.comm; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; char comm[TASK_COMM_LEN]; unsigned int size; - struct pmu *pmu; - int ctxn; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); @@ -4629,24 +4616,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); + perf_event_aux(perf_event_comm_match, + perf_event_comm_output, + comm_event, + NULL); } void perf_event_comm(struct task_struct *task) @@ -4708,8 +4682,9 @@ struct perf_mmap_event { }; static void perf_event_mmap_output(struct perf_event *event, - struct perf_mmap_event *mmap_event) + void *data) { + struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; @@ -4736,46 +4711,24 @@ out: } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event, - int executable) + void *data) { - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if ((!executable && event->attr.mmap_data) || - (executable && event->attr.mmap)) - return 1; - - return 0; -} - -static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event, - int executable) -{ - struct perf_event *event; + struct perf_mmap_event *mmap_event = data; + struct vm_area_struct *vma = mmap_event->vma; + int executable = vma->vm_flags & VM_EXEC; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event, executable)) - perf_event_mmap_output(event, mmap_event); - } + return (!executable && event->attr.mmap_data) || + (executable && event->attr.mmap); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; unsigned int size; char tmp[16]; char *buf = NULL; const char *name; - struct pmu *pmu; - int ctxn; memset(tmp, 0, sizeof(tmp)); @@ -4831,27 +4784,10 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, - vma->vm_flags & VM_EXEC); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_event_mmap_ctx(ctx, mmap_event, - vma->vm_flags & VM_EXEC); - } -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); + perf_event_aux(perf_event_mmap_match, + perf_event_mmap_output, + mmap_event, + NULL); kfree(buf); } -- cgit v1.2.3 From d3251859168b0b12841e1b90d6d768ab478dc23d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 10 May 2013 11:10:17 -0700 Subject: workqueue: workqueue_congested() shouldn't translate WORK_CPU_UNBOUND into node number df2d5ae499 ("workqueue: map an unbound workqueues to multiple per-node pool_workqueues") made unbound workqueues to map to multiple per-node pool_workqueues and accordingly updated workqueue_contested() so that, for unbound workqueues, it maps the specified @cpu to the NUMA node number to obtain the matching pool_workqueue to query the congested state. Before this change, workqueue_congested() ignored @cpu for unbound workqueues as there was only one pool_workqueue and some users (fscache) called it with WORK_CPU_UNBOUND. After the commit, this causes the following oops as WORK_CPU_UNBOUND gets translated to garbage by cpu_to_node(). BUG: unable to handle kernel paging request at ffff8803598d98b8 IP: [] unbound_pwq_by_node+0xa1/0xfa PGD 2421067 PUD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 2689 Comm: cat Tainted: GF 3.9.0-fsdevel+ #4 task: ffff88003d801040 ti: ffff880025806000 task.ti: ffff880025806000 RIP: 0010:[] [] unbound_pwq_by_node+0xa1/0xfa RSP: 0018:ffff880025807ad8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff8800388a2400 RCX: 0000000000000003 RDX: ffff880025807fd8 RSI: ffffffff81a31420 RDI: ffff88003d8016e0 RBP: ffff880025807ae8 R08: ffff88003d801730 R09: ffffffffa00b4898 R10: ffffffff81044217 R11: ffff88003d801040 R12: 0000000064206e97 R13: ffff880036059d98 R14: ffff880038cc8080 R15: ffff880038cc82d0 FS: 00007f21afd9c740(0000) GS:ffff88003d100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff8803598d98b8 CR3: 000000003df49000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff8800388a2400 0000000000000002 ffff880025807b18 ffffffff810442ce ffffffff81044217 ffff880000000002 ffff8800371b4080 ffff88003d112ec0 ffff880025807b38 ffffffffa00810b0 ffff880036059d88 ffff880036059be8 Call Trace: [] workqueue_congested+0xb7/0x12c [] fscache_enqueue_object+0xb2/0xe8 [fscache] [] __fscache_acquire_cookie+0x3b9/0x56c [fscache] [] nfs_fscache_set_inode_cookie+0xee/0x132 [nfs] [] do_open+0x9/0xd [nfs] [] do_dentry_open+0x175/0x24b [] finish_open+0x41/0x51 Fix it by using smp_processor_id() if @cpu is WORK_CPU_UNBOUND. Signed-off-by: Tejun Heo Reported-by: David Howells Tested-and-Acked-by: David Howells --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4aa9f5bc6b2d..1ae602809efb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4311,6 +4311,12 @@ bool current_is_workqueue_rescuer(void) * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * + * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. + * Note that both per-cpu and unbound workqueues may be associated with + * multiple pool_workqueues which have separate congested states. A + * workqueue being congested on one CPU doesn't mean the workqueue is also + * contested on other CPUs / NUMA nodes. + * * RETURNS: * %true if congested, %false otherwise. */ @@ -4321,6 +4327,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) rcu_read_lock_sched(); + if (cpu == WORK_CPU_UNBOUND) + cpu = smp_processor_id(); + if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else -- cgit v1.2.3 From 4b0c0f294f60abcdd20994a8341a95c8ac5eeb96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 May 2013 15:02:50 +0200 Subject: tick: Cleanup NOHZ per cpu data on cpu down Prarit reported a crash on CPU offline/online. The reason is that on CPU down the NOHZ related per cpu data of the dead cpu is not cleaned up. If at cpu online an interrupt happens before the per cpu tick device is registered the irq_enter() check potentially sees stale data and dereferences a NULL pointer. Cleanup the data after the cpu is dead. Reported-by: Prarit Bhargava Cc: stable@vger.kernel.org Cc: Mike Galbraith Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305031451561.2886@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 225f8bf19095..0eed1db2d792 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -904,7 +904,7 @@ void tick_cancel_sched_timer(int cpu) hrtimer_cancel(&ts->sched_timer); # endif - ts->nohz_mode = NOHZ_MODE_INACTIVE; + memset(ts, 0, sizeof(*ts)); } #endif -- cgit v1.2.3 From d6cbf35dac8a3dadb9103379820c96d7c85df3d9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 May 2013 19:44:20 +0800 Subject: cgroup: initialize xattr before calling d_instantiate() cgroup_create_file() calls d_instantiate(), which may decide to look at the xattrs on the file. Smack always does this and SELinux can be configured to do so. But cgroup_add_file() didn't initialize xattrs before calling cgroup_create_file(), which finally leads to dereferencing NULL dentry->d_fsdata. This bug has been there since cgroup xattr was introduced. Cc: # 3.8.x Reported-by: Ivan Bulatovic Reported-by: Casey Schaufler Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a9926275f80..38b136553044 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2699,13 +2699,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, goto out; } + cfe->type = (void *)cft; + cfe->dentry = dentry; + dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } -- cgit v1.2.3 From f7ea0fd639c2c48d3c61b6eec75362be290c6874 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 May 2013 21:40:27 +0200 Subject: tick: Don't invoke tick_nohz_stop_sched_tick() if the cpu is offline commit 5b39939a4 (nohz: Move ts->idle_calls incrementation into strict idle logic) moved code out of tick_nohz_stop_sched_tick() and missed to bail out when the cpu is offline. That's causing subsequent failures as an offline CPU is supposed to die and not to fiddle with nohz magic. Return false in can_stop_idle_tick() if the cpu is offline. Reported-and-tested-by: Jiri Kosina Reported-and-tested-by: Prarit Bhargava Cc: Frederic Weisbecker Cc: Borislav Petkov Cc: Tony Luck Cc: x86@kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305132138160.2863@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0eed1db2d792..012142187db9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -469,6 +469,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) -- cgit v1.2.3 From b47430d3adbedbfdb5979ba4874f5dadf94f16b1 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 14 May 2013 04:01:27 +0530 Subject: rcu/idle: Wrap cpu-idle poll mode within rcu_idle_enter/exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bjørn Mork reported the following warning when running powertop. [ 49.289034] ------------[ cut here ]------------ [ 49.289055] WARNING: at kernel/rcutree.c:502 rcu_eqs_exit_common.isra.48+0x3d/0x125() [ 49.289244] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.10.0-bisect-rcu-warn+ #107 [ 49.289251] ffffffff8157d8c8 ffffffff81801e28 ffffffff8137e4e3 ffffffff81801e68 [ 49.289260] ffffffff8103094f ffffffff81801e68 0000000000000000 ffff88023afcd9b0 [ 49.289268] 0000000000000000 0140000000000000 ffff88023bee7700 ffffffff81801e78 [ 49.289276] Call Trace: [ 49.289285] [] dump_stack+0x19/0x1b [ 49.289293] [] warn_slowpath_common+0x62/0x7b [ 49.289300] [] warn_slowpath_null+0x15/0x17 [ 49.289306] [] rcu_eqs_exit_common.isra.48+0x3d/0x125 [ 49.289314] [] ? trace_hardirqs_off_caller+0x37/0xa6 [ 49.289320] [] rcu_idle_exit+0x85/0xa8 [ 49.289327] [] trace_cpu_idle_rcuidle+0xae/0xff [ 49.289334] [] cpu_startup_entry+0x72/0x115 [ 49.289341] [] rest_init+0x149/0x150 [ 49.289347] [] ? csum_partial_copy_generic+0x16c/0x16c [ 49.289355] [] start_kernel+0x3f0/0x3fd [ 49.289362] [] ? repair_env_string+0x5a/0x5a [ 49.289368] [] x86_64_start_reservations+0x2a/0x2c [ 49.289375] [] x86_64_start_kernel+0xcd/0xd1 [ 49.289379] ---[ end trace 07a1cc95e29e9036 ]--- The warning is that 'rdtp->dynticks' has an unexpected value, which roughly translates to - the calls to rcu_idle_enter() and rcu_idle_exit() were not made in the correct order, or otherwise messed up. And Bjørn's painstaking debugging indicated that this happens when the idle loop enters the poll mode. Looking at the poll function cpu_idle_poll(), and the implementation of trace_cpu_idle_rcuidle(), the problem becomes very clear: cpu_idle_poll() lacks calls to rcu_idle_enter/exit(), and trace_cpu_idle_rcuidle() calls them in the reverse order - first rcu_idle_exit(), and then rcu_idle_enter(). Hence the even/odd alternative sequencing of rdtp->dynticks goes for a toss. And powertop readily triggers this because powertop uses the idle-tracing infrastructure extensively. So, to fix this, wrap the code in cpu_idle_poll() within rcu_idle_enter/exit(), so that it blends properly with the calls inside trace_cpu_idle_rcuidle() and thus get the function ordering right. Reported-and-tested-by: Bjørn Mork Cc: Paul McKenney Cc: Steven Rostedt Cc: Dipankar Sarma Signed-off-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/519169BF.4080208@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 8b86c0c68edf..d5585f5e038e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -40,11 +40,13 @@ __setup("hlt", cpu_idle_nopoll_setup); static inline int cpu_idle_poll(void) { + rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); return 1; } -- cgit v1.2.3 From 42a5cf46cd56f46267d2a9fcf2655f4078cd3042 Mon Sep 17 00:00:00 2001 From: Tirupathi Reddy Date: Tue, 14 May 2013 13:59:02 +0530 Subject: timer: Don't reinitialize the cpu base lock during CPU_UP_PREPARE An inactive timer's base can refer to a offline cpu's base. In the current code, cpu_base's lock is blindly reinitialized each time a CPU is brought up. If a CPU is brought online during the period that another thread is trying to modify an inactive timer on that CPU with holding its timer base lock, then the lock will be reinitialized under its feet. This leads to following SPIN_BUG(). <0> BUG: spinlock already unlocked on CPU#3, kworker/u:3/1466 <0> lock: 0xe3ebe000, .magic: dead4ead, .owner: kworker/u:3/1466, .owner_cpu: 1 <4> [] (unwind_backtrace+0x0/0x11c) from [] (do_raw_spin_unlock+0x40/0xcc) <4> [] (do_raw_spin_unlock+0x40/0xcc) from [] (_raw_spin_unlock+0x8/0x30) <4> [] (_raw_spin_unlock+0x8/0x30) from [] (mod_timer+0x294/0x310) <4> [] (mod_timer+0x294/0x310) from [] (queue_delayed_work_on+0x104/0x120) <4> [] (queue_delayed_work_on+0x104/0x120) from [] (sdhci_msm_bus_voting+0x88/0x9c) <4> [] (sdhci_msm_bus_voting+0x88/0x9c) from [] (sdhci_disable+0x40/0x48) <4> [] (sdhci_disable+0x40/0x48) from [] (mmc_release_host+0x4c/0xb0) <4> [] (mmc_release_host+0x4c/0xb0) from [] (mmc_sd_detect+0x90/0xfc) <4> [] (mmc_sd_detect+0x90/0xfc) from [] (mmc_rescan+0x7c/0x2c4) <4> [] (mmc_rescan+0x7c/0x2c4) from [] (process_one_work+0x27c/0x484) <4> [] (process_one_work+0x27c/0x484) from [] (worker_thread+0x210/0x3b0) <4> [] (worker_thread+0x210/0x3b0) from [] (kthread+0x80/0x8c) <4> [] (kthread+0x80/0x8c) from [] (kernel_thread_exit+0x0/0x8) As an example, this particular crash occurred when CPU #3 is executing mod_timer() on an inactive timer whose base is refered to offlined CPU #2. The code locked the timer_base corresponding to CPU #2. Before it could proceed, CPU #2 came online and reinitialized the spinlock corresponding to its base. Thus now CPU #3 held a lock which was reinitialized. When CPU #3 finally ended up unlocking the old cpu_base corresponding to CPU #2, we hit the above SPIN_BUG(). CPU #0 CPU #3 CPU #2 ------ ------- ------- ..... ...... mod_timer() lock_timer_base spin_lock_irqsave(&base->lock) cpu_up(2) ..... ...... init_timers_cpu() .... ..... spin_lock_init(&base->lock) ..... spin_unlock_irqrestore(&base->lock) ...... Allocation of per_cpu timer vector bases is done only once under "tvec_base_done[]" check. In the current code, spinlock_initialization of base->lock isn't under this check. When a CPU is up each time the base lock is reinitialized. Move base spinlock initialization under the check. Signed-off-by: Tirupathi Reddy Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1368520142-4136-1-git-send-email-tirupath@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 09bca8ce9771..7376589adc28 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1539,12 +1539,12 @@ static int __cpuinit init_timers_cpu(int cpu) boot_done = 1; base = &boot_tvec_bases; } + spin_lock_init(&base->lock); tvec_base_done[cpu] = 1; } else { base = per_cpu(tvec_bases, cpu); } - spin_lock_init(&base->lock); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); -- cgit v1.2.3 From 6faf72834d9d0c0dc6632604eaeffb621e87fcf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 May 2013 06:53:37 -0700 Subject: rcu: Fix comparison sense in rcu_needs_cpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c0f4dfd4f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks) introduced a bug that can result in excessively long grace periods. This bug reverse the senes of the "if" statement checking for lazy callbacks, so that RCU takes a lazy approach when there are in fact non-lazy callbacks. This can result in excessive boot, suspend, and resume times. This commit therefore fixes the sense of this "if" statement. Reported-by: Borislav Petkov Reported-by: Bjørn Mork Reported-by: Joerg Roedel Signed-off-by: Paul E. McKenney Tested-by: Bjørn Mork Tested-by: Joerg Roedel --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 170814dc418f..6d939a645da1 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1667,7 +1667,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) rdtp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (rdtp->all_lazy) { + if (!rdtp->all_lazy) { *dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { -- cgit v1.2.3 From 8f174b1175a10903ade40f36eb6c896412877ca0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 1 May 2013 00:07:00 +0900 Subject: workqueue: correct handling of the pool spin_lock When we fail to mutex_trylock(), we release the pool spin_lock and do mutex_lock(). After that, we should regrab the pool spin_lock, but, regrabbing is missed in current code. So correct it. Cc: Lai Jiangshan Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1ae602809efb..286847b90225 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2059,6 +2059,7 @@ static bool manage_workers(struct worker *worker) if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); + spin_lock_irq(&pool->lock); ret = true; } -- cgit v1.2.3 From ad7b1f841f8a54c6d61ff181451f55b68175e15a Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Mon, 6 May 2013 17:44:55 -0400 Subject: workqueue: Make schedule_work() available again to non GPL modules Commit 8425e3d5bdbe ("workqueue: inline trivial wrappers") changed schedule_work() and schedule_delayed_work() to inline wrappers, but these rely on some symbols that are EXPORT_SYMBOL_GPL, while the original functions were EXPORT_SYMBOL. This has the effect of changing the licensing requirement for these functions and making them unavailable to non GPL modules. Make them available again by removing the restriction on the required symbols. Signed-off-by: Marc Dionne Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 286847b90225..02916f421385 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -296,7 +296,7 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; struct workqueue_struct *system_wq __read_mostly; -EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __read_mostly; @@ -1411,7 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_work_on); +EXPORT_SYMBOL(queue_work_on); void delayed_work_timer_fn(unsigned long __data) { @@ -1485,7 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_delayed_work_on); +EXPORT_SYMBOL(queue_delayed_work_on); /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU -- cgit v1.2.3 From b4f711ee03d28f776fd2324fd0bd999cc428e4d2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 24 Apr 2013 11:32:56 -0700 Subject: time: Revert ALWAYS_USE_PERSISTENT_CLOCK compile time optimizaitons Kay Sievers noted that the ALWAYS_USE_PERSISTENT_CLOCK config, which enables some minor compile time optimization to avoid uncessary code in mostly the suspend/resume path could cause problems for userland. In particular, the dependency for RTC_HCTOSYS on !ALWAYS_USE_PERSISTENT_CLOCK, which avoids setting the time twice and simplifies suspend/resume, has the side effect of causing the /sys/class/rtc/rtcN/hctosys flag to always be zero, and this flag is commonly used by udev to setup the /dev/rtc symlink to /dev/rtcN, which can cause pain for older applications. While the udev rules could use some work to be less fragile, breaking userland should strongly be avoided. Additionally the compile time optimizations are fairly minor, and the code being optimized is likely to be reworked in the future, so lets revert this change. Reported-by: Kay Sievers Signed-off-by: John Stultz Cc: stable #3.9 Cc: Feng Tang Cc: Jason Gunthorpe Link: http://lkml.kernel.org/r/1366828376-18124-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/Kconfig | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd7..b69692250af4 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,11 +12,6 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool -# Platforms has a persistent clock -config ALWAYS_USE_PERSISTENT_CLOCK - bool - default n - # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool -- cgit v1.2.3 From 615ee5443ff9bedd356dc6865f3e9c276ce434ea Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 26 Mar 2013 11:35:16 -0400 Subject: rcu: Don't allocate bootmem from rcu_init() When rcu_init() is called we already have slab working, allocating bootmem at that point results in warnings and an allocation from slab. This commit therefore changes alloc_bootmem_cpumask_var() to alloc_cpumask_var() in rcu_bootup_announce_oddness(), which is called from rcu_init(). Signed-off-by: Sasha Levin Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Tested-by: Robin Holt [paulmck: convert to zalloc_cpumask_var(), as suggested by Yinghai Lu.] --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6d939a645da1..3db5a375d8dd 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -88,7 +88,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_NOCB_CPU #ifndef CONFIG_RCU_NOCB_CPU_NONE if (!have_rcu_nocb_mask) { - alloc_bootmem_cpumask_var(&rcu_nocb_mask); + zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); have_rcu_nocb_mask = true; } #ifdef CONFIG_RCU_NOCB_CPU_ZERO -- cgit v1.2.3 From 60705c89460fdc7227f2d153b68b3f34814738a4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 May 2013 15:40:48 -0400 Subject: tracing: Fix leaks of filter preds Special preds are created when folding a series of preds that can be done in serial. These are allocated in an ops field of the pred structure. But they were never freed, causing memory leaks. This was discovered using the kmemleak checker: unreferenced object 0xffff8800797fd5e0 (size 32): comm "swapper/0", pid 1, jiffies 4294690605 (age 104.608s) hex dump (first 32 bytes): 00 00 01 00 03 00 05 00 07 00 09 00 0b 00 0d 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x73/0x98 [] kmemleak_alloc_recursive.constprop.42+0x16/0x18 [] __kmalloc+0xd7/0x125 [] kcalloc.constprop.24+0x2d/0x2f [] fold_pred_tree_cb+0xa9/0xf4 [] walk_pred_tree+0x47/0xcc [] replace_preds.isra.20+0x6f8/0x72f [] create_filter+0x4e/0x8b [] ftrace_test_event_filter+0x5a/0x155 [] do_one_initcall+0xa0/0x137 [] kernel_init_freeable+0x14d/0x1dc [] kernel_init+0xe/0xdb [] ret_from_fork+0x7c/0xb0 [] 0xffffffffffffffff Cc: Tom Zanussi Cc: stable@vger.kernel.org # 2.6.39+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a6361178de5a..e1b653f7e1ca 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -750,7 +750,11 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { + int i; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + kfree(filter->preds[i].ops); kfree(filter->preds); filter->preds = NULL; } -- cgit v1.2.3 From c02c7e65d9b13670e34bc523744cf4f6e99c198a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:34 +0900 Subject: tracing/kprobes: Use rcu_dereference_raw for tp->files Use rcu_dereference_raw() for accessing tp->files. Because the write-side uses rcu_assign_pointer() for memory barrier, the read-side also has to use rcu_dereference_raw() with read memory barrier. Link: http://lkml.kernel.org/r/20130513115834.6545.17022.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 47 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 636d45fe69b3..0a3d8d5c483d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -185,9 +185,14 @@ static struct trace_probe *find_trace_probe(const char *event, static int trace_probe_nr_files(struct trace_probe *tp) { - struct ftrace_event_file **file = tp->files; + struct ftrace_event_file **file; int ret = 0; + /* + * Since all tp->files updater is protected by probe_enable_lock, + * we don't need to lock an rcu_read_lock. + */ + file = rcu_dereference_raw(tp->files); if (file) while (*(file++)) ret++; @@ -209,9 +214,10 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) mutex_lock(&probe_enable_lock); if (file) { - struct ftrace_event_file **new, **old = tp->files; + struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); + old = rcu_dereference_raw(tp->files); /* 1 is for new one and 1 is for stopper */ new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), GFP_KERNEL); @@ -251,11 +257,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) static int trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) { + struct ftrace_event_file **files; int i; - if (tp->files) { - for (i = 0; tp->files[i]; i++) - if (tp->files[i] == file) + /* + * Since all tp->files updater is protected by probe_enable_lock, + * we don't need to lock an rcu_read_lock. + */ + files = rcu_dereference_raw(tp->files); + if (files) { + for (i = 0; files[i]; i++) + if (files[i] == file) return i; } @@ -274,10 +286,11 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) mutex_lock(&probe_enable_lock); if (file) { - struct ftrace_event_file **new, **old = tp->files; + struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); int i, j; + old = rcu_dereference_raw(tp->files); if (n == 0 || trace_probe_file_index(tp, file) < 0) { ret = -EINVAL; goto out_unlock; @@ -872,9 +885,16 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, static __kprobes void kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) { - struct ftrace_event_file **file = tp->files; + /* + * Note: preempt is already disabled around the kprobe handler. + * However, we still need an smp_read_barrier_depends() corresponding + * to smp_wmb() in rcu_assign_pointer() to access the pointer. + */ + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); + + if (unlikely(!file)) + return; - /* Note: preempt is already disabled around the kprobe handler */ while (*file) { __kprobe_trace_func(tp, regs, *file); file++; @@ -925,9 +945,16 @@ static __kprobes void kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_file **file = tp->files; + /* + * Note: preempt is already disabled around the kprobe handler. + * However, we still need an smp_read_barrier_depends() corresponding + * to smp_wmb() in rcu_assign_pointer() to access the pointer. + */ + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); + + if (unlikely(!file)) + return; - /* Note: preempt is already disabled around the kprobe handler */ while (*file) { __kretprobe_trace_func(tp, ri, regs, *file); file++; -- cgit v1.2.3 From 3d1fc7b0880c4db612a3d3211a808659e28af588 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:37 +0900 Subject: tracing/kprobes: Fix a sparse warning for incorrect type in assignment Fix a sparse warning about the rcu operated pointer is defined without __rcu address space. Link: http://lkml.kernel.org/r/20130513115837.6545.23322.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0a3d8d5c483d..81c5109b9d00 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; - struct ftrace_event_file **files; + struct ftrace_event_file * __rcu *files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; -- cgit v1.2.3 From b62fdd97fcae17e483b005bafd13fadbd9840672 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:39 +0900 Subject: tracing/kprobes: Make print_*probe_event static According to sparse warning, print_*probe_event static because those functions are not directly called from outside. Link: http://lkml.kernel.org/r/20130513115839.6545.83067.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 81c5109b9d00..9f46e98ba8f2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -962,7 +962,7 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, } /* Event entry printers */ -enum print_line_t +static enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -998,7 +998,7 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -enum print_line_t +static enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { -- cgit v1.2.3 From 1be0c25da56e860992af972a60321563ca2cfcd1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 May 2013 14:24:24 -0700 Subject: workqueue: don't perform NUMA-aware allocations on offline nodes in wq_numa_init() wq_numa_init() builds per-node cpumasks which are later used to make unbound workqueues NUMA-aware. The cpumasks are allocated using alloc_cpumask_var_node() for all possible nodes. Unfortunately, on machines with off-line nodes, this leads to NUMA-aware allocations on existing bug offline nodes, which in turn triggers BUG in the memory allocation code. Fix it by using NUMA_NO_NODE for cpumask allocations for offline nodes. kernel BUG at include/linux/gfp.h:323! invalid opcode: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0+ #1 Hardware name: ProLiant BL465c G7, BIOS A19 12/10/2011 task: ffff880234608000 ti: ffff880234602000 task.ti: ffff880234602000 RIP: 0010:[] [] new_slab+0x2ad/0x340 RSP: 0000:ffff880234603bf8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880237404b40 RCX: 00000000000000d0 RDX: 0000000000000001 RSI: 0000000000000003 RDI: 00000000002052d0 RBP: ffff880234603c28 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000001 R11: ffffffff812e3aa8 R12: 0000000000000001 R13: ffff8802378161c0 R14: 0000000000030027 R15: 00000000000040d0 FS: 0000000000000000(0000) GS:ffff880237800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff88043fdff000 CR3: 00000000018d5000 CR4: 00000000000007f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff880234603c28 0000000000000001 00000000000000d0 ffff8802378161c0 ffff880237404b40 ffff880237404b40 ffff880234603d28 ffffffff815edba1 ffff880237816140 0000000000000000 ffff88023740e1c0 Call Trace: [] __slab_alloc+0x330/0x4f2 [] kmem_cache_alloc_node_trace+0xa5/0x200 [] alloc_cpumask_var_node+0x28/0x90 [] wq_numa_init+0x10d/0x1be [] init_workqueues+0x64/0x341 [] do_one_initcall+0xea/0x1a0 [] kernel_init_freeable+0xb7/0x1ec [] kernel_init+0xe/0xf0 [] ret_from_fork+0x7c/0xb0 Code: 45 84 ac 00 00 00 f0 41 80 4d 00 40 e9 f6 fe ff ff 66 0f 1f 84 00 00 00 00 00 e8 eb 4b ff ff 49 89 c5 e9 05 fe ff ff <0f> 0b 4c 8b 73 38 44 89 ff 81 cf 00 00 20 00 4c 89 f6 48 c1 ee Signed-off-by: Tejun Heo Reported-and-Tested-by: Lingzhu Xiang --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02916f421385..ee8e29a2320c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4905,7 +4905,8 @@ static void __init wq_numa_init(void) BUG_ON(!tbl); for_each_node(node) - BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); -- cgit v1.2.3 From 6ed0106667d76589cb648c27edb4f4ffbf9d59ca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 May 2013 20:48:49 +0900 Subject: tracing: Return -EBUSY when event_enable_func() fails to get module Since try_module_get() returns false( = 0) when it fails to pindown a module, event_enable_func() returns 0 which means "succeed". This can cause a kernel panic when the entry is removed, because the event is already released. This fixes the bug by returning -EBUSY, because the reason why it fails is that the module is being removed at that time. Link: http://lkml.kernel.org/r/20130516114848.13508.97899.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7a0cf68027cc..27963e2bf4bf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2072,8 +2072,10 @@ event_enable_func(struct ftrace_hash *hash, out_reg: /* Don't let event modules unload while probe registered */ ret = try_module_get(file->event_call->mod); - if (!ret) + if (!ret) { + ret = -EBUSY; goto out_free; + } ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) -- cgit v1.2.3 From 264b83c07a84223f0efd0d1db9ccc66d6f88288f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 16 May 2013 17:43:55 +0200 Subject: usermodehelper: check subprocess_info->path != NULL argv_split(empty_or_all_spaces) happily succeeds, it simply returns argc == 0 and argv[0] == NULL. Change call_usermodehelper_exec() to check sub_info->path != NULL to avoid the crash. This is the minimal fix, todo: - perhaps we should change argv_split() to return NULL or change the callers. - kill or justify ->path[0] check - narrow the scope of helper_lock() Signed-off-by: Oleg Nesterov Acked-By: Lucas De Marchi Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 1296e72e4161..8241906c4b61 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -569,6 +569,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) int retval = 0; helper_lock(); + if (!sub_info->path) { + retval = -EINVAL; + goto out; + } + if (sub_info->path[0] == '\0') goto out; -- cgit v1.2.3 From 06c9494c0e9bdfcaa14d6d2b096a0ff7abe8494f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 May 2013 20:33:01 +0100 Subject: kmemleak: Scan all allocated, writeable and not executable module sections Instead of just picking data sections by name (names that start with .data, .bss or .ref.data), use the section flags and scan all sections that are allocated, writable and not executable. Which should cover all sections of a module that might reference data. Signed-off-by: Steven Rostedt [catalin.marinas@arm.com: removed unused 'name' variable] [catalin.marinas@arm.com: collapsed 'if' blocks] Signed-off-by: Catalin Marinas Acked-by: Rusty Russell --- kernel/module.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b049939177f6..06f496a5d182 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2431,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod, kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); for (i = 1; i < info->hdr->e_shnum; i++) { - const char *name = info->secstrings + info->sechdrs[i].sh_name; - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!strstarts(name, ".data") && !strstarts(name, ".bss")) + /* Scan all writable sections that's not executable */ + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || + !(info->sechdrs[i].sh_flags & SHF_WRITE) || + (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) continue; kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, -- cgit v1.2.3 From 89c837351db0b9b52fd572ec8b0445a42e59b75c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 May 2013 20:46:23 +0100 Subject: kmemleak: No need for scanning specific module sections As kmemleak now scans all module sections that are allocated, writable and non executable, there's no need to scan individual sections that might reference data. Signed-off-by: Steven Rostedt Signed-off-by: Catalin Marinas Acked-by: Rusty Russell --- kernel/module.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 06f496a5d182..cab4bce49c23 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2769,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", sizeof(*mod->trace_bprintk_fmt_start), &mod->num_trace_bprintk_fmt); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_bprintk_fmt_start, - sizeof(*mod->trace_bprintk_fmt_start) * - mod->num_trace_bprintk_fmt, GFP_KERNEL); #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ -- cgit v1.2.3 From fbe06b7bae7c9cf6ab05168fce5ee93b2f4bae7c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 17 May 2013 11:49:10 -0700 Subject: x86, range: fix missing merge during add range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Christian found v3.9 does not work with E350 with EFI is enabled. [ 1.658832] Trying to unpack rootfs image as initramfs... [ 1.679935] BUG: unable to handle kernel paging request at ffff88006e3fd000 [ 1.686940] IP: [] memset+0x1f/0xb0 [ 1.692010] PGD 1f77067 PUD 1f7a067 PMD 61420067 PTE 0 but early memtest report all memory could be accessed without problem. early page table is set in following sequence: [ 0.000000] init_memory_mapping: [mem 0x00000000-0x000fffff] [ 0.000000] init_memory_mapping: [mem 0x6e600000-0x6e7fffff] [ 0.000000] init_memory_mapping: [mem 0x6c000000-0x6e5fffff] [ 0.000000] init_memory_mapping: [mem 0x00100000-0x6bffffff] [ 0.000000] init_memory_mapping: [mem 0x6e800000-0x6ea07fff] but later efi_enter_virtual_mode try set mapping again wrongly. [ 0.010644] pid_max: default: 32768 minimum: 301 [ 0.015302] init_memory_mapping: [mem 0x640c5000-0x6e3fcfff] that means it fails with pfn_range_is_mapped. It turns out that we have a bug in add_range_with_merge and it does not merge range properly when new add one fill the hole between two exsiting ranges. In the case when [mem 0x00100000-0x6bffffff] is the hole between [mem 0x00000000-0x000fffff] and [mem 0x6c000000-0x6e7fffff]. Fix the add_range_with_merge by calling itself recursively. Reported-by: "Christian König" Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVofGoSk7q5-0irjkBxemqK729cND4hov-1QCBJDhxpgQ@mail.gmail.com Cc: v3.9 Signed-off-by: H. Peter Anvin --- kernel/range.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 071b0ab455cb..eb911dbce267 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -48,9 +48,11 @@ int add_range_with_merge(struct range *range, int az, int nr_range, final_start = min(range[i].start, start); final_end = max(range[i].end, end); - range[i].start = final_start; - range[i].end = final_end; - return nr_range; + /* clear it and add it back for further merge */ + range[i].start = 0; + range[i].end = 0; + return add_range_with_merge(range, az, nr_range, + final_start, final_end); } /* Need to add it: */ -- cgit v1.2.3 From ca1643186d3dce6171d8f171e516b02496360a9e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 May 2013 11:51:10 -0400 Subject: tracing: Fix crash when ftrace=nop on the kernel command line If ftrace= is on the kernel command line, when that tracer is registered, it will be initiated by tracing_set_tracer() to execute that tracer. The nop tracer is just a stub tracer that is used to have no tracer enabled. It is assigned at early bootup as it is the default tracer. But if ftrace=nop is on the kernel command line, the registering of the nop tracer will call tracing_set_tracer() which will try to execute the nop tracer. But it expects tr->current_trace to be assigned something as it usually is assigned to the nop tracer. As it hasn't been assigned to anything yet, it causes the system to crash. The simple fix is to move the tr->current_trace = nop before registering the nop tracer. The functionality is still the same as the nop tracer doesn't do anything anyway. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ae6fa2d1cdf7..4d79485b3237 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6216,10 +6216,15 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); - register_tracer(&nop_trace); - + /* + * register_tracer() might reference current_trace, so it + * needs to be set before we register anything. This is + * just a bootstrap of current_trace anyway. + */ global_trace.current_trace = &nop_trace; + register_tracer(&nop_trace); + /* All seems OK, enable tracing */ tracing_disabled = 0; -- cgit v1.2.3 From 7805d000db30a3787a4c969bab6ae4d8a5fd8ce6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:50:24 +0900 Subject: cgroup: fix a subtle bug in descendant pre-order walk When cgroup_next_descendant_pre() initiates a walk, it checks whether the subtree root doesn't have any children and if not returns NULL. Later code assumes that the subtree isn't empty. This is broken because the subtree may become empty inbetween, which can lead to the traversal escaping the subtree by walking to the sibling of the subtree root. There's no reason to have the early exit path. Remove it along with the later assumption that the subtree isn't empty. This simplifies the code a bit and fixes the subtle bug. While at it, fix the comment of cgroup_for_each_descendant_pre() which was incorrectly referring to ->css_offline() instead of ->css_online(). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Cc: stable@vger.kernel.org --- kernel/cgroup.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38b136553044..31e9ef319070 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2954,11 +2954,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, pretend we just visited @cgroup */ - if (!pos) { - if (list_empty(&cgroup->children)) - return NULL; + if (!pos) pos = cgroup; - } /* visit the first child if exists */ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); @@ -2966,14 +2963,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, return next; /* no child, visit my or the closest ancestor's next sibling */ - do { + while (pos != cgroup) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; pos = pos->parent; - } while (pos != cgroup); + } return NULL; } -- cgit v1.2.3 From 387b8b3e37cb1c257fb607787f73815c30d22859 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 May 2013 15:55:25 -0700 Subject: auditfilter.c: fix kernel-doc warnings Fix kernel-doc warnings in kernel/auditfilter.c: Warning(kernel/auditfilter.c:1029): Excess function parameter 'loginuid' description in 'audit_receive_filter' Warning(kernel/auditfilter.c:1029): Excess function parameter 'sessionid' description in 'audit_receive_filter' Warning(kernel/auditfilter.c:1029): Excess function parameter 'sid' description in 'audit_receive_filter' Signed-off-by: Randy Dunlap Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 83a2970295d1..6bd4a90d1991 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1021,9 +1021,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data - * @loginuid: loginuid of sender - * @sessionid: sessionid for netlink audit message - * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) { -- cgit v1.2.3 From 2938d2757fc99c26aa678ce4eba910c4a77c3a55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:33:01 +0200 Subject: tick: Cure broadcast false positive pending bit warning commit 26517f3e (tick: Avoid programming the local cpu timer if broadcast pending) added a warning if the cpu enters broadcast mode again while the pending bit is still set. Meelis reported that the warning triggers. There are two corner cases which have been not considered: 1) cpuidle calls clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) twice. That can result in the following scenario CPU0 CPU1 cpuidle_idle_call() clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) set cpu in tick_broadcast_oneshot_mask broadcast interrupt event expired for cpu1 set pending bit acpi_idle_enter_simple() clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) WARN_ON(pending bit) Move the WARN_ON into the section where we enter broadcast mode so it wont provide false positives on the second call. 2) safe_halt() enables interrupts, so a broadcast interrupt can be delivered befor the broadcast mode is disabled. That sets the pending bit for the CPU which receives the broadcast interrupt. Though the interrupt is delivered right away from the broadcast handler and leaves the pending bit stale. Clear the pending bit for the current cpu in the broadcast handler. Reported-and-tested-by: Meelis Roos Cc: Len Brown Cc: Frederic Weisbecker Cc: Borislav Petkov Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305271841130.4220@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d577669..0c739423b0f9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -511,6 +511,12 @@ again: } } + /* + * Remove the current cpu from the pending mask. The event is + * delivered immediately in tick_do_broadcast() ! + */ + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + /* Take care of enforced broadcast requests */ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); @@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); /* * We only reprogram the broadcast timer if we -- cgit v1.2.3 From 7b959fc582741227a1c4cba710d6aff8fb183128 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 May 2013 18:34:09 +0900 Subject: kprobes: Fix to free gone and unused optprobes Fix to free gone and unused optprobes. This bug will cause a kernel panic if the user reuses the killed and unused probe. Reported at: http://sourceware.org/ml/systemtap/2013-q2/msg00142.html In the normal path, an optprobe on an init function is unregistered when a module goes live. unregister_kprobe(kp) -> __unregister_kprobe_top ->__disable_kprobe ->disarm_kprobe(ap == op) ->__disarm_kprobe ->unoptimize_kprobe : the op is queued on unoptimizing_list and do nothing in __unregister_kprobe_bottom After a while (usually wait 5 jiffies), kprobe_optimizer runs to unoptimize and free optprobe. kprobe_optimizer ->do_unoptimize_kprobes ->arch_unoptimize_kprobes : moved to free_list ->do_free_cleaned_kprobes ->hlist_del: the op is removed ->free_aggr_kprobe ->arch_remove_optimized_kprobe ->arch_remove_kprobe ->kfree: the op is freed Here, if kprobes_module_callback is called and the delayed unoptimizing probe is picked BEFORE kprobe_optimizer runs, kprobes_module_callback ->kill_kprobe ->kill_optimized_kprobe : dequeued from unoptimizing_list <=!!! ->arch_remove_optimized_kprobe ->arch_remove_kprobe (but op is not freed, and on the kprobe hash table) This doesn't happen if the probe unregistration is done AFTER kprobes_module_callback is called (because at that time the op is gone), and kprobe-tracer does it. To fix this bug, this patch changes kprobes_module_callback to enqueue the op to freeing_list at kill_optimized_kprobe only if the op is unused. The unused probes on freeing_list will be freed in do_free_cleaned_kprobes. Note that this calls arch_remove_*kprobe twice on the same probe. Thus those functions have to check the double free. Fortunately, most of arch codes already checked that except for mips. This will be fixed in the next patch. Signed-off-by: Masami Hiramatsu Cc: Timo Juhani Lindfors Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Frank Ch. Eigler Cc: systemtap@sourceware.org Cc: yrl.pp-manager.tt@hitachi.com Cc: David S. Miller Cc: "David S. Miller" Link: http://lkml.kernel.org/r/20130522093409.9084.63554.stgit@mhiramat-M0-7522 [ Minor edits. ] Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3fed7f0cbcdf..bddf3b201a48 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) /* Optimization staging list, protected by kprobe_mutex */ static LIST_HEAD(optimizing_list); static LIST_HEAD(unoptimizing_list); +static LIST_HEAD(freeing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); @@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void) * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ -static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +static __kprobes void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) /* Ditto to do_optimize_kprobes */ get_online_cpus(); mutex_lock(&text_mutex); - arch_unoptimize_kprobes(&unoptimizing_list, free_list); + arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Disarm probes if marked disabled */ if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); @@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) } /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +static __kprobes void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { BUG_ON(!kprobe_unused(&op->kp)); list_del_init(&op->list); free_aggr_kprobe(&op->kp); @@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void) /* Kprobe jump optimizer */ static __kprobes void kprobe_optimizer(struct work_struct *work) { - LIST_HEAD(free_list); - mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) * kprobes before waiting for quiesence period. */ - do_unoptimize_kprobes(&free_list); + do_unoptimize_kprobes(); /* * Step 2: Wait for quiesence period to ensure all running interrupts @@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) do_optimize_kprobes(); /* Step 4: Free cleaned kprobes after quiesence period */ - do_free_cleaned_kprobes(&free_list); + do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); mutex_unlock(&kprobe_mutex); @@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) if (!list_empty(&op->list)) /* Dequeue from the (un)optimization queue */ list_del_init(&op->list); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + + if (kprobe_unused(p)) { + /* Enqueue if it is unused */ + list_add(&op->list, &freeing_list); + /* + * Remove unused probes from the hash list. After waiting + * for synchronization, this probe is reclaimed. + * (reclaiming is done by do_free_cleaned_kprobes().) + */ + hlist_del_rcu(&op->kp.hlist); + } + /* Don't touch the code, because it is already freed. */ arch_remove_optimized_kprobe(op); } -- cgit v1.2.3 From 26cb63ad11e04047a64309362674bcbbd6a6f246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 May 2013 10:55:48 +0200 Subject: perf: Fix perf mmap bugs Vince reported a problem found by his perf specific trinity fuzzer. Al noticed 2 problems with perf's mmap(): - it has issues against fork() since we use vma->vm_mm for accounting. - it has an rb refcount leak on double mmap(). We fix the issues against fork() by using VM_DONTCOPY; I don't think there's code out there that uses this; we didn't hear about weird accounting problems/crashes. If we do need this to work, the previously proposed VM_PINNED could make this work. Aside from the rb reference leak spotted by Al, Vince's example prog was indeed doing a double mmap() through the use of perf_event_set_output(). This exposes another problem, since we now have 2 events with one buffer, the accounting gets screwy because we account per event. Fix this by making the buffer responsible for its own accounting. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Al Viro Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20130528085548.GA12193@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 37 ++++++++++++++++++++----------------- kernel/events/internal.h | 3 +++ 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..ae752cd4a086 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2917,7 +2917,7 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void ring_buffer_put(struct ring_buffer *rb); +static bool ring_buffer_put(struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -3582,13 +3582,13 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static void ring_buffer_put(struct ring_buffer *rb) +static bool ring_buffer_put(struct ring_buffer *rb) { struct perf_event *event, *n; unsigned long flags; if (!atomic_dec_and_test(&rb->refcount)) - return; + return false; spin_lock_irqsave(&rb->event_lock, flags); list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { @@ -3598,6 +3598,7 @@ static void ring_buffer_put(struct ring_buffer *rb) spin_unlock_irqrestore(&rb->event_lock, flags); call_rcu(&rb->rcu_head, rb_free_rcu); + return true; } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3612,18 +3613,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->rb); - struct user_struct *user = event->mmap_user; struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); ring_buffer_detach(event, rb); mutex_unlock(&event->mmap_mutex); - ring_buffer_put(rb); - free_uid(user); + if (ring_buffer_put(rb)) { + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + } } } @@ -3676,9 +3679,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages == nr_pages) - atomic_inc(&event->rb->refcount); - else + if (event->rb->nr_pages != nr_pages) ret = -EINVAL; goto unlock; } @@ -3720,12 +3721,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->rb, rb); + + rb->mmap_locked = extra; + rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->pinned_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += extra; + + rcu_assign_pointer(event->rb, rb); perf_event_update_userpage(event); @@ -3734,7 +3737,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59df..5bc6c8e9b851 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,6 +31,9 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; + int mmap_locked; + struct user_struct *mmap_user; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; -- cgit v1.2.3 From 6721cb60022629ae76365551f05d9658b8d14c55 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 May 2013 14:21:36 -0400 Subject: ring-buffer: Do not poll non allocated cpu buffers The tracing infrastructure sets up for possible CPUs, but it uses the ring buffer polling, it is possible to call the ring buffer polling code with a CPU that hasn't been allocated. This will cause a kernel oops when it access a ring buffer cpu buffer that is part of the possible cpus but hasn't been allocated yet as the CPU has never been online. Reported-by: Mauro Carvalho Chehab Tested-by: Mauro Carvalho Chehab Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b59aea2c48c2..e444ff88f0a4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -620,6 +620,9 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } -- cgit v1.2.3 From aa848233f740abbabfa7669daca0ab94aaa37bcd Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 3 May 2013 23:27:07 +0200 Subject: ntp: Remove unused variable flags in __hardpps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/time/ntp.c: In function ‘__hardpps’: kernel/time/ntp.c:877: warning: unused variable ‘flags’ commit a076b2146fabb0894cae5e0189a8ba3f1502d737 ("ntp: Remove ntp_lock, using the timekeeping locks to protect ntp state") removed its users, but not the actual variable. Signed-off-by: Geert Uytterhoeven Signed-off-by: John Stultz --- kernel/time/ntp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 12ff13a838c6..8f5b3b98577b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -874,7 +874,6 @@ static void hardpps_update_phase(long error) void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { struct pps_normtime pts_norm, freq_norm; - unsigned long flags; pts_norm = pps_normalize_ts(*phase_ts); -- cgit v1.2.3 From 0d6bd9953f739dad96d9a0de65383e479ab4e10d Mon Sep 17 00:00:00 2001 From: Zoran Markovic Date: Fri, 17 May 2013 11:24:05 -0700 Subject: timekeeping: Correct run-time detection of persistent_clock. Since commit 31ade30692dc9680bfc95700d794818fa3f754ac, timekeeping_init() checks for presence of persistent clock by attempting to read a non-zero time value. This is an issue on platforms where persistent_clock (instead is implemented as a free-running counter (instead of an RTC) starting from zero on each boot and running during suspend. Examples are some ARM platforms (e.g. PandaBoard). An attempt to read such a clock during timekeeping_init() may return zero value and falsely declare persistent clock as missing. Additionally, in the above case suspend times may be accounted twice (once from timekeeping_resume() and once from rtc_resume()), resulting in a gradual drift of system time. This patch does a run-time correction of the issue by doing the same check during timekeeping_suspend(). A better long-term solution would have to return error when trying to read non-existing clock and zero when trying to read an uninitialized clock, but that would require changing all persistent_clock implementations. This patch addresses the immediate breakage, for now. Cc: John Stultz Cc: Thomas Gleixner Cc: Feng Tang Cc: stable@vger.kernel.org Signed-off-by: Zoran Markovic [jstultz: Tweaked commit message and subject] Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe49..baeeb5c87cf1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -975,6 +975,14 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); + /* + * On some systems the persistent_clock can not be detected at + * timekeeping_init by its return value, so if we see a valid + * value returned, update the persistent_clock_exists flag. + */ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exist = true; + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); -- cgit v1.2.3 From 2a0ff3fbe39bc93f719ff857e5a359d9780579ff Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Sun, 26 May 2013 21:33:09 +0800 Subject: cgroup: warn about mismatching options of a new mount of an existing hierarchy With the new __DEVEL__sane_behavior mount option was introduced, if the root cgroup is alive with no xattr function, to mount a new cgroup with xattr will be rejected in terms of design which just fine. However, if the root cgroup does not mounted with __DEVEL__sane_hehavior, to create a new cgroup with xattr option will succeed although after that the EA function does not works as expected but will get ENOTSUPP for setting up attributes under either cgroup. e.g. setfattr: /cgroup2/test: Operation not supported Instead of keeping silence in this case, it's better to drop a log entry in warning level. That would be helpful to understand the reason behind the scene from the user's perspective, and this is essentially an improvement does not break the backward compatibilities. With this fix, above mount attemption will keep up works as usual but the following line cound be found at the system log: [ ...] cgroup: new mount options do not match the existing superblock tj: minor formatting / message updates. Signed-off-by: Jie Liu Reported-by: Alexey Kodanev Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 31e9ef319070..a7c9e6ddb979 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_drop_root(opts.new_root); - if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && - root->flags != opts.flags) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto drop_new_super; + if (root->flags != opts.flags) { + if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } else { + pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + } } /* no subsys rebinding, so refcounts don't change */ -- cgit v1.2.3 From 1bb539ca36e21c2f4fce0865e11df384bc7b7656 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 14:38:43 -0400 Subject: ftrace: Use the rcu _notrace variants for rcu_dereference_raw() and friends As rcu_dereference_raw() under RCU debug config options can add quite a bit of checks, and that tracing uses rcu_dereference_raw(), these checks happen with the function tracer. The function tracer also happens to trace these debug checks too. This added overhead can livelock the system. Have the function tracer use the new RCU _notrace equivalents that do not do the debug checks for RCU. Link: http://lkml.kernel.org/r/20130528184209.467603904@goodmis.org Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b549b0f5b977..6c508ff33c62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list + * can use rcu_dereference_raw_notrace() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle + * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ #define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw(list); \ + op = rcu_dereference_raw_notrace(list); \ do /* * Optimized for just a single item in the list (as that is the normal case). */ #define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw((op)->next)) && \ + while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ unlikely((op) != &ftrace_list_end)) static inline void ftrace_ops_init(struct ftrace_ops *ops) @@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) if (hlist_empty(hhd)) return NULL; - hlist_for_each_entry_rcu(rec, hhd, node) { + hlist_for_each_entry_rcu_notrace(rec, hhd, node) { if (rec->ip == ip) return rec; } @@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) hhd = &hash->buckets[key]; - hlist_for_each_entry_rcu(entry, hhd, hlist) { + hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { if (entry->ip == ip) return entry; } @@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) struct ftrace_hash *notrace_hash; int ret; - filter_hash = rcu_dereference_raw(ops->filter_hash); - notrace_hash = rcu_dereference_raw(ops->notrace_hash); + filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu(entry, hhd, node) { + hlist_for_each_entry_rcu_notrace(entry, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } -- cgit v1.2.3 From 0184d50f9fd17658c232d6ee6d465a87f989d706 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 29 May 2013 15:56:49 -0400 Subject: tracing: Fix bad parameter passed in branch selftest The branch selftest calls trace_test_buffer(), but with the new code it expects the first parameter to be a pointer to a struct trace_buffer. All self tests were changed but the branch selftest was missed. This caused either a crash or failed test when the branch selftest was enabled. Link: http://lkml.kernel.org/r/20130529141333.GA24064@localhost Reported-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 55e2cf66967b..2901e3b88590 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); -- cgit v1.2.3 From 45eacc692771bd2b1ea3d384e6345cab3da10861 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 May 2013 22:16:32 +0200 Subject: vtime: Use consistent clocks among nohz accounting While computing the cputime delta of dynticks CPUs, we are mixing up clocks of differents natures: * local_clock() which takes care of unstable clock sources and fix these if needed. * sched_clock() which is the weaker version of local_clock(). It doesn't compute any fixup in case of unstable source. If the clock source is stable, those two clocks are the same and we can safely compute the difference against two random points. Otherwise it results in random deltas as sched_clock() can randomly drift away, back or forward, from local_clock(). As a consequence, some strange behaviour with unstable tsc has been observed such as non progressing constant zero cputime. (The 'top' command showing no load). Fix this by only using local_clock(), or its irq safe/remote equivalent, in vtime code. Reported-by: Mike Galbraith Suggested-by: Mike Galbraith Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/cputime.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..e1a27f918723 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4745,7 +4745,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); - vtime_init_idle(idle); + vtime_init_idle(idle, cpu); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..b5ccba22603b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqlock(¤t->vtime_seqlock); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock(); + current->vtime_snap = sched_clock_cpu(smp_processor_id()); write_sequnlock(¤t->vtime_seqlock); } -void vtime_init_idle(struct task_struct *t) +void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; write_seqlock_irqsave(&t->vtime_seqlock, flags); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock(); + t->vtime_snap = sched_clock_cpu(cpu); write_sequnlock_irqrestore(&t->vtime_seqlock, flags); } -- cgit v1.2.3 From 521921bad1192fb1b8f9b6a5aa673635848b8b5f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 May 2013 01:21:38 +0200 Subject: kvm: Move guest entry/exit APIs to context_tracking The kvm_host.h header file doesn't handle well inclusion when archs don't support KVM. This results in build crashes for such archs when they want to implement context tracking because this subsystem includes kvm_host.h in order to implement the guest_enter/exit APIs but it doesn't handle KVM off case. To fix this, move the guest_enter()/guest_exit() declarations and generic implementation to the context tracking headers. These generic APIs actually belong to this subsystem, besides other domains boundary tracking like user_enter() et al. KVM now properly becomes a user of this library, not the other buggy way around. Reported-by: Kevin Hilman Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Kevin Hilman Cc: Marcelo Tosatti Cc: Gleb Natapov Signed-off-by: Ingo Molnar --- kernel/context_tracking.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..85bdde1137eb 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,7 +15,6 @@ */ #include -#include #include #include #include -- cgit v1.2.3 From 1a7f829f094dd7951e7d46c571a18080e455a436 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 17 May 2013 16:44:04 +0800 Subject: nohz: Fix notifier return val that enforce timekeeping In tick_nohz_cpu_down_callback() if the cpu is the one handling timekeeping, we must return something that stops the CPU_DOWN_PREPARE notifiers and then start notify CPU_DOWN_FAILED on the already called notifier call backs. However traditional errno values are not handled by the notifier unless these are encapsulated using errno_to_notifier(). Hence the current -EINVAL is misinterpreted and converted to junk after notifier_to_errno(), leaving the notifier subsystem to random behaviour such as eventually allowing the cpu to go down. Fix this by using the standard NOTIFY_BAD instead. Signed-off-by: Li Zhong Reviewed-by: Srivatsa S. Bhat Acked-by: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf4..0cf1c1453181 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * we can't safely shutdown that CPU. */ if (have_nohz_full_mask && tick_do_timer_cpu == cpu) - return -EINVAL; + return NOTIFY_BAD; break; } return NOTIFY_OK; -- cgit v1.2.3 From f5d00c1f9adb350c24c5301600f7bf2da99b66de Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Tue, 28 May 2013 15:29:03 +0200 Subject: tick: Remove useless timekeeping duty attribution to broadcast source Since 7300711e ("clockevents: broadcast fixup possible waiters"), the timekeeping duty is assigned to the CPU that handles the tick broadcast clock device by the time it is set in one shot mode. This is an issue in full dynticks mode where the timekeeping duty must stay handled by the boot CPU for now. Otherwise it prevents secondary CPUs from offlining and this breaks suspend/shutdown/reboot/... As it appears there is no reason for this timekeeping duty to be moved to the broadcast CPU, besides nothing prevent it from being later re-assigned to another target, let's simply remove it. Signed-off-by: Jiri Bohac Reported-by: Steven Rostedt Acked-by: Thomas Gleixner Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Borislav Petkov Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0c739423b0f9..b4c245580b79 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -698,10 +698,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; - /* Take the do_timer update */ - if (!tick_nohz_full_cpu(cpu)) - tick_do_timer_cpu = cpu; - /* * We must be careful here. There might be other CPUs * waiting for periodic broadcast. We need to set the -- cgit v1.2.3 From f17a5194859a82afe4164e938b92035b86c55794 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 30 May 2013 21:10:37 -0400 Subject: tracing: Use current_uid() for critical time tracing The irqsoff tracer records the max time that interrupts are disabled. There are hooks in the assembly code that calls back into the tracer when interrupts are disabled or enabled. When they are enabled, the tracer checks if the amount of time they were disabled is larger than the previous recorded max interrupts off time. If it is, it creates a snapshot of the currently running trace to store where the last largest interrupts off time was held and how it happened. During testing, this RCU lockdep dump appeared: [ 1257.829021] =============================== [ 1257.829021] [ INFO: suspicious RCU usage. ] [ 1257.829021] 3.10.0-rc1-test+ #171 Tainted: G W [ 1257.829021] ------------------------------- [ 1257.829021] /home/rostedt/work/git/linux-trace.git/include/linux/rcupdate.h:780 rcu_read_lock() used illegally while idle! [ 1257.829021] [ 1257.829021] other info that might help us debug this: [ 1257.829021] [ 1257.829021] [ 1257.829021] RCU used illegally from idle CPU! [ 1257.829021] rcu_scheduler_active = 1, debug_locks = 0 [ 1257.829021] RCU used illegally from extended quiescent state! [ 1257.829021] 2 locks held by trace-cmd/4831: [ 1257.829021] #0: (max_trace_lock){......}, at: [] stop_critical_timing+0x1a3/0x209 [ 1257.829021] #1: (rcu_read_lock){.+.+..}, at: [] __update_max_tr+0x88/0x1ee [ 1257.829021] [ 1257.829021] stack backtrace: [ 1257.829021] CPU: 3 PID: 4831 Comm: trace-cmd Tainted: G W 3.10.0-rc1-test+ #171 [ 1257.829021] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 [ 1257.829021] 0000000000000001 ffff880065f49da8 ffffffff8153dd2b ffff880065f49dd8 [ 1257.829021] ffffffff81092a00 ffff88006bd78680 ffff88007add7500 0000000000000003 [ 1257.829021] ffff88006bd78680 ffff880065f49e18 ffffffff810daebf ffffffff810dae5a [ 1257.829021] Call Trace: [ 1257.829021] [] dump_stack+0x19/0x1b [ 1257.829021] [] lockdep_rcu_suspicious+0x109/0x112 [ 1257.829021] [] __update_max_tr+0xed/0x1ee [ 1257.829021] [] ? __update_max_tr+0x88/0x1ee [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] update_max_tr_single+0x11d/0x12d [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] stop_critical_timing+0x141/0x209 [ 1257.829021] [] ? trace_hardirqs_on+0xd/0xf [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] time_hardirqs_on+0x2a/0x2f [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] trace_hardirqs_on_caller+0x16/0x197 [ 1257.829021] [] trace_hardirqs_on+0xd/0xf [ 1257.829021] [] user_enter+0xfd/0x107 [ 1257.829021] [] do_notify_resume+0x92/0x97 [ 1257.829021] [] int_signal+0x12/0x17 What happened was entering into the user code, the interrupts were enabled and a max interrupts off was recorded. The trace buffer was saved along with various information about the task: comm, pid, uid, priority, etc. The uid is recorded with task_uid(tsk). But this is a macro that uses rcu_read_lock() to retrieve the data, and this happened to happen where RCU is blind (user_enter). As only the preempt and irqs off tracers can have this happen, and they both only have the tsk == current, if tsk == current, use current_uid() instead of task_uid(), as current_uid() does not use RCU as only current can change its uid. This fixes the RCU suspicious splat. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4d79485b3237..1a41023a1f88 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -843,7 +843,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; - max_data->uid = task_uid(tsk); + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; -- cgit v1.2.3 From 346dbb79ea0118ebb0df372b35cab9d5805216cd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Apr 2013 19:28:54 +0200 Subject: irqdomain: export irq_domain_add_simple All other irq_domain_add_* functions are exported already, and apparently this one got left out by mistake, which causes build errors for ARM allmodconfig kernels: ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-rcar.ko] undefined! ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-em.ko] undefined! Signed-off-by: Arnd Bergmann Acked-by: Simon Horman Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a83dde8ca0c..d1adaedb435f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -191,6 +191,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, /* A linear domain is the default */ return irq_domain_add_linear(of_node, size, ops, host_data); } +EXPORT_SYMBOL_GPL(irq_domain_add_simple); /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. -- cgit v1.2.3 From 275e31b10ce20613aedceaa5160129c64b260a98 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 14 May 2013 19:02:45 +0800 Subject: kernel/irq/irqdomain.c: before use 'irq_data', need check it whether valid. Since irq_data may be NULL, if so, we WARN_ON(), and continue, 'hwirq' which related with 'irq_data' has to initialize later, or it will cause issue. Signed-off-by: Chen Gang Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d1adaedb435f..8c4c8ea6a205 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -398,11 +398,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, while (count--) { int irq = irq_base + count; struct irq_data *irq_data = irq_get_irq_data(irq); - irq_hw_number_t hwirq = irq_data->hwirq; + irq_hw_number_t hwirq; if (WARN_ON(!irq_data || irq_data->domain != domain)) continue; + hwirq = irq_data->hwirq; irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ -- cgit v1.2.3 From 94a63da0ac1a67bfb8b30aec1086523c5031ea5a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 6 Jun 2013 12:10:23 +0100 Subject: irqdomain: document the simple domain first_irq The first_irq needs to be zero to get a linear domain and that comes with special semantics. We want to simplify this going forward but some documentation never hurts. Signed-off-by: Linus Walleij Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8c4c8ea6a205..54a4d5223238 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * irq_domain_add_simple() - Allocate and register a simple irq_domain. * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping - * @first_irq: first number of irq block assigned to the domain + * @first_irq: first number of irq block assigned to the domain, + * pass zero to assign irqs on-the-fly. This will result in a + * linear IRQ domain so it is important to use irq_create_mapping() + * for each used IRQ, especially when SPARSE_IRQ is enabled. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * -- cgit v1.2.3 From 016a8d5be6ddcc72ef0432d82d9f6fa34f61b907 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 17:32:53 -0400 Subject: rcu: Don't call wakeup() with rcu_node structure ->lock held This commit fixes a lockdep-detected deadlock by moving a wake_up() call out from a rnp->lock critical section. Please see below for the long version of this story. On Tue, 2013-05-28 at 16:13 -0400, Dave Jones wrote: > [12572.705832] ====================================================== > [12572.750317] [ INFO: possible circular locking dependency detected ] > [12572.796978] 3.10.0-rc3+ #39 Not tainted > [12572.833381] ------------------------------------------------------- > [12572.862233] trinity-child17/31341 is trying to acquire lock: > [12572.870390] (rcu_node_0){..-.-.}, at: [] rcu_read_unlock_special+0x9f/0x4c0 > [12572.878859] > but task is already holding lock: > [12572.894894] (&ctx->lock){-.-...}, at: [] perf_lock_task_context+0x7d/0x2d0 > [12572.903381] > which lock already depends on the new lock. > > [12572.927541] > the existing dependency chain (in reverse order) is: > [12572.943736] > -> #4 (&ctx->lock){-.-...}: > [12572.960032] [] lock_acquire+0x91/0x1f0 > [12572.968337] [] _raw_spin_lock+0x40/0x80 > [12572.976633] [] __perf_event_task_sched_out+0x2e7/0x5e0 > [12572.984969] [] perf_event_task_sched_out+0x93/0xa0 > [12572.993326] [] __schedule+0x2cf/0x9c0 > [12573.001652] [] schedule_user+0x2e/0x70 > [12573.009998] [] retint_careful+0x12/0x2e > [12573.018321] > -> #3 (&rq->lock){-.-.-.}: > [12573.034628] [] lock_acquire+0x91/0x1f0 > [12573.042930] [] _raw_spin_lock+0x40/0x80 > [12573.051248] [] wake_up_new_task+0xb7/0x260 > [12573.059579] [] do_fork+0x105/0x470 > [12573.067880] [] kernel_thread+0x26/0x30 > [12573.076202] [] rest_init+0x23/0x140 > [12573.084508] [] start_kernel+0x3f1/0x3fe > [12573.092852] [] x86_64_start_reservations+0x2a/0x2c > [12573.101233] [] x86_64_start_kernel+0xcc/0xcf > [12573.109528] > -> #2 (&p->pi_lock){-.-.-.}: > [12573.125675] [] lock_acquire+0x91/0x1f0 > [12573.133829] [] _raw_spin_lock_irqsave+0x4b/0x90 > [12573.141964] [] try_to_wake_up+0x31/0x320 > [12573.150065] [] default_wake_function+0x12/0x20 > [12573.158151] [] autoremove_wake_function+0x18/0x40 > [12573.166195] [] __wake_up_common+0x58/0x90 > [12573.174215] [] __wake_up+0x39/0x50 > [12573.182146] [] rcu_start_gp_advanced.isra.11+0x4a/0x50 > [12573.190119] [] rcu_start_future_gp+0x1c9/0x1f0 > [12573.198023] [] rcu_nocb_kthread+0x114/0x930 > [12573.205860] [] kthread+0xed/0x100 > [12573.213656] [] ret_from_fork+0x7c/0xb0 > [12573.221379] > -> #1 (&rsp->gp_wq){..-.-.}: > [12573.236329] [] lock_acquire+0x91/0x1f0 > [12573.243783] [] _raw_spin_lock_irqsave+0x4b/0x90 > [12573.251178] [] __wake_up+0x23/0x50 > [12573.258505] [] rcu_start_gp_advanced.isra.11+0x4a/0x50 > [12573.265891] [] rcu_start_future_gp+0x1c9/0x1f0 > [12573.273248] [] rcu_nocb_kthread+0x114/0x930 > [12573.280564] [] kthread+0xed/0x100 > [12573.287807] [] ret_from_fork+0x7c/0xb0 Notice the above call chain. rcu_start_future_gp() is called with the rnp->lock held. Then it calls rcu_start_gp_advance, which does a wakeup. You can't do wakeups while holding the rnp->lock, as that would mean that you could not do a rcu_read_unlock() while holding the rq lock, or any lock that was taken while holding the rq lock. This is because... (See below). > [12573.295067] > -> #0 (rcu_node_0){..-.-.}: > [12573.309293] [] __lock_acquire+0x1786/0x1af0 > [12573.316568] [] lock_acquire+0x91/0x1f0 > [12573.323825] [] _raw_spin_lock+0x40/0x80 > [12573.331081] [] rcu_read_unlock_special+0x9f/0x4c0 > [12573.338377] [] __rcu_read_unlock+0x96/0xa0 > [12573.345648] [] perf_lock_task_context+0x143/0x2d0 > [12573.352942] [] find_get_context+0x4e/0x1f0 > [12573.360211] [] SYSC_perf_event_open+0x514/0xbd0 > [12573.367514] [] SyS_perf_event_open+0x9/0x10 > [12573.374816] [] tracesys+0xdd/0xe2 Notice the above trace. perf took its own ctx->lock, which can be taken while holding the rq lock. While holding this lock, it did a rcu_read_unlock(). The perf_lock_task_context() basically looks like: rcu_read_lock(); raw_spin_lock(ctx->lock); rcu_read_unlock(); Now, what looks to have happened, is that we scheduled after taking that first rcu_read_lock() but before taking the spin lock. When we scheduled back in and took the ctx->lock, the following rcu_read_unlock() triggered the "special" code. The rcu_read_unlock_special() takes the rnp->lock, which gives us a possible deadlock scenario. CPU0 CPU1 CPU2 ---- ---- ---- rcu_nocb_kthread() lock(rq->lock); lock(ctx->lock); lock(rnp->lock); wake_up(); lock(rq->lock); rcu_read_unlock(); rcu_read_unlock_special(); lock(rnp->lock); lock(ctx->lock); **** DEADLOCK **** > [12573.382068] > other info that might help us debug this: > > [12573.403229] Chain exists of: > rcu_node_0 --> &rq->lock --> &ctx->lock > > [12573.424471] Possible unsafe locking scenario: > > [12573.438499] CPU0 CPU1 > [12573.445599] ---- ---- > [12573.452691] lock(&ctx->lock); > [12573.459799] lock(&rq->lock); > [12573.467010] lock(&ctx->lock); > [12573.474192] lock(rcu_node_0); > [12573.481262] > *** DEADLOCK *** > > [12573.501931] 1 lock held by trinity-child17/31341: > [12573.508990] #0: (&ctx->lock){-.-...}, at: [] perf_lock_task_context+0x7d/0x2d0 > [12573.516475] > stack backtrace: > [12573.530395] CPU: 1 PID: 31341 Comm: trinity-child17 Not tainted 3.10.0-rc3+ #39 > [12573.545357] ffffffff825b4f90 ffff880219f1dbc0 ffffffff816e375b ffff880219f1dc00 > [12573.552868] ffffffff816dfa5d ffff880219f1dc50 ffff88023ce4d1f8 ffff88023ce4ca40 > [12573.560353] 0000000000000001 0000000000000001 ffff88023ce4d1f8 ffff880219f1dcc0 > [12573.567856] Call Trace: > [12573.575011] [] dump_stack+0x19/0x1b > [12573.582284] [] print_circular_bug+0x200/0x20f > [12573.589637] [] __lock_acquire+0x1786/0x1af0 > [12573.596982] [] ? sched_clock_cpu+0xb5/0x100 > [12573.604344] [] lock_acquire+0x91/0x1f0 > [12573.611652] [] ? rcu_read_unlock_special+0x9f/0x4c0 > [12573.619030] [] _raw_spin_lock+0x40/0x80 > [12573.626331] [] ? rcu_read_unlock_special+0x9f/0x4c0 > [12573.633671] [] rcu_read_unlock_special+0x9f/0x4c0 > [12573.640992] [] ? perf_lock_task_context+0x7d/0x2d0 > [12573.648330] [] ? put_lock_stats.isra.29+0xe/0x40 > [12573.655662] [] ? delay_tsc+0x90/0xe0 > [12573.662964] [] __rcu_read_unlock+0x96/0xa0 > [12573.670276] [] perf_lock_task_context+0x143/0x2d0 > [12573.677622] [] ? __perf_event_enable+0x370/0x370 > [12573.684981] [] find_get_context+0x4e/0x1f0 > [12573.692358] [] SYSC_perf_event_open+0x514/0xbd0 > [12573.699753] [] ? get_parent_ip+0xd/0x50 > [12573.707135] [] ? trace_hardirqs_on_caller+0xfd/0x1c0 > [12573.714599] [] SyS_perf_event_open+0x9/0x10 > [12573.721996] [] tracesys+0xdd/0xe2 This commit delays the wakeup via irq_work(), which is what perf and ftrace use to perform wakeups in critical sections. Reported-by: Dave Jones Signed-off-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 17 +++++++++++++++-- kernel/rcutree.h | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 16ea67925015..b61d20c5ee7b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1613,6 +1613,14 @@ static int __noreturn rcu_gp_kthread(void *arg) } } +static void rsp_wakeup(struct irq_work *work) +{ + struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); + + /* Wake up rcu_gp_kthread() to start the grace period. */ + wake_up(&rsp->gp_wq); +} + /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1637,8 +1645,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, } rsp->gp_flags = RCU_GP_FLAG_INIT; - /* Wake up rcu_gp_kthread() to start the grace period. */ - wake_up(&rsp->gp_wq); + /* + * We can't do wakeups while holding the rnp->lock, as that + * could cause possible deadlocks with the rq->lock. Deter + * the wakeup to interrupt context. + */ + irq_work_queue(&rsp->wakeup_work); } /* @@ -3235,6 +3247,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); + init_irq_work(&rsp->wakeup_work, rsp_wakeup); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index da77a8f57ff9..4df503470e42 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -27,6 +27,7 @@ #include #include #include +#include /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -442,6 +443,7 @@ struct rcu_state { char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ + struct irq_work wakeup_work; /* Postponed wakeups */ }; /* Values for rcu_state structure's gp_flags field. */ -- cgit v1.2.3 From 971394f389992f8462c4e5ae0e3b49a10a9534a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Jun 2013 07:13:57 -0700 Subject: rcu: Fix deadlock with CPU hotplug, RCU GP init, and timer migration In Steven Rostedt's words: > I've been debugging the last couple of days why my tests have been > locking up. One of my tracing tests, runs all available tracers. The > lockup always happened with the mmiotrace, which is used to trace > interactions between priority drivers and the kernel. But to do this > easily, when the tracer gets registered, it disables all but the boot > CPUs. The lockup always happened after it got done disabling the CPUs. > > Then I decided to try this: > > while :; do > for i in 1 2 3; do > echo 0 > /sys/devices/system/cpu/cpu$i/online > done > for i in 1 2 3; do > echo 1 > /sys/devices/system/cpu/cpu$i/online > done > done > > Well, sure enough, that locked up too, with the same users. Doing a > sysrq-w (showing all blocked tasks): > > [ 2991.344562] task PC stack pid father > [ 2991.344562] rcu_preempt D ffff88007986fdf8 0 10 2 0x00000000 > [ 2991.344562] ffff88007986fc98 0000000000000002 ffff88007986fc48 0000000000000908 > [ 2991.344562] ffff88007986c280 ffff88007986ffd8 ffff88007986ffd8 00000000001d3c80 > [ 2991.344562] ffff880079248a40 ffff88007986c280 0000000000000000 00000000fffd4295 > [ 2991.344562] Call Trace: > [ 2991.344562] [] schedule+0x64/0x66 > [ 2991.344562] [] schedule_timeout+0xbc/0xf9 > [ 2991.344562] [] ? ftrace_call+0x5/0x2f > [ 2991.344562] [] ? cascade+0xa8/0xa8 > [ 2991.344562] [] schedule_timeout_uninterruptible+0x1e/0x20 > [ 2991.344562] [] rcu_gp_kthread+0x502/0x94b > [ 2991.344562] [] ? __init_waitqueue_head+0x50/0x50 > [ 2991.344562] [] ? rcu_gp_fqs+0x64/0x64 > [ 2991.344562] [] kthread+0xb1/0xb9 > [ 2991.344562] [] ? lock_release_holdtime.part.23+0x4e/0x55 > [ 2991.344562] [] ? __init_kthread_worker+0x58/0x58 > [ 2991.344562] [] ret_from_fork+0x7c/0xb0 > [ 2991.344562] [] ? __init_kthread_worker+0x58/0x58 > [ 2991.344562] kworker/0:1 D ffffffff81a30680 0 47 2 0x00000000 > [ 2991.344562] Workqueue: events cpuset_hotplug_workfn > [ 2991.344562] ffff880078dbbb58 0000000000000002 0000000000000006 00000000000000d8 > [ 2991.344562] ffff880078db8100 ffff880078dbbfd8 ffff880078dbbfd8 00000000001d3c80 > [ 2991.344562] ffff8800779ca5c0 ffff880078db8100 ffffffff81541fcf 0000000000000000 > [ 2991.344562] Call Trace: > [ 2991.344562] [] ? __mutex_lock_common+0x3d4/0x609 > [ 2991.344562] [] schedule+0x64/0x66 > [ 2991.344562] [] schedule_preempt_disabled+0x18/0x24 > [ 2991.344562] [] __mutex_lock_common+0x3d4/0x609 > [ 2991.344562] [] ? get_online_cpus+0x3c/0x50 > [ 2991.344562] [] ? get_online_cpus+0x3c/0x50 > [ 2991.344562] [] mutex_lock_nested+0x3b/0x40 > [ 2991.344562] [] get_online_cpus+0x3c/0x50 > [ 2991.344562] [] rebuild_sched_domains_locked+0x6e/0x3a8 > [ 2991.344562] [] rebuild_sched_domains+0x1c/0x2a > [ 2991.344562] [] cpuset_hotplug_workfn+0x1c7/0x1d3 > [ 2991.344562] [] ? cpuset_hotplug_workfn+0x5/0x1d3 > [ 2991.344562] [] process_one_work+0x2d4/0x4d1 > [ 2991.344562] [] ? process_one_work+0x207/0x4d1 > [ 2991.344562] [] worker_thread+0x2e7/0x3b5 > [ 2991.344562] [] ? rescuer_thread+0x332/0x332 > [ 2991.344562] [] kthread+0xb1/0xb9 > [ 2991.344562] [] ? __init_kthread_worker+0x58/0x58 > [ 2991.344562] [] ret_from_fork+0x7c/0xb0 > [ 2991.344562] [] ? __init_kthread_worker+0x58/0x58 > [ 2991.344562] bash D ffffffff81a4aa80 0 2618 2612 0x10000000 > [ 2991.344562] ffff8800379abb58 0000000000000002 0000000000000006 0000000000000c2c > [ 2991.344562] ffff880077fea140 ffff8800379abfd8 ffff8800379abfd8 00000000001d3c80 > [ 2991.344562] ffff8800779ca5c0 ffff880077fea140 ffffffff81541fcf 0000000000000000 > [ 2991.344562] Call Trace: > [ 2991.344562] [] ? __mutex_lock_common+0x3d4/0x609 > [ 2991.344562] [] schedule+0x64/0x66 > [ 2991.344562] [] schedule_preempt_disabled+0x18/0x24 > [ 2991.344562] [] __mutex_lock_common+0x3d4/0x609 > [ 2991.344562] [] ? rcu_cpu_notify+0x2f5/0x86e > [ 2991.344562] [] ? rcu_cpu_notify+0x2f5/0x86e > [ 2991.344562] [] mutex_lock_nested+0x3b/0x40 > [ 2991.344562] [] rcu_cpu_notify+0x2f5/0x86e > [ 2991.344562] [] ? __lock_is_held+0x32/0x53 > [ 2991.344562] [] notifier_call_chain+0x6b/0x98 > [ 2991.344562] [] __raw_notifier_call_chain+0xe/0x10 > [ 2991.344562] [] __cpu_notify+0x20/0x32 > [ 2991.344562] [] cpu_notify_nofail+0x17/0x36 > [ 2991.344562] [] _cpu_down+0x154/0x259 > [ 2991.344562] [] cpu_down+0x2d/0x3a > [ 2991.344562] [] store_online+0x4e/0xe7 > [ 2991.344562] [] dev_attr_store+0x20/0x22 > [ 2991.344562] [] sysfs_write_file+0x108/0x144 > [ 2991.344562] [] vfs_write+0xfd/0x158 > [ 2991.344562] [] SyS_write+0x5c/0x83 > [ 2991.344562] [] tracesys+0xdd/0xe2 > > As well as held locks: > > [ 3034.728033] Showing all locks held in the system: > [ 3034.728033] 1 lock held by rcu_preempt/10: > [ 3034.728033] #0: (rcu_preempt_state.onoff_mutex){+.+...}, at: [] rcu_gp_kthread+0x167/0x94b > [ 3034.728033] 4 locks held by kworker/0:1/47: > [ 3034.728033] #0: (events){.+.+.+}, at: [] process_one_work+0x207/0x4d1 > [ 3034.728033] #1: (cpuset_hotplug_work){+.+.+.}, at: [] process_one_work+0x207/0x4d1 > [ 3034.728033] #2: (cpuset_mutex){+.+.+.}, at: [] rebuild_sched_domains+0x17/0x2a > [ 3034.728033] #3: (cpu_hotplug.lock){+.+.+.}, at: [] get_online_cpus+0x3c/0x50 > [ 3034.728033] 1 lock held by mingetty/2563: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 1 lock held by mingetty/2565: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 1 lock held by mingetty/2569: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 1 lock held by mingetty/2572: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 1 lock held by mingetty/2575: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 7 locks held by bash/2618: > [ 3034.728033] #0: (sb_writers#5){.+.+.+}, at: [] file_start_write+0x2a/0x2c > [ 3034.728033] #1: (&buffer->mutex#2){+.+.+.}, at: [] sysfs_write_file+0x3c/0x144 > [ 3034.728033] #2: (s_active#54){.+.+.+}, at: [] sysfs_write_file+0xe7/0x144 > [ 3034.728033] #3: (x86_cpu_hotplug_driver_mutex){+.+.+.}, at: [] cpu_hotplug_driver_lock+0x17/0x19 > [ 3034.728033] #4: (cpu_add_remove_lock){+.+.+.}, at: [] cpu_maps_update_begin+0x17/0x19 > [ 3034.728033] #5: (cpu_hotplug.lock){+.+.+.}, at: [] cpu_hotplug_begin+0x2c/0x6d > [ 3034.728033] #6: (rcu_preempt_state.onoff_mutex){+.+...}, at: [] rcu_cpu_notify+0x2f5/0x86e > [ 3034.728033] 1 lock held by bash/2980: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > > Things looked a little weird. Also, this is a deadlock that lockdep did > not catch. But what we have here does not look like a circular lock > issue: > > Bash is blocked in rcu_cpu_notify(): > > 1961 /* Exclude any attempts to start a new grace period. */ > 1962 mutex_lock(&rsp->onoff_mutex); > > > kworker is blocked in get_online_cpus(), which makes sense as we are > currently taking down a CPU. > > But rcu_preempt is not blocked on anything. It is simply sleeping in > rcu_gp_kthread (really rcu_gp_init) here: > > 1453 #ifdef CONFIG_PROVE_RCU_DELAY > 1454 if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && > 1455 system_state == SYSTEM_RUNNING) > 1456 schedule_timeout_uninterruptible(2); > 1457 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ > > And it does this while holding the onoff_mutex that bash is waiting for. > > Doing a function trace, it showed me where it happened: > > [ 125.940066] rcu_pree-10 3.... 28384115273: schedule_timeout_uninterruptible <-rcu_gp_kthread > [...] > [ 125.940066] rcu_pree-10 3d..3 28384202439: sched_switch: prev_comm=rcu_preempt prev_pid=10 prev_prio=120 prev_state=D ==> next_comm=watchdog/3 next_pid=38 next_prio=120 > > The watchdog ran, and then: > > [ 125.940066] watchdog-38 3d..3 28384692863: sched_switch: prev_comm=watchdog/3 prev_pid=38 prev_prio=120 prev_state=P ==> next_comm=modprobe next_pid=2848 next_prio=118 > > Not sure what modprobe was doing, but shortly after that: > > [ 125.940066] modprobe-2848 3d..3 28385041749: sched_switch: prev_comm=modprobe prev_pid=2848 prev_prio=118 prev_state=R+ ==> next_comm=migration/3 next_pid=40 next_prio=0 > > Where the migration thread took down the CPU: > > [ 125.940066] migratio-40 3d..3 28389148276: sched_switch: prev_comm=migration/3 prev_pid=40 prev_prio=0 prev_state=P ==> next_comm=swapper/3 next_pid=0 next_prio=120 > > which finally did: > > [ 125.940066] -0 3...1 28389282142: arch_cpu_idle_dead <-cpu_startup_entry > [ 125.940066] -0 3...1 28389282548: native_play_dead <-arch_cpu_idle_dead > [ 125.940066] -0 3...1 28389282924: play_dead_common <-native_play_dead > [ 125.940066] -0 3...1 28389283468: idle_task_exit <-play_dead_common > [ 125.940066] -0 3...1 28389284644: amd_e400_remove_cpu <-play_dead_common > > > CPU 3 is now offline, the rcu_preempt thread that ran on CPU 3 is still > doing a schedule_timeout_uninterruptible() and it registered it's > timeout to the timer base for CPU 3. You would think that it would get > migrated right? The issue here is that the timer migration happens at > the CPU notifier for CPU_DEAD. The problem is that the rcu notifier for > CPU_DOWN is blocked waiting for the onoff_mutex to be released, which is > held by the thread that just put itself into a uninterruptible sleep, > that wont wake up until the CPU_DEAD notifier of the timer > infrastructure is called, which wont happen until the rcu notifier > finishes. Here's our deadlock! This commit breaks this deadlock cycle by substituting a shorter udelay() for the previous schedule_timeout_uninterruptible(), while at the same time increasing the probability of the delay. This maintains the intensity of the testing. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Tested-by: Steven Rostedt --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b61d20c5ee7b..35380019f0fc 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1451,9 +1451,9 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && + if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && system_state == SYSTEM_RUNNING) - schedule_timeout_uninterruptible(2); + udelay(200); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } -- cgit v1.2.3 From 34376a50fb1fa095b9d0636fa41ed2e73125f214 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 6 Jun 2013 14:29:49 -0700 Subject: Fix lockup related to stop_machine being stuck in __do_softirq. The stop machine logic can lock up if all but one of the migration threads make it through the disable-irq step and the one remaining thread gets stuck in __do_softirq. The reason __do_softirq can hang is that it has a bail-out based on jiffies timeout, but in the lockup case, jiffies itself is not incremented. To work around this, re-add the max_restart counter in __do_irq and stop processing irqs after 10 restarts. Thanks to Tejun Heo and Rusty Russell and others for helping me track this down. This was introduced in 3.9 by commit c10d73671ad3 ("softirq: reduce latencies"). It may be worth looking into ath9k to see if it has issues with its irq handler at a later date. The hang stack traces look something like this: ------------[ cut here ]------------ WARNING: at kernel/watchdog.c:245 watchdog_overflow_callback+0x9c/0xa7() Watchdog detected hard LOCKUP on cpu 2 Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc] Pid: 23, comm: migration/2 Tainted: G C 3.9.4+ #11 Call Trace: warn_slowpath_common+0x85/0x9f warn_slowpath_fmt+0x46/0x48 watchdog_overflow_callback+0x9c/0xa7 __perf_event_overflow+0x137/0x1cb perf_event_overflow+0x14/0x16 intel_pmu_handle_irq+0x2dc/0x359 perf_event_nmi_handler+0x19/0x1b nmi_handle+0x7f/0xc2 do_nmi+0xbc/0x304 end_repeat_nmi+0x1e/0x2e <> cpu_stopper_thread+0xae/0x162 smpboot_thread_fn+0x258/0x260 kthread+0xc7/0xcf ret_from_fork+0x7c/0xb0 ---[ end trace 4947dfa9b0a4cec3 ]--- BUG: soft lockup - CPU#1 stuck for 22s! [migration/1:17] Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc] irq event stamp: 835637905 hardirqs last enabled at (835637904): __do_softirq+0x9f/0x257 hardirqs last disabled at (835637905): apic_timer_interrupt+0x6d/0x80 softirqs last enabled at (5654720): __do_softirq+0x1ff/0x257 softirqs last disabled at (5654725): irq_exit+0x5f/0xbb CPU 1 Pid: 17, comm: migration/1 Tainted: G WC 3.9.4+ #11 To be filled by O.E.M. To be filled by O.E.M./To be filled by O.E.M. RIP: tasklet_hi_action+0xf0/0xf0 Process migration/1 Call Trace: __do_softirq+0x117/0x257 irq_exit+0x5f/0xbb smp_apic_timer_interrupt+0x8a/0x98 apic_timer_interrupt+0x72/0x80 printk+0x4d/0x4f stop_machine_cpu_stop+0x22c/0x274 cpu_stopper_thread+0xae/0x162 smpboot_thread_fn+0x258/0x260 kthread+0xc7/0xcf ret_from_fork+0x7c/0xb0 Signed-off-by: Ben Greear Acked-by: Tejun Heo Acked-by: Pekka Riikonen Cc: Eric Dumazet Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/softirq.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0dad..3d6833f125d3 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing for at most 2 ms, - * and if need_resched() is not set. + * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, + * but break the loop if need_resched() is set or after 2 ms. + * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in + * certain cases, such as stop_machine(), jiffies may cease to + * increment and so we need the MAX_SOFTIRQ_RESTART limit as + * well to make sure we eventually return from this method. * * These limits have been established via experimentation. * The two things to balance is latency against fairness - @@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); * should not be able to lock up the box. */ #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) +#define MAX_SOFTIRQ_RESTART 10 asmlinkage void __do_softirq(void) { @@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void) unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; + int max_restart = MAX_SOFTIRQ_RESTART; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -265,7 +271,8 @@ restart: pending = local_softirq_pending(); if (pending) { - if (time_before(jiffies, end) && !need_resched()) + if (time_before(jiffies, end) && !need_resched() && + --max_restart) goto restart; wakeup_softirqd(); -- cgit v1.2.3 From 58e8eedf18577c7eac722d5d1f190507ea263d1b Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Tue, 23 Apr 2013 10:32:39 +0900 Subject: tracing: Fix outputting formats of x86-tsc and counter when use trace_clock Outputting formats of x86-tsc and counter should be a raw format, but after applying the patch(2b6080f28c7cc3efc8625ab71495aae89aeb63a0), the format was changed to nanosec. This is because the global variable trace_clock_id was used. When we use multiple buffers, clock_id of each sub-buffer should be used. Then, this patch uses tr->clock_id instead of the global variable trace_clock_id. [ Basically, this fixes a regression where the multibuffer code changed the trace_clock file to update tr->clock_id but the traces still use the old global trace_clock_id variable, negating the file's effect. The global trace_clock_id variable is obsolete and removed. - SR ] Link: http://lkml.kernel.org/r/20130423013239.22334.7394.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++----- kernel/trace/trace.h | 2 -- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a41023a1f88..e71a8be4a6ee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -652,8 +652,6 @@ static struct { ARCH_TRACE_CLOCKS }; -int trace_clock_id; - /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -2826,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* stop the trace while dumping if we are not opening "snapshot" */ @@ -3825,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; iter->cpu_file = tc->cpu; @@ -5095,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); - if (trace_clocks[trace_clock_id].in_ns) { + if (trace_clocks[tr->clock_id].in_ns) { /* local or global for trace_clock */ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 711ca7d3e7f1..20572ed88c5c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; -extern int trace_clock_id; - /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit v1.2.3 From d7880812b3594d3c6dcbe3cfd71dabb17347d082 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2013 16:52:03 +0200 Subject: idle: Add the stack canary init to cpu_startup_entry() Moving x86 to the generic idle implementation (commit 7d1a9417 "x86: Use generic idle loop") wreckaged the stack protector. I stupidly missed that boot_init_stack_canary() must be inlined from a function which never returns, but I put that call into arch_cpu_idle_prepare() which of course returns. I pondered to play tricks with arch_cpu_idle_prepare() first, but then I noticed, that the other archs which have implemented the stackprotector (ARM and SH) do not initialize the canary for the non-boot cpus. So I decided to move the boot_init_stack_canary() call into cpu_startup_entry() ifdeffed with an CONFIG_X86 for now. This #ifdef is just a temporary measure as I don't want to inflict the boot_init_stack_canary() call on ARM and SH that late in the cycle. I'll queue a patch for 3.11 which removes the #ifdef if the ARM/SH maintainers have no objection. Reported-by: Wouter van Kesteren Cc: x86@kernel.org Cc: Russell King Cc: Paul Mundt Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038e..bf2ee1aafa0e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -112,6 +113,21 @@ static void cpu_idle_loop(void) void cpu_startup_entry(enum cpuhp_state state) { + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); -- cgit v1.2.3 From 16e53dbf10a2d7e228709a7286310e629ede5e45 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 12 Jun 2013 14:04:36 -0700 Subject: CPU hotplug: provide a generic helper to disable/enable CPU hotplug There are instances in the kernel where we would like to disable CPU hotplug (from sysfs) during some important operation. Today the freezer code depends on this and the code to do it was kinda tailor-made for that. Restructure the code and make it generic enough to be useful for other usecases too. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Shawn Guo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 55 +++++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427e..198a38883e64 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -540,36 +561,6 @@ static int __init alloc_frozen_cpus(void) } core_initcall(alloc_frozen_cpus); -/* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - /* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: -- cgit v1.2.3 From cf7df378aa4ff7da3a44769b7ff6e9eef1a9f3db Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Wed, 12 Jun 2013 14:04:37 -0700 Subject: reboot: rigrate shutdown/reboot to boot cpu We recently noticed that reboot of a 1024 cpu machine takes approx 16 minutes of just stopping the cpus. The slowdown was tracked to commit f96972f2dc63 ("kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()"). The current implementation does all the work of hot removing the cpus before halting the system. We are switching to just migrating to the boot cpu and then continuing with shutdown/reboot. This also has the effect of not breaking x86's command line parameter for specifying the reboot cpu. Note, this code was shamelessly copied from arch/x86/kernel/reboot.c with bits removed pertaining to the reboot_cpu command line parameter. Signed-off-by: Robin Holt Tested-by: Shawn Guo Cc: "Srivatsa S. Bhat" Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba21..2bbd9a73b54c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* Add backwards compatibility for stable trees. */ +#ifndef PF_NO_SETAFFINITY +#define PF_NO_SETAFFINITY PF_THREAD_BOUND +#endif + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); @@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -419,7 +442,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); -- cgit v1.2.3 From 637241a900cbd982f744d44646b48a273d609b34 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jun 2013 14:04:39 -0700 Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg The dmesg_restrict sysctl currently covers the syslog method for access dmesg, however /dev/kmsg isn't covered by the same protections. Most people haven't noticed because util-linux dmesg(1) defaults to using the syslog method for access in older versions. With util-linux dmesg(1) defaults to reading directly from /dev/kmsg. To fix /dev/kmsg, let's compare the existing interfaces and what they allow: - /proc/kmsg allows: - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive single-reader interface (SYSLOG_ACTION_READ). - everything, after an open. - syslog syscall allows: - anything, if CAP_SYSLOG. - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if dmesg_restrict==0. - nothing else (EPERM). The use-cases were: - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs. - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the destructive SYSLOG_ACTION_READs. AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't clear the ring buffer. Based on the comments in devkmsg_llseek, it sounds like actions besides reading aren't going to be supported by /dev/kmsg (i.e. SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive syslog syscall actions. To this end, move the check as Josh had done, but also rename the constants to reflect their new uses (SYSLOG_FROM_CALL becomes SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC). SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC allows destructive actions after a capabilities-constrained SYSLOG_ACTION_OPEN check. - /dev/kmsg allows: - open if CAP_SYSLOG or dmesg_restrict==0 - reading/polling, after open Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192 [akpm@linux-foundation.org: use pr_warn_once()] Signed-off-by: Kees Cook Reported-by: Christian Kujau Tested-by: Josh Boyer Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 91 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index fa36e1494420..8212c1aef125 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -363,6 +363,53 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1249,7 +1258,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* -- cgit v1.2.3 From f000cfdde5de4fc15dead5ccf524359c07eadf2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 12 Jun 2013 14:04:46 -0700 Subject: audit: wait_for_auditd() should use TASK_UNINTERRUPTIBLE audit_log_start() does wait_for_auditd() in a loop until audit_backlog_wait_time passes or audit_skb_queue has a room. If signal_pending() is true this becomes a busy-wait loop, schedule() in TASK_INTERRUPTIBLE won't block. Thanks to Guy for fully investigating and explaining the problem. (akpm: that'll cause the system to lock up on a non-preemptible uniprocessor kernel) (Guy: "Our customer was in fact running a uniprocessor machine, and they reported a system hang.") Signed-off-by: Oleg Nesterov Reported-by: Guy Streeter Cc: Eric Paris Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 21c7fa615bd3..91e53d04b6a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, static void wait_for_auditd(unsigned long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&audit_backlog_wait, &wait); if (audit_backlog_limit && -- cgit v1.2.3 From 736f3203a06eafd0944103775a98584082744c6b Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 12 Jun 2013 14:05:07 -0700 Subject: kernel/audit_tree.c:audit_add_tree_rule(): protect `rule' from kill_rules() audit_add_tree_rule() must set 'rule->tree = NULL;' firstly, to protect the rule itself freed in kill_rules(). The reason is when it is killed, the 'rule' itself may have already released, we should not access it. one example: we add a rule to an inode, just at the same time the other task is deleting this inode. The work flow for adding a rule: audit_receive() -> (need audit_cmd_mutex lock) audit_receive_skb() -> audit_receive_msg() -> audit_receive_filter() -> audit_add_rule() -> audit_add_tree_rule() -> (need audit_filter_mutex lock) ... unlock audit_filter_mutex get_tree() ... iterate_mounts() -> (iterate all related inodes) tag_mount() -> tag_trunk() -> create_trunk() -> (assume it is 1st rule) fsnotify_add_mark() -> fsnotify_add_inode_mark() -> (add mark to inode->i_fsnotify_marks) ... get_tree(); (each inode will get one) ... lock audit_filter_mutex The work flow for deleting an inode: __destroy_inode() -> fsnotify_inode_delete() -> __fsnotify_inode_delete() -> fsnotify_clear_marks_by_inode() -> (get mark from inode->i_fsnotify_marks) fsnotify_destroy_mark() -> fsnotify_destroy_mark_locked() -> audit_tree_freeing_mark() -> evict_chunk() -> ... tree->goner = 1 ... kill_rules() -> (assume current->audit_context == NULL) call_rcu() -> (rule->tree != NULL) audit_free_rule_rcu() -> audit_free_rule() ... audit_schedule_prune() -> (assume current->audit_context == NULL) kthread_run() -> (need audit_cmd_mutex and audit_filter_mutex lock) prune_one() -> (delete it from prue_list) put_tree(); (match the original get_tree above) Signed-off-by: Chen Gang Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3f..43c307dc9453 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); -- cgit v1.2.3 From 29ce3785b22da47c49f4ef6e14b9014fa5dee261 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 8 May 2013 14:05:34 -0700 Subject: idle: Enable interrupts in the weak arch_cpu_idle() implementation PARISC bootup triggers the warning at kernel/cpu/idle.c:96. That's caused by the weak arch_cpu_idle() implementation, which is provided to avoid that architectures implement idle_poll over and over. The switchover to polling mode happens in the first call of the weak arch_cpu_idle() implementation, but that code fails to reenable interrupts and therefor triggers the warning. Fix this by enabling interrupts in the weak arch_cpu_idle() code. [ tglx: Made the changelog match the patch ] Signed-off-by: James Bottomley Reviewed-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/1371236142.2726.43.camel@dabdike Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index bf2ee1aafa0e..e695c0a0bcb5 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -59,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; + local_irq_enable(); } /* -- cgit v1.2.3 From 8aac62706adaaf0fab02c4327761561c8bda9448 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Jun 2013 21:09:49 +0200 Subject: move exit_task_namespaces() outside of exit_notify() exit_notify() does exit_task_namespaces() after forget_original_parent(). This was needed to ensure that ->nsproxy can't be cleared prematurely, an exiting child we are going to reparent can do do_notify_parent() and use the parent's (ours) pid_ns. However, after 32084504 "pidns: use task_active_pid_ns in do_notify_parent" ->nsproxy != NULL is no longer needed, we rely on task_active_pid_ns(). Move exit_task_namespaces() from exit_notify() to do_exit(), after exit_fs() and before exit_task_work(). This solves the problem reported by Andrey, free_ipc_ns()->shm_destroy() does fput() which needs task_work_add(). Note: this particular problem can be fixed if we change fput(), and that change makes sense anyway. But there is another reason to move the callsite. The original reason for exit_task_namespaces() from the middle of exit_notify() was subtle and it has already gone away, now this looks confusing. And this allows us do simplify exit_notify(), we can avoid unlock/lock(tasklist) and we can use ->exit_state instead of PF_EXITING in forget_original_parent(). Reported-by: Andrey Vagin Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Acked-by: Andrey Vagin Signed-off-by: Al Viro --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd499..7bb73f9d09db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ forget_original_parent(tsk); - exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); if (group_dead) @@ -795,6 +794,7 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); exit_thread(); -- cgit v1.2.3 From 0541881502a1276149889fe468662ff6a8fc8f6d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 13 Jun 2013 13:17:02 -0700 Subject: range: Do not add new blank slot with add_range_with_merge Joshua reported: Commit cd7b304dfaf1 (x86, range: fix missing merge during add range) broke mtrr cleanup on his setup in 3.9.5. corresponding commit in upstream is fbe06b7bae7c. The reason is add_range_with_merge could generate blank spot. We could avoid that by searching new expanded start/end, that new range should include all connected ranges in range array. At last add the new expanded start/end to the range array. Also move up left array so do not add new blank slot in the range array. -v2: move left array to avoid enhance add_range() -v3: include fix from Joshua about memmove declaring when DYN_DEBUG is used. Reported-by: Joshua Covington Tested-by: Joshua Covington Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1371154622-8929-3-git-send-email-yinghai@kernel.org Cc: v3.9 Signed-off-by: H. Peter Anvin --- kernel/range.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index eb911dbce267..322ea8e93e4b 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -4,7 +4,7 @@ #include #include #include - +#include #include int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) @@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (start >= end) return nr_range; - /* Try to merge it with old one: */ + /* get new start/end: */ for (i = 0; i < nr_range; i++) { - u64 final_start, final_end; u64 common_start, common_end; if (!range[i].end) @@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (common_start > common_end) continue; - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); + /* new start/end, will add it back at last */ + start = min(range[i].start, start); + end = max(range[i].end, end); - /* clear it and add it back for further merge */ - range[i].start = 0; - range[i].end = 0; - return add_range_with_merge(range, az, nr_range, - final_start, final_end); + memmove(&range[i], &range[i + 1], + (nr_range - (i + 1)) * sizeof(range[i])); + range[nr_range - 1].start = 0; + range[nr_range - 1].end = 0; + nr_range--; + i--; } /* Need to add it: */ -- cgit v1.2.3 From 9bb5d40cd93c9dd4be74834b1dcb1ba03629716b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Jun 2013 10:44:21 +0200 Subject: perf: Fix mmap() accounting hole Vince's fuzzer once again found holes. This time it spotted a leak in the locked page accounting. When an event had redirected output and its close() was the last reference to the buffer we didn't have a vm context to undo accounting. Change the code to destroy the buffer on the last munmap() and detach all redirected events at that time. This provides us the right context to undo the vm accounting. Reported-and-tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net Cc: Signed-off-by: Ingo Molnar --- kernel/events/core.c | 228 ++++++++++++++++++++++++++++++++--------------- kernel/events/internal.h | 3 +- 2 files changed, 159 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ae752cd4a086..b391907d5352 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); - void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static bool ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event) if (has_branch_stack(event)) { static_key_slow_dec_deferred(&perf_sched_events); /* is system-wide event */ - if (!(event->attach_state & PERF_ATTACH_TASK)) + if (!(event->attach_state & PERF_ATTACH_TASK)) { atomic_dec(&per_cpu(perf_branch_stack_events, event->cpu)); + } } } if (event->rb) { - ring_buffer_put(event->rb); - event->rb = NULL; + struct ring_buffer *rb; + + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + rb = event->rb; + if (rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* could be last */ + } + mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) @@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) unsigned int events = POLL_HUP; /* - * Race between perf_event_set_output() and perf_poll(): perf_poll() - * grabs the rb reference but perf_event_set_output() overrides it. - * Here is the timeline for two threads T1, T2: - * t0: T1, rb = rcu_dereference(event->rb) - * t1: T2, old_rb = event->rb - * t2: T2, event->rb = new rb - * t3: T2, ring_buffer_detach(old_rb) - * t4: T1, ring_buffer_attach(rb1) - * t5: T1, poll_wait(event->waitq) - * - * To avoid this problem, we grab mmap_mutex in perf_poll() - * thereby ensuring that the assignment of the new ring buffer - * and the detachment of the old buffer appear atomic to perf_poll() + * Pin the event->rb by taking event->mmap_mutex; otherwise + * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (rb) { - ring_buffer_attach(event, rb); + rb = event->rb; + if (rb) events = atomic_xchg(&rb->poll, 0); - } - rcu_read_unlock(); - mutex_unlock(&event->mmap_mutex); poll_wait(file, &event->waitq, wait); @@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event, return; spin_lock_irqsave(&rb->event_lock, flags); - if (!list_empty(&event->rb_entry)) - goto unlock; - - list_add(&event->rb_entry, &rb->event_list); -unlock: + if (list_empty(&event->rb_entry)) + list_add(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } -static void ring_buffer_detach(struct perf_event *event, - struct ring_buffer *rb) +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) { unsigned long flags; @@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) - wake_up_all(&event->waitq); - -unlock: + if (rb) { + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) + wake_up_all(&event->waitq); + } rcu_read_unlock(); } @@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static bool ring_buffer_put(struct ring_buffer *rb) +static void ring_buffer_put(struct ring_buffer *rb) { - struct perf_event *event, *n; - unsigned long flags; - if (!atomic_dec_and_test(&rb->refcount)) - return false; + return; - spin_lock_irqsave(&rb->event_lock, flags); - list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - } - spin_unlock_irqrestore(&rb->event_lock, flags); + WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); - return true; } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); + atomic_inc(&event->rb->mmap_count); } +/* + * A buffer can be mmap()ed multiple times; either directly through the same + * event, or through other events by use of perf_event_set_output(). + * + * In order to undo the VM accounting done by perf_mmap() we need to destroy + * the buffer here, where we still have a VM context. This means we need + * to detach all events redirecting to us. + */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - struct ring_buffer *rb = event->rb; - struct user_struct *mmap_user = rb->mmap_user; - int mmap_locked = rb->mmap_locked; - unsigned long size = perf_data_size(rb); + struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - mutex_unlock(&event->mmap_mutex); + atomic_dec(&rb->mmap_count); + + if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + return; + + /* Detach current event from the buffer. */ + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + mutex_unlock(&event->mmap_mutex); + + /* If there's still other mmap()s of this buffer, we're done. */ + if (atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); /* can't be last */ + return; + } - if (ring_buffer_put(rb)) { - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; - free_uid(mmap_user); + /* + * No other mmap()s, detach from all other events that might redirect + * into the now unreachable buffer. Somewhat complicated by the + * fact that rb::event_lock otherwise nests inside mmap_mutex. + */ +again: + rcu_read_lock(); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + if (!atomic_long_inc_not_zero(&event->refcount)) { + /* + * This event is en-route to free_event() which will + * detach it and remove it from the list. + */ + continue; } + rcu_read_unlock(); + + mutex_lock(&event->mmap_mutex); + /* + * Check we didn't race with perf_event_set_output() which can + * swizzle the rb from under us while we were waiting to + * acquire mmap_mutex. + * + * If we find a different rb; ignore this event, a next + * iteration will no longer find it on the list. We have to + * still restart the iteration to make sure we're not now + * iterating the wrong list. + */ + if (event->rb == rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* can't be last, we still have one */ + } + mutex_unlock(&event->mmap_mutex); + put_event(event); + + /* + * Restart the iteration; either we're on the wrong list or + * destroyed its integrity by doing a deletion. + */ + goto again; } + rcu_read_unlock(); + + /* + * It could be there's still a few 0-ref events on the list; they'll + * get cleaned up by free_event() -- they'll also still have their + * ref on the rb and will free it whenever they are done with it. + * + * Aside from that, this buffer is 'fully' detached and unmapped, + * undo the VM accounting. + */ + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + + ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { @@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); +again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) + if (event->rb->nr_pages != nr_pages) { ret = -EINVAL; + goto unlock; + } + + if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Raced against perf_mmap_close() through + * perf_event_set_output(). Try again, hope for better + * luck. + */ + mutex_unlock(&event->mmap_mutex); + goto again; + } + goto unlock; } @@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } + atomic_set(&rb->mmap_count, 1); rb->mmap_locked = extra; rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->pinned_vm += extra; + ring_buffer_attach(event, rb); rcu_assign_pointer(event->rb, rb); perf_event_update_userpage(event); @@ -3737,6 +3805,10 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; @@ -6415,6 +6487,8 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; + old_rb = event->rb; + if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6422,16 +6496,28 @@ set: goto unlock; } - old_rb = event->rb; - rcu_assign_pointer(event->rb, rb); if (old_rb) ring_buffer_detach(event, old_rb); + + if (rb) + ring_buffer_attach(event, rb); + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } + ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_rb) - ring_buffer_put(old_rb); out: return ret; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5bc6c8e9b851..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,7 +31,8 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; - int mmap_locked; + atomic_t mmap_count; + unsigned long mmap_locked; struct user_struct *mmap_user; struct perf_event_mmap_page *user_page; -- cgit v1.2.3 From 873b4c65b519fd769940eb281f77848227d4e5c1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 5 Jun 2013 10:13:11 +0200 Subject: sched: Fix clear NOHZ_BALANCE_KICK I have faced a sequence where the Idle Load Balance was sometime not triggered for a while on my platform, in the following scenario: CPU 0 and CPU 1 are running tasks and CPU 2 is idle CPU 1 kicks the Idle Load Balance CPU 1 selects CPU 2 as the new Idle Load Balancer CPU 2 sets NOHZ_BALANCE_KICK for CPU 2 CPU 2 sends a reschedule IPI to CPU 2 While CPU 3 wakes up, CPU 0 or CPU 1 migrates a waking up task A on CPU 2 CPU 2 finally wakes up, runs task A and discards the Idle Load Balance task A quickly goes back to sleep (before a tick occurs on CPU 2) CPU 2 goes back to idle with NOHZ_BALANCE_KICK set Whenever CPU 2 will be selected as the ILB, no reschedule IPI will be sent because NOHZ_BALANCE_KICK is already set and no Idle Load Balance will be performed. We must wait for the sched softirq to be raised on CPU 2 thanks to another part the kernel to come back to clear NOHZ_BALANCE_KICK. The proposed solution clears NOHZ_BALANCE_KICK in schedule_ipi if we can't raise the sched_softirq for the Idle Load Balance. Change since V1: - move the clear of NOHZ_BALANCE_KICK in got_nohz_idle_kick if the ILB can't run on this CPU (as suggested by Peter) Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1370419991-13870-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..919bee68032b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu) static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + + if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + return false; + + if (idle_cpu(cpu) && !need_resched()) + return true; + + /* + * We can't run Idle Load Balance on this CPU for this time so we + * cancel it and clear NOHZ_BALANCE_KICK + */ + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + return false; } #else /* CONFIG_NO_HZ_COMMON */ @@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() - && !tick_nohz_full_cpu(smp_processor_id())) + if (llist_empty(&this_rq()->wake_list) + && !tick_nohz_full_cpu(smp_processor_id()) + && !got_nohz_idle_kick()) return; /* @@ -1417,7 +1430,7 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) { + if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } -- cgit v1.2.3 From 29bb9e5a75684106a37593ad75ec75ff8312731b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 May 2013 15:23:40 -0400 Subject: tracing/context-tracking: Add preempt_schedule_context() for tracing Dave Jones hit the following bug report: =============================== [ INFO: suspicious RCU usage. ] 3.10.0-rc2+ #1 Not tainted ------------------------------- include/linux/rcupdate.h:771 rcu_read_lock() used illegally while idle! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! 2 locks held by cc1/63645: #0: (&rq->lock){-.-.-.}, at: [] __schedule+0xed/0x9b0 #1: (rcu_read_lock){.+.+..}, at: [] cpuacct_charge+0x5/0x1f0 CPU: 1 PID: 63645 Comm: cc1 Not tainted 3.10.0-rc2+ #1 [loadavg: 40.57 27.55 13.39 25/277 64369] Hardware name: Gigabyte Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H, BIOS F12a 04/23/2010 0000000000000000 ffff88010f78fcf8 ffffffff816ae383 ffff88010f78fd28 ffffffff810b698d ffff88011c092548 000000000023d073 ffff88011c092500 0000000000000001 ffff88010f78fd60 ffffffff8109d7c5 ffffffff8109d645 Call Trace: [] dump_stack+0x19/0x1b [] lockdep_rcu_suspicious+0xfd/0x130 [] cpuacct_charge+0x185/0x1f0 [] ? cpuacct_charge+0x5/0x1f0 [] update_curr+0xec/0x240 [] put_prev_task_fair+0x228/0x480 [] __schedule+0x161/0x9b0 [] preempt_schedule+0x51/0x80 [] ? __cond_resched_softirq+0x60/0x60 [] ? retint_careful+0x12/0x2e [] ftrace_ops_control_func+0x1dc/0x210 [] ftrace_call+0x5/0x2f [] ? retint_careful+0xb/0x2e [] ? schedule_user+0x5/0x70 [] ? schedule_user+0x5/0x70 [] ? retint_careful+0x12/0x2e ------------[ cut here ]------------ What happened was that the function tracer traced the schedule_user() code that tells RCU that the system is coming back from userspace, and to add the CPU back to the RCU monitoring. Because the function tracer does a preempt_disable/enable_notrace() calls the preempt_enable_notrace() checks the NEED_RESCHED flag. If it is set, then preempt_schedule() is called. But this is called before the user_exit() function can inform the kernel that the CPU is no longer in user mode and needs to be accounted for by RCU. The fix is to create a new preempt_schedule_context() that checks if the kernel is still in user mode and if so to switch it to kernel mode before calling schedule. It also switches back to user mode coming back from schedule in need be. The only user of this currently is the preempt_enable_notrace(), which is only used by the tracing subsystem. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369423420.6828.226.camel@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/context_tracking.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..66677003e223 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -71,6 +71,46 @@ void user_enter(void) local_irq_restore(flags); } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ + struct thread_info *ti = current_thread_info(); + enum ctx_state prev_ctx; + + if (likely(ti->preempt_count || irqs_disabled())) + return; + + /* + * Need to disable preemption in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + preempt_disable_notrace(); + prev_ctx = exception_enter(); + preempt_enable_no_resched_notrace(); + + preempt_schedule(); + + preempt_disable_notrace(); + exception_exit(prev_ctx); + preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */ /** * user_exit - Inform the context tracking that the CPU is -- cgit v1.2.3