From fbd44a607a1a5019bc32c3615cead8c5ee8f89c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 May 2013 20:22:36 +0200 Subject: tick: Use zalloc_cpumask_var for allocating offstack cpumasks commit b352bc1cbc (tick: Convert broadcast cpu bitmaps to cpumask_var_t) broke CONFIG_CPUMASK_OFFSTACK in a very subtle way. Instead of allocating the cpumasks with zalloc_cpumask_var it uses alloc_cpumask_var, so we can get random data there, which of course confuses the logic completely and causes random failures. Reported-and-tested-by: Dave Jones Reported-and-tested-by: Yinghai Lu Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305032015060.2990@ionos Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 61d00a8cdf2f..d70cdc42c829 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -785,11 +785,11 @@ bool tick_broadcast_oneshot_available(void) void __init tick_broadcast_init(void) { - alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); - alloc_cpumask_var(&tmpmask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT - alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); - alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); - alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); #endif } -- cgit v1.2.3 From 524eff183f51d080a83b348d0ea97c08b3607b9a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 May 2013 18:27:17 +0200 Subject: perf: Fix EXIT event notification The perf_event_task_ctx() function needs to be called with preemption disabled, since it's checking for currently scheduled cpu against event cpu. We disable preemption for task related perf event context if there's one defined, leaving up to the chance which cpu it gets scheduled in. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Stephane Eranian Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1367857638-27631-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6b41c1899a8b..38b68a05c3c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4474,7 +4474,7 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *task_ctx = task_event->task_ctx; struct pmu *pmu; int ctxn; @@ -4485,20 +4485,22 @@ static void perf_event_task_event(struct perf_task_event *task_event) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); - ctx = task_event->task_ctx; - if (!ctx) { - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_task_ctx(ctx, task_event); - } + if (task_ctx) + goto next; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } - if (task_event->task_ctx) - perf_event_task_ctx(task_event->task_ctx, task_event); + if (task_ctx) { + preempt_disable(); + perf_event_task_ctx(task_ctx, task_event); + preempt_enable(); + } rcu_read_unlock(); } -- cgit v1.2.3 From 52d857a8784a09576215c71cebf368d61c12a754 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 May 2013 18:27:18 +0200 Subject: perf: Factor out auxiliary events notification Add perf_event_aux() function to send out all types of auxiliary events - mmap, task, comm events. For each type there's match and output functions defined and used as callbacks during perf_event_aux processing. This way we can centralize the pmu/context iterating and event matching logic. Also since lot of the code was duplicated, this patch reduces the .text size about 2kB on my setup: snipped output from 'objdump -x kernel/events/core.o' before: Idx Name Size 0 .text 0000d313 after: Idx Name Size 0 .text 0000cad3 Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1367857638-27631-3-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 242 +++++++++++++++++++-------------------------------- 1 file changed, 89 insertions(+), 153 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 38b68a05c3c6..9dc297faf7c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4394,6 +4394,64 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } +typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); +typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); + +static void +perf_event_aux_ctx(struct perf_event_context *ctx, + perf_event_aux_match_cb match, + perf_event_aux_output_cb output, + void *data) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + if (match(event, data)) + output(event, data); + } +} + +static void +perf_event_aux(perf_event_aux_match_cb match, + perf_event_aux_output_cb output, + void *data, + struct perf_event_context *task_ctx) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + goto next; + perf_event_aux_ctx(&cpuctx->ctx, match, output, data); + if (task_ctx) + goto next; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_aux_ctx(ctx, match, output, data); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + + if (task_ctx) { + preempt_disable(); + perf_event_aux_ctx(task_ctx, match, output, data); + preempt_enable(); + } + rcu_read_unlock(); +} + /* * task tracking -- fork/exit * @@ -4416,8 +4474,9 @@ struct perf_task_event { }; static void perf_event_task_output(struct perf_event *event, - struct perf_task_event *task_event) + void *data) { + struct perf_task_event *task_event = data; struct perf_output_handle handle; struct perf_sample_data sample; struct task_struct *task = task_event->task; @@ -4445,64 +4504,11 @@ out: task_event->event_id.header.size = size; } -static int perf_event_task_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task) - return 1; - - return 0; -} - -static void perf_event_task_ctx(struct perf_event_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_task_match(event)) - perf_event_task_output(event, task_event); - } -} - -static void perf_event_task_event(struct perf_task_event *task_event) +static int perf_event_task_match(struct perf_event *event, + void *data __maybe_unused) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx, *task_ctx = task_event->task_ctx; - struct pmu *pmu; - int ctxn; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_task_ctx(&cpuctx->ctx, task_event); - - if (task_ctx) - goto next; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_task_ctx(ctx, task_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - if (task_ctx) { - preempt_disable(); - perf_event_task_ctx(task_ctx, task_event); - preempt_enable(); - } - - rcu_read_unlock(); + return event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task; } static void perf_event_task(struct task_struct *task, @@ -4533,7 +4539,10 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_task_event(&task_event); + perf_event_aux(perf_event_task_match, + perf_event_task_output, + &task_event, + task_ctx); } void perf_event_fork(struct task_struct *task) @@ -4559,8 +4568,9 @@ struct perf_comm_event { }; static void perf_event_comm_output(struct perf_event *event, - struct perf_comm_event *comm_event) + void *data) { + struct perf_comm_event *comm_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = comm_event->event_id.header.size; @@ -4587,39 +4597,16 @@ out: comm_event->event_id.header.size = size; } -static int perf_event_comm_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm) - return 1; - - return 0; -} - -static void perf_event_comm_ctx(struct perf_event_context *ctx, - struct perf_comm_event *comm_event) +static int perf_event_comm_match(struct perf_event *event, + void *data __maybe_unused) { - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_comm_match(event)) - perf_event_comm_output(event, comm_event); - } + return event->attr.comm; } static void perf_event_comm_event(struct perf_comm_event *comm_event) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; char comm[TASK_COMM_LEN]; unsigned int size; - struct pmu *pmu; - int ctxn; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); @@ -4629,24 +4616,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); + perf_event_aux(perf_event_comm_match, + perf_event_comm_output, + comm_event, + NULL); } void perf_event_comm(struct task_struct *task) @@ -4708,8 +4682,9 @@ struct perf_mmap_event { }; static void perf_event_mmap_output(struct perf_event *event, - struct perf_mmap_event *mmap_event) + void *data) { + struct perf_mmap_event *mmap_event = data; struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; @@ -4736,46 +4711,24 @@ out: } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event, - int executable) + void *data) { - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if ((!executable && event->attr.mmap_data) || - (executable && event->attr.mmap)) - return 1; - - return 0; -} - -static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event, - int executable) -{ - struct perf_event *event; + struct perf_mmap_event *mmap_event = data; + struct vm_area_struct *vma = mmap_event->vma; + int executable = vma->vm_flags & VM_EXEC; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event, executable)) - perf_event_mmap_output(event, mmap_event); - } + return (!executable && event->attr.mmap_data) || + (executable && event->attr.mmap); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; unsigned int size; char tmp[16]; char *buf = NULL; const char *name; - struct pmu *pmu; - int ctxn; memset(tmp, 0, sizeof(tmp)); @@ -4831,27 +4784,10 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, - vma->vm_flags & VM_EXEC); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_event_mmap_ctx(ctx, mmap_event, - vma->vm_flags & VM_EXEC); - } -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); + perf_event_aux(perf_event_mmap_match, + perf_event_mmap_output, + mmap_event, + NULL); kfree(buf); } -- cgit v1.2.3 From d3251859168b0b12841e1b90d6d768ab478dc23d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 10 May 2013 11:10:17 -0700 Subject: workqueue: workqueue_congested() shouldn't translate WORK_CPU_UNBOUND into node number df2d5ae499 ("workqueue: map an unbound workqueues to multiple per-node pool_workqueues") made unbound workqueues to map to multiple per-node pool_workqueues and accordingly updated workqueue_contested() so that, for unbound workqueues, it maps the specified @cpu to the NUMA node number to obtain the matching pool_workqueue to query the congested state. Before this change, workqueue_congested() ignored @cpu for unbound workqueues as there was only one pool_workqueue and some users (fscache) called it with WORK_CPU_UNBOUND. After the commit, this causes the following oops as WORK_CPU_UNBOUND gets translated to garbage by cpu_to_node(). BUG: unable to handle kernel paging request at ffff8803598d98b8 IP: [] unbound_pwq_by_node+0xa1/0xfa PGD 2421067 PUD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 2689 Comm: cat Tainted: GF 3.9.0-fsdevel+ #4 task: ffff88003d801040 ti: ffff880025806000 task.ti: ffff880025806000 RIP: 0010:[] [] unbound_pwq_by_node+0xa1/0xfa RSP: 0018:ffff880025807ad8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff8800388a2400 RCX: 0000000000000003 RDX: ffff880025807fd8 RSI: ffffffff81a31420 RDI: ffff88003d8016e0 RBP: ffff880025807ae8 R08: ffff88003d801730 R09: ffffffffa00b4898 R10: ffffffff81044217 R11: ffff88003d801040 R12: 0000000064206e97 R13: ffff880036059d98 R14: ffff880038cc8080 R15: ffff880038cc82d0 FS: 00007f21afd9c740(0000) GS:ffff88003d100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff8803598d98b8 CR3: 000000003df49000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff8800388a2400 0000000000000002 ffff880025807b18 ffffffff810442ce ffffffff81044217 ffff880000000002 ffff8800371b4080 ffff88003d112ec0 ffff880025807b38 ffffffffa00810b0 ffff880036059d88 ffff880036059be8 Call Trace: [] workqueue_congested+0xb7/0x12c [] fscache_enqueue_object+0xb2/0xe8 [fscache] [] __fscache_acquire_cookie+0x3b9/0x56c [fscache] [] nfs_fscache_set_inode_cookie+0xee/0x132 [nfs] [] do_open+0x9/0xd [nfs] [] do_dentry_open+0x175/0x24b [] finish_open+0x41/0x51 Fix it by using smp_processor_id() if @cpu is WORK_CPU_UNBOUND. Signed-off-by: Tejun Heo Reported-by: David Howells Tested-and-Acked-by: David Howells --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4aa9f5bc6b2d..1ae602809efb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4311,6 +4311,12 @@ bool current_is_workqueue_rescuer(void) * no synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * + * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. + * Note that both per-cpu and unbound workqueues may be associated with + * multiple pool_workqueues which have separate congested states. A + * workqueue being congested on one CPU doesn't mean the workqueue is also + * contested on other CPUs / NUMA nodes. + * * RETURNS: * %true if congested, %false otherwise. */ @@ -4321,6 +4327,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) rcu_read_lock_sched(); + if (cpu == WORK_CPU_UNBOUND) + cpu = smp_processor_id(); + if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else -- cgit v1.2.3 From 4b0c0f294f60abcdd20994a8341a95c8ac5eeb96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 May 2013 15:02:50 +0200 Subject: tick: Cleanup NOHZ per cpu data on cpu down Prarit reported a crash on CPU offline/online. The reason is that on CPU down the NOHZ related per cpu data of the dead cpu is not cleaned up. If at cpu online an interrupt happens before the per cpu tick device is registered the irq_enter() check potentially sees stale data and dereferences a NULL pointer. Cleanup the data after the cpu is dead. Reported-by: Prarit Bhargava Cc: stable@vger.kernel.org Cc: Mike Galbraith Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305031451561.2886@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 225f8bf19095..0eed1db2d792 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -904,7 +904,7 @@ void tick_cancel_sched_timer(int cpu) hrtimer_cancel(&ts->sched_timer); # endif - ts->nohz_mode = NOHZ_MODE_INACTIVE; + memset(ts, 0, sizeof(*ts)); } #endif -- cgit v1.2.3 From f7ea0fd639c2c48d3c61b6eec75362be290c6874 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 May 2013 21:40:27 +0200 Subject: tick: Don't invoke tick_nohz_stop_sched_tick() if the cpu is offline commit 5b39939a4 (nohz: Move ts->idle_calls incrementation into strict idle logic) moved code out of tick_nohz_stop_sched_tick() and missed to bail out when the cpu is offline. That's causing subsequent failures as an offline CPU is supposed to die and not to fiddle with nohz magic. Return false in can_stop_idle_tick() if the cpu is offline. Reported-and-tested-by: Jiri Kosina Reported-and-tested-by: Prarit Bhargava Cc: Frederic Weisbecker Cc: Borislav Petkov Cc: Tony Luck Cc: x86@kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305132138160.2863@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0eed1db2d792..012142187db9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -469,6 +469,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) -- cgit v1.2.3 From b47430d3adbedbfdb5979ba4874f5dadf94f16b1 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 14 May 2013 04:01:27 +0530 Subject: rcu/idle: Wrap cpu-idle poll mode within rcu_idle_enter/exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bjørn Mork reported the following warning when running powertop. [ 49.289034] ------------[ cut here ]------------ [ 49.289055] WARNING: at kernel/rcutree.c:502 rcu_eqs_exit_common.isra.48+0x3d/0x125() [ 49.289244] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.10.0-bisect-rcu-warn+ #107 [ 49.289251] ffffffff8157d8c8 ffffffff81801e28 ffffffff8137e4e3 ffffffff81801e68 [ 49.289260] ffffffff8103094f ffffffff81801e68 0000000000000000 ffff88023afcd9b0 [ 49.289268] 0000000000000000 0140000000000000 ffff88023bee7700 ffffffff81801e78 [ 49.289276] Call Trace: [ 49.289285] [] dump_stack+0x19/0x1b [ 49.289293] [] warn_slowpath_common+0x62/0x7b [ 49.289300] [] warn_slowpath_null+0x15/0x17 [ 49.289306] [] rcu_eqs_exit_common.isra.48+0x3d/0x125 [ 49.289314] [] ? trace_hardirqs_off_caller+0x37/0xa6 [ 49.289320] [] rcu_idle_exit+0x85/0xa8 [ 49.289327] [] trace_cpu_idle_rcuidle+0xae/0xff [ 49.289334] [] cpu_startup_entry+0x72/0x115 [ 49.289341] [] rest_init+0x149/0x150 [ 49.289347] [] ? csum_partial_copy_generic+0x16c/0x16c [ 49.289355] [] start_kernel+0x3f0/0x3fd [ 49.289362] [] ? repair_env_string+0x5a/0x5a [ 49.289368] [] x86_64_start_reservations+0x2a/0x2c [ 49.289375] [] x86_64_start_kernel+0xcd/0xd1 [ 49.289379] ---[ end trace 07a1cc95e29e9036 ]--- The warning is that 'rdtp->dynticks' has an unexpected value, which roughly translates to - the calls to rcu_idle_enter() and rcu_idle_exit() were not made in the correct order, or otherwise messed up. And Bjørn's painstaking debugging indicated that this happens when the idle loop enters the poll mode. Looking at the poll function cpu_idle_poll(), and the implementation of trace_cpu_idle_rcuidle(), the problem becomes very clear: cpu_idle_poll() lacks calls to rcu_idle_enter/exit(), and trace_cpu_idle_rcuidle() calls them in the reverse order - first rcu_idle_exit(), and then rcu_idle_enter(). Hence the even/odd alternative sequencing of rdtp->dynticks goes for a toss. And powertop readily triggers this because powertop uses the idle-tracing infrastructure extensively. So, to fix this, wrap the code in cpu_idle_poll() within rcu_idle_enter/exit(), so that it blends properly with the calls inside trace_cpu_idle_rcuidle() and thus get the function ordering right. Reported-and-tested-by: Bjørn Mork Cc: Paul McKenney Cc: Steven Rostedt Cc: Dipankar Sarma Signed-off-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/519169BF.4080208@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 8b86c0c68edf..d5585f5e038e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -40,11 +40,13 @@ __setup("hlt", cpu_idle_nopoll_setup); static inline int cpu_idle_poll(void) { + rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); return 1; } -- cgit v1.2.3 From 42a5cf46cd56f46267d2a9fcf2655f4078cd3042 Mon Sep 17 00:00:00 2001 From: Tirupathi Reddy Date: Tue, 14 May 2013 13:59:02 +0530 Subject: timer: Don't reinitialize the cpu base lock during CPU_UP_PREPARE An inactive timer's base can refer to a offline cpu's base. In the current code, cpu_base's lock is blindly reinitialized each time a CPU is brought up. If a CPU is brought online during the period that another thread is trying to modify an inactive timer on that CPU with holding its timer base lock, then the lock will be reinitialized under its feet. This leads to following SPIN_BUG(). <0> BUG: spinlock already unlocked on CPU#3, kworker/u:3/1466 <0> lock: 0xe3ebe000, .magic: dead4ead, .owner: kworker/u:3/1466, .owner_cpu: 1 <4> [] (unwind_backtrace+0x0/0x11c) from [] (do_raw_spin_unlock+0x40/0xcc) <4> [] (do_raw_spin_unlock+0x40/0xcc) from [] (_raw_spin_unlock+0x8/0x30) <4> [] (_raw_spin_unlock+0x8/0x30) from [] (mod_timer+0x294/0x310) <4> [] (mod_timer+0x294/0x310) from [] (queue_delayed_work_on+0x104/0x120) <4> [] (queue_delayed_work_on+0x104/0x120) from [] (sdhci_msm_bus_voting+0x88/0x9c) <4> [] (sdhci_msm_bus_voting+0x88/0x9c) from [] (sdhci_disable+0x40/0x48) <4> [] (sdhci_disable+0x40/0x48) from [] (mmc_release_host+0x4c/0xb0) <4> [] (mmc_release_host+0x4c/0xb0) from [] (mmc_sd_detect+0x90/0xfc) <4> [] (mmc_sd_detect+0x90/0xfc) from [] (mmc_rescan+0x7c/0x2c4) <4> [] (mmc_rescan+0x7c/0x2c4) from [] (process_one_work+0x27c/0x484) <4> [] (process_one_work+0x27c/0x484) from [] (worker_thread+0x210/0x3b0) <4> [] (worker_thread+0x210/0x3b0) from [] (kthread+0x80/0x8c) <4> [] (kthread+0x80/0x8c) from [] (kernel_thread_exit+0x0/0x8) As an example, this particular crash occurred when CPU #3 is executing mod_timer() on an inactive timer whose base is refered to offlined CPU #2. The code locked the timer_base corresponding to CPU #2. Before it could proceed, CPU #2 came online and reinitialized the spinlock corresponding to its base. Thus now CPU #3 held a lock which was reinitialized. When CPU #3 finally ended up unlocking the old cpu_base corresponding to CPU #2, we hit the above SPIN_BUG(). CPU #0 CPU #3 CPU #2 ------ ------- ------- ..... ...... mod_timer() lock_timer_base spin_lock_irqsave(&base->lock) cpu_up(2) ..... ...... init_timers_cpu() .... ..... spin_lock_init(&base->lock) ..... spin_unlock_irqrestore(&base->lock) ...... Allocation of per_cpu timer vector bases is done only once under "tvec_base_done[]" check. In the current code, spinlock_initialization of base->lock isn't under this check. When a CPU is up each time the base lock is reinitialized. Move base spinlock initialization under the check. Signed-off-by: Tirupathi Reddy Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1368520142-4136-1-git-send-email-tirupath@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 09bca8ce9771..7376589adc28 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1539,12 +1539,12 @@ static int __cpuinit init_timers_cpu(int cpu) boot_done = 1; base = &boot_tvec_bases; } + spin_lock_init(&base->lock); tvec_base_done[cpu] = 1; } else { base = per_cpu(tvec_bases, cpu); } - spin_lock_init(&base->lock); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); -- cgit v1.2.3 From b4f711ee03d28f776fd2324fd0bd999cc428e4d2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 24 Apr 2013 11:32:56 -0700 Subject: time: Revert ALWAYS_USE_PERSISTENT_CLOCK compile time optimizaitons Kay Sievers noted that the ALWAYS_USE_PERSISTENT_CLOCK config, which enables some minor compile time optimization to avoid uncessary code in mostly the suspend/resume path could cause problems for userland. In particular, the dependency for RTC_HCTOSYS on !ALWAYS_USE_PERSISTENT_CLOCK, which avoids setting the time twice and simplifies suspend/resume, has the side effect of causing the /sys/class/rtc/rtcN/hctosys flag to always be zero, and this flag is commonly used by udev to setup the /dev/rtc symlink to /dev/rtcN, which can cause pain for older applications. While the udev rules could use some work to be less fragile, breaking userland should strongly be avoided. Additionally the compile time optimizations are fairly minor, and the code being optimized is likely to be reworked in the future, so lets revert this change. Reported-by: Kay Sievers Signed-off-by: John Stultz Cc: stable #3.9 Cc: Feng Tang Cc: Jason Gunthorpe Link: http://lkml.kernel.org/r/1366828376-18124-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/Kconfig | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd7..b69692250af4 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,11 +12,6 @@ config CLOCKSOURCE_WATCHDOG config ARCH_CLOCKSOURCE_DATA bool -# Platforms has a persistent clock -config ALWAYS_USE_PERSISTENT_CLOCK - bool - default n - # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool -- cgit v1.2.3 From 60705c89460fdc7227f2d153b68b3f34814738a4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 May 2013 15:40:48 -0400 Subject: tracing: Fix leaks of filter preds Special preds are created when folding a series of preds that can be done in serial. These are allocated in an ops field of the pred structure. But they were never freed, causing memory leaks. This was discovered using the kmemleak checker: unreferenced object 0xffff8800797fd5e0 (size 32): comm "swapper/0", pid 1, jiffies 4294690605 (age 104.608s) hex dump (first 32 bytes): 00 00 01 00 03 00 05 00 07 00 09 00 0b 00 0d 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x73/0x98 [] kmemleak_alloc_recursive.constprop.42+0x16/0x18 [] __kmalloc+0xd7/0x125 [] kcalloc.constprop.24+0x2d/0x2f [] fold_pred_tree_cb+0xa9/0xf4 [] walk_pred_tree+0x47/0xcc [] replace_preds.isra.20+0x6f8/0x72f [] create_filter+0x4e/0x8b [] ftrace_test_event_filter+0x5a/0x155 [] do_one_initcall+0xa0/0x137 [] kernel_init_freeable+0x14d/0x1dc [] kernel_init+0xe/0xdb [] ret_from_fork+0x7c/0xb0 [] 0xffffffffffffffff Cc: Tom Zanussi Cc: stable@vger.kernel.org # 2.6.39+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a6361178de5a..e1b653f7e1ca 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -750,7 +750,11 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { + int i; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + kfree(filter->preds[i].ops); kfree(filter->preds); filter->preds = NULL; } -- cgit v1.2.3 From c02c7e65d9b13670e34bc523744cf4f6e99c198a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:34 +0900 Subject: tracing/kprobes: Use rcu_dereference_raw for tp->files Use rcu_dereference_raw() for accessing tp->files. Because the write-side uses rcu_assign_pointer() for memory barrier, the read-side also has to use rcu_dereference_raw() with read memory barrier. Link: http://lkml.kernel.org/r/20130513115834.6545.17022.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 47 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 636d45fe69b3..0a3d8d5c483d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -185,9 +185,14 @@ static struct trace_probe *find_trace_probe(const char *event, static int trace_probe_nr_files(struct trace_probe *tp) { - struct ftrace_event_file **file = tp->files; + struct ftrace_event_file **file; int ret = 0; + /* + * Since all tp->files updater is protected by probe_enable_lock, + * we don't need to lock an rcu_read_lock. + */ + file = rcu_dereference_raw(tp->files); if (file) while (*(file++)) ret++; @@ -209,9 +214,10 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) mutex_lock(&probe_enable_lock); if (file) { - struct ftrace_event_file **new, **old = tp->files; + struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); + old = rcu_dereference_raw(tp->files); /* 1 is for new one and 1 is for stopper */ new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), GFP_KERNEL); @@ -251,11 +257,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) static int trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) { + struct ftrace_event_file **files; int i; - if (tp->files) { - for (i = 0; tp->files[i]; i++) - if (tp->files[i] == file) + /* + * Since all tp->files updater is protected by probe_enable_lock, + * we don't need to lock an rcu_read_lock. + */ + files = rcu_dereference_raw(tp->files); + if (files) { + for (i = 0; files[i]; i++) + if (files[i] == file) return i; } @@ -274,10 +286,11 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) mutex_lock(&probe_enable_lock); if (file) { - struct ftrace_event_file **new, **old = tp->files; + struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); int i, j; + old = rcu_dereference_raw(tp->files); if (n == 0 || trace_probe_file_index(tp, file) < 0) { ret = -EINVAL; goto out_unlock; @@ -872,9 +885,16 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, static __kprobes void kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) { - struct ftrace_event_file **file = tp->files; + /* + * Note: preempt is already disabled around the kprobe handler. + * However, we still need an smp_read_barrier_depends() corresponding + * to smp_wmb() in rcu_assign_pointer() to access the pointer. + */ + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); + + if (unlikely(!file)) + return; - /* Note: preempt is already disabled around the kprobe handler */ while (*file) { __kprobe_trace_func(tp, regs, *file); file++; @@ -925,9 +945,16 @@ static __kprobes void kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_file **file = tp->files; + /* + * Note: preempt is already disabled around the kprobe handler. + * However, we still need an smp_read_barrier_depends() corresponding + * to smp_wmb() in rcu_assign_pointer() to access the pointer. + */ + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); + + if (unlikely(!file)) + return; - /* Note: preempt is already disabled around the kprobe handler */ while (*file) { __kretprobe_trace_func(tp, ri, regs, *file); file++; -- cgit v1.2.3 From 3d1fc7b0880c4db612a3d3211a808659e28af588 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:37 +0900 Subject: tracing/kprobes: Fix a sparse warning for incorrect type in assignment Fix a sparse warning about the rcu operated pointer is defined without __rcu address space. Link: http://lkml.kernel.org/r/20130513115837.6545.23322.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0a3d8d5c483d..81c5109b9d00 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; - struct ftrace_event_file **files; + struct ftrace_event_file * __rcu *files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; -- cgit v1.2.3 From b62fdd97fcae17e483b005bafd13fadbd9840672 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 13 May 2013 20:58:39 +0900 Subject: tracing/kprobes: Make print_*probe_event static According to sparse warning, print_*probe_event static because those functions are not directly called from outside. Link: http://lkml.kernel.org/r/20130513115839.6545.83067.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 81c5109b9d00..9f46e98ba8f2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -962,7 +962,7 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, } /* Event entry printers */ -enum print_line_t +static enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -998,7 +998,7 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -enum print_line_t +static enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { -- cgit v1.2.3 From 5d33b883aed81c6fbcd09c6f7c3619eee850a7e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:43 +0000 Subject: clocksource: Always verify highres capability If a clocksource has a (wrong) high rating, but can't be used as a timebase for oneshot tick mode, it is unconditionally selected even when the system is already in oneshot tick mode. This causes full system failure. Verify the clocksource selection against the oneshot mode. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.635040849@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c9583382141a..dda5c7130d93 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,6 +553,26 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET +static struct clocksource *clocksource_find_best(bool oneshot) +{ + struct clocksource *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + /** * clocksource_select - Select the best clocksource available * @@ -563,12 +583,14 @@ static u64 clocksource_max_deferment(struct clocksource *cs) */ static void clocksource_select(void) { + bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; - if (!finished_booting || list_empty(&clocksource_list)) + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot); + if (!best) return; - /* First clocksource on the list has the best rating. */ - best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (strcmp(cs->name, override_name) != 0) @@ -578,8 +600,7 @@ static void clocksource_select(void) * capable clocksource if the tick code is in oneshot * mode (highres or nohz) */ - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - tick_oneshot_mode_active()) { + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ printk(KERN_WARNING "Override clocksource %s is not " "HRT compatible. Cannot switch while in " -- cgit v1.2.3 From ba919d1caa2e624eb8c6cae1f2ce0a253e697d45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Let timekeeping_notify return success/error timekeeping_notify() can fail due cs->enable() failure. Though the caller does not notice and happily keeps the wrong clocksource as the current one. Let the caller know about failure, so the current clocksource will be shown correctly in sysfs. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.696321912@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 6 +++--- kernel/time/timekeeping.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index dda5c7130d93..1923a340bd91 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -611,10 +611,10 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) { - printk(KERN_INFO "Switching to clocksource %s\n", best->name); + + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; - timekeeping_notify(curr_clocksource); } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe49..da6e10c7a378 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -648,14 +648,15 @@ static int change_clocksource(void *data) * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ -void timekeeping_notify(struct clocksource *clock) +int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &timekeeper; if (tk->clock == clock) - return; + return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); + return tk->clock == clock ? 0 : -1; } /** -- cgit v1.2.3 From 09ac369c825d9d593404306d59062d854b321e9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Add module refcount Add a module refcount, so the current clocksource cannot be removed unconditionally. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.762417789@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index da6e10c7a378..933efa4071c3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -627,11 +627,20 @@ static int change_clocksource(void *data) write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); - if (!new->enable || new->enable(new) == 0) { - old = tk->clock; - tk_setup_internals(tk, new); - if (old->disable) - old->disable(old); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } } timekeeping_update(tk, true, true); -- cgit v1.2.3 From f5a2e34375a5e2b711aea488ac3ae50eeba6d57c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Allow clocksource select to skip current clocksource Preparatory patch for clocksource unbind support. Split out code from clocksource_select and modify it, so it skips the current clocksource on request and tries to find a fallback clocksource. Convert all existing users. No functional change. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.834965397@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1923a340bd91..9782997cb6cf 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,7 +553,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -static struct clocksource *clocksource_find_best(bool oneshot) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { struct clocksource *cs; @@ -566,6 +566,8 @@ static struct clocksource *clocksource_find_best(bool oneshot) * the best rating. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; return cs; @@ -573,26 +575,20 @@ static struct clocksource *clocksource_find_best(bool oneshot) return NULL; } -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static void __clocksource_select(bool skipcur) { bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; /* Find the best suitable clocksource */ - best = clocksource_find_best(oneshot); + best = clocksource_find_best(oneshot, skipcur); if (!best) return; /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (strcmp(cs->name, override_name) != 0) continue; /* @@ -618,6 +614,19 @@ static void clocksource_select(void) } } +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + return __clocksource_select(false); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } -- cgit v1.2.3 From 29b5407819f59731c9423238fae03b756822708c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Split out user string input Split out the user string input for clocksource override. Preparatory patch for unbind. [ jstultz: Fix an off by one error ] Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.895851338@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9782997cb6cf..d7f1a45c2fa5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -174,7 +174,8 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +#define CS_NAME_LEN 32 +static char override_name[CS_NAME_LEN]; static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -838,6 +839,23 @@ sysfs_show_current_clocksources(struct device *dev, return count; } +static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused @@ -852,22 +870,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret = count; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; + size_t ret; mutex_lock(&clocksource_mutex); - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - clocksource_select(); + ret = clocksource_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); mutex_unlock(&clocksource_mutex); -- cgit v1.2.3 From 7eaeb34305dee26634f7c98ae62646da5cebe91d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Provide unbind interface in sysfs With the module refcount held for the current clocksource there is no way to unload the module. Provide a sysfs interface which allows to unbind the clocksource. One could argue that the clocksource override could be (ab)used to do so, but the clocksource override cannot be used from the kernel itself, while an unbind function can be used to programmatically check whether a clocksource can be shutdown or not. The unbind functionality uses the new skip current feature of clocksource_select and verifies that a fallback clocksource has been installed. If the clocksource which should be unbound is the current clocksource and no fallback can be found, unbind returns -EBUSY. This does not support the unbinding of a clocksource which is used as the watchdog clocksource. No point in fostering crappy hardware. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.964218245@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d7f1a45c2fa5..791d1aeb17ac 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -440,6 +440,11 @@ static int clocksource_watchdog_kthread(void *data) return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -451,6 +456,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -628,6 +634,11 @@ static void clocksource_select(void) return __clocksource_select(false); } +static void clocksource_select_fallback(void) +{ + return __clocksource_select(true); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -803,6 +814,29 @@ void clocksource_change_rating(struct clocksource *cs, int rating) } EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + /* + * I really can't convince myself to support this on hardware + * designed by lobotomized monkeys. + */ + if (clocksource_is_watchdog(cs)) + return -EBUSY; + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + return 0; +} + /** * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered @@ -883,6 +917,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +/** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + size_t ret; + + ret = clocksource_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} + /** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused @@ -925,6 +993,8 @@ sysfs_show_available_clocksources(struct device *dev, static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); @@ -948,6 +1018,9 @@ static int __init init_clocksource_sysfs(void) error = device_create_file( &device_clocksource, &dev_attr_current_clocksource); + if (!error) + error = device_create_file(&device_clocksource, + &dev_attr_unbind_clocksource); if (!error) error = device_create_file( &device_clocksource, -- cgit v1.2.3 From a89c7edbe7d7aa80f507915f3dd801211b116b79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Let clocksource_unregister() return success/error The unregister call can fail, if the clocksource is the current one and there is no replacement clocksource available. It can also fail, if the clocksource is the watchdog clocksource and I'm not going to provide support for this. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.029915527@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 791d1aeb17ac..31b90332f47b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -389,28 +389,17 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - struct clocksource *tmp; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a watched clocksource. */ - list_del_init(&cs->wd_list); - } else if (cs == watchdog) { - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - /* Current watchdog is removed. Find an alternative. */ - watchdog = NULL; - list_for_each_entry(tmp, &clocksource_list, list) { - if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) - continue; - if (!watchdog || tmp->rating > watchdog->rating) - watchdog = tmp; + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); } } - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -841,13 +830,15 @@ static int clocksource_unbind(struct clocksource *cs) * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs) { + int ret = 0; + mutex_lock(&clocksource_mutex); - clocksource_dequeue_watchdog(cs); - list_del(&cs->list); - clocksource_select(); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); mutex_unlock(&clocksource_mutex); + return ret; } EXPORT_SYMBOL(clocksource_unregister); -- cgit v1.2.3 From 7172a286ced0c1f4f239a0fa09db54ed37d3ead2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:47 +0000 Subject: clockevents: Get rid of the notifier chain 7+ years and still a single user. Kill it. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.098520211@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 35 +++-------------------------------- kernel/time/tick-broadcast.c | 5 ++--- kernel/time/tick-common.c | 30 +++++------------------------- kernel/time/tick-internal.h | 7 ++++--- 4 files changed, 14 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c6d6400ee137..dd70b4842c62 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "tick-internal.h" @@ -23,10 +22,6 @@ /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); @@ -232,30 +227,6 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - raw_spin_unlock_irqrestore(&clockevents_lock, flags); - - return ret; -} - -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ - raw_notifier_call_chain(&clockevents_chain, reason, dev); -} - /* * Called after a notify add to make devices available which were * released from the notifier call. @@ -269,7 +240,7 @@ static void clockevents_notify_released(void) struct clock_event_device, list); list_del(&dev->list); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); } } @@ -290,7 +261,7 @@ void clockevents_register_device(struct clock_event_device *dev) raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); @@ -433,7 +404,7 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); + tick_notify(reason, arg); switch (reason) { case CLOCK_EVT_NOTIFY_CPU_DEAD: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d577669..3500caaa0bfd 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -64,7 +64,7 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ -int tick_check_broadcast_device(struct clock_event_device *dev) +void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; @@ -72,7 +72,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) @@ -90,7 +90,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev) */ if (dev->features & CLOCK_EVT_FEAT_ONESHOT) tick_clock_notify(); - return 1; } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5d3fb100bc06..dbf4e18d5101 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -208,11 +208,11 @@ static void tick_setup_device(struct tick_device *td, /* * Check, if the new registered device should be used. */ -static int tick_check_new_device(struct clock_event_device *newdev) +void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; - int cpu, ret = NOTIFY_OK; + int cpu; unsigned long flags; raw_spin_lock_irqsave(&tick_device_lock, flags); @@ -275,18 +275,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) tick_oneshot_notify(); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; + return; out_bc: /* * Can the new device be used as a broadcast device ? */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - + tick_install_broadcast_device(newdev); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; } /* @@ -360,17 +356,10 @@ static void tick_resume(void) raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) +void tick_notify(unsigned long reason, void *dev) { switch (reason) { - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_OFF: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: @@ -404,21 +393,12 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, default: break; } - - return NOTIFY_OK; } -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - /** * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework */ void __init tick_init(void) { - clockevents_register_notifier(&tick_notifier); tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f0299eae4602..60742fe6f63d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,6 +18,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_notify(unsigned long reason, void *dev); +extern void tick_check_new_device(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); @@ -90,7 +92,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; } */ #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); @@ -102,9 +104,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { - return 0; } static inline int tick_is_broadcast_device(struct clock_event_device *dev) -- cgit v1.2.3 From 7126cac426137633e470167524e7bcb590fd49b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Simplify locking Now that the notifier chain is gone there are no other users and it's pointless to nest tick_device_lock inside of clockevents_lock because there is no other use case. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.162888472@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index dbf4e18d5101..170a4bdfa99e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -206,16 +205,14 @@ static void tick_setup_device(struct tick_device *td, } /* - * Check, if the new registered device should be used. + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. */ void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; int cpu; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -273,8 +270,6 @@ void tick_check_new_device(struct clock_event_device *newdev) tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); return; out_bc: @@ -282,7 +277,6 @@ out_bc: * Can the new device be used as a broadcast device ? */ tick_install_broadcast_device(newdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } /* @@ -311,9 +305,7 @@ static void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -325,26 +317,20 @@ static void tick_shutdown(unsigned int *cpup) dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; int broadcast = tick_resume_broadcast(); - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -353,9 +339,11 @@ static void tick_resume(void) else tick_resume_oneshot(); } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } +/* + * Called with clockevents_lock held and interrupts disabled + */ void tick_notify(unsigned long reason, void *dev) { switch (reason) { -- cgit v1.2.3 From 8c53daf63f56791ed47fc585206ef3049489612f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Move the tick_notify() switch case to clockevents_notify() No need to call another function and have duplicated cases. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.235746557@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 28 ++++++++++++++++++++++++- kernel/time/tick-common.c | 50 ++++----------------------------------------- kernel/time/tick-internal.h | 5 ++++- 3 files changed, 35 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index dd70b4842c62..0e3a8448e115 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -404,10 +404,36 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_notify(reason, arg); switch (reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + tick_broadcast_on_off(reason, arg); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(arg); + break; + + case CLOCK_EVT_NOTIFY_SUSPEND: + tick_suspend(); + tick_suspend_broadcast(); + break; + + case CLOCK_EVT_NOTIFY_RESUME: + tick_resume(); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(arg); + tick_shutdown_broadcast(arg); + tick_shutdown(arg); /* * Unregister the clock event devices which were * released from the users in the notify chain. diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 170a4bdfa99e..84c7cfca4d7d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -284,7 +284,7 @@ out_bc: * * Called with interrupts disabled. */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup) { if (*cpup == tick_do_timer_cpu) { int cpu = cpumask_first(cpu_online_mask); @@ -301,7 +301,7 @@ static void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; @@ -319,14 +319,14 @@ static void tick_shutdown(unsigned int *cpup) } } -static void tick_suspend(void) +void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); clockevents_shutdown(td->evtdev); } -static void tick_resume(void) +void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); int broadcast = tick_resume_broadcast(); @@ -341,48 +341,6 @@ static void tick_resume(void) } } -/* - * Called with clockevents_lock held and interrupts disabled - */ -void tick_notify(unsigned long reason, void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(dev); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } -} - /** * tick_init - initialize the tick control */ diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 60742fe6f63d..06bfc8802dfb 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,8 +18,11 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); -extern void tick_notify(unsigned long reason, void *dev); extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); extern void clockevents_shutdown(struct clock_event_device *dev); -- cgit v1.2.3 From ccf33d6880f39a35158fff66db13000ae4943fac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Add module refcount We want to be able to remove clockevent modules as well. Add a refcount so we don't remove a module with an active clock event device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.307435149@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 1 + kernel/time/tick-broadcast.c | 3 +++ kernel/time/tick-common.c | 4 ++++ 3 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0e3a8448e115..89e394caa769 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -357,6 +357,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + module_put(old->owner); clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 3500caaa0bfd..0e374cd2e0ef 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -73,6 +74,8 @@ void tick_install_broadcast_device(struct clock_event_device *dev) tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return; + if (!try_module_get(dev->owner)) + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 84c7cfca4d7d..433a1e11d13b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -257,6 +258,9 @@ void tick_check_new_device(struct clock_event_device *newdev) goto out_bc; } + if (!try_module_get(newdev->owner)) + return; + /* * Replace the eventually existing device by the new * device. If the current device is the broadcast device, do -- cgit v1.2.3 From 501f867064e95f9a6f540e60705be0937280e7ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Provide sysfs interface Provide a simple sysfs interface for the clockevent devices. Show the current active clockevent device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.371634778@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 89e394caa769..0a23f4f29934 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -460,4 +461,89 @@ void clockevents_notify(unsigned long reason, void *arg) raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ -- cgit v1.2.3 From 45cb8e01b2ecef1c2afb18333e95793fa1a90281 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Split out selection logic Split out the clockevent device selection logic. Preparatory patch to allow unbinding active clockevent devices. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.431796247@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 25 ++++++++++++---- kernel/time/tick-common.c | 69 +++++++++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0e374cd2e0ef..d067c01586f5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -65,19 +65,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; - if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || - (tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if (!tick_check_broadcast_device(cur, dev)) return; + if (!try_module_get(dev->owner)) return; - clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 433a1e11d13b..c34021650348 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,37 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* Use the higher rated one */ + return !curdev || newdev->rating > curdev->rating; +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. @@ -223,40 +254,12 @@ void tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; - - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) - goto out_bc; - } + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; if (!try_module_get(newdev->owner)) return; -- cgit v1.2.3 From 03e13cf5ee60584fe0c831682c67212effb7fca4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Implement unbind functionality Provide a sysfs interface to allow unbinding of clockevent devices. The device is unbound if it is unused or if there is a replacement device available. Unbinding of broadcast devices is not supported as we don't want to foster that nonsense. If no replacement device is available the unbind returns -EBUSY. Unbind is available from the kernel and through sysfs, which is necessary to drop the module refcount. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.499216659@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 125 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/clocksource.c | 9 ++-- kernel/time/tick-common.c | 24 +++++++++ kernel/time/tick-internal.h | 7 +++ 4 files changed, 161 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0a23f4f29934..38959c866789 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -25,6 +25,13 @@ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -245,6 +252,90 @@ static void clockevents_notify_released(void) } } +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind); + /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -487,6 +578,38 @@ static ssize_t sysfs_show_current_tick_dev(struct device *dev, } static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + size_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", @@ -529,6 +652,8 @@ static int __init tick_init_sysfs(void) err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 31b90332f47b..6d05b00410cc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,8 @@ #include #include +#include "tick-internal.h" + void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, u64 start_tstamp) @@ -174,7 +176,6 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -#define CS_NAME_LEN 32 static char override_name[CS_NAME_LEN]; static int finished_booting; @@ -864,7 +865,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -899,7 +900,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, mutex_lock(&clocksource_mutex); - ret = clocksource_get_uname(buf, override_name, count); + ret = sysfs_get_uname(buf, override_name, count); if (ret >= 0) clocksource_select(); @@ -925,7 +926,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, char name[CS_NAME_LEN]; size_t ret; - ret = clocksource_get_uname(buf, name, count); + ret = sysfs_get_uname(buf, name, count); if (ret < 0) return ret; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c34021650348..5edfb4806032 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,17 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + static bool tick_check_percpu(struct clock_event_device *curdev, struct clock_event_device *newdev, int cpu) { @@ -236,6 +247,19 @@ static bool tick_check_preferred(struct clock_event_device *curdev, return !curdev || newdev->rating > curdev->rating; } +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 06bfc8802dfb..be1690eaecff 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -11,6 +11,8 @@ extern seqlock_t jiffies_lock; #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 +#define CS_NAME_LEN 32 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; @@ -23,9 +25,14 @@ extern void tick_handover_do_timer(int *cpup); extern void tick_shutdown(unsigned int *cpup); extern void tick_suspend(void); extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); +extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + /* * NO_HZ / high resolution timer shared code */ -- cgit v1.2.3 From c7e99fc75de8882bc4104455ace366d9d3599a96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:28:02 +0200 Subject: clockevents: Define CS_NAME_LEN unconditionally Unbreak architectures which do not use clockevents, but require to build some of the core timekeeping infrastructure Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/time/tick-internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index be1690eaecff..bc906cad709b 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,13 +6,13 @@ extern seqlock_t jiffies_lock; +#define CS_NAME_LEN 32 + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 -#define CS_NAME_LEN 32 - DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; -- cgit v1.2.3 From 1eaff67266b6b6c97bbd33cf2c20577822836413 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:48:46 +0200 Subject: clocksource: Implement clocksource_select_fallback() for CONFIG_ARCH_USES_GETTIMEOFFSET=y commit 7eaeb34305 (clocksource: Provide unbind interface in sysfs) implemented clocksource_select_fallback() which is not defined for CONFIG_ARCH_USES_GETTIMEOFFSET=y. Add an empty inline function for that. Reported-by: Ingo Molnar Reported-by: fengguang.wu@intel.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6d05b00410cc..e713ef7d19a7 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -632,6 +632,7 @@ static void clocksource_select_fallback(void) #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } #endif -- cgit v1.2.3 From 6cffe00f7d4e24679eae6b7aae4caaf915288256 Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Wed, 15 May 2013 14:38:11 -0700 Subject: alarmtimer: Add functions for timerfd support Add functions needed for hooking up alarmtimer to timerfd: * alarm_restart: Similar to hrtimer_restart, restart an alarmtimer after the expires time has already been updated (as with alarm_forward). * alarm_forward_now: Similar to hrtimer_forward_now, move the expires time forward to an interval from the current time of the associated clock. * alarm_start_relative: Start an alarmtimer with an expires time relative to the current time of the associated clock. * alarm_expires_remaining: Similar to hrtimer_expires_remaining, return the amount of time remaining until alarm expiry. Signed-off-by: Todd Poynor Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f11d83b12949..3e5cba274475 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -199,6 +199,12 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} + #ifdef CONFIG_RTC_CLASS /** * alarmtimer_suspend - Suspend time callback @@ -305,7 +311,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, } /** - * alarm_start - Sets an alarm to fire + * alarm_start - Sets an absolute alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm */ @@ -324,6 +330,31 @@ int alarm_start(struct alarm *alarm, ktime_t start) return ret; } +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add(start, base->gettime()); + return alarm_start(alarm, start); +} + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} + /** * alarm_try_to_cancel - Tries to cancel an alarm timer * @alarm: ptr to alarm to be canceled @@ -394,6 +425,12 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) return overrun; } +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + return alarm_forward(alarm, base->gettime(), interval); +} -- cgit v1.2.3 From 5c83545f24ab3dd67e0ae0e2b795fea750f08c34 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Tue, 21 May 2013 22:32:14 -0700 Subject: power: Add option to log time spent in suspend Below is a patch from android kernel that maintains a histogram of suspend times. Please review and provide feedback. Statistices on the time spent in suspend are kept in /sys/kernel/debug/sleep_time. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Todd Poynor [zoran.markovic@linaro.org: Re-formatted suspend time table to better fit expected values. Moved accounting of suspend time into timekeeping core. Removed CONFIG_SUSPEND_TIME flag and made the feature conditional on CONFIG_DEBUG_FS. Changed the file name to sleep_time to better fit terminology in timekeeping core. Changed seq_printf to seq_puts. Tweaked commit message] Signed-off-by: Zoran Markovic Signed-off-by: John Stultz --- kernel/time/Makefile | 1 + kernel/time/timekeeping.c | 2 ++ kernel/time/timekeeping_debug.c | 72 ++++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping_internal.h | 14 ++++++++ 4 files changed, 89 insertions(+) create mode 100644 kernel/time/timekeeping_debug.c create mode 100644 kernel/time/timekeeping_internal.h (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2ab504..d52ac8bf0006 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 933efa4071c3..838fc0777b68 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,7 @@ #include "tick-internal.h" #include "ntp_internal.h" +#include "timekeeping_internal.h" static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -851,6 +852,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_debug_account_sleep_time(delta); } /** diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000000000000..802433a4f5eb --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,72 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000000000000..13323ea08ffa --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.2.3 From 11682a41618f8094cb7a9330b4b6a12ffaef5774 Mon Sep 17 00:00:00 2001 From: Marcus Gelderie Date: Tue, 4 Jun 2013 09:32:09 +0200 Subject: alarmtimer: Export symbols of functions declared in linux/alarmtimer.h Export symbols so they can be used by drivers/staging/android/alarm-dev.c if it is built as a module. So far alarm-dev is built-in but module support is planned (see drivers/staging/android/TODO). Signed-off-by: Marcus Gelderie [jstultz: tweaked commit message, also export newly added functions] Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 3e5cba274475..eec50fcef9e4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -204,6 +204,7 @@ ktime_t alarm_expires_remaining(const struct alarm *alarm) struct alarm_base *base = &alarm_bases[alarm->type]; return ktime_sub(alarm->node.expires, base->gettime()); } +EXPORT_SYMBOL_GPL(alarm_expires_remaining); #ifdef CONFIG_RTC_CLASS /** @@ -309,6 +310,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; } +EXPORT_SYMBOL_GPL(alarm_init); /** * alarm_start - Sets an absolute alarm to fire @@ -329,6 +331,7 @@ int alarm_start(struct alarm *alarm, ktime_t start) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_start); /** * alarm_start_relative - Sets a relative alarm to fire @@ -342,6 +345,7 @@ int alarm_start_relative(struct alarm *alarm, ktime_t start) start = ktime_add(start, base->gettime()); return alarm_start(alarm, start); } +EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { @@ -354,6 +358,7 @@ void alarm_restart(struct alarm *alarm) alarmtimer_enqueue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); } +EXPORT_SYMBOL_GPL(alarm_restart); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -375,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); /** @@ -392,6 +398,7 @@ int alarm_cancel(struct alarm *alarm) cpu_relax(); } } +EXPORT_SYMBOL_GPL(alarm_cancel); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) @@ -424,6 +431,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) alarm->node.expires = ktime_add(alarm->node.expires, interval); return overrun; } +EXPORT_SYMBOL_GPL(alarm_forward); u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { @@ -431,7 +439,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) return alarm_forward(alarm, base->gettime(), interval); } - +EXPORT_SYMBOL_GPL(alarm_forward_now); /** -- cgit v1.2.3 From 38ff87f77af0b5a93fc8581cff1d6e5692ab8970 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Sat, 1 Jun 2013 23:39:40 -0700 Subject: sched_clock: Make ARM's sched_clock generic for all architectures Nothing about the sched_clock implementation in the ARM port is specific to the architecture. Generalize the code so that other architectures can use it by selecting GENERIC_SCHED_CLOCK. Signed-off-by: Stephen Boyd [jstultz: Merge minor collisions with other patches in my tree] Signed-off-by: John Stultz --- kernel/time/Makefile | 1 + kernel/time/sched_clock.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 kernel/time/sched_clock.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index d52ac8bf0006..9250130646f5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -4,6 +4,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 000000000000..aad1ae6077ef --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,215 @@ +/* + * sched_clock.c: support for extending counters to full 64-bit ns counter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct clock_data { + u64 epoch_ns; + u32 epoch_cyc; + u32 epoch_cyc_copy; + unsigned long rate; + u32 mult; + u32 shift; + bool suspended; +}; + +static void sched_clock_poll(unsigned long wrap_ticks); +static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static struct clock_data cd = { + .mult = NSEC_PER_SEC / HZ, +}; + +static u32 __read_mostly sched_clock_mask = 0xffffffff; + +static u32 notrace jiffy_sched_clock_read(void) +{ + return (u32)(jiffies - INITIAL_JIFFIES); +} + +static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ + return (cyc * mult) >> shift; +} + +static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) +{ + u64 epoch_ns; + u32 epoch_cyc; + + /* + * Load the epoch_cyc and epoch_ns atomically. We do this by + * ensuring that we always write epoch_cyc, epoch_ns and + * epoch_cyc_copy in strict order, and read them in strict order. + * If epoch_cyc and epoch_cyc_copy are not equal, then we're in + * the middle of an update, and we should repeat the load. + */ + do { + epoch_cyc = cd.epoch_cyc; + smp_rmb(); + epoch_ns = cd.epoch_ns; + smp_rmb(); + } while (epoch_cyc != cd.epoch_cyc_copy); + + return epoch_ns + cyc_to_ns((cyc - epoch_cyc) & mask, cd.mult, cd.shift); +} + +/* + * Atomically update the sched_clock epoch. + */ +static void notrace update_sched_clock(void) +{ + unsigned long flags; + u32 cyc; + u64 ns; + + cyc = read_sched_clock(); + ns = cd.epoch_ns + + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + /* + * Write epoch_cyc and epoch_ns in a way that the update is + * detectable in cyc_to_fixed_sched_clock(). + */ + raw_local_irq_save(flags); + cd.epoch_cyc_copy = cyc; + smp_wmb(); + cd.epoch_ns = ns; + smp_wmb(); + cd.epoch_cyc = cyc; + raw_local_irq_restore(flags); +} + +static void sched_clock_poll(unsigned long wrap_ticks) +{ + mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); + update_sched_clock(); +} + +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +{ + unsigned long r, w; + u64 res, wrap; + char r_unit; + + if (cd.rate > rate) + return; + + BUG_ON(bits > 32); + WARN_ON(!irqs_disabled()); + read_sched_clock = read; + sched_clock_mask = (1 << bits) - 1; + cd.rate = rate; + + /* calculate the mult/shift to convert counter ticks to ns. */ + clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); + + r = rate; + if (r >= 4000000) { + r /= 1000000; + r_unit = 'M'; + } else if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else + r_unit = ' '; + + /* calculate how many ns until we wrap */ + wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); + do_div(wrap, NSEC_PER_MSEC); + w = wrap; + + /* calculate the ns resolution of this counter */ + res = cyc_to_ns(1ULL, cd.mult, cd.shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", + bits, r, r_unit, res, w); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); + update_sched_clock(); + + /* + * Ensure that sched_clock() starts off at 0ns + */ + cd.epoch_ns = 0; + + /* Enable IRQ time accounting if we have a fast enough sched_clock */ + if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) + enable_sched_clock_irqtime(); + + pr_debug("Registered %pF as sched_clock source\n", read); +} + +static unsigned long long notrace sched_clock_32(void) +{ + u32 cyc = read_sched_clock(); + return cyc_to_sched_clock(cyc, sched_clock_mask); +} + +unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; + +unsigned long long notrace sched_clock(void) +{ + if (cd.suspended) + return cd.epoch_ns; + + return sched_clock_func(); +} + +void __init sched_clock_postinit(void) +{ + /* + * If no sched_clock function has been provided at that point, + * make it the final one one. + */ + if (read_sched_clock == jiffy_sched_clock_read) + setup_sched_clock(jiffy_sched_clock_read, 32, HZ); + + sched_clock_poll(sched_clock_timer.data); +} + +static int sched_clock_suspend(void) +{ + sched_clock_poll(sched_clock_timer.data); + cd.suspended = true; + return 0; +} + +static void sched_clock_resume(void) +{ + cd.epoch_cyc = read_sched_clock(); + cd.epoch_cyc_copy = cd.epoch_cyc; + cd.suspended = false; +} + +static struct syscore_ops sched_clock_ops = { + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ + register_syscore_ops(&sched_clock_ops); + return 0; +} +device_initcall(sched_clock_syscore_init); -- cgit v1.2.3 From 336ae1180df5f69b9e0fb6561bec01c5f64361cf Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 17 Jun 2013 15:40:58 -0700 Subject: ARM: sched_clock: Load cycle count after epoch stabilizes There is a small race between when the cycle count is read from the hardware and when the epoch stabilizes. Consider this scenario: CPU0 CPU1 ---- ---- cyc = read_sched_clock() cyc_to_sched_clock() update_sched_clock() ... cd.epoch_cyc = cyc; epoch_cyc = cd.epoch_cyc; ... epoch_ns + cyc_to_ns((cyc - epoch_cyc) The cyc on cpu0 was read before the epoch changed. But we calculate the nanoseconds based on the new epoch by subtracting the new epoch from the old cycle count. Since epoch is most likely larger than the old cycle count we calculate a large number that will be converted to nanoseconds and added to epoch_ns, causing time to jump forward too much. Fix this problem by reading the hardware after the epoch has stabilized. Cc: Russell King Signed-off-by: Stephen Boyd Signed-off-by: John Stultz --- kernel/time/sched_clock.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index aad1ae6077ef..a326f27d7f09 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -49,10 +49,14 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } -static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) +static unsigned long long notrace sched_clock_32(void) { u64 epoch_ns; u32 epoch_cyc; + u32 cyc; + + if (cd.suspended) + return cd.epoch_ns; /* * Load the epoch_cyc and epoch_ns atomically. We do this by @@ -68,7 +72,9 @@ static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) smp_rmb(); } while (epoch_cyc != cd.epoch_cyc_copy); - return epoch_ns + cyc_to_ns((cyc - epoch_cyc) & mask, cd.mult, cd.shift); + cyc = read_sched_clock(); + cyc = (cyc - epoch_cyc) & sched_clock_mask; + return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); } /* @@ -160,19 +166,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -static unsigned long long notrace sched_clock_32(void) -{ - u32 cyc = read_sched_clock(); - return cyc_to_sched_clock(cyc, sched_clock_mask); -} - unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; unsigned long long notrace sched_clock(void) { - if (cd.suspended) - return cd.epoch_ns; - return sched_clock_func(); } -- cgit v1.2.3 From 70e5975d3a04be5479a28eec4a2fb10f98ad2785 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 13 Jun 2013 11:39:50 -0700 Subject: clockevents: Prefer CPU local devices over global devices On an SMP system with only one global clockevent and a dummy clockevent per CPU we run into problems. We want the dummy clockevents to be registered as the per CPU tick devices, but we can only achieve that if we register the dummy clockevents before the global clockevent or if we artificially inflate the rating of the dummy clockevents to be higher than the rating of the global clockevent. Failure to do so leads to boot hangs when the dummy timers are registered on all other CPUs besides the CPU that accepted the global clockevent as its tick device and there is no broadcast timer to poke the dummy devices. If we're registering multiple clockevents and one clockevent is global and the other is local to a particular CPU we should choose to use the local clockevent regardless of the rating of the device. This way, if the clockevent is a dummy it will take the tick device duty as long as there isn't a higher rated tick device and any global clockevent will be bumped out into broadcast mode, fixing the problem described above. Reported-and-tested-by: Mark Rutland Signed-off-by: Stephen Boyd Tested-by: soren.brinkmann@xilinx.com Cc: John Stultz Cc: Daniel Lezcano Cc: linux-arm-kernel@lists.infradead.org Cc: John Stultz Link: http://lkml.kernel.org/r/20130613183950.GA32061@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5edfb4806032..edd45f64162f 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -243,8 +243,13 @@ static bool tick_check_preferred(struct clock_event_device *curdev, return false; } - /* Use the higher rated one */ - return !curdev || newdev->rating > curdev->rating; + /* + * Use the higher rated one, but prefer a CPU local device with a lower + * rating than a non-CPU local device + */ + return !curdev || + newdev->rating > curdev->rating || + !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* -- cgit v1.2.3