summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h6
-rw-r--r--kernel/sched.c43
-rw-r--r--kernel/timer.c10
3 files changed, 58 insertions, 1 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fed07d03364e..6a1e7afb099b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1541,6 +1541,12 @@ static inline void idle_task_exit(void) {}
extern void sched_idle_next(void);
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+extern void wake_up_idle_cpu(int cpu);
+#else
+static inline void wake_up_idle_cpu(int cpu) { }
+#endif
+
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
diff --git a/kernel/sched.c b/kernel/sched.c
index 28c73f07efb2..8dcdec6fe0fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1052,6 +1052,49 @@ static void resched_cpu(int cpu)
resched_task(cpu_curr(cpu));
spin_unlock_irqrestore(&rq->lock, flags);
}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (cpu == smp_processor_id())
+ return;
+
+ /*
+ * This is safe, as this function is called with the timer
+ * wheel base lock of (cpu) held. When the CPU is on the way
+ * to idle and has not yet set rq->curr to idle then it will
+ * be serialized on the timer wheel base lock and take the new
+ * timer into account automatically.
+ */
+ if (rq->curr != rq->idle)
+ return;
+
+ /*
+ * We can set TIF_RESCHED on the idle task of the other CPU
+ * lockless. The worst case is that the other CPU runs the
+ * idle task through an additional NOOP schedule()
+ */
+ set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+
+ /* NEED_RESCHED must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(rq->idle))
+ smp_send_reschedule(cpu);
+}
+#endif
+
#else
static void __resched_task(struct task_struct *p, int tif_bit)
{
diff --git a/kernel/timer.c b/kernel/timer.c
index 99b00a25f88b..b024106daa70 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu)
spin_lock_irqsave(&base->lock, flags);
timer_set_base(timer, base);
internal_add_timer(base, timer);
+ /*
+ * Check whether the other CPU is idle and needs to be
+ * triggered to reevaluate the timer wheel when nohz is
+ * active. We are protected against the other CPU fiddling
+ * with the timer by holding the timer base lock. This also
+ * makes sure that a CPU on the way to idle can not evaluate
+ * the timer wheel.
+ */
+ wake_up_idle_cpu(cpu);
spin_unlock_irqrestore(&base->lock, flags);
}
-
/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified