summaryrefslogtreecommitdiff
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c3120
1 files changed, 1436 insertions, 1684 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6d24b2e8d82d..fd18f395a1bf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
-#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
@@ -75,9 +74,11 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include <asm/mutex.h>
#include "sched_cpupri.h"
#include "workqueue_sched.h"
+#include "sched_autogroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -230,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
#endif
/*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
* detach_destroy_domains and partition_sched_domains.
*/
static DEFINE_MUTEX(sched_domains_mutex);
@@ -253,6 +254,8 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+
+ atomic_t load_weight;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +271,18 @@ struct task_group {
struct task_group *parent;
struct list_head siblings;
struct list_head children;
-};
-#define root_task_group init_task_group
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
+};
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
- return list_empty(&root_task_group.children);
-}
-#endif
-
-# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
+# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
/*
* A weight of 0 or 1 can cause arithmetics problems.
@@ -297,15 +293,15 @@ static int root_task_group_empty(void)
* limitation from this.)
*/
#define MIN_SHARES 2
-#define MAX_SHARES (1UL << 18)
+#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
#endif
/* Default task group.
* Every task in system belong to this group at bootup.
*/
-struct task_group init_task_group;
+struct task_group root_task_group;
#endif /* CONFIG_CGROUP_SCHED */
@@ -316,6 +312,9 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+#endif
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
@@ -327,9 +326,11 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next, *last;
+ struct sched_entity *curr, *next, *last, *skip;
+#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
+#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -342,6 +343,7 @@ struct cfs_rq {
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
+ int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
@@ -360,14 +362,17 @@ struct cfs_rq {
unsigned long h_load;
/*
- * this cpu's part of tg->shares
+ * Maintaining per-cpu shares distribution for group scheduling
+ *
+ * load_stamp is the last time we updated the load average
+ * load_last is the last time we updated the load average and saw load
+ * load_unacc_exec_time is currently unaccounted execution time
*/
- unsigned long shares;
+ u64 load_avg;
+ u64 load_period;
+ u64 load_stamp, load_last, load_unacc_exec_time;
- /*
- * load.weight at the time we set shares
- */
- unsigned long rq_weight;
+ unsigned long load_contribution;
#endif
#endif
};
@@ -417,6 +422,7 @@ struct rt_rq {
*/
struct root_domain {
atomic_t refcount;
+ struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
@@ -460,7 +466,7 @@ struct rq {
u64 nohz_stamp;
unsigned char nohz_balance_kick;
#endif
- unsigned int skip_clock_update;
+ int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
@@ -552,9 +558,10 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
+#endif
- /* BKL stats */
- unsigned int bkl_count;
+#ifdef CONFIG_SMP
+ struct task_struct *wake_list;
#endif
};
@@ -574,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
- rcu_read_lock_sched_held() || \
+ rcu_read_lock_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
@@ -599,17 +606,20 @@ static inline int cpu_of(struct rq *rq)
* Return the group to which this tasks belongs.
*
* We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
* holds that lock for each task it moves into the cgroup. Therefore
* by holding that lock, we pin the task to the current cgroup.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ lockdep_is_held(&p->pi_lock));
+ tg = container_of(css, struct task_group, css);
+
+ return autogroup_task_group(p, tg);
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -636,22 +646,18 @@ static inline struct task_group *task_group(struct task_struct *p)
#endif /* CONFIG_CGROUP_SCHED */
-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update) {
- int cpu = cpu_of(rq);
- u64 irq_time;
+ s64 delta;
- rq->clock = sched_clock_cpu(cpu);
- irq_time = irq_time_cpu(cpu);
- if (rq->clock - irq_time > rq->clock_task)
- rq->clock_task = rq->clock - irq_time;
+ if (rq->skip_clock_update > 0)
+ return;
- sched_irq_time_avg_update(rq, irq_time);
- }
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
}
/*
@@ -664,10 +670,9 @@ inline void update_rq_clock(struct rq *rq)
#endif
/**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
* @cpu: the processor in question.
*
- * Returns true if the current cpu runqueue is locked.
* This interface allows printk to be called with the runqueue lock
* held and know whether or not it is OK to wake up the klogd.
*/
@@ -741,7 +746,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
cmp = strstrip(buf);
- if (strncmp(buf, "NO_", 3) == 0) {
+ if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
cmp += 3;
}
@@ -797,20 +802,6 @@ late_initcall(sched_init_debug);
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
-/*
* period over which we average the RT time consumption, measured
* in ms.
*
@@ -857,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
return rq->curr == p;
}
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline int task_running(struct rq *rq, struct task_struct *p)
{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
return task_current(rq, p);
+#endif
}
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->on_cpu = 1;
+#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
+#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
@@ -884,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
- return p->oncpu;
-#else
- return task_current(rq, p);
-#endif
-}
-
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
@@ -901,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
* SMP rebalancing from interrupt is the only thing that cares
* here.
*/
- next->oncpu = 1;
+ next->on_cpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
raw_spin_unlock_irq(&rq->lock);
@@ -914,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
- * After ->oncpu is cleared, the task can be moved to a different CPU.
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*/
smp_wmb();
- prev->oncpu = 0;
+ prev->on_cpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
@@ -928,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
- return unlikely(p->state == TASK_WAKING);
-}
-
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
+ * __task_rq_lock - lock the rq @p resides on.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;
+ lockdep_assert_held(&p->pi_lock);
+
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
@@ -955,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
}
/*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
- local_irq_save(*flags);
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
}
@@ -980,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
raw_spin_unlock(&rq->lock);
}
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
+ __releases(p->pi_lock)
{
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
/*
@@ -1212,11 +1210,17 @@ int get_nohz_timer_target(void)
int i;
struct sched_domain *sd;
+ rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu(i, sched_domain_span(sd))
- if (!idle_cpu(i))
- return i;
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (!idle_cpu(i)) {
+ cpu = i;
+ goto unlock;
+ }
+ }
}
+unlock:
+ rcu_read_unlock();
return cpu;
}
/*
@@ -1326,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
{
u64 tmp;
+ /*
+ * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+ * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+ * 2^SCHED_LOAD_RESOLUTION.
+ */
+ if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+ tmp = (u64)delta_exec * scale_load_down(weight);
+ else
+ tmp = (u64)delta_exec;
+
if (!lw->inv_weight) {
- if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ unsigned long w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
else
- lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
- / (lw->weight+1);
+ lw->inv_weight = WMULT_CONST / w;
}
- tmp = (u64)delta_exec * weight;
/*
* Check whether we'd overflow the 64-bit multiplication:
*/
@@ -1359,6 +1375,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
lw->inv_weight = 0;
}
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1547,101 +1569,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares,
- unsigned long sd_rq_weight,
- unsigned long *usd_rq_weight)
-{
- unsigned long shares, rq_weight;
- int boost = 0;
-
- rq_weight = usd_rq_weight[cpu];
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum_j shares_j * rq_weight_i
- * shares_i = -----------------------------
- * \Sum_j rq_weight_j
- */
- shares = (sd_shares * rq_weight) / sd_rq_weight;
- shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
- if (abs(shares - tg->se[cpu]->load.weight) >
- sysctl_sched_shares_thresh) {
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- __set_se_shares(tg->se[cpu], shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
- unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
- unsigned long *usd_rq_weight;
- struct sched_domain *sd = data;
- unsigned long flags;
- int i;
-
- if (!tg->se[0])
- return 0;
-
- local_irq_save(flags);
- usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
- for_each_cpu(i, sched_domain_span(sd)) {
- weight = tg->cfs_rq[i]->load.weight;
- usd_rq_weight[i] = weight;
-
- rq_weight += weight;
- /*
- * If there are currently no tasks on the cpu pretend there
- * is one of average load so that when a new task gets to
- * run here it will not get delayed by group starvation.
- */
- if (!weight)
- weight = NICE_0_LOAD;
-
- sum_weight += weight;
- shares += tg->cfs_rq[i]->shares;
- }
-
- if (!rq_weight)
- rq_weight = sum_weight;
-
- if ((!shares && rq_weight) || shares > tg->shares)
- shares = tg->shares;
-
- if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
- shares = tg->shares;
-
- for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
- local_irq_restore(flags);
-
- return 0;
-}
-
/*
* Compute the cpu's hierarchical load factor for each task group.
* This needs to be done in a top-down fashion because the load of a child
@@ -1656,7 +1583,7 @@ static int tg_load_down(struct task_group *tg, void *data)
load = cpu_rq(cpu)->load.weight;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->cfs_rq[cpu]->shares;
+ load *= tg->se[cpu]->load.weight;
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
}
@@ -1665,34 +1592,11 @@ static int tg_load_down(struct task_group *tg, void *data)
return 0;
}
-static void update_shares(struct sched_domain *sd)
-{
- s64 elapsed;
- u64 now;
-
- if (root_task_group_empty())
- return;
-
- now = local_clock();
- elapsed = now - sd->last_update;
-
- if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
- sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, sd);
- }
-}
-
static void update_h_load(long cpu)
{
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
#endif
#ifdef CONFIG_PREEMPT
@@ -1812,15 +1716,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
-#endif
+#else /* CONFIG_SMP */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
{
-#ifdef CONFIG_SMP
- cfs_rq->shares = shares;
-#endif
+ BUG_ON(!irqs_disabled());
+ BUG_ON(rq1 != rq2);
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ BUG_ON(rq1 != rq2);
+ raw_spin_unlock(&rq1->lock);
+ __release(rq2->lock);
+}
+
#endif
static void calc_load_account_idle(struct rq *this_rq);
@@ -1862,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
+ int prio = p->static_prio - MAX_RT_PRIO;
+ struct load_weight *load = &p->se.load;
+
/*
* SCHED_IDLE tasks get minimal weight:
*/
if (p->policy == SCHED_IDLE) {
- p->se.load.weight = WEIGHT_IDLEPRIO;
- p->se.load.inv_weight = WMULT_IDLEPRIO;
+ load->weight = scale_load(WEIGHT_IDLEPRIO);
+ load->inv_weight = WMULT_IDLEPRIO;
return;
}
- p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
- p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+ load->weight = scale_load(prio_to_weight[prio]);
+ load->inv_weight = prio_to_wmult[prio];
}
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1880,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, flags);
- p->se.on_rq = 1;
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1888,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, flags);
- p->se.on_rq = 0;
}
/*
@@ -1924,10 +1853,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
* They are read and saved off onto struct rq in update_rq_clock().
* This may result in other CPU reading this CPU's irq time and can
* race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
*/
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1945,19 +1873,58 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
{
- if (!sched_clock_irqtime)
- return 0;
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
void account_system_vtime(struct task_struct *curr)
{
unsigned long flags;
+ s64 delta;
int cpu;
- u64 now, delta;
if (!sched_clock_irqtime)
return;
@@ -1965,9 +1932,10 @@ void account_system_vtime(struct task_struct *curr)
local_irq_save(flags);
cpu = smp_processor_id();
- now = sched_clock_cpu(cpu);
- delta = now - per_cpu(irq_start_time, cpu);
- per_cpu(irq_start_time, cpu) = now;
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -1975,37 +1943,92 @@ void account_system_vtime(struct task_struct *curr)
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
- per_cpu(cpu_hardirq_time, cpu) += delta;
- else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- per_cpu(cpu_softirq_time, cpu) += delta;
+ __this_cpu_add(cpu_hardirq_time, delta);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ __this_cpu_add(cpu_softirq_time, delta);
+ irq_time_write_end();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- rq->prev_irq_time = curr_irq_time;
- sched_rt_avg_update(rq, delta_irq);
- }
+ s64 irq_delta;
+
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ rq->clock_task += delta;
+
+ if (irq_delta && sched_feat(NONIRQ_POWER))
+ sched_rt_avg_update(rq, irq_delta);
}
-#else
+static int irqtime_account_hi_update(void)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
-static u64 irq_time_cpu(int cpu)
+static int irqtime_account_si_update(void)
{
- return 0;
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
}
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-#endif
+#define sched_clock_irqtime (0)
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+ rq->clock_task += delta;
+}
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_autogroup.c"
#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
@@ -2098,14 +2121,14 @@ inline int task_curr(const struct task_struct *p)
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
- int oldprio, int running)
+ int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
- prev_class->switched_from(rq, p, running);
- p->sched_class->switched_to(rq, p, running);
- } else
- p->sched_class->prio_changed(rq, p, oldprio, running);
+ prev_class->switched_from(rq, p);
+ p->sched_class->switched_to(rq, p);
+ } else if (oldprio != p->prio)
+ p->sched_class->prio_changed(rq, p, oldprio);
}
static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2129,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (test_tsk_need_resched(rq->curr))
+ if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -2175,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock)));
+#endif
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -2195,21 +2223,6 @@ struct migration_arg {
static int migration_cpu_stop(void *data);
/*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
-{
- struct rq *rq = task_rq(p);
-
- /*
- * If the task is not on a runqueue (and not running), then
- * the next wake-up will properly place the task.
- */
- return p->se.on_rq || task_running(rq, p);
-}
-
-/*
* wait_task_inactive - wait for a thread to unschedule.
*
* If @match_state is nonzero, it's the @p->state value just checked and
@@ -2266,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
/*
* If it changed from the expected state, bail out now.
@@ -2299,7 +2312,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* yield - it could be a while.
*/
if (unlikely(on_rq)) {
- schedule_timeout_uninterruptible(1);
+ ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
continue;
}
@@ -2321,7 +2337,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* Cause a process which is running on another CPU to enter
* kernel-mode, without any delay. (to get signals handled.)
*
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
* because all it wants to ensure is that the remote task enters
* the kernel. If the IPI races and the task has been migrated
* to another CPU then no harm is done and the purpose has been
@@ -2340,30 +2356,9 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
#endif /* CONFIG_SMP */
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p: the task to evaluate
- * @func: the function to be called
- * @info: the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
- void (*func) (void *info), void *info)
-{
- int cpu;
-
- preempt_disable();
- cpu = task_cpu(p);
- if (task_curr(p))
- smp_call_function_single(cpu, func, info, 1);
- preempt_enable();
-}
-
#ifdef CONFIG_SMP
/*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
@@ -2381,30 +2376,27 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
return dest_cpu;
/* No more Mr. Nice Guy. */
- if (unlikely(dest_cpu >= nr_cpu_ids)) {
- dest_cpu = cpuset_cpus_allowed_fallback(p);
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- printk(KERN_INFO "process %d (%s) no "
- "longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
- }
+ dest_cpu = cpuset_cpus_allowed_fallback(p);
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
}
return dest_cpu;
}
/*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2430,27 +2422,63 @@ static void update_avg(u64 *avg, u64 sample)
}
#endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
- bool is_sync, bool is_migrate, bool is_local,
- unsigned long en_flags)
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
- schedstat_inc(p, se.statistics.nr_wakeups);
- if (is_sync)
- schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (is_migrate)
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- if (is_local)
+#ifdef CONFIG_SCHEDSTATS
+ struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+ int this_cpu = smp_processor_id();
+
+ if (cpu == this_cpu) {
+ schedstat_inc(rq, ttwu_local);
schedstat_inc(p, se.statistics.nr_wakeups_local);
- else
+ } else {
+ struct sched_domain *sd;
+
schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ if (wake_flags & WF_MIGRATED)
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+
+#endif /* CONFIG_SMP */
+ schedstat_inc(rq, ttwu_count);
+ schedstat_inc(p, se.statistics.nr_wakeups);
+
+ if (wake_flags & WF_SYNC)
+ schedstat_inc(p, se.statistics.nr_wakeups_sync);
+
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
activate_task(rq, p, en_flags);
+ p->on_rq = 1;
+
+ /* if a worker is waking up, notify workqueue */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_waking_up(p, cpu_of(rq));
}
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
- int wake_flags, bool success)
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- trace_sched_wakeup(p, success);
+ trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -2469,9 +2497,119 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
rq->idle_stamp = 0;
}
#endif
- /* if a worker is waking up, notify workqueue */
- if ((p->flags & PF_WQ_WORKER) && success)
- wq_worker_waking_up(p, cpu_of(rq));
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+#endif
+
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_rq) {
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+static void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ raw_spin_lock(&rq->lock);
+
+ while (list) {
+ struct task_struct *p = list;
+ list = list->wake_entry;
+ ttwu_do_activate(rq, p, 0);
+ }
+
+ raw_spin_unlock(&rq->lock);
+}
+
+void scheduler_ipi(void)
+{
+ sched_ttwu_pending();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *next = rq->wake_list;
+
+ for (;;) {
+ struct task_struct *old = next;
+
+ p->wake_entry = next;
+ next = cmpxchg(&rq->wake_list, old, p);
+ if (next == old)
+ break;
+ }
+
+ if (!next)
+ smp_send_reschedule(cpu);
+}
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_cpu) {
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+ if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ ttwu_queue_remote(p, cpu);
+ return;
+ }
+#endif
+
+ raw_spin_lock(&rq->lock);
+ ttwu_do_activate(rq, p, 0);
+ raw_spin_unlock(&rq->lock);
}
/**
@@ -2489,92 +2627,66 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
* Returns %true if @p was woken up, %false if it was already running
* or @state didn't match @p's state.
*/
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
- int wake_flags)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
- unsigned long en_flags = ENQUEUE_WAKEUP;
- struct rq *rq;
-
- this_cpu = get_cpu();
+ int cpu, success = 0;
smp_wmb();
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
- if (p->se.on_rq)
- goto out_running;
-
+ success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
- orig_cpu = cpu;
-#ifdef CONFIG_SMP
- if (unlikely(task_running(rq, p)))
- goto out_activate;
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto stat;
+#ifdef CONFIG_SMP
/*
- * In order to handle concurrent wakeups and release the rq->lock
- * we put the task in TASK_WAKING state.
- *
- * First fix up the nr_uninterruptible count:
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
*/
- if (task_contributes_to_load(p)) {
- if (likely(cpu_online(orig_cpu)))
- rq->nr_uninterruptible--;
- else
- this_rq()->nr_uninterruptible--;
+ while (p->on_cpu) {
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ /*
+ * In case the architecture enables interrupts in
+ * context_switch(), we cannot busy wait, since that
+ * would lead to deadlocks when an interrupt hits and
+ * tries to wake up @prev. So bail and do a complete
+ * remote wakeup.
+ */
+ if (ttwu_activate_remote(p, wake_flags))
+ goto stat;
+#else
+ cpu_relax();
+#endif
}
+ /*
+ * Pairs with the smp_wmb() in finish_lock_switch().
+ */
+ smp_rmb();
+
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
- if (p->sched_class->task_waking) {
- p->sched_class->task_waking(rq, p);
- en_flags |= ENQUEUE_WAKING;
- }
+ if (p->sched_class->task_waking)
+ p->sched_class->task_waking(p);
- cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu)
+ cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+ if (task_cpu(p) != cpu) {
+ wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
- __task_rq_unlock(rq);
-
- rq = cpu_rq(cpu);
- raw_spin_lock(&rq->lock);
-
- /*
- * We migrated the task without holding either rq->lock, however
- * since the task is not on the task list itself, nobody else
- * will try and migrate the task, hence the rq should match the
- * cpu we just moved it to.
- */
- WARN_ON(task_cpu(p) != cpu);
- WARN_ON(p->state != TASK_WAKING);
-
-#ifdef CONFIG_SCHEDSTATS
- schedstat_inc(rq, ttwu_count);
- if (cpu == this_cpu)
- schedstat_inc(rq, ttwu_local);
- else {
- struct sched_domain *sd;
- for_each_domain(this_cpu, sd) {
- if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd, ttwu_wake_remote);
- break;
- }
- }
}
-#endif /* CONFIG_SCHEDSTATS */
-
-out_activate:
#endif /* CONFIG_SMP */
- ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
- cpu == this_cpu, en_flags);
- success = 1;
-out_running:
- ttwu_post_activation(p, rq, wake_flags, success);
+
+ ttwu_queue(p, cpu);
+stat:
+ ttwu_stat(p, cpu, wake_flags);
out:
- task_rq_unlock(rq, &flags);
- put_cpu();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
}
@@ -2583,31 +2695,34 @@ out:
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
- * Put @p on the run-queue if it's not alredy there. The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task. this_rq() stays locked over invocation.
+ * the current task.
*/
static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- bool success = false;
BUG_ON(rq != this_rq());
BUG_ON(p == current);
lockdep_assert_held(&rq->lock);
+ if (!raw_spin_trylock(&p->pi_lock)) {
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ }
+
if (!(p->state & TASK_NORMAL))
- return;
+ goto out;
- if (!p->se.on_rq) {
- if (likely(!task_running(rq, p))) {
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(rq, ttwu_local);
- }
- ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
- success = true;
- }
- ttwu_post_activation(p, rq, 0, success);
+ if (!p->on_rq)
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+
+ ttwu_do_wakeup(rq, p, 0);
+ ttwu_stat(p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
}
/**
@@ -2640,18 +2755,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*/
static void __sched_fork(struct task_struct *p)
{
+ p->on_rq = 0;
+
+ p->se.on_rq = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
+ p->se.vruntime = 0;
+ INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
INIT_LIST_HEAD(&p->rt.run_list);
- p->se.on_rq = 0;
- INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2661,8 +2779,9 @@ static void __sched_fork(struct task_struct *p)
/*
* fork()/clone()-time setup:
*/
-void sched_fork(struct task_struct *p, int clone_flags)
+void sched_fork(struct task_struct *p)
{
+ unsigned long flags;
int cpu = get_cpu();
__sched_fork(p);
@@ -2713,22 +2832,24 @@ void sched_fork(struct task_struct *p, int clone_flags)
*
* Silence PROVE_RCU.
*/
- rcu_read_lock();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
set_task_cpu(p, cpu);
- rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- p->oncpu = 0;
+#if defined(CONFIG_SMP)
+ p->on_cpu = 0;
#endif
#ifdef CONFIG_PREEMPT
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
put_cpu();
}
@@ -2740,41 +2861,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
- int cpu __maybe_unused = get_cpu();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP
- rq = task_rq_lock(p, &flags);
- p->state = TASK_WAKING;
-
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
- *
- * We set TASK_WAKING so that select_task_rq() can drop rq->lock
- * without people poking at ->cpus_allowed.
*/
- cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
- set_task_cpu(p, cpu);
-
- p->state = TASK_RUNNING;
- task_rq_unlock(rq, &flags);
+ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
#endif
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
activate_task(rq, p, 0);
- trace_sched_wakeup_new(p, 1);
+ p->on_rq = 1;
+ trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
#endif
- task_rq_unlock(rq, &flags);
- put_cpu();
+ task_rq_unlock(rq, p, &flags);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2852,9 +2963,12 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ sched_info_switch(prev, next);
+ perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
+ trace_sched_switch(prev, next);
}
/**
@@ -2987,7 +3101,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_sched_switch(prev, next);
+
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -3119,6 +3233,15 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
#ifdef CONFIG_NO_HZ
/*
* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3271,128 @@ static long calc_load_fold_idle(void)
return delta;
}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+ long delta, active, n;
+
+ if (time_before(jiffies, calc_load_update))
+ return;
+
+ /*
+ * If we crossed a calc_load_update boundary, make sure to fold
+ * any pending idle changes, the respective CPUs might have
+ * missed the tick driven calc_load_account_active() update
+ * due to NO_HZ.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ /*
+ * If we were idle for multiple load cycles, apply them.
+ */
+ if (ticks >= LOAD_FREQ) {
+ n = ticks / LOAD_FREQ;
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Its possible the remainder of the above division also crosses
+ * a LOAD_FREQ period, the regular check in calc_global_load()
+ * which comes after this will take care of that.
+ *
+ * Consider us being 11 ticks before a cycle completion, and us
+ * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ * age us 4 cycles, and the test in calc_global_load() will
+ * pick up the final one.
+ */
+}
#else
static void calc_load_account_idle(struct rq *this_rq)
{
@@ -3157,6 +3402,10 @@ static inline long calc_load_fold_idle(void)
{
return 0;
}
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
#endif
/**
@@ -3174,24 +3423,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
}
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- return load >> FSHIFT;
-}
-
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
{
- unsigned long upd = calc_load_update + 10;
long active;
- if (time_before(jiffies, upd))
+ calc_global_nohz(ticks);
+
+ if (time_before(jiffies, calc_load_update + 10))
return;
active = atomic_long_read(&calc_load_tasks);
@@ -3352,27 +3594,22 @@ void sched_exec(void)
{
struct task_struct *p = current;
unsigned long flags;
- struct rq *rq;
int dest_cpu;
- rq = task_rq_lock(p, &flags);
- dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
- /*
- * select_task_rq() can race against ->cpus_allowed
- */
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+ if (likely(cpu_active(dest_cpu))) {
struct migration_arg arg = { p, dest_cpu };
- task_rq_unlock(rq, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
return;
}
unlock:
- task_rq_unlock(rq, &flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
#endif
@@ -3409,7 +3646,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
rq = task_rq_lock(p, &flags);
ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -3427,7 +3664,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -3451,7 +3688,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags);
thread_group_cputime(p, &totals);
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -3516,6 +3753,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
}
/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+ cputime64_t tmp = cputime_to_cputime64(cputime);
+
+ /* Add system time to process. */
+ p->stime = cputime_add(p->stime, cputime);
+ p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+ cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+ /* Account for system time used */
+ acct_update_integrals(p);
+}
+
+/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3526,36 +3789,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t tmp;
+ cputime64_t *target_cputime64;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
return;
}
- /* Add system time to process. */
- p->stime = cputime_add(p->stime, cputime);
- p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
- cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ target_cputime64 = &cpustat->irq;
else if (in_serving_softirq())
- cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ target_cputime64 = &cpustat->softirq;
else
- cpustat->system = cputime64_add(cpustat->system, tmp);
+ target_cputime64 = &cpustat->system;
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-
- /* Account for system time used */
- acct_update_integrals(p);
+ __account_system_time(p, cputime, cputime_scaled, target_cputime64);
}
/*
* Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
*/
void account_steal_time(cputime_t cputime)
{
@@ -3583,6 +3836,73 @@ void account_idle_time(cputime_t cputime)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+ if (irqtime_account_hi_update()) {
+ cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ } else if (irqtime_account_si_update()) {
+ cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->softirq);
+ } else if (user_tick) {
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime_one_jiffy);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else {
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->system);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ int i;
+ struct rq *rq = this_rq();
+
+ for (i = 0; i < ticks; i++)
+ irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
@@ -3593,6 +3913,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq);
+ return;
+ }
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3618,6 +3943,12 @@ void account_steal_ticks(unsigned long ticks)
*/
void account_idle_ticks(unsigned long ticks)
{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
account_idle_time(jiffies_to_cputime(ticks));
}
@@ -3711,9 +4042,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
*/
void scheduler_tick(void)
{
@@ -3833,19 +4161,12 @@ static inline void schedule_debug(struct task_struct *prev)
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq(), sched_count);
-#ifdef CONFIG_SCHEDSTATS
- if (unlikely(prev->lock_depth >= 0)) {
- schedstat_inc(this_rq(), bkl_count);
- schedstat_inc(prev, sched_info.bkl_count);
- }
-#endif
}
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->se.on_rq)
+ if (prev->on_rq || rq->skip_clock_update < 0)
update_rq_clock(rq);
- rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
@@ -3894,27 +4215,25 @@ need_resched:
rcu_note_context_switch(cpu);
prev = rq->curr;
- release_kernel_lock(prev);
-need_resched_nonpreemptible:
-
schedule_debug(prev);
if (sched_feat(HRTICK))
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- clear_tsk_need_resched(prev);
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
+ deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ prev->on_rq = 0;
+
/*
- * If a worker is going to sleep, notify and
- * ask workqueue whether it wants to wake up a
- * task to maintain concurrency. If so, wake
- * up the task.
+ * If a worker went to sleep, notify and ask workqueue
+ * whether it wants to wake up a task to maintain
+ * concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
@@ -3923,7 +4242,16 @@ need_resched_nonpreemptible:
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
+
+ /*
+ * If we are going to sleep and we have plugged IO
+ * queued, make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(prev)) {
+ raw_spin_unlock(&rq->lock);
+ blk_schedule_flush_plug(prev);
+ raw_spin_lock(&rq->lock);
+ }
}
switch_count = &prev->nvcsw;
}
@@ -3935,11 +4263,10 @@ need_resched_nonpreemptible:
put_prev_task(rq, prev);
next = pick_next_task(rq);
+ clear_tsk_need_resched(prev);
+ rq->skip_clock_update = 0;
if (likely(prev != next)) {
- sched_info_switch(prev, next);
- perf_event_task_sched_out(prev, next);
-
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -3958,9 +4285,6 @@ need_resched_nonpreemptible:
post_schedule(rq);
- if (unlikely(reacquire_kernel_lock(prev)))
- goto need_resched_nonpreemptible;
-
preempt_enable_no_resched();
if (need_resched())
goto need_resched;
@@ -3968,70 +4292,53 @@ need_resched_nonpreemptible:
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ bool ret = false;
+
+ rcu_read_lock();
+ if (lock->owner != owner)
+ goto fail;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ ret = owner->on_cpu;
+fail:
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
*/
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
{
- unsigned int cpu;
- struct rq *rq;
-
if (!sched_feat(OWNER_SPIN))
return 0;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * Need to access the cpu field knowing that
- * DEBUG_PAGEALLOC could have unmapped it if
- * the mutex owner just released it and exited.
- */
- if (probe_kernel_address(&owner->cpu, cpu))
- return 0;
-#else
- cpu = owner->cpu;
-#endif
+ while (owner_running(lock, owner)) {
+ if (need_resched())
+ return 0;
- /*
- * Even if the access succeeded (likely case),
- * the cpu field may no longer be valid.
- */
- if (cpu >= nr_cpumask_bits)
- return 0;
+ arch_mutex_cpu_relax();
+ }
/*
- * We need to validate that we can do a
- * get_cpu() and that we have the percpu area.
+ * If the owner changed to another task there is likely
+ * heavy contention, stop spinning.
*/
- if (!cpu_online(cpu))
+ if (lock->owner)
return 0;
- rq = cpu_rq(cpu);
-
- for (;;) {
- /*
- * Owner changed, break to re-assess state.
- */
- if (lock->owner != owner) {
- /*
- * If the lock has switched to a different owner,
- * we likely have heavy contention. Return 0 to quit
- * optimistic spinning and not contend further:
- */
- if (lock->owner)
- return 0;
- break;
- }
-
- /*
- * Is that owner really running on that cpu?
- */
- if (task_thread_info(rq->curr) != owner || need_resched())
- return 0;
-
- cpu_relax();
- }
-
return 1;
}
#endif
@@ -4161,6 +4468,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
{
__wake_up_common(q, mode, 1, 0, key);
}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4341,7 +4649,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*/
-unsigned long __sched
+long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
{
@@ -4374,7 +4682,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*/
-unsigned long __sched
+long __sched
wait_for_completion_killable_timeout(struct completion *x,
unsigned long timeout)
{
@@ -4490,19 +4798,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- unsigned long flags;
int oldprio, on_rq, running;
struct rq *rq;
const struct sched_class *prev_class;
BUG_ON(prio < 0 || prio > MAX_PRIO);
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
dequeue_task(rq, p, 0);
@@ -4518,12 +4825,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
- task_rq_unlock(rq, &flags);
+ check_class_changed(rq, p, prev_class, oldprio);
+ __task_rq_unlock(rq);
}
#endif
@@ -4551,7 +4857,7 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);
@@ -4571,7 +4877,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_task(rq->curr);
}
out_unlock:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
}
EXPORT_SYMBOL(set_user_nice);
@@ -4685,8 +4991,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
{
- BUG_ON(p->se.on_rq);
-
p->policy = policy;
p->rt_priority = prio;
p->normal_prio = normal_prio(p);
@@ -4709,14 +5013,17 @@ static bool check_same_owner(struct task_struct *p)
rcu_read_lock();
pcred = __task_cred(p);
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
+ if (cred->user->user_ns == pcred->user->user_ns)
+ match = (cred->euid == pcred->euid ||
+ cred->euid == pcred->uid);
+ else
+ match = false;
rcu_read_unlock();
return match;
}
static int __sched_setscheduler(struct task_struct *p, int policy,
- struct sched_param *param, bool user)
+ const struct sched_param *param, bool user)
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
@@ -4770,12 +5077,15 @@ recheck:
param->sched_priority > rlim_rtprio)
return -EPERM;
}
+
/*
- * Like positive nice levels, dont allow tasks to
- * move out of SCHED_IDLE either:
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
- return -EPERM;
+ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ if (!can_nice(p, TASK_NICE(p)))
+ return -EPERM;
+ }
/* can't change other user's priorities */
if (!check_same_owner(p))
@@ -4795,21 +5105,29 @@ recheck:
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- /*
- * To be able to change p->policy safely, the apropriate
+ *
+ * To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
- rq = __task_rq_lock(p);
+ rq = task_rq_lock(p, &flags);
/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
+ task_rq_unlock(rq, p, &flags);
+ return -EINVAL;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further:
+ */
+ if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+ param->sched_priority == p->rt_priority))) {
+
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- return -EINVAL;
+ return 0;
}
#ifdef CONFIG_RT_GROUP_SCHED
@@ -4819,9 +5137,9 @@ recheck:
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0) {
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
+ task_rq_unlock(rq, p, &flags);
return -EPERM;
}
}
@@ -4830,11 +5148,10 @@ recheck:
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
goto recheck;
}
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
deactivate_task(rq, p, 0);
@@ -4849,13 +5166,11 @@ recheck:
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
activate_task(rq, p, 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ check_class_changed(rq, p, prev_class, oldprio);
+ task_rq_unlock(rq, p, &flags);
rt_mutex_adjust_pi(p);
@@ -4871,7 +5186,7 @@ recheck:
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
- struct sched_param *param)
+ const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, true);
}
@@ -4889,7 +5204,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* but our caller might not have that capability.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- struct sched_param *param)
+ const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, false);
}
@@ -5035,7 +5350,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+ if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p);
@@ -5106,7 +5421,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
unsigned long flags;
- struct rq *rq;
int retval;
get_online_cpus();
@@ -5121,9 +5435,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
if (retval)
goto out_unlock;
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
- task_rq_unlock(rq, &flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
@@ -5270,6 +5584,67 @@ void __sched yield(void)
}
EXPORT_SYMBOL(yield);
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ bool yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ double_rq_lock(rq, p_rq);
+ while (task_rq(p) != p_rq) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ if (!curr->sched_class->yield_to_task)
+ goto out;
+
+ if (curr->sched_class != p->sched_class)
+ goto out;
+
+ if (task_running(p_rq, p) || p->state)
+ goto out;
+
+ yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ if (yielded) {
+ schedstat_inc(rq, yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity takes care of
+ * fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_task(p_rq->curr);
+ }
+
+out:
+ double_rq_unlock(rq, p_rq);
+ local_irq_restore(flags);
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
@@ -5280,6 +5655,7 @@ void __sched io_schedule(void)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
@@ -5295,6 +5671,7 @@ long __sched io_schedule_timeout(long timeout)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
ret = schedule_timeout(timeout);
current->in_iowait = 0;
@@ -5385,7 +5762,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
rq = task_rq_lock(p, &flags);
time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
rcu_read_unlock();
jiffies_to_timespec(time_slice, &t);
@@ -5405,7 +5782,7 @@ void sched_show_task(struct task_struct *p)
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
- printk(KERN_INFO "%-13.13s %c", p->comm,
+ printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
@@ -5443,7 +5820,7 @@ void show_state_filter(unsigned long state_filter)
do_each_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
- * console might take alot of time:
+ * console might take a lot of time:
*/
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
@@ -5487,7 +5864,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ do_set_cpus_allowed(idle, cpumask_of(cpu));
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5503,22 +5880,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- idle->oncpu = 1;
+#if defined(CONFIG_SMP)
+ idle->on_cpu = 1;
#endif
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
- task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
task_thread_info(idle)->preempt_count = 0;
-#endif
+
/*
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
- ftrace_graph_init_task(idle);
+ ftrace_graph_init_idle_task(idle, cpu);
}
/*
@@ -5569,7 +5943,6 @@ static void update_sysctl(void)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
- SET_SYSCTL(sched_shares_ratelimit);
#undef SET_SYSCTL
}
@@ -5579,6 +5952,16 @@ static inline void sched_init_granularity(void)
}
#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class && p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ else {
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ }
+}
+
/*
* This is how migration works:
*
@@ -5609,52 +5992,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
unsigned int dest_cpu;
int ret = 0;
- /*
- * Serialize against TASK_WAKING so that ttwu() and wunt() can
- * drop the rq->lock and still rely on ->cpus_allowed.
- */
-again:
- while (task_is_waking(p))
- cpu_relax();
rq = task_rq_lock(p, &flags);
- if (task_is_waking(p)) {
- task_rq_unlock(rq, &flags);
- goto again;
- }
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
goto out;
}
- if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
- !cpumask_equal(&p->cpus_allowed, new_mask))) {
+ if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
ret = -EINVAL;
goto out;
}
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
- else {
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
- }
+ do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (migrate_task(p, dest_cpu)) {
+ if (p->on_rq) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
}
out:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ret;
}
@@ -5682,6 +6051,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);
+ raw_spin_lock(&p->pi_lock);
double_rq_lock(rq_src, rq_dest);
/* Already moved. */
if (task_cpu(p) != src_cpu)
@@ -5694,7 +6064,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->se.on_rq) {
+ if (p->on_rq) {
deactivate_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
activate_task(rq_dest, p, 0);
@@ -5704,6 +6074,7 @@ done:
ret = 1;
fail:
double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&p->pi_lock);
return ret;
}
@@ -5727,29 +6098,20 @@ static int migration_cpu_stop(void *data)
}
#ifdef CONFIG_HOTPLUG_CPU
+
/*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
*/
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
{
- struct rq *rq = cpu_rq(dead_cpu);
- int needs_cpu, uninitialized_var(dest_cpu);
- unsigned long flags;
+ struct mm_struct *mm = current->active_mm;
- local_irq_save(flags);
+ BUG_ON(cpu_online(smp_processor_id()));
- raw_spin_lock(&rq->lock);
- needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
- if (needs_cpu)
- dest_cpu = select_fallback_rq(dead_cpu, p);
- raw_spin_unlock(&rq->lock);
- /*
- * It can only fail if we race with set_cpus_allowed(),
- * in the racer should migrate the task anyway.
- */
- if (needs_cpu)
- __migrate_task(p, dead_cpu, dest_cpu);
- local_irq_restore(flags);
+ if (mm != &init_mm)
+ switch_mm(mm, &init_mm, current);
+ mmdrop(mm);
}
/*
@@ -5762,128 +6124,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
static void migrate_nr_uninterruptible(struct rq *rq_src)
{
struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
- unsigned long flags;
- local_irq_save(flags);
- double_rq_lock(rq_src, rq_dest);
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
rq_src->nr_uninterruptible = 0;
- double_rq_unlock(rq_src, rq_dest);
- local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
- struct task_struct *p, *t;
-
- read_lock(&tasklist_lock);
-
- do_each_thread(t, p) {
- if (p == current)
- continue;
-
- if (task_cpu(p) == src_cpu)
- move_task_off_dead_cpu(src_cpu, p);
- } while_each_thread(t, p);
-
- read_unlock(&tasklist_lock);
}
/*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
*/
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
{
- int this_cpu = smp_processor_id();
- struct rq *rq = cpu_rq(this_cpu);
- struct task_struct *p = rq->idle;
- unsigned long flags;
-
- /* cpu has to be offline */
- BUG_ON(cpu_online(this_cpu));
-
- /*
- * Strictly not necessary since rest of the CPUs are stopped by now
- * and interrupts disabled on the current cpu.
- */
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
- activate_task(rq, p, 0);
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ rq->calc_load_active = 0;
}
/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
*/
-void idle_task_exit(void)
-{
- struct mm_struct *mm = current->active_mm;
-
- BUG_ON(cpu_online(smp_processor_id()));
-
- if (mm != &init_mm)
- switch_mm(mm, &init_mm, current);
- mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
{
struct rq *rq = cpu_rq(dead_cpu);
-
- /* Must be exiting, otherwise would be on tasklist. */
- BUG_ON(!p->exit_state);
-
- /* Cannot have done final schedule yet: would have vanished. */
- BUG_ON(p->state == TASK_DEAD);
-
- get_task_struct(p);
+ struct task_struct *next, *stop = rq->stop;
+ int dest_cpu;
/*
- * Drop lock around migration; if someone else moves it,
- * that's OK. No task can be added to this CPU, so iteration is
- * fine.
+ * Fudge the rq selection such that the below task selection loop
+ * doesn't get stuck on the currently eligible stop task.
+ *
+ * We're currently inside stop_machine() and the rq is either stuck
+ * in the stop_machine_cpu_stop() loop, or we're executing this code,
+ * either way we should never end up calling schedule() until we're
+ * done here.
*/
- raw_spin_unlock_irq(&rq->lock);
- move_task_off_dead_cpu(dead_cpu, p);
- raw_spin_lock_irq(&rq->lock);
-
- put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
- struct rq *rq = cpu_rq(dead_cpu);
- struct task_struct *next;
+ rq->stop = NULL;
for ( ; ; ) {
- if (!rq->nr_running)
+ /*
+ * There's this thread running, bail when that's the only
+ * remaining thread.
+ */
+ if (rq->nr_running == 1)
break;
+
next = pick_next_task(rq);
- if (!next)
- break;
+ BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
- migrate_dead(dead_cpu, next);
+ /* Find suitable destination for @next, with force if needed. */
+ dest_cpu = select_fallback_rq(dead_cpu, next);
+ raw_spin_unlock(&rq->lock);
+
+ __migrate_task(next, dead_cpu, dest_cpu);
+
+ raw_spin_lock(&rq->lock);
}
-}
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
- atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- rq->calc_load_active = 0;
+ rq->stop = stop;
}
+
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6093,15 +6396,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
unsigned long flags;
struct rq *rq = cpu_rq(cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
@@ -6113,33 +6414,26 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_live_tasks(cpu);
- /* Idle task back to normal (off runqueue, low prio) */
- raw_spin_lock_irq(&rq->lock);
- deactivate_task(rq, rq->idle, 0);
- __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
- rq->idle->sched_class = &idle_sched_class;
- migrate_dead_tasks(cpu);
- raw_spin_unlock_irq(&rq->lock);
- migrate_nr_uninterruptible(rq);
- BUG_ON(rq->nr_running != 0);
- calc_global_load_remove(rq);
- break;
-
case CPU_DYING:
- case CPU_DYING_FROZEN:
+ sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
+ migrate_tasks(cpu);
+ BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ migrate_nr_uninterruptible(rq);
+ calc_global_load_remove(rq);
break;
#endif
}
+
+ update_max_interval();
+
return NOTIFY_OK;
}
@@ -6200,6 +6494,8 @@ early_initcall(migration_init);
#ifdef CONFIG_SMP
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
#ifdef CONFIG_SCHED_DEBUG
static __read_mostly int sched_domain_debug_enabled;
@@ -6274,7 +6570,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->cpu_power != SCHED_LOAD_SCALE) {
+ if (group->cpu_power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
group->cpu_power);
}
@@ -6295,7 +6591,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
- cpumask_var_t groupmask;
int level = 0;
if (!sched_domain_debug_enabled)
@@ -6308,20 +6603,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
- if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
- printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
- return;
- }
-
for (;;) {
- if (sched_domain_debug_one(sd, cpu, level, groupmask))
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
break;
level++;
sd = sd->parent;
if (!sd)
break;
}
- free_cpumask_var(groupmask);
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6378,12 +6667,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1;
}
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
{
- synchronize_sched();
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri);
-
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
@@ -6424,7 +6712,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd)
- free_rootdomain(old_rd);
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd)
@@ -6475,6 +6763,25 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
+static void free_sched_domain(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+ if (atomic_dec_and_test(&sd->groups->ref))
+ kfree(sd->groups);
+ kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+ call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+ for (; sd; sd = sd->parent)
+ destroy_sched_domain(sd, cpu);
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
@@ -6485,9 +6792,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
- for (tmp = sd; tmp; tmp = tmp->parent)
- tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
-
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
@@ -6498,12 +6802,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
+ tmp = sd;
sd = sd->parent;
+ destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
}
@@ -6511,7 +6818,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
+ tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp, cpu);
}
/* cpus with isolated domains */
@@ -6527,56 +6836,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
- const struct cpumask *cpu_map,
- int (*group_fn)(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *tmpmask),
- struct cpumask *covered, struct cpumask *tmpmask)
-{
- struct sched_group *first = NULL, *last = NULL;
- int i;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct sched_group *sg;
- int group = group_fn(i, cpu_map, &sg, tmpmask);
- int j;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
-
- for_each_cpu(j, span) {
- if (group_fn(j, cpu_map, NULL, tmpmask) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- }
- last->next = first;
-}
-
#define SD_NODES_PER_DOMAIN 16
#ifdef CONFIG_NUMA
@@ -6593,7 +6852,7 @@ init_sched_build_groups(const struct cpumask *span,
*/
static int find_next_best_node(int node, nodemask_t *used_nodes)
{
- int i, n, val, min_val, best_node = 0;
+ int i, n, val, min_val, best_node = -1;
min_val = INT_MAX;
@@ -6617,7 +6876,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
}
}
- node_set(best_node, *used_nodes);
+ if (best_node != -1)
+ node_set(best_node, *used_nodes);
return best_node;
}
@@ -6643,315 +6903,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
int next_node = find_next_best_node(node, &used_nodes);
-
+ if (next_node < 0)
+ break;
cpumask_or(span, span, cpumask_of_node(next_node));
}
}
+
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+ lockdep_assert_held(&sched_domains_mutex);
+
+ sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+
+ return sched_domains_tmpmask;
+}
+
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+ return cpu_possible_mask;
+}
#endif /* CONFIG_NUMA */
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+ return cpumask_of_node(cpu_to_node(cpu));
+}
-/*
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- * and struct sched_domain. )
- */
-struct static_sched_group {
- struct sched_group sg;
- DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-struct static_sched_domain {
- struct sched_domain sd;
- DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+struct sd_data {
+ struct sched_domain **__percpu sd;
+ struct sched_group **__percpu sg;
};
struct s_data {
-#ifdef CONFIG_NUMA
- int sd_allnodes;
- cpumask_var_t domainspan;
- cpumask_var_t covered;
- cpumask_var_t notcovered;
-#endif
- cpumask_var_t nodemask;
- cpumask_var_t this_sibling_map;
- cpumask_var_t this_core_map;
- cpumask_var_t this_book_map;
- cpumask_var_t send_covered;
- cpumask_var_t tmpmask;
- struct sched_group **sched_group_nodes;
+ struct sched_domain ** __percpu sd;
struct root_domain *rd;
};
enum s_alloc {
- sa_sched_groups = 0,
sa_rootdomain,
- sa_tmpmask,
- sa_send_covered,
- sa_this_book_map,
- sa_this_core_map,
- sa_this_sibling_map,
- sa_nodemask,
- sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
- sa_notcovered,
- sa_covered,
- sa_domainspan,
-#endif
+ sa_sd,
+ sa_sd_storage,
sa_none,
};
-/*
- * SMT sched-domains:
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
+struct sched_domain_topology_level;
-static int
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *unused)
-{
- if (sg)
- *sg = &per_cpu(sched_groups, cpu).sg;
- return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-/*
- * multi-core sched-domains:
- */
-#ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-
-static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group;
-#ifdef CONFIG_SCHED_SMT
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_core, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_MC */
+struct sched_domain_topology_level {
+ sched_domain_init_f init;
+ sched_domain_mask_f mask;
+ struct sd_data data;
+};
/*
- * book sched-domains:
+ * Assumes the sched_domain tree is fully constructed
*/
-#ifdef CONFIG_SCHED_BOOK
-static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
-
-static int
-cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
- int group = cpu;
-#ifdef CONFIG_SCHED_MC
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_book, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_BOOK */
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
-static int
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group;
-#ifdef CONFIG_SCHED_BOOK
- cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_MC)
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
if (sg)
- *sg = &per_cpu(sched_group_phys, group).sg;
- return group;
+ *sg = *per_cpu_ptr(sdd->sg, cpu);
+
+ return cpu;
}
-#ifdef CONFIG_NUMA
/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
+ * build_sched_groups takes the cpumask we wish to span, and a pointer
+ * to a function which identifies what group(along with sched group) a CPU
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
+ *
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
*/
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
-
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
-
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *nodemask)
-{
- int group;
-
- cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
- group = cpumask_first(nodemask);
-
- if (sg)
- *sg = &per_cpu(sched_group_allnodes, group).sg;
- return group;
-}
-
-static void init_numa_sched_groups_power(struct sched_group *group_head)
-{
- struct sched_group *sg = group_head;
- int j;
-
- if (!sg)
- return;
- do {
- for_each_cpu(j, sched_group_cpus(sg)) {
- struct sched_domain *sd;
-
- sd = &per_cpu(phys_domains, j).sd;
- if (j != group_first_cpu(sd->groups)) {
- /*
- * Only add "power" once for each
- * physical package.
- */
- continue;
- }
-
- sg->cpu_power += sd->groups->cpu_power;
- }
- sg = sg->next;
- } while (sg != group_head);
-}
-
-static int build_numa_sched_groups(struct s_data *d,
- const struct cpumask *cpu_map, int num)
+static void
+build_sched_groups(struct sched_domain *sd)
{
- struct sched_domain *sd;
- struct sched_group *sg, *prev;
- int n, j;
-
- cpumask_clear(d->covered);
- cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
- if (cpumask_empty(d->nodemask)) {
- d->sched_group_nodes[num] = NULL;
- goto out;
- }
-
- sched_domain_node_span(num, d->domainspan);
- cpumask_and(d->domainspan, d->domainspan, cpu_map);
-
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING "Can not alloc domain group for node %d\n",
- num);
- return -ENOMEM;
- }
- d->sched_group_nodes[num] = sg;
-
- for_each_cpu(j, d->nodemask) {
- sd = &per_cpu(node_domains, j).sd;
- sd->groups = sg;
- }
-
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->nodemask);
- sg->next = sg;
- cpumask_or(d->covered, d->covered, d->nodemask);
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
- prev = sg;
- for (j = 0; j < nr_node_ids; j++) {
- n = (num + j) % nr_node_ids;
- cpumask_complement(d->notcovered, d->covered);
- cpumask_and(d->tmpmask, d->notcovered, cpu_map);
- cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
- if (cpumask_empty(d->tmpmask))
- break;
- cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
- if (cpumask_empty(d->tmpmask))
- continue;
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", j);
- return -ENOMEM;
- }
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->tmpmask);
- sg->next = prev->next;
- cpumask_or(d->covered, d->covered, d->tmpmask);
- prev->next = sg;
- prev = sg;
- }
-out:
- return 0;
-}
-#endif /* CONFIG_NUMA */
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
-#ifdef CONFIG_NUMA
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
- int cpu, i;
+ cpumask_clear(covered);
- for_each_cpu(cpu, cpu_map) {
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
+ for_each_cpu(i, span) {
+ struct sched_group *sg;
+ int group = get_group(i, sdd, &sg);
+ int j;
- if (!sched_group_nodes)
+ if (cpumask_test_cpu(i, covered))
continue;
- for (i = 0; i < nr_node_ids; i++) {
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ cpumask_clear(sched_group_cpus(sg));
+ sg->cpu_power = 0;
- cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- if (cpumask_empty(nodemask))
+ for_each_cpu(j, span) {
+ if (get_group(j, sdd, NULL) != group)
continue;
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
}
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
}
+ last->next = first;
}
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
-}
-#endif /* CONFIG_NUMA */
/*
* Initialize sched groups cpu_power.
@@ -6965,11 +7040,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- struct sched_domain *child;
- struct sched_group *group;
- long power;
- int weight;
-
WARN_ON(!sd || !sd->groups);
if (cpu != group_first_cpu(sd->groups))
@@ -6977,36 +7047,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
- child = sd->child;
-
- sd->groups->cpu_power = 0;
-
- if (!child) {
- power = SCHED_LOAD_SCALE;
- weight = cpumask_weight(sched_domain_span(sd));
- /*
- * SMT siblings share the power of a single core.
- * Usually multiple threads get a better yield out of
- * that one core than a single thread would have,
- * reflect that in sd->smt_gain.
- */
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- power *= sd->smt_gain;
- power /= weight;
- power >>= SCHED_LOAD_SHIFT;
- }
- sd->groups->cpu_power += power;
- return;
- }
-
- /*
- * Add cpu_power of each child group to this groups cpu_power.
- */
- group = child->groups;
- do {
- sd->groups->cpu_power += group->cpu_power;
- group = group->next;
- } while (group != child->groups);
+ update_group_power(sd, cpu);
}
/*
@@ -7020,15 +7061,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
# define SD_INIT_NAME(sd, type) do { } while (0)
#endif
-#define SD_INIT(sd, type) sd_init_##type(sd)
-
-#define SD_INIT_FUNC(type) \
-static noinline void sd_init_##type(struct sched_domain *sd) \
-{ \
- memset(sd, 0, sizeof(*sd)); \
- *sd = SD_##type##_INIT; \
- sd->level = SD_LV_##type; \
- SD_INIT_NAME(sd, type); \
+#define SD_INIT_FUNC(type) \
+static noinline struct sched_domain * \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
+{ \
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
+ *sd = SD_##type##_INIT; \
+ SD_INIT_NAME(sd, type); \
+ sd->private = &tl->data; \
+ return sd; \
}
SD_INIT_FUNC(CPU)
@@ -7047,13 +7088,14 @@ SD_INIT_FUNC(CPU)
#endif
static int default_relax_domain_level = -1;
+int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
unsigned long val;
val = simple_strtoul(str, NULL, 0);
- if (val < SD_LV_MAX)
+ if (val < sched_domain_level_max)
default_relax_domain_level = val;
return 1;
@@ -7081,37 +7123,20 @@ static void set_domain_attribute(struct sched_domain *sd,
}
}
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
switch (what) {
- case sa_sched_groups:
- free_sched_groups(cpu_map, d->tmpmask); /* fall through */
- d->sched_group_nodes = NULL;
case sa_rootdomain:
- free_rootdomain(d->rd); /* fall through */
- case sa_tmpmask:
- free_cpumask_var(d->tmpmask); /* fall through */
- case sa_send_covered:
- free_cpumask_var(d->send_covered); /* fall through */
- case sa_this_book_map:
- free_cpumask_var(d->this_book_map); /* fall through */
- case sa_this_core_map:
- free_cpumask_var(d->this_core_map); /* fall through */
- case sa_this_sibling_map:
- free_cpumask_var(d->this_sibling_map); /* fall through */
- case sa_nodemask:
- free_cpumask_var(d->nodemask); /* fall through */
- case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
- kfree(d->sched_group_nodes); /* fall through */
- case sa_notcovered:
- free_cpumask_var(d->notcovered); /* fall through */
- case sa_covered:
- free_cpumask_var(d->covered); /* fall through */
- case sa_domainspan:
- free_cpumask_var(d->domainspan); /* fall through */
-#endif
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd:
+ free_percpu(d->sd); /* fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map); /* fall through */
case sa_none:
break;
}
@@ -7120,308 +7145,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
-#ifdef CONFIG_NUMA
- if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
- return sa_none;
- if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
- return sa_domainspan;
- if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
- return sa_covered;
- /* Allocate the per-node list of sched groups */
- d->sched_group_nodes = kcalloc(nr_node_ids,
- sizeof(struct sched_group *), GFP_KERNEL);
- if (!d->sched_group_nodes) {
- printk(KERN_WARNING "Can not alloc sched group node list\n");
- return sa_notcovered;
- }
- sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
- if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- return sa_sched_group_nodes;
- if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
- return sa_nodemask;
- if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
- return sa_this_sibling_map;
- if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
- return sa_this_core_map;
- if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
- return sa_this_book_map;
- if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
- return sa_send_covered;
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
d->rd = alloc_rootdomain();
- if (!d->rd) {
- printk(KERN_WARNING "Cannot alloc root domain\n");
- return sa_tmpmask;
- }
+ if (!d->rd)
+ return sa_sd;
return sa_rootdomain;
}
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
{
- struct sched_domain *sd = NULL;
-#ifdef CONFIG_NUMA
- struct sched_domain *parent;
-
- d->sd_allnodes = 0;
- if (cpumask_weight(cpu_map) >
- SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
- sd = &per_cpu(allnodes_domains, i).sd;
- SD_INIT(sd, ALLNODES);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), cpu_map);
- cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
- d->sd_allnodes = 1;
- }
- parent = sd;
-
- sd = &per_cpu(node_domains, i).sd;
- SD_INIT(sd, NODE);
- set_domain_attribute(sd, attr);
- sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
- sd->parent = parent;
- if (parent)
- parent->child = sd;
- cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
-#endif
- return sd;
-}
+ struct sd_data *sdd = sd->private;
+ struct sched_group *sg = sd->groups;
-static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd;
- sd = &per_cpu(phys_domains, i).sd;
- SD_INIT(sd, CPU);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), d->nodemask);
- sd->parent = parent;
- if (parent)
- parent->child = sd;
- cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
- return sd;
-}
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
-static struct sched_domain *__build_book_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_BOOK
- sd = &per_cpu(book_domains, i).sd;
- SD_INIT(sd, BOOK);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
+ if (cpu == cpumask_first(sched_group_cpus(sg))) {
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ }
}
-static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *cpu_smt_mask(int cpu)
{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_MC
- sd = &per_cpu(core_domains, i).sd;
- SD_INIT(sd, MC);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
+ return topology_thread_cpumask(cpu);
}
-
-static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
- SD_INIT(sd, SIBLING);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
- return sd;
-}
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
- const struct cpumask *cpu_map, int cpu)
-{
- switch (l) {
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- cpumask_and(d->this_sibling_map, cpu_map,
- topology_thread_cpumask(cpu));
- if (cpu == cpumask_first(d->this_sibling_map))
- init_sched_build_groups(d->this_sibling_map, cpu_map,
- &cpu_to_cpu_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_SIBLING, cpu_smt_mask, },
#endif
#ifdef CONFIG_SCHED_MC
- case SD_LV_MC: /* set up multi-core groups */
- cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
- if (cpu == cpumask_first(d->this_core_map))
- init_sched_build_groups(d->this_core_map, cpu_map,
- &cpu_to_core_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_MC, cpu_coregroup_mask, },
#endif
#ifdef CONFIG_SCHED_BOOK
- case SD_LV_BOOK: /* set up book groups */
- cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
- if (cpu == cpumask_first(d->this_book_map))
- init_sched_build_groups(d->this_book_map, cpu_map,
- &cpu_to_book_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_BOOK, cpu_book_mask, },
#endif
- case SD_LV_CPU: /* set up physical groups */
- cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
- if (!cpumask_empty(d->nodemask))
- init_sched_build_groups(d->nodemask, cpu_map,
- &cpu_to_phys_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_CPU, cpu_cpu_mask, },
#ifdef CONFIG_NUMA
- case SD_LV_ALLNODES:
- init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_NODE, cpu_node_mask, },
+ { sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
- default:
- break;
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ }
+ free_percpu(sdd->sd);
+ free_percpu(sdd->sg);
}
}
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ struct s_data *d, const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr, struct sched_domain *child,
+ int cpu)
+{
+ struct sched_domain *sd = tl->init(tl, cpu);
+ if (!sd)
+ return child;
+
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+ }
+ sd->child = child;
+
+ return sd;
+}
+
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-static int __build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
+static int build_sched_domains(const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr)
{
enum s_alloc alloc_state = sa_none;
- struct s_data d;
struct sched_domain *sd;
- int i;
-#ifdef CONFIG_NUMA
- d.sd_allnodes = 0;
-#endif
+ struct s_data d;
+ int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
- alloc_state = sa_sched_groups;
- /*
- * Set up domains for cpus specified by the cpu_map.
- */
+ /* Set up domains for cpus specified by the cpu_map. */
for_each_cpu(i, cpu_map) {
- cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
- cpu_map);
+ struct sched_domain_topology_level *tl;
- sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
- sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
- }
+ sd = NULL;
+ for (tl = sched_domain_topology; tl->init; tl++)
+ sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
- for_each_cpu(i, cpu_map) {
- build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
- build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
- build_sched_groups(&d, SD_LV_MC, cpu_map, i);
- }
-
- /* Set up physical groups */
- for (i = 0; i < nr_node_ids; i++)
- build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+ while (sd->child)
+ sd = sd->child;
-#ifdef CONFIG_NUMA
- /* Set up node groups */
- if (d.sd_allnodes)
- build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-
- for (i = 0; i < nr_node_ids; i++)
- if (build_numa_sched_groups(&d, cpu_map, i))
- goto error;
-#endif
-
- /* Calculate CPU power for physical packages and nodes */
-#ifdef CONFIG_SCHED_SMT
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(cpu_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_MC
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(core_domains, i).sd;
- init_sched_groups_power(i, sd);
+ *per_cpu_ptr(d.sd, i) = sd;
}
-#endif
-#ifdef CONFIG_SCHED_BOOK
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(book_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
+ /* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
- sd = &per_cpu(phys_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ get_group(i, sd->private, &sd->groups);
+ atomic_inc(&sd->groups->ref);
-#ifdef CONFIG_NUMA
- for (i = 0; i < nr_node_ids; i++)
- init_numa_sched_groups_power(d.sched_group_nodes[i]);
+ if (i != cpumask_first(sched_domain_span(sd)))
+ continue;
- if (d.sd_allnodes) {
- struct sched_group *sg;
+ build_sched_groups(sd);
+ }
+ }
- cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- d.tmpmask);
- init_numa_sched_groups_power(sg);
+ /* Calculate CPU power for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_power(i, sd);
+ }
}
-#endif
/* Attach the domains */
+ rcu_read_lock();
for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
- sd = &per_cpu(core_domains, i).sd;
-#elif defined(CONFIG_SCHED_BOOK)
- sd = &per_cpu(book_domains, i).sd;
-#else
- sd = &per_cpu(phys_domains, i).sd;
-#endif
+ sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
+ rcu_read_unlock();
- d.sched_group_nodes = NULL; /* don't free this we still need it */
- __free_domain_allocs(&d, sa_tmpmask, cpu_map);
- return 0;
-
+ ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
- return -ENOMEM;
-}
-
-static int build_sched_domains(const struct cpumask *cpu_map)
-{
- return __build_sched_domains(cpu_map, NULL);
+ return ret;
}
static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7476,7 +7405,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
*/
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
{
int err;
@@ -7487,32 +7416,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
dattr_cur = NULL;
- err = build_sched_domains(doms_cur[0]);
+ err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
return err;
}
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
- struct cpumask *tmpmask)
-{
- free_sched_groups(cpu_map, tmpmask);
-}
-
/*
* Detach sched domains from a group of cpus specified in cpu_map
* These cpus will now be attached to the NULL domain
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
- /* Save because hotplug lock held. */
- static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
int i;
+ rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
- synchronize_sched();
- arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+ rcu_read_unlock();
}
/* handle null as "default" */
@@ -7601,8 +7522,7 @@ match1:
goto match2;
}
/* no match - add a new doms_new */
- __build_sched_domains(doms_new[i],
- dattr_new ? dattr_new + i : NULL);
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
match2:
;
}
@@ -7621,7 +7541,7 @@ match2:
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
{
get_online_cpus();
@@ -7654,7 +7574,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
else
sched_mc_power_savings = level;
- arch_reinit_sched_domains();
+ reinit_sched_domains();
return count;
}
@@ -7773,14 +7693,9 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-#if defined(CONFIG_NUMA)
- sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
- GFP_KERNEL);
- BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
get_online_cpus();
mutex_lock(&sched_domains_mutex);
- arch_init_sched_domains(cpu_active_mask);
+ init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7825,6 +7740,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
INIT_LIST_HEAD(&cfs_rq->tasks);
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
+ /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+ cfs_rq->load_stamp = 1;
+#endif
#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
}
@@ -7867,18 +7786,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
- struct sched_entity *se, int cpu, int add,
+ struct sched_entity *se, int cpu,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
tg->cfs_rq[cpu] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
- if (add)
- list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
tg->se[cpu] = se;
- /* se could be NULL for init_task_group */
+ /* se could be NULL for root_task_group */
if (!se)
return;
@@ -7888,15 +7805,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- se->load.weight = tg->shares;
- se->load.inv_weight = 0;
+ update_load_set(&se->load, 0);
se->parent = parent;
}
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- struct sched_rt_entity *rt_se, int cpu, int add,
+ struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
@@ -7905,8 +7821,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
init_rt_rq(rt_rq, rq);
rt_rq->tg = tg;
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (add)
- list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
tg->rt_se[cpu] = rt_se;
if (!rt_se)
@@ -7941,18 +7855,18 @@ void __init sched_init(void)
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
- init_task_group.se = (struct sched_entity **)ptr;
+ root_task_group.se = (struct sched_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
- init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+ root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
- init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
- init_task_group.rt_rq = (struct rt_rq **)ptr;
+ root_task_group.rt_rq = (struct rt_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7972,20 +7886,16 @@ void __init sched_init(void)
global_rt_period(), global_rt_runtime());
#ifdef CONFIG_RT_GROUP_SCHED
- init_rt_bandwidth(&init_task_group.rt_bandwidth,
+ init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
- list_add(&init_task_group.list, &task_groups);
- INIT_LIST_HEAD(&init_task_group.children);
-
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
- update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
- __alignof__(unsigned long));
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -7997,38 +7907,34 @@ void __init sched_init(void)
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
- init_task_group.shares = init_task_group_load;
+ root_task_group.shares = root_task_group_load;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
/*
- * How much cpu bandwidth does init_task_group get?
+ * How much cpu bandwidth does root_task_group get?
*
* In case of task-groups formed thr' the cgroup filesystem, it
* gets 100% of the cpu resources in the system. This overall
* system cpu resource is divided among the tasks of
- * init_task_group and its child task-groups in a fair manner,
+ * root_task_group and its child task-groups in a fair manner,
* based on each entity's (task or task-group's) weight
* (se->load.weight).
*
- * In other words, if init_task_group has 10 tasks of weight
+ * In other words, if root_task_group has 10 tasks of weight
* 1024) and two child groups A0 and A1 (of weight 1024 each),
* then A0's share of the cpu resource is:
*
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
- * We achieve this by letting init_task_group's tasks sit
- * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+ * We achieve this by letting root_task_group's tasks sit
+ * directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+ init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
- init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+ init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8039,7 +7945,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_LOAD_SCALE;
+ rq->cpu_power = SCHED_POWER_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -8096,6 +8002,7 @@ void __init sched_init(void)
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8108,8 +8015,6 @@ void __init sched_init(void)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
- perf_event_init();
-
scheduler_running = 1;
}
@@ -8118,7 +8023,7 @@ static inline int preempt_count_equals(int preempt_offset)
{
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
- return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+ return (nested == preempt_offset);
}
void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8153,9 +8058,11 @@ EXPORT_SYMBOL(__might_sleep);
#ifdef CONFIG_MAGIC_SYSRQ
static void normalize_task(struct rq *rq, struct task_struct *p)
{
+ const struct sched_class *prev_class = p->sched_class;
+ int old_prio = p->prio;
int on_rq;
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8163,6 +8070,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
activate_task(rq, p, 0);
resched_task(rq->curr);
}
+
+ check_class_changed(rq, p, prev_class, old_prio);
}
void normalize_rt_tasks(void)
@@ -8278,7 +8187,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se;
- struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8291,8 +8199,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -8303,7 +8209,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!se)
goto err_free_rq;
- init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
return 1;
@@ -8314,15 +8220,21 @@ err:
return 0;
}
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
- list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
- &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
- list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#else /* !CONFG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
@@ -8335,10 +8247,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
}
@@ -8367,7 +8275,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
struct rt_rq *rt_rq;
struct sched_rt_entity *rt_se;
- struct rq *rq;
int i;
tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8381,8 +8288,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
ktime_to_ns(def_rt_bandwidth.rt_period), 0);
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
rt_rq = kzalloc_node(sizeof(struct rt_rq),
GFP_KERNEL, cpu_to_node(i));
if (!rt_rq)
@@ -8393,7 +8298,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
- init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
return 1;
@@ -8403,17 +8308,6 @@ err_free_rq:
err:
return 0;
}
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
- list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
- &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
- list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
#else /* !CONFIG_RT_GROUP_SCHED */
static inline void free_rt_sched_group(struct task_group *tg)
{
@@ -8424,14 +8318,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
@@ -8439,6 +8325,7 @@ static void free_sched_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
+ autogroup_free(tg);
kfree(tg);
}
@@ -8447,7 +8334,6 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
unsigned long flags;
- int i;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -8460,10 +8346,6 @@ struct task_group *sched_create_group(struct task_group *parent)
goto err;
spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i) {
- register_fair_sched_group(tg, i);
- register_rt_sched_group(tg, i);
- }
list_add_rcu(&tg->list, &task_groups);
WARN_ON(!parent); /* root should already exist */
@@ -8493,11 +8375,11 @@ void sched_destroy_group(struct task_group *tg)
unsigned long flags;
int i;
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i) {
+ /* end participation in shares distribution */
+ for_each_possible_cpu(i)
unregister_fair_sched_group(tg, i);
- unregister_rt_sched_group(tg, i);
- }
+
+ spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8520,7 +8402,7 @@ void sched_move_task(struct task_struct *tsk)
rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk);
- on_rq = tsk->se.on_rq;
+ on_rq = tsk->on_rq;
if (on_rq)
dequeue_task(rq, tsk, 0);
@@ -8539,38 +8421,11 @@ void sched_move_task(struct task_struct *tsk)
if (on_rq)
enqueue_task(rq, tsk, 0);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, tsk, &flags);
}
#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- int on_rq;
-
- on_rq = se->on_rq;
- if (on_rq)
- dequeue_entity(cfs_rq, se, 0);
-
- se->load.weight = shares;
- se->load.inv_weight = 0;
-
- if (on_rq)
- enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- struct rq *rq = cfs_rq->rq;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __set_se_shares(se, shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8593,37 +8448,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
if (tg->shares == shares)
goto done;
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i)
- unregister_fair_sched_group(tg, i);
- list_del_rcu(&tg->siblings);
- spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for any ongoing reference to this group to finish */
- synchronize_sched();
-
- /*
- * Now we are free to modify the group's share on each cpu
- * w/o tripping rebalance_share or load_balance_fair.
- */
tg->shares = shares;
for_each_possible_cpu(i) {
- /*
- * force a rebalance
- */
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
- set_se_shares(tg->se[i], shares);
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se;
+
+ se = tg->se[i];
+ /* Propagate contribution to hierarchy */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
- /*
- * Enable load balance activity on this group, by inserting it back on
- * each cpu's rq->leaf_cfs_rq_list.
- */
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i)
- register_fair_sched_group(tg, i);
- list_add_rcu(&tg->siblings, &tg->parent->children);
- spin_unlock_irqrestore(&task_group_lock, flags);
done:
mutex_unlock(&shares_mutex);
return 0;
@@ -8922,7 +8759,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
- return &init_task_group.css;
+ return &root_task_group.css;
}
parent = cgroup_tg(cgrp->parent);
@@ -8955,56 +8792,39 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
return 0;
}
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup)
+static void
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
+ sched_move_task(tsk);
}
static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk,
- bool threadgroup)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
{
- sched_move_task(tsk);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- sched_move_task(c);
- }
- rcu_read_unlock();
- }
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ sched_move_task(task);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+ return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
}
static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
{
struct task_group *tg = cgroup_tg(cgrp);
- return (u64) tg->shares;
+ return (u64) scale_load_down(tg->shares);
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -9063,8 +8883,9 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach = cpu_cgroup_can_attach,
- .attach = cpu_cgroup_attach,
+ .can_attach_task = cpu_cgroup_can_attach_task,
+ .attach_task = cpu_cgroup_attach_task,
+ .exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
.early_init = 1,
@@ -9349,72 +9170,3 @@ struct cgroup_subsys cpuacct_subsys = {
};
#endif /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
- barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly. This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier. Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
- int snap, trycount = 0;
-
- smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = atomic_read(&synchronize_sched_expedited_count) + 1;
- get_online_cpus();
- while (try_stop_cpus(cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_sched();
- return;
- }
- if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
- smp_mb(); /* ensure test happens before caller kfree */
- return;
- }
- get_online_cpus();
- }
- atomic_inc(&synchronize_sched_expedited_count);
- smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
- put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */