diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/irq/chip.c | 3 | ||||
-rw-r--r-- | kernel/rcupdate.c | 19 | ||||
-rw-r--r-- | kernel/sched.c | 51 | ||||
-rw-r--r-- | kernel/sched_fair.c | 62 | ||||
-rw-r--r-- | kernel/sched_features.h | 2 | ||||
-rw-r--r-- | kernel/sched_stats.h | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 10 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 12 |
8 files changed, 101 insertions, 60 deletions
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4895fde4eb93..10b5092e9bfe 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -76,6 +76,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; + desc->name = NULL; spin_unlock_irqrestore(&desc->lock, flags); } @@ -127,7 +128,7 @@ int set_irq_type(unsigned int irq, unsigned int type) return 0; spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_trigger(desc, irq, flags); + ret = __irq_set_trigger(desc, irq, type); spin_unlock_irqrestore(&desc->lock, flags); return ret; } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 467d5940f624..ad63af8b2521 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -119,18 +119,19 @@ static void _rcu_barrier(enum rcu_barrier type) /* Take cpucontrol mutex to protect against CPU hotplug */ mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); - atomic_set(&rcu_barrier_cpu_count, 0); /* - * The queueing of callbacks in all CPUs must be atomic with - * respect to RCU, otherwise one CPU may queue a callback, - * wait for a grace period, decrement barrier count and call - * complete(), while other CPUs have not yet queued anything. - * So, we need to make sure that grace periods cannot complete - * until all the callbacks are queued. + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. */ - rcu_read_lock(); + atomic_set(&rcu_barrier_cpu_count, 1); on_each_cpu(rcu_barrier_func, (void *)type, 1); - rcu_read_unlock(); + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } diff --git a/kernel/sched.c b/kernel/sched.c index d906f72b42d2..945a97b9600d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -819,6 +819,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; unsigned int sysctl_sched_shares_ratelimit = 250000; /* + * Inject some fuzzyness into changing the per-cpu group shares + * this avoids remote rq-locks at the expense of fairness. + * default: 4 + */ +unsigned int sysctl_sched_shares_thresh = 4; + +/* * period over which we measure -rt task cpu usage in us. * default: 1s */ @@ -1454,8 +1461,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) +update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; @@ -1486,19 +1493,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * */ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + if (abs(shares - tg->se[cpu]->load.weight) > + sysctl_sched_shares_thresh) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + spin_lock_irqsave(&rq->lock, flags); + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; - __set_se_shares(tg->se[cpu], shares); + __set_se_shares(tg->se[cpu], shares); + spin_unlock_irqrestore(&rq->lock, flags); + } } /* @@ -1527,14 +1538,8 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!rq_weight) rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, i, shares, rq_weight); - spin_unlock_irqrestore(&rq->lock, flags); - } + for_each_cpu_mask(i, sd->span) + update_group_shares_cpu(tg, i, shares, rq_weight); return 0; } @@ -4443,12 +4448,8 @@ need_resched_nonpreemptible: if (sched_feat(HRTICK)) hrtick_clear(rq); - /* - * Do the rq-clock update outside the rq lock: - */ - local_irq_disable(); + spin_lock_irq(&rq->lock); update_rq_clock(rq); - spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f604dae71316..9573c33688b8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +static const struct sched_class fair_sched_class; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -334,7 +336,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* - * delta *= w / rw + * delta *= P[w / rw] */ static inline unsigned long calc_delta_weight(unsigned long delta, struct sched_entity *se) @@ -348,15 +350,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se) } /* - * delta *= rw / w + * delta /= w */ static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, &se->load); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); return delta; } @@ -386,26 +386,26 @@ static u64 __sched_period(unsigned long nr_running) * We calculate the wall-time slice from the period by taking a part * proportional to the weight. * - * s = p*w/rw + * s = p*P[w/rw] */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); + unsigned long nr_running = cfs_rq->nr_running; + + if (unlikely(!se->on_rq)) + nr_running++; + + return calc_delta_weight(__sched_period(nr_running), se); } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s*rw/w = p + * vs = s/w */ -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; - - if (!se->on_rq) - nr_running++; - - return __sched_period(nr_running); + return calc_delta_fair(sched_slice(cfs_rq, se), se); } /* @@ -628,7 +628,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice_add(cfs_rq, se); + vruntime += sched_vslice(cfs_rq, se); if (!initial) { /* sleeps upto a single latency don't count. */ @@ -748,7 +748,7 @@ pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rq *rq = rq_of(cfs_rq); u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) { cfs_rq->pair_start = rq->clock; return se; } @@ -849,11 +849,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) hrtick_start(rq, delta); } } + +/* + * called from enqueue/dequeue and updates the hrtick when the + * current task is from our class and nr_running is low enough + * to matter. + */ +static void hrtick_update(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + + if (curr->sched_class != &fair_sched_class) + return; + + if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) + hrtick_start_fair(rq, curr); +} #else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { } + +static inline void hrtick_update(struct rq *rq) +{ +} #endif /* @@ -874,7 +894,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) wakeup = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@ -896,7 +916,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) sleep = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@ -1002,8 +1022,6 @@ static inline int wake_idle(int cpu, struct task_struct *p) #ifdef CONFIG_SMP -static const struct sched_class fair_sched_class; - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 7c9e8f4a049f..fda016218296 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -5,7 +5,7 @@ SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) -SCHED_FEAT(HRTICK, 1) +SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index b8c156979cf2..2df9d297d292 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -9,7 +9,7 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = NR_CPUS/32 * 9; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; char *mask_str = kmalloc(mask_len, GFP_KERNEL); if (mask_str == NULL) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b3cc73931d1f..a13bd4dfaeb1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -276,6 +276,16 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_thresh", + .data = &sysctl_sched_shares_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0581c11fe6c6..727c1ae0517a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -567,11 +567,21 @@ static void tick_nohz_switch_to_nohz(void) static void tick_nohz_kick_tick(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t delta, now; if (!ts->tick_stopped) return; - tick_nohz_restart(ts, ktime_get()); + /* + * Do not touch the tick device, when the next expiry is either + * already reached or less/equal than the tick period. + */ + now = ktime_get(); + delta = ktime_sub(ts->sched_timer.expires, now); + if (delta.tv64 <= tick_period.tv64) + return; + + tick_nohz_restart(ts, now); } #else |