diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 244 | 
1 files changed, 148 insertions, 96 deletions
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d3335e1f..bfa3c86d0d68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)  	if (!cpus)  		return; -	ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;  	ns->task_capacity =  		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);  	ns->has_free_capacity = (ns->nr_running < ns->task_capacity); @@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,  	env->best_cpu = env->dst_cpu;  } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, -				long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load,  				struct task_numa_env *env)  {  	long imb, old_imb; +	long orig_src_load, orig_dst_load; +	long src_capacity, dst_capacity; + +	/* +	 * The load is corrected for the CPU capacity available on each node. +	 * +	 * src_load        dst_load +	 * ------------ vs --------- +	 * src_capacity    dst_capacity +	 */ +	src_capacity = env->src_stats.compute_capacity; +	dst_capacity = env->dst_stats.compute_capacity;  	/* We care about the slope of the imbalance, not the direction. */  	if (dst_load < src_load)  		swap(dst_load, src_load);  	/* Is the difference below the threshold? */ -	imb = dst_load * 100 - src_load * env->imbalance_pct; +	imb = dst_load * src_capacity * 100 - +	      src_load * dst_capacity * env->imbalance_pct;  	if (imb <= 0)  		return false; @@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,  	 * The imbalance is above the allowed threshold.  	 * Compare it with the old imbalance.  	 */ +	orig_src_load = env->src_stats.load; +	orig_dst_load = env->dst_stats.load; +  	if (orig_dst_load < orig_src_load)  		swap(orig_dst_load, orig_src_load); -	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; +	old_imb = orig_dst_load * src_capacity * 100 - +		  orig_src_load * dst_capacity * env->imbalance_pct;  	/* Would this change make things worse? */  	return (imb > old_imb); @@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,  	struct rq *src_rq = cpu_rq(env->src_cpu);  	struct rq *dst_rq = cpu_rq(env->dst_cpu);  	struct task_struct *cur; -	long orig_src_load, src_load; -	long orig_dst_load, dst_load; +	long src_load, dst_load;  	long load; -	long imp = (groupimp > 0) ? groupimp : taskimp; +	long imp = env->p->numa_group ? groupimp : taskimp; +	long moveimp = imp;  	rcu_read_lock();  	cur = ACCESS_ONCE(dst_rq->curr); @@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,  			 * itself (not part of a group), use the task weight  			 * instead.  			 */ -			if (env->p->numa_group) -				imp = groupimp; -			else -				imp = taskimp; -  			if (cur->numa_group)  				imp += group_weight(cur, env->src_nid) -  				       group_weight(cur, env->dst_nid); @@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,  		}  	} -	if (imp < env->best_imp) +	if (imp <= env->best_imp && moveimp <= env->best_imp)  		goto unlock;  	if (!cur) { @@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,  	}  	/* Balance doesn't matter much if we're running a task per cpu */ -	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) +	if (imp > env->best_imp && src_rq->nr_running == 1 && +			dst_rq->nr_running == 1)  		goto assign;  	/*  	 * In the overloaded case, try and keep the load balanced.  	 */  balance: -	orig_dst_load = env->dst_stats.load; -	orig_src_load = env->src_stats.load; - -	/* XXX missing capacity terms */  	load = task_h_load(env->p); -	dst_load = orig_dst_load + load; -	src_load = orig_src_load - load; +	dst_load = env->dst_stats.load + load; +	src_load = env->src_stats.load - load; + +	if (moveimp > imp && moveimp > env->best_imp) { +		/* +		 * If the improvement from just moving env->p direction is +		 * better than swapping tasks around, check if a move is +		 * possible. Store a slightly smaller score than moveimp, +		 * so an actually idle CPU will win. +		 */ +		if (!load_too_imbalanced(src_load, dst_load, env)) { +			imp = moveimp - 1; +			cur = NULL; +			goto assign; +		} +	} + +	if (imp <= env->best_imp) +		goto unlock;  	if (cur) {  		load = task_h_load(cur); @@ -1225,8 +1249,7 @@ balance:  		src_load += load;  	} -	if (load_too_imbalanced(orig_src_load, orig_dst_load, -				src_load, dst_load, env)) +	if (load_too_imbalanced(src_load, dst_load, env))  		goto unlock;  assign: @@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)  	groupimp = group_weight(p, env.dst_nid) - groupweight;  	update_numa_stats(&env.dst_stats, env.dst_nid); -	/* If the preferred nid has free capacity, try to use it. */ -	if (env.dst_stats.has_free_capacity) -		task_numa_find_cpu(&env, taskimp, groupimp); +	/* Try to find a spot on the preferred nid. */ +	task_numa_find_cpu(&env, taskimp, groupimp);  	/* No space available on the preferred nid. Look elsewhere. */  	if (env.best_cpu == -1) { @@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)  		}  	} -	/* No better CPU than the current one was found. */ -	if (env.best_cpu == -1) -		return -EAGAIN; -  	/*  	 * If the task is part of a workload that spans multiple NUMA nodes,  	 * and is migrating into one of the workload's active nodes, remember @@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)  	 * A task that migrated to a second choice node will be better off  	 * trying for a better one later. Do not set the preferred node here.  	 */ -	if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) -		sched_setnuma(p, env.dst_nid); +	if (p->numa_group) { +		if (env.best_cpu == -1) +			nid = env.src_nid; +		else +			nid = env.dst_nid; + +		if (node_isset(nid, p->numa_group->active_nodes)) +			sched_setnuma(p, env.dst_nid); +	} + +	/* No better CPU than the current one was found. */ +	if (env.best_cpu == -1) +		return -EAGAIN;  	/*  	 * Reset the scan period if the task is being rescheduled on an @@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)  /*   * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS   * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses.   */  #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7  /*   * Increase the scan period (slow down scanning) if the majority of @@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)  	if (p->numa_group) {  		update_numa_active_node_mask(p->numa_group); -		/* -		 * If the preferred task and group nids are different, -		 * iterate over the nodes again to find the best place. -		 */ -		if (max_nid != max_group_nid) { -			unsigned long weight, max_weight = 0; - -			for_each_online_node(nid) { -				weight = task_weight(p, nid) + group_weight(p, nid); -				if (weight > max_weight) { -					max_weight = weight; -					max_nid = nid; -				} -			} -		} -  		spin_unlock_irq(group_lock); +		max_nid = max_group_nid;  	} -	/* Preferred node as the node with the most faults */ -	if (max_faults && max_nid != p->numa_preferred_nid) { -		/* Update the preferred nid and migrate task if possible */ -		sched_setnuma(p, max_nid); -		numa_migrate_preferred(p); +	if (max_faults) { +		/* Set the new preferred node */ +		if (max_nid != p->numa_preferred_nid) +			sched_setnuma(p, max_nid); + +		if (task_node(p) != p->numa_preferred_nid) +			numa_migrate_preferred(p);  	}  } @@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)  	ideal_runtime = sched_slice(cfs_rq, curr);  	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;  	if (delta_exec > ideal_runtime) { -		resched_task(rq_of(cfs_rq)->curr); +		resched_curr(rq_of(cfs_rq));  		/*  		 * The current task ran long enough, ensure it doesn't get  		 * re-elected due to buddy favours. @@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)  		return;  	if (delta > ideal_runtime) -		resched_task(rq_of(cfs_rq)->curr); +		resched_curr(rq_of(cfs_rq));  }  static void @@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)  	 * validating it and just reschedule.  	 */  	if (queued) { -		resched_task(rq_of(cfs_rq)->curr); +		resched_curr(rq_of(cfs_rq));  		return;  	}  	/* @@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)  	 * hierarchy can be throttled  	 */  	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) -		resched_task(rq_of(cfs_rq)->curr); +		resched_curr(rq_of(cfs_rq));  }  static __always_inline @@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	cfs_rq->throttled = 1;  	cfs_rq->throttled_clock = rq_clock(rq);  	raw_spin_lock(&cfs_b->lock); -	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); +	/* +	 * Add to the _head_ of the list, so that an already-started +	 * distribute_cfs_runtime will not see us +	 */ +	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);  	if (!cfs_b->timer_active)  		__start_cfs_bandwidth(cfs_b, false);  	raw_spin_unlock(&cfs_b->lock); @@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	/* determine whether we need to wake up potentially idle cpu */  	if (rq->curr == rq->idle && rq->cfs.nr_running) -		resched_task(rq->curr); +		resched_curr(rq);  }  static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,  		u64 remaining, u64 expires)  {  	struct cfs_rq *cfs_rq; -	u64 runtime = remaining; +	u64 runtime; +	u64 starting_runtime = remaining;  	rcu_read_lock();  	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3448,7 +3469,7 @@ next:  	}  	rcu_read_unlock(); -	return remaining; +	return starting_runtime - remaining;  }  /* @@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  	/* account preceding periods in which throttling occurred */  	cfs_b->nr_throttled += overrun; -	/* -	 * There are throttled entities so we must first use the new bandwidth -	 * to unthrottle them before making it generally available.  This -	 * ensures that all existing debts will be paid before a new cfs_rq is -	 * allowed to run. -	 */ -	runtime = cfs_b->runtime;  	runtime_expires = cfs_b->runtime_expires; -	cfs_b->runtime = 0;  	/* -	 * This check is repeated as we are holding onto the new bandwidth -	 * while we unthrottle.  This can potentially race with an unthrottled -	 * group trying to acquire new bandwidth from the global pool. +	 * This check is repeated as we are holding onto the new bandwidth while +	 * we unthrottle. This can potentially race with an unthrottled group +	 * trying to acquire new bandwidth from the global pool. This can result +	 * in us over-using our runtime if it is all used during this loop, but +	 * only by limited amounts in that extreme case.  	 */ -	while (throttled && runtime > 0) { +	while (throttled && cfs_b->runtime > 0) { +		runtime = cfs_b->runtime;  		raw_spin_unlock(&cfs_b->lock);  		/* we can't nest cfs_b->lock while distributing bandwidth */  		runtime = distribute_cfs_runtime(cfs_b, runtime, @@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  		raw_spin_lock(&cfs_b->lock);  		throttled = !list_empty(&cfs_b->throttled_cfs_rq); + +		cfs_b->runtime -= min(runtime, cfs_b->runtime);  	} -	/* return (any) remaining runtime */ -	cfs_b->runtime = runtime;  	/*  	 * While we are ensured activity in the period following an  	 * unthrottle, this also covers the case in which the new bandwidth is @@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  		return;  	} -	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { +	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)  		runtime = cfs_b->runtime; -		cfs_b->runtime = 0; -	} +  	expires = cfs_b->runtime_expires;  	raw_spin_unlock(&cfs_b->lock); @@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	raw_spin_lock(&cfs_b->lock);  	if (expires == cfs_b->runtime_expires) -		cfs_b->runtime = runtime; +		cfs_b->runtime -= min(runtime, cfs_b->runtime);  	raw_spin_unlock(&cfs_b->lock);  } @@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  	hrtimer_cancel(&cfs_b->slack_timer);  } +static void __maybe_unused update_runtime_enabled(struct rq *rq) +{ +	struct cfs_rq *cfs_rq; + +	for_each_leaf_cfs_rq(rq, cfs_rq) { +		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + +		raw_spin_lock(&cfs_b->lock); +		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; +		raw_spin_unlock(&cfs_b->lock); +	} +} +  static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  {  	struct cfs_rq *cfs_rq; @@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  		 * there's some valid quota amount  		 */  		cfs_rq->runtime_remaining = 1; +		/* +		 * Offline rq is schedulable till cpu is completely disabled +		 * in take_cpu_down(), so we prevent new cfs throttling here. +		 */ +		cfs_rq->runtime_enabled = 0; +  		if (cfs_rq_throttled(cfs_rq))  			unthrottle_cfs_rq(cfs_rq);  	} @@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)  	return NULL;  }  static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void update_runtime_enabled(struct rq *rq) {}  static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}  #endif /* CONFIG_CFS_BANDWIDTH */ @@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)  		if (delta < 0) {  			if (rq->curr == p) -				resched_task(p); +				resched_curr(rq);  			return;  		} @@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_  	return;  preempt: -	resched_task(curr); +	resched_curr(rq);  	/*  	 * Only set the backward buddy when the current task is still  	 * on the rq. This can happen when a wakeup gets interleaved @@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)  /*   * Is this task likely cache-hot:   */ -static int -task_hot(struct task_struct *p, u64 now) +static int task_hot(struct task_struct *p, struct lb_env *env)  {  	s64 delta; @@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)  	/*  	 * Buddy candidates are cache hot:  	 */ -	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && +	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&  			(&p->se == cfs_rq_of(&p->se)->next ||  			 &p->se == cfs_rq_of(&p->se)->last))  		return 1; @@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)  	if (sysctl_sched_migration_cost == 0)  		return 0; -	delta = now - p->se.exec_start; +	delta = rq_clock_task(env->src_rq) - p->se.exec_start;  	return delta < (s64)sysctl_sched_migration_cost;  } @@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	 * 2) task is cache cold, or  	 * 3) too many balance attempts have failed.  	 */ -	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); +	tsk_cache_hot = task_hot(p, env);  	if (!tsk_cache_hot)  		tsk_cache_hot = migrate_degrades_locality(p, env); @@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro   * @load_idx: Load index of sched_domain of this_cpu for load calc.   * @local_group: Does group contain this_cpu.   * @sgs: variable to hold the statistics for this group. + * @overload: Indicate more than one runnable task for any CPU.   */  static inline void update_sg_lb_stats(struct lb_env *env,  			struct sched_group *group, int load_idx, -			int local_group, struct sg_lb_stats *sgs) +			int local_group, struct sg_lb_stats *sgs, +			bool *overload)  {  	unsigned long load;  	int i; @@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,  		sgs->group_load += load;  		sgs->sum_nr_running += rq->nr_running; + +		if (rq->nr_running > 1) +			*overload = true; +  #ifdef CONFIG_NUMA_BALANCING  		sgs->nr_numa_running += rq->nr_numa_running;  		sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd  	struct sched_group *sg = env->sd->groups;  	struct sg_lb_stats tmp_sgs;  	int load_idx, prefer_sibling = 0; +	bool overload = false;  	if (child && child->flags & SD_PREFER_SIBLING)  		prefer_sibling = 1; @@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd  				update_group_capacity(env->sd, env->dst_cpu);  		} -		update_sg_lb_stats(env, sg, load_idx, local_group, sgs); +		update_sg_lb_stats(env, sg, load_idx, local_group, sgs, +						&overload);  		if (local_group)  			goto next_group; @@ -6049,6 +6091,13 @@ next_group:  	if (env->sd->flags & SD_NUMA)  		env->fbq_type = fbq_classify_group(&sds->busiest_stat); + +	if (!env->sd->parent) { +		/* update overload indicator if we are at root domain */ +		if (env->dst_rq->rd->overload != overload) +			env->dst_rq->rd->overload = overload; +	} +  }  /** @@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)  	 */  	this_rq->idle_stamp = rq_clock(this_rq); -	if (this_rq->avg_idle < sysctl_sched_migration_cost) { +	if (this_rq->avg_idle < sysctl_sched_migration_cost || +	    !this_rq->rd->overload) {  		rcu_read_lock();  		sd = rcu_dereference_check_sched_domain(this_rq->sd);  		if (sd) @@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)  static void rq_online_fair(struct rq *rq)  {  	update_sysctl(); + +	update_runtime_enabled(rq);  }  static void rq_offline_fair(struct rq *rq) @@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)  		 * 'current' within the tree based on its new key value.  		 */  		swap(curr->vruntime, se->vruntime); -		resched_task(rq->curr); +		resched_curr(rq);  	}  	se->vruntime -= cfs_rq->min_vruntime; @@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)  	 */  	if (rq->curr == p) {  		if (p->prio > oldprio) -			resched_task(rq->curr); +			resched_curr(rq);  	} else  		check_preempt_curr(rq, p, 0);  } @@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)  	 * if we can still preempt the current task.  	 */  	if (rq->curr == p) -		resched_task(rq->curr); +		resched_curr(rq);  	else  		check_preempt_curr(rq, p, 0);  } | 
