summaryrefslogtreecommitdiff
path: root/kernel/sched/topology.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r--kernel/sched/topology.c238
1 files changed, 169 insertions, 69 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b958fe48e020..cf643a5ddedd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,7 +3,9 @@
* Scheduler topology setup/handling methods
*/
+#include <linux/sched/isolation.h>
#include <linux/bsearch.h>
+#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
void sched_domains_mutex_lock(void)
@@ -87,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!(sd->flags & SD_OVERLAP) &&
+ if (!(sd->flags & SD_NUMA) &&
cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -100,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
group->sgc->id,
cpumask_pr_args(sched_group_span(group)));
- if ((sd->flags & SD_OVERLAP) &&
+ if ((sd->flags & SD_NUMA) &&
!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
printk(KERN_CONT " mask=%*pbl",
cpumask_pr_args(group_balance_mask(group)));
@@ -313,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void)
}
late_initcall(sched_energy_aware_sysctl_init);
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
static void free_pd(struct perf_domain *pd)
{
@@ -449,9 +451,9 @@ free:
return false;
}
-#else
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
static void free_pd(struct perf_domain *pd) { }
-#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
static void free_rootdomain(struct rcu_head *rcu)
{
@@ -1318,8 +1320,6 @@ next:
update_group_capacity(sd, cpu);
}
-#ifdef CONFIG_SMP
-
/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
{
@@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
* "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
* which is shared by all the overlapping groups.
*/
- WARN_ON_ONCE(sd->flags & SD_OVERLAP);
+ WARN_ON_ONCE(sd->flags & SD_NUMA);
sg = sd->groups;
if (cpu != sg->asym_prefer_cpu) {
@@ -1374,8 +1374,6 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
}
}
-#endif /* CONFIG_SMP */
-
/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
@@ -1592,13 +1590,19 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
enum numa_topology_type sched_numa_topology_type;
+/*
+ * sched_domains_numa_distance is derived from sched_numa_node_distance
+ * and provides a simplified view of NUMA distances used specifically
+ * for building NUMA scheduling domains.
+ */
static int sched_domains_numa_levels;
-static int sched_domains_curr_level;
+static int sched_numa_node_levels;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
+static int *sched_numa_node_distance;
static struct cpumask ***sched_domains_numa_masks;
-#endif
+#endif /* CONFIG_NUMA */
/*
* SD_flags allowed in topology descriptions.
@@ -1634,14 +1638,7 @@ sd_init(struct sched_domain_topology_level *tl,
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
-#ifdef CONFIG_NUMA
- /*
- * Ugly hack to pass state to sd_numa_mask()...
- */
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
+ sd_weight = cpumask_weight(tl->mask(tl, cpu));
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
@@ -1672,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
+
+ /* 50% success rate */
+ .newidle_call = 512,
+ .newidle_success = 256,
+ .newidle_ratio = 512,
+
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
@@ -1679,7 +1682,7 @@ sd_init(struct sched_domain_topology_level *tl,
};
sd_span = sched_domain_span(sd);
- cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
sd_id = cpumask_first(sd_span);
sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
@@ -1714,7 +1717,7 @@ sd_init(struct sched_domain_topology_level *tl,
SD_WAKE_AFFINE);
}
-#endif
+#endif /* CONFIG_NUMA */
} else {
sd->cache_nice_tries = 1;
}
@@ -1734,22 +1737,63 @@ sd_init(struct sched_domain_topology_level *tl,
return sd;
}
+#ifdef CONFIG_SCHED_SMT
+int cpu_smt_flags(void)
+{
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_smt_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+int cpu_cluster_flags(void)
+{
+ return SD_CLUSTER | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_clustergroup_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+int cpu_core_flags(void)
+{
+ return SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_coregroup_mask(cpu);
+}
+#endif
+
+const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_node_mask(cpu);
+}
+
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+ SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
#endif
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(tl_pkg_mask, NULL, PKG),
{ NULL, },
};
@@ -1770,10 +1814,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl)
}
#ifdef CONFIG_NUMA
+static int cpu_numa_flags(void)
+{
+ return SD_NUMA;
+}
-static const struct cpumask *sd_numa_mask(int cpu)
+static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu)
{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+ return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)];
}
static void sched_numa_warn(const char *str)
@@ -1810,10 +1858,10 @@ bool find_numa_distance(int distance)
return true;
rcu_read_lock();
- distances = rcu_dereference(sched_domains_numa_distance);
+ distances = rcu_dereference(sched_numa_node_distance);
if (!distances)
goto unlock;
- for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (i = 0; i < sched_numa_node_levels; i++) {
if (distances[i] == distance) {
found = true;
break;
@@ -1889,14 +1937,34 @@ static void init_numa_topology_type(int offline_node)
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
-void sched_init_numa(int offline_node)
+/*
+ * An architecture could modify its NUMA distance, to change
+ * grouping of NUMA nodes and number of NUMA levels when creating
+ * NUMA level sched domains.
+ *
+ * A NUMA level is created for each unique
+ * arch_sched_node_distance.
+ */
+static int numa_node_dist(int i, int j)
{
- struct sched_domain_topology_level *tl;
- unsigned long *distance_map;
+ return node_distance(i, j);
+}
+
+int arch_sched_node_distance(int from, int to)
+ __weak __alias(numa_node_dist);
+
+static bool modified_sched_node_distance(void)
+{
+ return numa_node_dist != arch_sched_node_distance;
+}
+
+static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
+ int **dist, int *levels)
+{
+ unsigned long *distance_map __free(bitmap) = NULL;
int nr_levels = 0;
int i, j;
int *distances;
- struct cpumask ***masks;
/*
* O(nr_nodes^2) de-duplicating selection sort -- in order to find the
@@ -1904,17 +1972,16 @@ void sched_init_numa(int offline_node)
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
if (!distance_map)
- return;
+ return -ENOMEM;
bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for_each_cpu_node_but(i, offline_node) {
for_each_cpu_node_but(j, offline_node) {
- int distance = node_distance(i, j);
+ int distance = n_dist(i, j);
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
- bitmap_free(distance_map);
- return;
+ return -EINVAL;
}
bitmap_set(distance_map, distance, 1);
@@ -1927,18 +1994,46 @@ void sched_init_numa(int offline_node)
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
- if (!distances) {
- bitmap_free(distance_map);
- return;
- }
+ if (!distances)
+ return -ENOMEM;
for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
distances[i] = j;
}
- rcu_assign_pointer(sched_domains_numa_distance, distances);
+ *dist = distances;
+ *levels = nr_levels;
+
+ return 0;
+}
+
+void sched_init_numa(int offline_node)
+{
+ struct sched_domain_topology_level *tl;
+ int nr_levels, nr_node_levels;
+ int i, j;
+ int *distances, *domain_distances;
+ struct cpumask ***masks;
- bitmap_free(distance_map);
+ /* Record the NUMA distances from SLIT table */
+ if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
+ &nr_node_levels))
+ return;
+
+ /* Record modified NUMA distances for building sched domains */
+ if (modified_sched_node_distance()) {
+ if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
+ &domain_distances, &nr_levels)) {
+ kfree(distances);
+ return;
+ }
+ } else {
+ domain_distances = distances;
+ nr_levels = nr_node_levels;
+ }
+ rcu_assign_pointer(sched_numa_node_distance, distances);
+ WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
+ WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
/*
* 'nr_levels' contains the number of unique distances
@@ -1956,6 +2051,8 @@ void sched_init_numa(int offline_node)
*
* We reset it to 'nr_levels' at the end of this function.
*/
+ rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
+
sched_domains_numa_levels = 0;
masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
@@ -1981,10 +2078,13 @@ void sched_init_numa(int offline_node)
masks[i][j] = mask;
for_each_cpu_node_but(k, offline_node) {
- if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ if (sched_debug() &&
+ (arch_sched_node_distance(j, k) !=
+ arch_sched_node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
- if (node_distance(j, k) > sched_domains_numa_distance[i])
+ if (arch_sched_node_distance(j, k) >
+ sched_domains_numa_distance[i])
continue;
cpumask_or(mask, mask, cpumask_of_node(k));
@@ -2010,30 +2110,20 @@ void sched_init_numa(int offline_node)
/*
* Add the NUMA identity distance, aka single NODE.
*/
- tl[i++] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .numa_level = 0,
- SD_INIT_NAME(NODE)
- };
+ tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 1; j < nr_levels; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
+ tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+ tl[i].numa_level = j;
}
sched_domain_topology_saved = sched_domain_topology;
sched_domain_topology = tl;
sched_domains_numa_levels = nr_levels;
- WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
init_numa_topology_type(offline_node);
}
@@ -2041,14 +2131,18 @@ void sched_init_numa(int offline_node)
static void sched_reset_numa(void)
{
- int nr_levels, *distances;
+ int nr_levels, *distances, *dom_distances = NULL;
struct cpumask ***masks;
nr_levels = sched_domains_numa_levels;
+ sched_numa_node_levels = 0;
sched_domains_numa_levels = 0;
sched_max_numa_distance = 0;
sched_numa_topology_type = NUMA_DIRECT;
- distances = sched_domains_numa_distance;
+ distances = sched_numa_node_distance;
+ if (sched_numa_node_distance != sched_domains_numa_distance)
+ dom_distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_numa_node_distance, NULL);
rcu_assign_pointer(sched_domains_numa_distance, NULL);
masks = sched_domains_numa_masks;
rcu_assign_pointer(sched_domains_numa_masks, NULL);
@@ -2057,6 +2151,7 @@ static void sched_reset_numa(void)
synchronize_rcu();
kfree(distances);
+ kfree(dom_distances);
for (i = 0; i < nr_levels && masks; i++) {
if (!masks[i])
continue;
@@ -2103,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu)
continue;
/* Set ourselves in the remote node's masks */
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ if (arch_sched_node_distance(j, node) <=
+ sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}
@@ -2212,6 +2308,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
goto unlock;
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ if (!hop_masks)
+ goto unlock;
hop = hop_masks - k.masks;
ret = hop ?
@@ -2337,7 +2435,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sd) {
sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
+ if (sd && (sd->flags & SD_NUMA))
free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sd, j));
}
@@ -2403,9 +2501,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
id_seen = sched_domains_tmpmask2;
for_each_sd_topology(tl) {
+ int tl_common_flags = 0;
+
+ if (tl->sd_flags)
+ tl_common_flags = (*tl->sd_flags)();
/* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
+ if (tl_common_flags & SD_NUMA)
continue;
cpumask_clear(covered);
@@ -2418,7 +2520,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
* breaks the linking done for an earlier span.
*/
for_each_cpu(cpu, cpu_map) {
- const struct cpumask *tl_cpu_mask = tl->mask(cpu);
+ const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu);
int id;
/* lowest bit set in this mask is used as a unique id */
@@ -2426,7 +2528,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
if (cpumask_test_cpu(id, id_seen)) {
/* First CPU has already been seen, ensure identical spans */
- if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
+ if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask))
return false;
} else {
/* First CPU hasn't been seen before, ensure it's a completely new span */
@@ -2476,8 +2578,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP)
- sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
@@ -2490,7 +2590,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
+ if (sd->flags & SD_NUMA) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {