diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave | 35 | ||||
-rw-r--r-- | drivers/base/node.c | 9 | ||||
-rw-r--r-- | include/linux/mempolicy.h | 4 | ||||
-rw-r--r-- | mm/mempolicy.c | 326 |
4 files changed, 311 insertions, 63 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave index 0b7972de04e9..649c0e9b895c 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave @@ -20,6 +20,35 @@ Description: Weight configuration interface for nodeN Minimum weight: 1 Maximum weight: 255 - Writing an empty string or `0` will reset the weight to the - system default. The system default may be set by the kernel - or drivers at boot or during hotplug events. + Writing invalid values (i.e. any values not in [1,255], + empty string, ...) will return -EINVAL. + + Changing the weight to a valid value will automatically + switch the system to manual mode as well. + +What: /sys/kernel/mm/mempolicy/weighted_interleave/auto +Date: May 2025 +Contact: Linux memory management mailing list <linux-mm@kvack.org> +Description: Auto-weighting configuration interface + + Configuration mode for weighted interleave. 'true' indicates + that the system is in auto mode, and a 'false' indicates that + the system is in manual mode. + + In auto mode, all node weights are re-calculated and overwritten + (visible via the nodeN interfaces) whenever new bandwidth data + is made available during either boot or hotplug events. + + In manual mode, node weights can only be updated by the user. + Note that nodes that are onlined with previously set weights + will reuse those weights. If they were not previously set or + are onlined with missing bandwidth data, the weights will use + a default weight of 1. + + Writing any true value string (e.g. Y or 1) will enable auto + mode, while writing any false value string (e.g. N or 0) will + enable manual mode. All other strings are ignored and will + return -EINVAL. + + Writing a new weight to a node directly via the nodeN interface + will also automatically switch the system to manual mode. diff --git a/drivers/base/node.c b/drivers/base/node.c index cd13ef287011..25ab9ec14eb8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/memory.h> +#include <linux/mempolicy.h> #include <linux/vmstat.h> #include <linux/notifier.h> #include <linux/node.h> @@ -214,6 +215,14 @@ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, break; } } + + /* When setting CPU access coordinates, update mempolicy */ + if (access == ACCESS_COORDINATE_CPU) { + if (mempolicy_set_node_perf(nid, coord)) { + pr_info("failed to set mempolicy attrs for node %d\n", + nid); + } + } } EXPORT_SYMBOL_GPL(node_set_perf_attrs); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ce9885e0178a..0fe96f3ab3ef 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> +#include <linux/node.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> @@ -178,6 +179,9 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); +extern int mempolicy_set_node_perf(unsigned int node, + struct access_coordinate *coords); + #else struct mempolicy {}; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9a2b4b36f558..72fd72e156b1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -109,6 +109,7 @@ #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> +#include <linux/gcd.h> #include <asm/tlbflush.h> #include <asm/tlb.h> @@ -140,31 +141,138 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* - * iw_table is the sysfs-set interleave weight table, a value of 0 denotes - * system-default value should be used. A NULL iw_table also denotes that - * system-default values should be used. Until the system-default table - * is implemented, the system-default is always 1. - * - * iw_table is RCU protected + * weightiness balances the tradeoff between small weights (cycles through nodes + * faster, more fair/even distribution) and large weights (smaller errors + * between actual bandwidth ratios and weight ratios). 32 is a number that has + * been found to perform at a reasonable compromise between the two goals. + */ +static const int weightiness = 32; + +/* + * A null weighted_interleave_state is interpreted as having .mode="auto", + * and .iw_table is interpreted as an array of 1s with length nr_node_ids. + */ +struct weighted_interleave_state { + bool mode_auto; + u8 iw_table[]; +}; +static struct weighted_interleave_state __rcu *wi_state; +static unsigned int *node_bw_table; + +/* + * wi_state_lock protects both wi_state and node_bw_table. + * node_bw_table is only used by writers to update wi_state. */ -static u8 __rcu *iw_table; -static DEFINE_MUTEX(iw_table_lock); +static DEFINE_MUTEX(wi_state_lock); static u8 get_il_weight(int node) { - u8 *table; - u8 weight; + struct weighted_interleave_state *state; + u8 weight = 1; rcu_read_lock(); - table = rcu_dereference(iw_table); - /* if no iw_table, use system default */ - weight = table ? table[node] : 1; - /* if value in iw_table is 0, use system default */ - weight = weight ? weight : 1; + state = rcu_dereference(wi_state); + if (state) + weight = state->iw_table[node]; rcu_read_unlock(); return weight; } +/* + * Convert bandwidth values into weighted interleave weights. + * Call with wi_state_lock. + */ +static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) +{ + u64 sum_bw = 0; + unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; + int nid; + + for_each_node_state(nid, N_MEMORY) + sum_bw += bw[nid]; + + /* Scale bandwidths to whole numbers in the range [1, weightiness] */ + for_each_node_state(nid, N_MEMORY) { + /* + * Try not to perform 64-bit division. + * If sum_bw < scaling_factor, then sum_bw < U32_MAX. + * If sum_bw > scaling_factor, then round the weight up to 1. + */ + scaling_factor = weightiness * bw[nid]; + if (bw[nid] && sum_bw < scaling_factor) { + cast_sum_bw = (unsigned int)sum_bw; + new_iw[nid] = scaling_factor / cast_sum_bw; + } else { + new_iw[nid] = 1; + } + if (!iw_gcd) + iw_gcd = new_iw[nid]; + iw_gcd = gcd(iw_gcd, new_iw[nid]); + } + + /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ + for_each_node_state(nid, N_MEMORY) + new_iw[nid] /= iw_gcd; +} + +int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *old_bw, *new_bw; + unsigned int bw_val; + int i; + + bw_val = min(coords->read_bandwidth, coords->write_bandwidth); + new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); + if (!new_bw) + return -ENOMEM; + + new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) { + kfree(new_bw); + return -ENOMEM; + } + new_wi_state->mode_auto = true; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + /* + * Update bandwidth info, even in manual mode. That way, when switching + * to auto mode in the future, iw_table can be overwritten using + * accurate bw data. + */ + mutex_lock(&wi_state_lock); + + old_bw = node_bw_table; + if (old_bw) + memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); + new_bw[node] = bw_val; + node_bw_table = new_bw; + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state && !old_wi_state->mode_auto) { + /* Manual mode; skip reducing weights and updating wi_state */ + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + goto out; + } + + /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ + reduce_interleave_weights(new_bw, new_wi_state->iw_table); + rcu_assign_pointer(wi_state, new_wi_state); + + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +out: + kfree(old_bw); + return 0; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -2023,26 +2131,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol, static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) { + struct weighted_interleave_state *state; nodemask_t nodemask; unsigned int target, nr_nodes; - u8 *table; + u8 *table = NULL; unsigned int weight_total = 0; u8 weight; - int nid; + int nid = 0; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); - table = rcu_dereference(iw_table); + + state = rcu_dereference(wi_state); + /* Uninitialized wi_state means we should assume all weights are 1 */ + if (state) + table = state->iw_table; + /* calculate the total weight */ - for_each_node_mask(nid, nodemask) { - /* detect system default usage */ - weight = table ? table[nid] : 1; - weight = weight ? weight : 1; - weight_total += weight; - } + for_each_node_mask(nid, nodemask) + weight_total += table ? table[nid] : 1; /* Calculate the node offset based on totals */ target = ilx % weight_total; @@ -2050,7 +2160,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) while (target) { /* detect system default usage */ weight = table ? table[nid] : 1; - weight = weight ? weight : 1; if (target < weight) break; target -= weight; @@ -2451,13 +2560,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { + struct weighted_interleave_state *state; struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; - u8 *table, *weights, weight; + u8 *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; @@ -2507,17 +2617,19 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, return total_allocated; rcu_read_lock(); - table = rcu_dereference(iw_table); - if (table) - memcpy(weights, table, nr_node_ids); - rcu_read_unlock(); + state = rcu_dereference(wi_state); + if (state) { + memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + for (i = 0; i < nr_node_ids; i++) + weights[i] = 1; + } /* calculate total, detect system default usage */ - for_each_node_mask(node, nodes) { - if (!weights[node]) - weights[node] = 1; + for_each_node_mask(node, nodes) weight_total += weights[node]; - } /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. @@ -3450,31 +3562,109 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; struct iw_node_attr *node_attr; - u8 *new; - u8 *old; u8 weight = 0; + int i; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); - if (count == 0 || sysfs_streq(buf, "")) - weight = 0; - else if (kstrtou8(buf, 0, &weight)) + if (count == 0 || sysfs_streq(buf, "") || + kstrtou8(buf, 0, &weight) || weight == 0) return -EINVAL; - new = kzalloc(nr_node_ids, GFP_KERNEL); - if (!new) + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) return -ENOMEM; - mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - if (old) - memcpy(new, old, nr_node_ids); - new[node_attr->nid] = weight; - rcu_assign_pointer(iw_table, new); - mutex_unlock(&iw_table_lock); - synchronize_rcu(); - kfree(old); + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state) { + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + } else { + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + } + new_wi_state->iw_table[node_attr->nid] = weight; + new_wi_state->mode_auto = false; + + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static ssize_t weighted_interleave_auto_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct weighted_interleave_state *state; + bool wi_auto = true; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) + wi_auto = state->mode_auto; + rcu_read_unlock(); + + return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); +} + +static ssize_t weighted_interleave_auto_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *bw; + bool input; + int i; + + if (kstrtobool(buf, &input)) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + mutex_lock(&wi_state_lock); + if (!input) { + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) + goto update_wi_state; + if (input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + return count; + } + + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + goto update_wi_state; + } + + bw = node_bw_table; + if (!bw) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return -ENODEV; + } + + new_wi_state->mode_auto = true; + reduce_interleave_weights(bw, new_wi_state->iw_table); + +update_wi_state: + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } return count; } @@ -3508,23 +3698,35 @@ static void sysfs_wi_node_delete_all(void) sysfs_wi_node_delete(nid); } -static void iw_table_free(void) +static void wi_state_free(void) { - u8 *old; + struct weighted_interleave_state *old_wi_state; - mutex_lock(&iw_table_lock); - old = rcu_dereference_protected(iw_table, - lockdep_is_held(&iw_table_lock)); - rcu_assign_pointer(iw_table, NULL); - mutex_unlock(&iw_table_lock); + mutex_lock(&wi_state_lock); + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) { + mutex_unlock(&wi_state_lock); + goto out; + } + rcu_assign_pointer(wi_state, NULL); + mutex_unlock(&wi_state_lock); synchronize_rcu(); - kfree(old); + kfree(old_wi_state); +out: + kfree(&wi_group->wi_kobj); } +static struct kobj_attribute wi_auto_attr = + __ATTR(auto, 0664, weighted_interleave_auto_show, + weighted_interleave_auto_store); + static void wi_cleanup(void) { + sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); sysfs_wi_node_delete_all(); - iw_table_free(); + wi_state_free(); } static void wi_kobj_release(struct kobject *wi_kobj) @@ -3627,6 +3829,10 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) if (err) goto err_put_kobj; + err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + if (err) + goto err_put_kobj; + for_each_online_node(nid) { if (!node_state(nid, N_MEMORY)) continue; |