From fad9c80e6371ee04a3fa5728efe20b88d8e4cccd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 May 2023 22:51:01 +0200 Subject: maple_tree: fix a few documentation issues The documentation of mt_next() claims that it starts the search at the provided index. That's incorrect as it starts the search after the provided index. The documentation of mt_find() is slightly confusing. "Handles locking" is not really helpful as it does not explain how the "locking" works. Also the documentation of index talks about a range, while in reality the index is updated on a succesful search to the index of the found entry plus one. Fix similar issues for mt_find_after() and mt_prev(). Reword the confusing "Note: Will not return the zero entry." comment on mt_for_each() and document @__index correctly. Link: https://lkml.kernel.org/r/87ttw2n556.ffs@tglx Signed-off-by: Thomas Gleixner Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Cc: Shanker Donthineni Signed-off-by: Andrew Morton --- lib/maple_tree.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4dd73cf936a6..f512bb9766aa 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5750,7 +5750,11 @@ EXPORT_SYMBOL_GPL(mas_next_range); * @index: The start index * @max: The maximum index to check * - * Return: The entry at @index or higher, or %NULL if nothing is found. + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst + * + * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { @@ -5856,7 +5860,11 @@ EXPORT_SYMBOL_GPL(mas_prev_range); * @index: The start index * @min: The minimum index to check * - * Return: The entry at @index or lower, or %NULL if nothing is found. + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst + * + * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { @@ -6468,9 +6476,15 @@ EXPORT_SYMBOL(mtree_destroy); * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search - * @max: The maximum value to check + * @max: The maximum value of the search range + * + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst * - * Handles locking. @index will be incremented to one beyond the range. + * In case that an entry is found @index is updated to point to the next + * possible entry independent whether the found entry is occupying a + * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ @@ -6528,7 +6542,9 @@ EXPORT_SYMBOL(mt_find); * @index: Pointer which contains the start location of the search * @max: The maximum value to check * - * Handles locking, detects wrapping on index == 0 + * Same as mt_find() except that it checks @index for 0 before + * searching. If @index == 0, the search is aborted. This covers a wrap + * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ -- cgit v1.2.3 From d6e8d0dc19a3ebea185cd8e99f2e960d81b153ad Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:54 +0800 Subject: maple_tree: add test for mas_wr_modify() fast path Patch series "Optimize the fast path of mas_store()", v4. Add fast paths for mas_wr_append() and mas_wr_slot_store() respectively. The newly added fast path of mas_wr_append() is used in fork() and how much it benefits fork() depends on how many VMAs are duplicated. Thanks Liam for the review. This patch (of 4): Add tests for all cases of mas_wr_append() and mas_wr_slot_store(). Link: https://lkml.kernel.org/r/20230628073657.75314-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230628073657.75314-2-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 8d4c92cbdd0c..3207c2107918 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1157,6 +1157,71 @@ static noinline void __init check_ranges(struct maple_tree *mt) MT_BUG_ON(mt, !mt_height(mt)); mtree_destroy(mt); + /* Check in-place modifications */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + /* Append to the start of last range */ + mt_set_non_kernel(50); + for (i = 0; i <= 500; i++) { + val = i * 5 + 1; + val2 = val + 4; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Append to the last range without touching any boundaries */ + for (i = 0; i < 10; i++) { + val = val2 + 5; + val2 = val + 4; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Append to the end of last range */ + val = val2; + for (i = 0; i < 10; i++) { + val += 5; + MT_BUG_ON(mt, mtree_test_store_range(mt, val, ULONG_MAX, + xa_mk_value(val)) != 0); + } + + /* Overwriting the range and over a part of the next range */ + for (i = 10; i < 30; i += 2) { + val = i * 5 + 1; + val2 = val + 5; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Overwriting a part of the range and over the next range */ + for (i = 50; i < 70; i += 2) { + val2 = i * 5; + val = val2 - 5; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* + * Expand the range, only partially overwriting the previous and + * next ranges + */ + for (i = 100; i < 130; i += 3) { + val = i * 5 - 5; + val2 = i * 5 + 1; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* + * Expand the range, only partially overwriting the previous and + * next ranges, in RCU mode + */ + mt_set_in_rcu(mt); + for (i = 150; i < 180; i += 3) { + val = i * 5 - 5; + val2 = i * 5 + 1; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + MT_BUG_ON(mt, !mt_height(mt)); + mt_validate(mt); + mt_set_non_kernel(0); + mtree_destroy(mt); + /* Test rebalance gaps */ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); mt_set_non_kernel(50); -- cgit v1.2.3 From 23e9dde0b246d47e4a1942ea50bf7fef63e2d41a Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:56 +0800 Subject: maple_tree: optimize mas_wr_append(), also improve duplicating VMAs When the new range can be completely covered by the original last range without touching the boundaries on both sides, two new entries can be appended to the end as a fast path. We update the original last pivot at the end, and the newly appended two entries will not be accessed before this, so it is also safe in RCU mode. This is useful for sequential insertion, which is what we do in dup_mmap(). Enabling BENCH_FORK in test_maple_tree and just running bench_forking() gives the following time-consuming numbers: before: after: 17,874.83 msec 15,738.38 msec It shows about a 12% performance improvement for duplicating VMAs. Link: https://lkml.kernel.org/r/20230628073657.75314-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f512bb9766aa..3e0c91c01781 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4267,10 +4267,10 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) * * Return: True if appended, false otherwise */ -static inline bool mas_wr_append(struct ma_wr_state *wr_mas) +static inline bool mas_wr_append(struct ma_wr_state *wr_mas, + unsigned char new_end) { unsigned char end = wr_mas->node_end; - unsigned char new_end = end + 1; struct ma_state *mas = wr_mas->mas; unsigned char node_pivots = mt_pivots[wr_mas->type]; @@ -4282,16 +4282,27 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas) ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end); } - if (mas->last == wr_mas->r_max) { - /* Append to end of range */ - rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->entry); - wr_mas->pivots[end] = mas->index - 1; - mas->offset = new_end; + if (new_end == wr_mas->node_end + 1) { + if (mas->last == wr_mas->r_max) { + /* Append to end of range */ + rcu_assign_pointer(wr_mas->slots[new_end], + wr_mas->entry); + wr_mas->pivots[end] = mas->index - 1; + mas->offset = new_end; + } else { + /* Append to start of range */ + rcu_assign_pointer(wr_mas->slots[new_end], + wr_mas->content); + wr_mas->pivots[end] = mas->last; + rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + } } else { - /* Append to start of range */ + /* Append to the range without touching any boundaries. */ rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content); - wr_mas->pivots[end] = mas->last; - rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + wr_mas->pivots[end + 1] = mas->last; + rcu_assign_pointer(wr_mas->slots[end + 1], wr_mas->entry); + wr_mas->pivots[end] = mas->index - 1; + mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) @@ -4338,7 +4349,7 @@ static inline void mas_wr_modify(struct ma_wr_state *wr_mas) goto slow_path; /* Attempt to append */ - if (new_end == wr_mas->node_end + 1 && mas_wr_append(wr_mas)) + if (mas_wr_append(wr_mas, new_end)) return; if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas)) -- cgit v1.2.3 From 64891ba3e51fb841b0af70db029038eb93bd5a43 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:57 +0800 Subject: maple_tree: add a fast path case in mas_wr_slot_store() When expanding a range in two directions, only partially overwriting the previous and next ranges, the number of entries will not be increased, so we can just update the pivots as a fast path. However, it may introduce potential risks in RCU mode, because it updates two pivots. We only enable it in non-RCU mode. Link: https://lkml.kernel.org/r/20230628073657.75314-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3e0c91c01781..7fad4a7a0b05 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4168,23 +4168,35 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; + void __rcu **slots = wr_mas->slots; bool gap = false; - if (wr_mas->offset_end - offset != 1) - return false; - - gap |= !mt_slot_locked(mas->tree, wr_mas->slots, offset); - gap |= !mt_slot_locked(mas->tree, wr_mas->slots, offset + 1); + gap |= !mt_slot_locked(mas->tree, slots, offset); + gap |= !mt_slot_locked(mas->tree, slots, offset + 1); - if (mas->index == wr_mas->r_min) { - /* Overwriting the range and over a part of the next range. */ - rcu_assign_pointer(wr_mas->slots[offset], wr_mas->entry); - wr_mas->pivots[offset] = mas->last; - } else { - /* Overwriting a part of the range and over the next range */ - rcu_assign_pointer(wr_mas->slots[offset + 1], wr_mas->entry); + if (wr_mas->offset_end - offset == 1) { + if (mas->index == wr_mas->r_min) { + /* Overwriting the range and a part of the next one */ + rcu_assign_pointer(slots[offset], wr_mas->entry); + wr_mas->pivots[offset] = mas->last; + } else { + /* Overwriting a part of the range and the next one */ + rcu_assign_pointer(slots[offset + 1], wr_mas->entry); + wr_mas->pivots[offset] = mas->index - 1; + mas->offset++; /* Keep mas accurate. */ + } + } else if (!mt_in_rcu(mas->tree)) { + /* + * Expand the range, only partially overwriting the previous and + * next ranges + */ + gap |= !mt_slot_locked(mas->tree, slots, offset + 2); + rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; + wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ + } else { + return false; } trace_ma_write(__func__, mas, 0, wr_mas->entry); -- cgit v1.2.3 From d695c30a8ca07ac7e2138435b461b36289d5656e Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:38 +0800 Subject: maple_tree: don't use MAPLE_ARANGE64_META_MAX to indicate no gap Patch series "Improve the validation for maple tree and some cleanup", v2. This patch (of 7): Do not use a special offset to indicate that there is no gap. When there is no gap, offset can point to any valid slots because its gap is 0. Link: https://lkml.kernel.org/r/20230711035444.526-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230711035444.526-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7fad4a7a0b05..da2c8542ad2f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1610,8 +1610,6 @@ ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * - * If the metadata gap is set to MAPLE_ARANGE64_META_MAX, there is no gap. - * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) @@ -1628,9 +1626,6 @@ static inline unsigned long mas_max_gap(struct ma_state *mas) node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node, mt); - if (offset == MAPLE_ARANGE64_META_MAX) - return 0; - gaps = ma_gaps(node, mt); return gaps[offset]; } @@ -1662,10 +1657,7 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode, pmt); - if (meta_offset == MAPLE_ARANGE64_META_MAX) - meta_gap = 0; - else - meta_gap = pgaps[meta_offset]; + meta_gap = pgaps[meta_offset]; pgaps[offset] = new; @@ -1678,7 +1670,6 @@ ascend: ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { - meta_offset = 15; new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } @@ -2076,7 +2067,7 @@ static inline void mab_mas_cp(struct maple_big_node *b_node, end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; - unsigned char offset = 15; + unsigned char offset = 0; gaps = ma_gaps(node, mt); do { -- cgit v1.2.3 From f8e5eac8abe3d26106e5470c735058f04f60f61e Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:39 +0800 Subject: maple_tree: make mas_validate_gaps() to check metadata Make mas_validate_gaps() check whether the offset in the metadata points to the largest gap. By the way, simplify this function. Add the verification that gaps beyond the node limit are zero. Link: https://lkml.kernel.org/r/20230711035444.526-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 78 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index da2c8542ad2f..9ce78e5e6084 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6957,15 +6957,16 @@ EXPORT_SYMBOL_GPL(mt_dump); static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; - struct maple_node *p_mn; + struct maple_node *p_mn, *node = mte_to_node(mte); + enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; - unsigned char p_slot; + unsigned char p_slot, offset; unsigned long *gaps = NULL; - unsigned long *pivots = ma_pivots(mte_to_node(mte), mte_node_type(mte)); - int i; + unsigned long *pivots = ma_pivots(node, mt); + unsigned int i; - if (ma_is_dense(mte_node_type(mte))) { + if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) @@ -6978,52 +6979,59 @@ static void mas_validate_gaps(struct ma_state *mas) goto counted; } - gaps = ma_gaps(mte_to_node(mte), mte_node_type(mte)); + gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { - p_end = mas_logical_pivot(mas, pivots, i, mte_node_type(mte)); + p_end = mas_logical_pivot(mas, pivots, i, mt); if (!gaps) { - if (mas_get_slot(mas, i)) { - gap = 0; - goto not_empty; - } - - gap += p_end - p_start + 1; + if (!mas_get_slot(mas, i)) + gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; - if (!entry) { - if (gap != p_end - p_start + 1) { - pr_err("%p[%u] -> %p %lu != %lu - %lu + 1\n", - mas_mn(mas), i, - mas_get_slot(mas, i), gap, - p_end, p_start); - mt_dump(mas->tree, mt_dump_hex); - - MT_BUG_ON(mas->tree, - gap != p_end - p_start + 1); - } - } else { - if (gap > p_end - p_start + 1) { - pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", - mas_mn(mas), i, gap, p_end, p_start, - p_end - p_start + 1); - MT_BUG_ON(mas->tree, - gap > p_end - p_start + 1); - } + MT_BUG_ON(mas->tree, !entry); + + if (gap > p_end - p_start + 1) { + pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", + mas_mn(mas), i, gap, p_end, p_start, + p_end - p_start + 1); + MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; -not_empty: + p_start = p_end + 1; if (p_end >= mas->max) break; } counted: + if (mt == maple_arange_64) { + offset = ma_meta_gap(node, mt); + if (offset > i) { + pr_err("gap offset %p[%u] is invalid\n", node, offset); + MT_BUG_ON(mas->tree, 1); + } + + if (gaps[offset] != max_gap) { + pr_err("gap %p[%u] is not the largest gap %lu\n", + node, offset, max_gap); + MT_BUG_ON(mas->tree, 1); + } + + MT_BUG_ON(mas->tree, !gaps); + for (i++ ; i < mt_slot_count(mte); i++) { + if (gaps[i] != 0) { + pr_err("gap %p[%u] beyond node limit != 0\n", + node, i); + MT_BUG_ON(mas->tree, 1); + } + } + } + if (mte_is_root(mte)) return; @@ -7033,10 +7041,8 @@ counted: if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); + MT_BUG_ON(mas->tree, 1); } - - MT_BUG_ON(mas->tree, - ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap); } static void mas_validate_parent_slot(struct ma_state *mas) -- cgit v1.2.3 From e93fda5a1ab7a0c6143ae8a6f231c9f5f3c417b1 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:40 +0800 Subject: maple_tree: fix mas_validate_child_slot() to check last missed slot Don't break the loop before checking the last slot. Also here check if non-leaf nodes are missing children. Link: https://lkml.kernel.org/r/20230711035444.526-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9ce78e5e6084..af8fb75ad688 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7093,11 +7093,12 @@ static void mas_validate_child_slot(struct ma_state *mas) for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); - if (!pivots[i] || pivots[i] == mas->max) - break; - if (!child) - break; + if (!child) { + pr_err("Non-leaf node lacks child at %p[%u]\n", + mas_mn(mas), i); + MT_BUG_ON(mas->tree, 1); + } if (mte_parent_slot(child) != i) { pr_err("Slot error at %p[%u]: child %p has pslot %u\n", @@ -7112,6 +7113,9 @@ static void mas_validate_child_slot(struct ma_state *mas) mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } + + if (i < mt_pivots[type] && pivots[i] == mas->max) + break; } } -- cgit v1.2.3 From 33af39d0244ce4944ab16728f7b04df9dfc6d365 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:41 +0800 Subject: maple_tree: make mas_validate_limits() check root node and node limit Update mas_validate_limits() to check root node, check node limit pivot if there is enough room for it to exist and check data_end. Remove the check for child existence as it is done in mas_validate_child_slot(). Link: https://lkml.kernel.org/r/20230711035444.526-6-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index af8fb75ad688..31ac4f2c4426 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7120,7 +7120,9 @@ static void mas_validate_child_slot(struct ma_state *mas) } /* - * Validate all pivots are within mas->min and mas->max. + * Validate all pivots are within mas->min and mas->max, check metadata ends + * where the maximum ends and ensure there is no slots or pivots set outside of + * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { @@ -7130,26 +7132,15 @@ static void mas_validate_limits(struct ma_state *mas) void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); - /* all limits are fine here. */ - if (mte_is_root(mas->node)) - return; - for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); - if (!piv && (i != 0)) - break; - - if (!mte_is_leaf(mas->node)) { - void *entry = mas_slot(mas, slots, i); - - if (!entry) - pr_err("%p[%u] cannot be null\n", - mas_mn(mas), i); - - MT_BUG_ON(mas->tree, !entry); + if (!piv && (i != 0)) { + pr_err("Missing node limit pivot at %p[%u]", + mas_mn(mas), i); + MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { @@ -7172,6 +7163,13 @@ static void mas_validate_limits(struct ma_state *mas) if (piv == mas->max) break; } + + if (mas_data_end(mas) != i) { + pr_err("node%p: data_end %u != the last slot offset %u\n", + mas_mn(mas), mas_data_end(mas), i); + MT_BUG_ON(mas->tree, 1); + } + for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); -- cgit v1.2.3 From a489539e33c29b469bcd023a32c99078c2597c7c Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:42 +0800 Subject: maple_tree: update mt_validate() Instead of using mas_first_entry() to find the leftmost leaf, use a simple loop instead. Remove an unneeded check for root node. To make the error message more accurate, check pivots first and then slots, because checking slots depend on the node limit pivot to break the loop. Link: https://lkml.kernel.org/r/20230711035444.526-7-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 31ac4f2c4426..e08ef44926c6 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7244,21 +7244,20 @@ void mt_validate(struct maple_tree *mt) if (!mas_searchable(&mas)) goto done; - mas_first_entry(&mas, mas_mn(&mas), ULONG_MAX, mte_node_type(mas.node)); + while (!mte_is_leaf(mas.node)) + mas_descend(&mas); + while (!mas_is_none(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); - if (!mte_is_root(mas.node)) { - end = mas_data_end(&mas); - if (MAS_WARN_ON(&mas, - (end < mt_min_slot_count(mas.node)) && - (mas.max != ULONG_MAX))) { - pr_err("Invalid size %u of %p\n", end, - mas_mn(&mas)); - } + end = mas_data_end(&mas); + if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && + (mas.max != ULONG_MAX))) { + pr_err("Invalid size %u of %p\n", end, mas_mn(&mas)); } + mas_validate_parent_slot(&mas); - mas_validate_child_slot(&mas); mas_validate_limits(&mas); + mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); -- cgit v1.2.3 From 29b2681f1aa95cff6ec0afdeac0b2cab659a5564 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:43 +0800 Subject: maple_tree: replace mas_logical_pivot() with mas_safe_pivot() Replace mas_logical_pivot() with mas_safe_pivot() and drop mas_logical_pivot() since it won't be used anymore. We can do this since now all nodes will have node limit pivot (if it is not full node). Link: https://lkml.kernel.org/r/20230711035444.526-8-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e08ef44926c6..4f3209ca0e3b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -728,33 +728,6 @@ mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) return mas->min; } -/* - * mas_logical_pivot() - Get the logical pivot of a given offset. - * @mas: The maple state - * @pivots: The pointer to the maple node pivots - * @offset: The offset into the pivot array - * @type: The maple node type - * - * When there is no value at a pivot (beyond the end of the data), then the - * pivot is actually @mas->max. - * - * Return: the logical pivot of a given @offset. - */ -static inline unsigned long -mas_logical_pivot(struct ma_state *mas, unsigned long *pivots, - unsigned char offset, enum maple_type type) -{ - unsigned long lpiv = mas_safe_pivot(mas, pivots, offset, type); - - if (likely(lpiv)) - return lpiv; - - if (likely(offset)) - return mas->max; - - return lpiv; -} - /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node @@ -2202,7 +2175,7 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, goto b_end; /* Handle new range ending before old range ends */ - piv = mas_logical_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); + piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); @@ -4935,7 +4908,7 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { - pivot = mas_logical_pivot(mas, pivots, offset, type); + pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) @@ -6981,7 +6954,7 @@ static void mas_validate_gaps(struct ma_state *mas) gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { - p_end = mas_logical_pivot(mas, pivots, i, mt); + p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) -- cgit v1.2.3 From 6783bd4b5f72b483cf492dc09500548b495670b5 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:44 +0800 Subject: maple_tree: drop mas_first_entry() The internal function mas_first_entry() is no longer used, so drop it. Link: https://lkml.kernel.org/r/20230711035444.526-9-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 72 -------------------------------------------------------- 1 file changed, 72 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4f3209ca0e3b..cef47ce8eddd 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6636,78 +6636,6 @@ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, offset); } - -/* - * mas_first_entry() - Go the first leaf and find the first entry. - * @mas: the maple state. - * @limit: the maximum index to check. - * @*r_start: Pointer to set to the range start. - * - * Sets mas->offset to the offset of the entry, r_start to the range minimum. - * - * Return: The first entry or MAS_NONE. - */ -static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, - unsigned long limit, enum maple_type mt) - -{ - unsigned long max; - unsigned long *pivots; - void __rcu **slots; - void *entry = NULL; - - mas->index = mas->min; - if (mas->index > limit) - goto none; - - max = mas->max; - mas->offset = 0; - while (likely(!ma_is_leaf(mt))) { - MAS_WARN_ON(mas, mte_dead_node(mas->node)); - slots = ma_slots(mn, mt); - entry = mas_slot(mas, slots, 0); - pivots = ma_pivots(mn, mt); - if (unlikely(ma_dead_node(mn))) - return NULL; - max = pivots[0]; - mas->node = entry; - mn = mas_mn(mas); - mt = mte_node_type(mas->node); - } - MAS_WARN_ON(mas, mte_dead_node(mas->node)); - - mas->max = max; - slots = ma_slots(mn, mt); - entry = mas_slot(mas, slots, 0); - if (unlikely(ma_dead_node(mn))) - return NULL; - - /* Slot 0 or 1 must be set */ - if (mas->index > limit) - goto none; - - if (likely(entry)) - return entry; - - mas->offset = 1; - entry = mas_slot(mas, slots, 1); - pivots = ma_pivots(mn, mt); - if (unlikely(ma_dead_node(mn))) - return NULL; - - mas->index = pivots[0] + 1; - if (mas->index > limit) - goto none; - - if (likely(entry)) - return entry; - -none: - if (likely(!ma_dead_node(mn))) - mas->node = MAS_NONE; - return NULL; -} - /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { -- cgit v1.2.3 From efb78fa86e95832b78ca0ba60f3706788a818938 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Fri, 14 Jul 2023 11:52:38 +1000 Subject: lib/test_meminit: allocate pages up to order MAX_ORDER test_pages() tests the page allocator by calling alloc_pages() with different orders up to order 10. However, different architectures and platforms support different maximum contiguous allocation sizes. The default maximum allocation order (MAX_ORDER) is 10, but architectures can use CONFIG_ARCH_FORCE_MAX_ORDER to override this. On platforms where this is less than 10, test_meminit() will blow up with a WARN(). This is expected, so let's not do that. Replace the hardcoded "10" with the MAX_ORDER macro so that we test allocations up to the expected platform limit. Link: https://lkml.kernel.org/r/20230714015238.47931-1-ajd@linux.ibm.com Fixes: 5015a300a522 ("lib: introduce test_meminit module") Signed-off-by: Andrew Donnellan Reviewed-by: Alexander Potapenko Cc: Xiaoke Wang Cc: Signed-off-by: Andrew Morton --- lib/test_meminit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 60e1984c060f..0ae35223d773 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures) int failures = 0, num_tests = 0; int i; - for (i = 0; i < 10; i++) + for (i = 0; i <= MAX_ORDER; i++) num_tests += do_alloc_pages_order(i, &failures); REPORT_FAILURES_IN_FN(); -- cgit v1.2.3 From 4445e58264aea8ec6bb1287add79606f0e3f3988 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sat, 15 Jul 2023 17:39:20 +0300 Subject: maple_tree: mtree_insert*: fix typo in kernel-doc description Replace "Insert and entry at a give index" with "Insert an entry at a given index" Link: https://lkml.kernel.org/r/20230715143920.994812-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index cef47ce8eddd..616ec7f3be81 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6274,7 +6274,7 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, EXPORT_SYMBOL(mtree_store); /** - * mtree_insert_range() - Insert an entry at a give range if there is no value. + * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range @@ -6310,7 +6310,7 @@ retry: EXPORT_SYMBOL(mtree_insert_range); /** - * mtree_insert() - Insert an entry at a give index if there is no value. + * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store -- cgit v1.2.3 From 4ae6944d15727c50ff1c0bb3fe38b9b412520d85 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sat, 15 Jul 2023 11:40:38 +0300 Subject: maple_tree: mtree_insert: fix typo in kernel-doc description of GFP flags Replace FGP_FLAGS with GFP_FLAGS Link: https://lkml.kernel.org/r/20230715084038.987955-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 616ec7f3be81..b6b3973897f0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6314,7 +6314,7 @@ EXPORT_SYMBOL(mtree_insert_range); * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store - * @gfp: The FGP_FLAGS to use for allocations. + * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. -- cgit v1.2.3 From 19a462f06eb5a78e0c3ebe4fd4fbdc71620b8788 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Jul 2023 15:55:51 -0400 Subject: maple_tree: Be more strict about locking Use lockdep to check the write path in the maple tree holds the lock in write mode. Introduce mt_write_lock_is_held() to check if the lock is held for writing. Update the necessary checks for rcu_dereference_protected() to use the new write lock check. Link: https://lkml.kernel.org/r/20230714195551.894800-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b6b3973897f0..3b6f8c8dac65 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -777,6 +777,12 @@ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) } } +static inline bool mt_write_locked(const struct maple_tree *mt) +{ + return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : + lockdep_is_held(&mt->ma_lock); +} + static inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : @@ -792,7 +798,7 @@ static inline void *mt_slot(const struct maple_tree *mt, static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { - return rcu_dereference_protected(slots[offset], mt_locked(mt)); + return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. @@ -835,7 +841,7 @@ static inline void *mas_root(struct ma_state *mas) static inline void *mt_root_locked(struct maple_tree *mt) { - return rcu_dereference_protected(mt->ma_root, mt_locked(mt)); + return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* -- cgit v1.2.3 From 361c678be709f67a5b609ec3666ff5fec7eb8baf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:43 -0400 Subject: maple_tree: add benchmarking for mas_for_each Patch series "Reduce preallocations for maple tree", v3. Initial work on preallocations showed no regression in performance during testing, but recently some users (both on [1] and off [android] list) have reported that preallocating the worst-case number of nodes has caused some slow down. This patch set addresses the number of allocations in a few ways. During munmap() most munmap() operations will remove a single VMA, so leverage the fact that the maple tree can place a single pointer at range 0 - 0 without allocating. This is done by changing the index of the VMAs to be indexed by the count, starting at 0. Re-introduce the entry argument to mas_preallocate() so that a more intelligent guess of the node count can be made. Implement the more intelligent guess of the node count, although there is more work to be done. During development of v2 of this patch set, I also noticed that the number of nodes being allocated for a rebalance was beyond what could possibly be needed. This is addressed in patch 0008. This patch (of 15): Add a way to test the speed of mas_for_each() to the testing code. Link: https://lkml.kernel.org/r/20230724183157.3939892-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230724183157.3939892-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 3207c2107918..9c4cf5fb2b7f 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -44,6 +44,7 @@ atomic_t maple_tree_tests_passed; /* #define BENCH_WALK */ /* #define BENCH_MT_FOR_EACH */ /* #define BENCH_FORK */ +/* #define BENCH_MAS_FOR_EACH */ #ifdef __KERNEL__ #define mt_set_non_kernel(x) do {} while (0) @@ -1770,6 +1771,37 @@ static noinline void __init bench_mt_for_each(struct maple_tree *mt) } #endif +#if defined(BENCH_MAS_FOR_EACH) +static noinline void __init bench_mas_for_each(struct maple_tree *mt) +{ + int i, count = 1000000; + unsigned long max = 2500; + void *entry; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i < max; i += 5) { + int gap = 4; + + if (i % 30 == 0) + gap = 3; + mtree_store_range(mt, i, i + gap, xa_mk_value(i), GFP_KERNEL); + } + + rcu_read_lock(); + for (i = 0; i < count; i++) { + unsigned long j = 0; + + mas_for_each(&mas, entry, max) { + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j += 5; + } + mas_set(&mas, 0); + } + rcu_read_unlock(); + +} +#endif + /* check_forking - simulate the kernel forking sequence with the tree. */ static noinline void __init check_forking(struct maple_tree *mt) { @@ -3498,6 +3530,13 @@ static int __init maple_tree_seed(void) mtree_destroy(&tree); goto skip; #endif +#if defined(BENCH_MAS_FOR_EACH) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_mas_for_each(&tree); + mtree_destroy(&tree); + goto skip; +#endif mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); -- cgit v1.2.3 From 8c314f3b55fbc42284ea1367bb2807f2accad8ae Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:44 -0400 Subject: maple_tree: add benchmarking for mas_prev() Add some benchmarking functions in testing for mas_prev(). This is useful to ensure there are no regressions added during modifications. Link: https://lkml.kernel.org/r/20230724183157.3939892-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 9c4cf5fb2b7f..0674aebd4423 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -45,6 +45,7 @@ atomic_t maple_tree_tests_passed; /* #define BENCH_MT_FOR_EACH */ /* #define BENCH_FORK */ /* #define BENCH_MAS_FOR_EACH */ +/* #define BENCH_MAS_PREV */ #ifdef __KERNEL__ #define mt_set_non_kernel(x) do {} while (0) @@ -1801,7 +1802,36 @@ static noinline void __init bench_mas_for_each(struct maple_tree *mt) } #endif +#if defined(BENCH_MAS_PREV) +static noinline void __init bench_mas_prev(struct maple_tree *mt) +{ + int i, count = 1000000; + unsigned long max = 2500; + void *entry; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i < max; i += 5) { + int gap = 4; + + if (i % 30 == 0) + gap = 3; + mtree_store_range(mt, i, i + gap, xa_mk_value(i), GFP_KERNEL); + } + + rcu_read_lock(); + for (i = 0; i < count; i++) { + unsigned long j = 2495; + + mas_set(&mas, ULONG_MAX); + while ((entry = mas_prev(&mas, 0)) != NULL) { + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j -= 5; + } + } + rcu_read_unlock(); +} +#endif /* check_forking - simulate the kernel forking sequence with the tree. */ static noinline void __init check_forking(struct maple_tree *mt) { @@ -3537,6 +3567,13 @@ static int __init maple_tree_seed(void) mtree_destroy(&tree); goto skip; #endif +#if defined(BENCH_MAS_PREV) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_mas_prev(&tree); + mtree_destroy(&tree); + goto skip; +#endif mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); -- cgit v1.2.3 From da0892547b101df6e13255b378380d077975368d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:49 -0400 Subject: maple_tree: re-introduce entry to mas_preallocate() arguments The current preallocation strategy is to preallocate the absolute worst-case allocation for a tree modification. The entry (or NULL) is needed to know how many nodes are needed to write to the tree. Start by adding the argument to the mas_preallocate() definition. Link: https://lkml.kernel.org/r/20230724183157.3939892-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3b6f8c8dac65..0d7e30c7d999 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5529,11 +5529,12 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state + * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { int ret; -- cgit v1.2.3 From c108df767fb786586274b2435473885151d6f360 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:50 -0400 Subject: maple_tree: adjust node allocation on mas_rebalance() mas_rebalance() is called to rebalance an insufficient node into a single node or two sufficient nodes. The preallocation estimate is always too many in this case as the height of the tree will never grow and there is no possibility to have a three way split in this case, so revise the node allocation count. Link: https://lkml.kernel.org/r/20230724183157.3939892-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0d7e30c7d999..494f884ef17f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3136,7 +3136,7 @@ static inline int mas_rebalance(struct ma_state *mas, * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ - mas_node_count(mas, 1 + empty_count * 3); + mas_node_count(mas, empty_count * 2 - 1); if (mas_is_err(mas)) return 0; -- cgit v1.2.3 From a7496ad529dfd96e37219849bddcda121d133536 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:53 -0400 Subject: maple_tree: move mas_wr_end_piv() below mas_wr_extend_null() Relocate it and call mas_wr_extend_null() from within mas_wr_end_piv(). Extending the NULL may affect the end pivot value so call mas_wr_endtend_null() from within mas_wr_end_piv() to keep it all together. Link: https://lkml.kernel.org/r/20230724183157.3939892-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 494f884ef17f..db61cdd8a649 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4180,18 +4180,6 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) return true; } -static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) -{ - while ((wr_mas->offset_end < wr_mas->node_end) && - (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) - wr_mas->offset_end++; - - if (wr_mas->offset_end < wr_mas->node_end) - wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; - else - wr_mas->end_piv = wr_mas->mas->max; -} - static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; @@ -4228,6 +4216,21 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) } } +static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) +{ + while ((wr_mas->offset_end < wr_mas->node_end) && + (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) + wr_mas->offset_end++; + + if (wr_mas->offset_end < wr_mas->node_end) + wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; + else + wr_mas->end_piv = wr_mas->mas->max; + + if (!wr_mas->entry) + mas_wr_extend_null(wr_mas); +} + static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; @@ -4371,10 +4374,6 @@ static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas) /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); - - if (!wr_mas->entry) - mas_wr_extend_null(wr_mas); - /* New root for a single pointer */ if (unlikely(!mas->index && mas->last == ULONG_MAX)) { mas_new_root(mas, wr_mas->entry); -- cgit v1.2.3 From 17983dc617837a588a52848ab4034d8efa6c1fa6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:55 -0400 Subject: maple_tree: refine mas_preallocate() node calculations Calculate the number of nodes based on the pending write action instead of assuming the worst case. This addresses a performance regression introduced in platforms that have longer allocation timing. Link: https://lkml.kernel.org/r/20230724183157.3939892-14-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index db61cdd8a649..4a111785360f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5535,9 +5535,51 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { + MA_WR_STATE(wr_mas, mas, entry); + unsigned char node_size; + int request = 1; int ret; - mas_node_count_gfp(mas, 1 + mas_mt_height(mas) * 3, gfp); + + if (unlikely(!mas->index && mas->last == ULONG_MAX)) + goto ask_now; + + mas_wr_store_setup(&wr_mas); + wr_mas.content = mas_start(mas); + /* Root expand */ + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) + goto ask_now; + + if (unlikely(!mas_wr_walk(&wr_mas))) { + /* Spanning store, use worst case for now */ + request = 1 + mas_mt_height(mas) * 3; + goto ask_now; + } + + /* At this point, we are at the leaf node that needs to be altered. */ + /* Exact fit, no nodes needed. */ + if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last) + return 0; + + mas_wr_end_piv(&wr_mas); + node_size = mas_wr_new_end(&wr_mas); + if (node_size >= mt_slots[wr_mas.type]) { + /* Split, worst case for now. */ + request = 1 + mas_mt_height(mas) * 2; + goto ask_now; + } + + /* New root needs a singe node */ + if (unlikely(mte_is_root(mas->node))) + goto ask_now; + + /* Potential spanning rebalance collapsing a node, use worst-case */ + if (node_size - 1 <= mt_min_slots[wr_mas.type]) + request = mas_mt_height(mas) * 2 - 1; + + /* node store, slot store needs one node */ +ask_now: + mas_node_count_gfp(mas, request, gfp); mas->mas_flags |= MA_STATE_PREALLOC; if (likely(!mas_is_err(mas))) return 0; -- cgit v1.2.3 From fec29364348fec535c55708b1f4025b321aba572 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:56 -0400 Subject: maple_tree: reduce resets during store setup mas_prealloc() may walk partially down the tree before finding that a split or spanning store is needed. When the write occurs, relax the logic on resetting the walk so that partial walks will not restart, but walks that have gone too far (a store that affects beyond the current node) should be restarted. Link: https://lkml.kernel.org/r/20230724183157.3939892-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4a111785360f..a3d602cfd030 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5424,19 +5424,34 @@ static inline void mte_destroy_walk(struct maple_enode *enode, static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { + if (mas_is_start(wr_mas->mas)) + return; + if (unlikely(mas_is_paused(wr_mas->mas))) - mas_reset(wr_mas->mas); + goto reset; - if (!mas_is_start(wr_mas->mas)) { - if (mas_is_none(wr_mas->mas)) { - mas_reset(wr_mas->mas); - } else { - wr_mas->r_max = wr_mas->mas->max; - wr_mas->type = mte_node_type(wr_mas->mas->node); - if (mas_is_span_wr(wr_mas)) - mas_reset(wr_mas->mas); - } - } + if (unlikely(mas_is_none(wr_mas->mas))) + goto reset; + + /* + * A less strict version of mas_is_span_wr() where we allow spanning + * writes within this node. This is to stop partial walks in + * mas_prealloc() from being reset. + */ + if (wr_mas->mas->last > wr_mas->mas->max) + goto reset; + + if (wr_mas->entry) + return; + + if (mte_is_leaf(wr_mas->mas->node) && + wr_mas->mas->last == wr_mas->mas->max) + goto reset; + + return; + +reset: + mas_reset(wr_mas->mas); } /* Interface */ -- cgit v1.2.3 From 83d97f620f611ab3fbf2de585bf34bd9dab513c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:46 -0400 Subject: maple_tree: add hex output to maple_arange64 dump Patch series "maple_tree: Change replacement strategy". The maple tree marks nodes dead as soon as they are going to be replaced. This could be problematic when used in the RCU context since the writer may be starved of CPU time by the readers. This patch set addresses the issue by switching the data replacement strategy to one that will only mark data as dead once the new data is available. This series changes the ordering of the node replacement so that the new data is live before the old data is marked 'dead'. When readers hit 'dead' nodes, they will restart from the top of the tree and end up in the new data. In more complex scenarios, the replacement strategy means a subtree is built and graphed into the tree leaving some nodes to point to the old parent. The view of tasks into the old data will either remain with the old data, or see the new data once the old data is marked 'dead'. Iterators will see the 'dead' node and restart on their own and switch to the new data. There is no risk of the reader seeing old data in these cases. The 'dead' subtree of data is then fully marked dead, but reused nodes will still point to the dead nodes until the parent pointer is updated. Walking up to a 'dead' node will cause a re-walk from the top of the tree and enter the new data area where old data is not reachable. Once the parent pointers are fully up to date in the active data, the 'dead' subtree is iterated to collect entirely 'dead' subtrees, and dead nodes (nodes that partially contained reused data). This patch (of 6): When dumping the tree, honour formatting request to output hex for the maple node type arange64. Link: https://lkml.kernel.org/r/20230804165951.2661157-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230804165951.2661157-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a3d602cfd030..880ce0fcdcac 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6833,11 +6833,27 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, int i; pr_cont(" contents: "); - for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) - pr_cont("%lu ", node->gap[i]); + for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { + switch (format) { + case mt_dump_hex: + pr_cont("%lx ", node->gap[i]); + break; + default: + case mt_dump_dec: + pr_cont("%lu ", node->gap[i]); + } + } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); - for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { + switch (format) { + case mt_dump_hex: + pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + break; + default: + case mt_dump_dec: + pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + } + } pr_cont("%p\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; -- cgit v1.2.3 From 72bcf4aa86ece2b49fbdc7fe83d3a05c7ebcfc97 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:47 -0400 Subject: maple_tree: reorder replacement of nodes to avoid live lock Replacing nodes may cause a live lock-up if CPU resources are saturated by write operations on the tree by continuously retrying on dead nodes. To avoid the continuous retry scenario, ensure the new node is inserted into the tree prior to marking the old data as dead. This will define a window where old and new data is swapped. When reusing lower level nodes, ensure the parent pointer is updated after the parent is marked dead. This ensures that the child is still reachable from the top of the tree, but walking up to a dead node will result in a single retry that will start a fresh walk from the top down through the new node. Link: https://lkml.kernel.org/r/20230804165951.2661157-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 880ce0fcdcac..0d4573a8d134 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1756,6 +1756,36 @@ static inline void mas_replace(struct ma_state *mas, bool advanced) } } +/* + * mas_replace_node() - Replace a node by putting it in the tree, marking it + * dead, and freeing it. + * the parent encoding to locate the maple node in the tree. + * @mas - the ma_state with @mas->node pointing to the new node. + * @old_enode - The old maple encoded node. + */ +static inline void mas_replace_node(struct ma_state *mas, + struct maple_enode *old_enode) + __must_hold(mas->tree->ma_lock) +{ + if (mte_is_root(mas->node)) { + mas_mn(mas)->parent = ma_parent_ptr( + ((unsigned long)mas->tree | MA_ROOT_PARENT)); + rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); + mas_set_height(mas); + } else { + unsigned char offset = 0; + void __rcu **slots = NULL; + + offset = mte_parent_slot(mas->node); + slots = ma_slots(mte_parent(mas->node), + mas_parent_type(mas, mas->node)); + rcu_assign_pointer(slots[offset], mas->node); + } + + mte_set_node_dead(old_enode); + mas_free(mas, old_enode); +} + /* * mas_new_child() - Find the new child of a node. * @mas: the maple state @@ -3176,7 +3206,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; - struct maple_enode *eparent; + struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; @@ -3218,7 +3248,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; - eparent = mt_mk_node(mte_parent(l_mas.node), + old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { @@ -3234,7 +3264,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); - mte_set_pivot(eparent, mte_parent_slot(l_mas.node), + mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ @@ -3242,6 +3272,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); + eparent = old_eparent; goto done; } @@ -3266,7 +3297,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); - memcpy(parent, mte_to_node(eparent), sizeof(struct maple_node)); + memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; @@ -3278,8 +3309,10 @@ done: mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); - if (in_rcu) - mas_replace(mas, false); + if (in_rcu) { + mas_replace_node(mas, old_eparent); + mas_adopt_children(mas, mas->node); + } mas_update_gap(mas); } @@ -3596,11 +3629,13 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char end) { struct maple_node *node; + struct maple_enode *old_enode; unsigned char b_end = b_node->b_end; enum maple_type b_type = b_node->type; + old_enode = wr_mas->mas->node; if ((b_end < mt_min_slots[b_type]) && - (!mte_is_root(wr_mas->mas->node)) && + (!mte_is_root(old_enode)) && (mas_mt_height(wr_mas->mas) > 1)) return mas_rebalance(wr_mas->mas, b_node); @@ -3618,7 +3653,7 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, node->parent = mas_mn(wr_mas->mas)->parent; wr_mas->mas->node = mt_mk_node(node, b_type); mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false); - mas_replace(wr_mas->mas, false); + mas_replace_node(wr_mas->mas, old_enode); reuse_node: mas_update_gap(wr_mas->mas); return 1; @@ -4117,9 +4152,10 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, done: mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end); if (in_rcu) { - mte_set_node_dead(mas->node); + struct maple_enode *old_enode = mas->node; + mas->node = mt_mk_node(newnode, wr_mas->type); - mas_replace(mas, false); + mas_replace_node(mas, old_enode); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } -- cgit v1.2.3 From 1238f6a226dc27ec34d229b71b02f0d6c46bbf11 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:48 -0400 Subject: maple_tree: introduce mas_put_in_tree() mas_replace() has a single user that takes a flag which is now always true. Replace this function with mas_put_in_tree() to better align with mas_replace_node(). Inline the remaining logic into the only caller; mas_wmb_replace(). Link: https://lkml.kernel.org/r/20230804165951.2661157-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 73 +++++++++++++++++++++----------------------------------- 1 file changed, 27 insertions(+), 46 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0d4573a8d134..c01b1be1480c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1715,45 +1715,32 @@ static inline void mas_adopt_children(struct ma_state *mas, } /* - * mas_replace() - Replace a maple node in the tree with mas->node. Uses the - * parent encoding to locate the maple node in the tree. - * @mas - the ma_state to use for operations. - * @advanced - boolean to adopt the child nodes and free the old node (false) or - * leave the node (true) and handle the adoption and free elsewhere. + * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old + * node as dead. + * @mas - the maple state with the new node + * @old_enode - The old maple encoded node to replace. */ -static inline void mas_replace(struct ma_state *mas, bool advanced) +static inline void mas_put_in_tree(struct ma_state *mas, + struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { - struct maple_node *mn = mas_mn(mas); - struct maple_enode *old_enode; - unsigned char offset = 0; - void __rcu **slots = NULL; - - if (ma_is_root(mn)) { - old_enode = mas_root_locked(mas); - } else { - offset = mte_parent_slot(mas->node); - slots = ma_slots(mte_parent(mas->node), - mas_parent_type(mas, mas->node)); - old_enode = mas_slot_locked(mas, slots, offset); - } - - if (!advanced && !mte_is_leaf(mas->node)) - mas_adopt_children(mas, mas->node); + unsigned char offset; + void __rcu **slots; if (mte_is_root(mas->node)) { - mn->parent = ma_parent_ptr( + mas_mn(mas)->parent = ma_parent_ptr( ((unsigned long)mas->tree | MA_ROOT_PARENT)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { + + offset = mte_parent_slot(mas->node); + slots = ma_slots(mte_parent(mas->node), + mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } - if (!advanced) { - mte_set_node_dead(old_enode); - mas_free(mas, old_enode); - } + mte_set_node_dead(old_enode); } /* @@ -1767,22 +1754,7 @@ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { - if (mte_is_root(mas->node)) { - mas_mn(mas)->parent = ma_parent_ptr( - ((unsigned long)mas->tree | MA_ROOT_PARENT)); - rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - mas_set_height(mas); - } else { - unsigned char offset = 0; - void __rcu **slots = NULL; - - offset = mte_parent_slot(mas->node); - slots = ma_slots(mte_parent(mas->node), - mas_parent_type(mas, mas->node)); - rcu_assign_pointer(slots[offset], mas->node); - } - - mte_set_node_dead(old_enode); + mas_put_in_tree(mas, old_enode); mas_free(mas, old_enode); } @@ -2789,11 +2761,20 @@ static inline void mas_wmb_replace(struct ma_state *mas, struct ma_topiary *free, struct ma_topiary *destroy) { - /* All nodes must see old data as dead prior to replacing that data */ - smp_wmb(); /* Needed for RCU */ + struct maple_enode *old_enode; + + if (mte_is_root(mas->node)) { + old_enode = mas_root_locked(mas); + } else { + unsigned char offset = mte_parent_slot(mas->node); + void __rcu **slots = ma_slots(mte_parent(mas->node), + mas_parent_type(mas, mas->node)); + + old_enode = mas_slot_locked(mas, slots, offset); + } /* Insert the new data in the tree */ - mas_replace(mas, true); + mas_put_in_tree(mas, old_enode); if (!mte_is_leaf(mas->node)) mas_descend_adopt(mas); -- cgit v1.2.3 From 4ffc2ee2cf01f3d03977fbeb1b43da2dc22a95f4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:49 -0400 Subject: maple_tree: introduce mas_tree_parent() definition Add a definition to shorten long code lines and clarify what the code is doing. Use the new definition to get the maple tree parent pointer from the maple state where possible. Link: https://lkml.kernel.org/r/20230804165951.2661157-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c01b1be1480c..cf41e0dbb87b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -75,6 +75,7 @@ #define MA_STATE_PREALLOC 4 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) +#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; @@ -1728,8 +1729,7 @@ static inline void mas_put_in_tree(struct ma_state *mas, void __rcu **slots; if (mte_is_root(mas->node)) { - mas_mn(mas)->parent = ma_parent_ptr( - ((unsigned long)mas->tree | MA_ROOT_PARENT)); + mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { @@ -2798,8 +2798,7 @@ static inline void mas_wmb_replace(struct ma_state *mas, static inline void mast_new_root(struct maple_subtree_state *mast, struct ma_state *mas) { - mas_mn(mast->l)->parent = - ma_parent_ptr(((unsigned long)mas->tree | MA_ROOT_PARENT)); + mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); if (!mte_dead_node(mast->orig_l->node) && !mte_is_root(mast->orig_l->node)) { do { @@ -3661,8 +3660,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); - node->parent = ma_parent_ptr( - ((unsigned long)mas->tree | MA_ROOT_PARENT)); + node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); if (mas->index) { @@ -3938,8 +3936,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); - node->parent = ma_parent_ptr( - ((unsigned long)mas->tree | MA_ROOT_PARENT)); + node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; -- cgit v1.2.3 From 068bafcac0b89ee5b1616793231eb4b3dd41e3f0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:50 -0400 Subject: maple_tree: change mas_adopt_children() parent usage All calls to mas_adopt_children() currently pass the parent as the node in the maple state. Allow for the parent pointer that is passed in to be used instead. Link: https://lkml.kernel.org/r/20230804165951.2661157-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index cf41e0dbb87b..8e94f5495a97 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1702,7 +1702,7 @@ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); - struct maple_node *node = mas_mn(mas); + struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; -- cgit v1.2.3 From 530f745c7620af288b71b3d667cb90f10df3defe Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:51 -0400 Subject: maple_tree: replace data before marking dead in split and spanning store Reorder the operations for split and spanning stores so that new data is placed in the tree prior to marking the old data as dead. This will limit re-walks on dead data to just once instead of a retry loop. The order of operations is as follows: Create the new data, put the new data in place, mark the top node of the old data as dead. Then repair parent links in the reused nodes through all levels of the tree, following the new nodes downwards. Finally walk the top dead node looking for nodes that are no longer used, or subtrees that should be destroyed (marked dead throughout then freed), follow the partially used nodes downwards to discover other dead nodes and subtrees. Link: https://lkml.kernel.org/r/20230804165951.2661157-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 493 +++++++++++++++++++------------------------------------ 1 file changed, 168 insertions(+), 325 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8e94f5495a97..ffb9d15bd815 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -982,27 +982,9 @@ static inline void mat_add(struct ma_topiary *mat, mat->tail = dead_enode; } -static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); -static inline void mas_free(struct ma_state *mas, struct maple_enode *used); - -/* - * mas_mat_free() - Free all nodes in a dead list. - * @mas - the maple state - * @mat - the ma_topiary linked list of dead nodes to free. - * - * Free walk a dead list. - */ -static void mas_mat_free(struct ma_state *mas, struct ma_topiary *mat) -{ - struct maple_enode *next; - - while (mat->head) { - next = mte_to_mat(mat->head)->next; - mas_free(mas, mat->head); - mat->head = next; - } -} - +static void mt_free_walk(struct rcu_head *head); +static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, + bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas - the maple state @@ -1013,10 +995,15 @@ static void mas_mat_free(struct ma_state *mas, struct ma_topiary *mat) static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; + struct maple_node *node; + bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; - mte_destroy_walk(mat->head, mat->mtree); + node = mte_to_node(mat->head); + mt_destroy_walk(mat->head, mas->tree, !in_rcu); + if (in_rcu) + call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } @@ -1759,11 +1746,11 @@ static inline void mas_replace_node(struct ma_state *mas, } /* - * mas_new_child() - Find the new child of a node. - * @mas: the maple state + * mas_find_child() - Find a child who has the parent @mas->node. + * @mas: the maple state with the parent. * @child: the maple state to store the child. */ -static inline bool mas_new_child(struct ma_state *mas, struct ma_state *child) +static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; @@ -2065,56 +2052,6 @@ static inline void mab_mas_cp(struct maple_big_node *b_node, } } -/* - * mas_descend_adopt() - Descend through a sub-tree and adopt children. - * @mas: the maple state with the maple encoded node of the sub-tree. - * - * Descend through a sub-tree and adopt children who do not have the correct - * parents set. Follow the parents which have the correct parents as they are - * the new entries which need to be followed to find other incorrectly set - * parents. - */ -static inline void mas_descend_adopt(struct ma_state *mas) -{ - struct ma_state list[3], next[3]; - int i, n; - - /* - * At each level there may be up to 3 correct parent pointers which indicates - * the new nodes which need to be walked to find any new nodes at a lower level. - */ - - for (i = 0; i < 3; i++) { - list[i] = *mas; - list[i].offset = 0; - next[i].offset = 0; - } - next[0] = *mas; - - while (!mte_is_leaf(list[0].node)) { - n = 0; - for (i = 0; i < 3; i++) { - if (mas_is_none(&list[i])) - continue; - - if (i && list[i-1].node == list[i].node) - continue; - - while ((n < 3) && (mas_new_child(&list[i], &next[n]))) - n++; - - mas_adopt_children(&list[i], list[i].node); - } - - while (n < 3) - next[n++].node = MAS_NONE; - - /* descend by setting the list to the children */ - for (i = 0; i < 3; i++) - list[i] = next[i]; - } -} - /* * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. * @mas: The maple state @@ -2304,98 +2241,6 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) wr_mas->offset_end = mas->offset = offset; } -/* - * mas_topiary_range() - Add a range of slots to the topiary. - * @mas: The maple state - * @destroy: The topiary to add the slots (usually destroy) - * @start: The starting slot inclusively - * @end: The end slot inclusively - */ -static inline void mas_topiary_range(struct ma_state *mas, - struct ma_topiary *destroy, unsigned char start, unsigned char end) -{ - void __rcu **slots; - unsigned char offset; - - MAS_BUG_ON(mas, mte_is_leaf(mas->node)); - - slots = ma_slots(mas_mn(mas), mte_node_type(mas->node)); - for (offset = start; offset <= end; offset++) { - struct maple_enode *enode = mas_slot_locked(mas, slots, offset); - - if (mte_dead_node(enode)) - continue; - - mat_add(destroy, enode); - } -} - -/* - * mast_topiary() - Add the portions of the tree to the removal list; either to - * be freed or discarded (destroy walk). - * @mast: The maple_subtree_state. - */ -static inline void mast_topiary(struct maple_subtree_state *mast) -{ - MA_WR_STATE(wr_mas, mast->orig_l, NULL); - unsigned char r_start, r_end; - unsigned char l_start, l_end; - void __rcu **l_slots, **r_slots; - - wr_mas.type = mte_node_type(mast->orig_l->node); - mast->orig_l->index = mast->orig_l->last; - mas_wr_node_walk(&wr_mas); - l_start = mast->orig_l->offset + 1; - l_end = mas_data_end(mast->orig_l); - r_start = 0; - r_end = mast->orig_r->offset; - - if (r_end) - r_end--; - - l_slots = ma_slots(mas_mn(mast->orig_l), - mte_node_type(mast->orig_l->node)); - - r_slots = ma_slots(mas_mn(mast->orig_r), - mte_node_type(mast->orig_r->node)); - - if ((l_start < l_end) && - mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_start))) { - l_start++; - } - - if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_end))) { - if (r_end) - r_end--; - } - - if ((l_start > r_end) && (mast->orig_l->node == mast->orig_r->node)) - return; - - /* At the node where left and right sides meet, add the parts between */ - if (mast->orig_l->node == mast->orig_r->node) { - return mas_topiary_range(mast->orig_l, mast->destroy, - l_start, r_end); - } - - /* mast->orig_r is different and consumed. */ - if (mte_is_leaf(mast->orig_r->node)) - return; - - if (mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_end))) - l_end--; - - - if (l_start <= l_end) - mas_topiary_range(mast->orig_l, mast->destroy, l_start, l_end); - - if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_start))) - r_start++; - - if (r_start <= r_end) - mas_topiary_range(mast->orig_r, mast->destroy, 0, r_end); -} - /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state @@ -2431,7 +2276,7 @@ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each - * level upwards until root is reached. Free and destroy as needed. + * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ @@ -2440,8 +2285,6 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; - struct maple_enode *ancestor = NULL; - unsigned char start, end; unsigned char depth = 0; r_tmp = *mast->orig_r; @@ -2450,87 +2293,25 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast) mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; - if (!ancestor && - (mast->orig_r->node == mast->orig_l->node)) { - ancestor = mast->orig_r->node; - end = mast->orig_r->offset - 1; - start = mast->orig_l->offset + 1; - } - if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { - if (!ancestor) { - ancestor = mast->orig_r->node; - start = 0; - } - mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; - depth--; - } while (depth); + } while (--depth); mast_rebalance_next(mast); - do { - unsigned char l_off = 0; - struct maple_enode *child = r_tmp.node; - - mas_ascend(&r_tmp); - if (ancestor == r_tmp.node) - l_off = start; - - if (r_tmp.offset) - r_tmp.offset--; - - if (l_off < r_tmp.offset) - mas_topiary_range(&r_tmp, mast->destroy, - l_off, r_tmp.offset); - - if (l_tmp.node != child) - mat_add(mast->free, child); - - } while (r_tmp.node != ancestor); - *mast->orig_l = l_tmp; return true; - } else if (mast->orig_l->offset != 0) { - if (!ancestor) { - ancestor = mast->orig_l->node; - end = mas_data_end(mast->orig_l); - } - mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); - depth--; - } while (depth); + } while (--depth); mast_rebalance_prev(mast); - do { - unsigned char r_off; - struct maple_enode *child = l_tmp.node; - - mas_ascend(&l_tmp); - if (ancestor == l_tmp.node) - r_off = end; - else - r_off = mas_data_end(&l_tmp); - - if (l_tmp.offset < r_off) - l_tmp.offset++; - - if (l_tmp.offset < r_off) - mas_topiary_range(&l_tmp, mast->destroy, - l_tmp.offset, r_off); - - if (r_tmp.node != child) - mat_add(mast->free, child); - - } while (l_tmp.node != ancestor); - *mast->orig_r = r_tmp; return true; } @@ -2542,36 +2323,24 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast) } /* - * mast_ascend_free() - Add current original maple state nodes to the free list - * and ascend. + * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * - * Ascend the original left and right sides and add the previous nodes to the - * free list. Set the slots to point to the correct location in the new nodes. + * Ascend the original left and right sides. Set the offsets to point to the + * data already in the new tree (@mast->l and @mast->r). */ -static inline void -mast_ascend_free(struct maple_subtree_state *mast) +static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); - struct maple_enode *left = mast->orig_l->node; - struct maple_enode *right = mast->orig_r->node; - mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); - mat_add(mast->free, left); - - if (left != right) - mat_add(mast->free, right); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; - /* - * The node may not contain the value so set slot to ensure all - * of the nodes contents are freed or destroyed. - */ + wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ @@ -2750,66 +2519,152 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast, } /* - * mas_wmb_replace() - Write memory barrier and replace - * @mas: The maple state - * @free: the maple topiary list of nodes to free - * @destroy: The maple topiary list of nodes to destroy (walk and free) + * mas_topiary_node() - Dispose of a singe node + * @mas: The maple state for pushing nodes + * @enode: The encoded maple node + * @in_rcu: If the tree is in rcu mode * - * Updates gap as necessary. + * The node will either be RCU freed or pushed back on the maple state. */ -static inline void mas_wmb_replace(struct ma_state *mas, - struct ma_topiary *free, - struct ma_topiary *destroy) +static inline void mas_topiary_node(struct ma_state *mas, + struct maple_enode *enode, bool in_rcu) { - struct maple_enode *old_enode; + struct maple_node *tmp; - if (mte_is_root(mas->node)) { - old_enode = mas_root_locked(mas); - } else { - unsigned char offset = mte_parent_slot(mas->node); - void __rcu **slots = ma_slots(mte_parent(mas->node), - mas_parent_type(mas, mas->node)); + if (enode == MAS_NONE) + return; - old_enode = mas_slot_locked(mas, slots, offset); - } + tmp = mte_to_node(enode); + mte_set_node_dead(enode); + if (in_rcu) + ma_free_rcu(tmp); + else + mas_push_node(mas, tmp); +} - /* Insert the new data in the tree */ +/* + * mas_topiary_replace() - Replace the data with new data, then repair the + * parent links within the new tree. Iterate over the dead sub-tree and collect + * the dead subtrees and topiary the nodes that are no longer of use. + * + * The new tree will have up to three children with the correct parent. Keep + * track of the new entries as they need to be followed to find the next level + * of new entries. + * + * The old tree will have up to three children with the old parent. Keep track + * of the old entries as they may have more nodes below replaced. Nodes within + * [index, last] are dead subtrees, others need to be freed and followed. + * + * @mas: The maple state pointing at the new data + * @old_enode: The maple encoded node being replaced + * + */ +static inline void mas_topiary_replace(struct ma_state *mas, + struct maple_enode *old_enode) +{ + struct ma_state tmp[3], tmp_next[3]; + MA_TOPIARY(subtrees, mas->tree); + bool in_rcu; + int i, n; + + /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode); - if (!mte_is_leaf(mas->node)) - mas_descend_adopt(mas); + /* Update the parent pointers in the tree */ + tmp[0] = *mas; + tmp[0].offset = 0; + tmp[1].node = MAS_NONE; + tmp[2].node = MAS_NONE; + while (!mte_is_leaf(tmp[0].node)) { + n = 0; + for (i = 0; i < 3; i++) { + if (mas_is_none(&tmp[i])) + continue; + + while (n < 3) { + if (!mas_find_child(&tmp[i], &tmp_next[n])) + break; + n++; + } + + mas_adopt_children(&tmp[i], tmp[i].node); + } - mas_mat_free(mas, free); + if (MAS_WARN_ON(mas, n == 0)) + break; - if (destroy) - mas_mat_destroy(mas, destroy); + while (n < 3) + tmp_next[n++].node = MAS_NONE; - if (mte_is_leaf(mas->node)) - return; + for (i = 0; i < 3; i++) + tmp[i] = tmp_next[i]; + } - mas_update_gap(mas); + /* Collect the old nodes that need to be discarded */ + if (mte_is_leaf(old_enode)) + return mas_free(mas, old_enode); + + tmp[0] = *mas; + tmp[0].offset = 0; + tmp[0].node = old_enode; + tmp[1].node = MAS_NONE; + tmp[2].node = MAS_NONE; + in_rcu = mt_in_rcu(mas->tree); + do { + n = 0; + for (i = 0; i < 3; i++) { + if (mas_is_none(&tmp[i])) + continue; + + while (n < 3) { + if (!mas_find_child(&tmp[i], &tmp_next[n])) + break; + + if ((tmp_next[n].min >= tmp_next->index) && + (tmp_next[n].max <= tmp_next->last)) { + mat_add(&subtrees, tmp_next[n].node); + tmp_next[n].node = MAS_NONE; + } else { + n++; + } + } + } + + if (MAS_WARN_ON(mas, n == 0)) + break; + + while (n < 3) + tmp_next[n++].node = MAS_NONE; + + for (i = 0; i < 3; i++) { + mas_topiary_node(mas, tmp[i].node, in_rcu); + tmp[i] = tmp_next[i]; + } + } while (!mte_is_leaf(tmp[0].node)); + + for (i = 0; i < 3; i++) + mas_topiary_node(mas, tmp[i].node, in_rcu); + + mas_mat_destroy(mas, &subtrees); } /* - * mast_new_root() - Set a new tree root during subtree creation - * @mast: The maple subtree state + * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state + * @old: The old maple encoded node that is being replaced. + * + * Updates gap as necessary. */ -static inline void mast_new_root(struct maple_subtree_state *mast, - struct ma_state *mas) +static inline void mas_wmb_replace(struct ma_state *mas, + struct maple_enode *old_enode) { - mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); - if (!mte_dead_node(mast->orig_l->node) && - !mte_is_root(mast->orig_l->node)) { - do { - mast_ascend_free(mast); - mast_topiary(mast); - } while (!mte_is_root(mast->orig_l->node)); - } - if ((mast->orig_l->node != mas->node) && - (mast->l->depth > mas_mt_height(mas))) { - mat_add(mast->free, mas->node); - } + /* Insert the new data in the tree */ + mas_topiary_replace(mas, old_enode); + + if (mte_is_leaf(mas->node)) + return; + + mas_update_gap(mas); } /* @@ -2995,12 +2850,11 @@ static int mas_spanning_rebalance(struct ma_state *mas, unsigned char split, mid_split; unsigned char slot = 0; struct maple_enode *left = NULL, *middle = NULL, *right = NULL; + struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); - MA_TOPIARY(free, mas->tree); - MA_TOPIARY(destroy, mas->tree); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. @@ -3009,8 +2863,6 @@ static int mas_spanning_rebalance(struct ma_state *mas, mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; - mast->free = &free; - mast->destroy = &destroy; l_mas.node = r_mas.node = m_mas.node = MAS_NONE; /* Check if this is not root and has sufficient data. */ @@ -3018,7 +2870,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); - mast->orig_l->depth = 0; + l_mas.depth = 0; /* * Each level of the tree is examined and balanced, pushing data to the left or @@ -3029,7 +2881,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. - * See mas_descend_adopt() for more information.. + * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; @@ -3046,13 +2898,13 @@ static int mas_spanning_rebalance(struct ma_state *mas, */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); - mast->orig_l->depth++; + l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; - mast_ascend_free(mast); + mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); @@ -3061,7 +2913,6 @@ static int mas_spanning_rebalance(struct ma_state *mas, /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); - mast_topiary(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) @@ -3083,7 +2934,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); - mast->orig_l->depth++; + l_mas.depth++; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); mas_set_parent(mas, left, l_mas.node, slot); if (middle) @@ -3094,23 +2945,20 @@ static int mas_spanning_rebalance(struct ma_state *mas, if (mas_is_root_limits(mast->l)) { new_root: - mast_new_root(mast, mas); + mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); + while (!mte_is_root(mast->orig_l->node)) + mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } - if (!mte_dead_node(mast->orig_l->node)) - mat_add(&free, mast->orig_l->node); - - mas->depth = mast->orig_l->depth; - *mast->orig_l = l_mas; - mte_set_node_dead(mas->node); - - /* Set up mas for insertion. */ - mast->orig_l->depth = mas->depth; - mast->orig_l->alloc = mas->alloc; - *mas = *mast->orig_l; - mas_wmb_replace(mas, &free, &destroy); + old_enode = mast->orig_l->node; + mas->depth = l_mas.depth; + mas->node = l_mas.node; + mas->min = l_mas.min; + mas->max = l_mas.max; + mas->offset = l_mas.offset; + mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); return mast->bn->b_end; } @@ -3341,7 +3189,6 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast, unsigned char skip) { bool cp = true; - struct maple_enode *old = mas->node; unsigned char split; memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap)); @@ -3353,7 +3200,6 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast, cp = false; } else { mas_ascend(mas); - mat_add(mast->free, old); mas->offset = mte_parent_slot(mas->node); } @@ -3457,13 +3303,11 @@ static inline bool mas_push_data(struct ma_state *mas, int height, split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ - mat_add(mast->free, mas->node); *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { - mat_add(mast->free, tmp_mas.node); tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; @@ -3490,6 +3334,7 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; + struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple @@ -3512,7 +3357,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); - MA_TOPIARY(mat, mas->tree); trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); @@ -3525,7 +3369,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; - mast.free = &mat; mast.bn = b_node; while (height++ <= mas->depth) { @@ -3565,9 +3408,9 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) } /* Set the original node as dead */ - mat_add(mast.free, mas->node); + old = mas->node; mas->node = l_mas.node; - mas_wmb_replace(mas, mast.free, NULL); + mas_wmb_replace(mas, old); mtree_range_walk(mas); return 1; } @@ -3903,6 +3746,7 @@ dead_node: return NULL; } +static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. @@ -3969,7 +3813,6 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); - MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); -- cgit v1.2.3 From f9bff0e31881d03badf191d3b0005839391f5f2b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:29 +0100 Subject: minmax: add in_range() macro Patch series "New page table range API", v6. This patchset changes the API used by the MM to set up page table entries. The four APIs are: set_ptes(mm, addr, ptep, pte, nr) update_mmu_cache_range(vma, addr, ptep, nr) flush_dcache_folio(folio) flush_icache_pages(vma, page, nr) flush_dcache_folio() isn't technically new, but no architecture implemented it, so I've done that for them. The old APIs remain around but are mostly implemented by calling the new interfaces. The new APIs are based around setting up N page table entries at once. The N entries belong to the same PMD, the same folio and the same VMA, so ptep++ is a legitimate operation, and locking is taken care of for you. Some architectures can do a better job of it than just a loop, but I have hesitated to make too deep a change to architectures I don't understand well. One thing I have changed in every architecture is that PG_arch_1 is now a per-folio bit instead of a per-page bit when used for dcache clean/dirty tracking. This was something that would have to happen eventually, and it makes sense to do it now rather than iterate over every page involved in a cache flush and figure out if it needs to happen. The point of all this is better performance, and Fengwei Yin has measured improvement on x86. I suspect you'll see improvement on your architecture too. Try the new will-it-scale test mentioned here: https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/ You'll need to run it on an XFS filesystem and have CONFIG_TRANSPARENT_HUGEPAGE set. This patchset is the basis for much of the anonymous large folio work being done by Ryan, so it's received quite a lot of testing over the last few months. This patch (of 38): Determine if a value lies within a range more efficiently (subtraction + comparison vs two comparisons and an AND). It also has useful (under some circumstances) behaviour if the range exceeds the maximum value of the type. Convert all the conflicting definitions of in_range() within the kernel; some can use the generic definition while others need their own definition. Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- lib/logic_pio.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'lib') diff --git a/lib/logic_pio.c b/lib/logic_pio.c index 07b4b9a1f54b..2ea564a40064 100644 --- a/lib/logic_pio.c +++ b/lib/logic_pio.c @@ -20,9 +20,6 @@ static LIST_HEAD(io_range_list); static DEFINE_MUTEX(io_range_mutex); -/* Consider a kernel general helper for this */ -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /** * logic_pio_register_range - register logical PIO range for a host * @new_range: pointer to the IO range to be registered. -- cgit v1.2.3 From 432af5c966667f12c7af38fb3b2cd52eef0c47b4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 18 Aug 2023 20:43:56 -0400 Subject: maple_tree: clean up mas_wr_append() Avoid setting the variables until necessary, and actually use the variables where applicable. Introducing a variable for the slots array avoids spanning multiple lines. Add the missing argument to the documentation. Use the node type when setting the metadata instead of blindly assuming the type. Finally, add a trace point to the function for successful store. Link: https://lkml.kernel.org/r/20230819004356.1454718-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 05d5db255c39..ee1ff0c59fd7 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4106,6 +4106,7 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state + * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in @@ -4114,42 +4115,46 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) * Return: True if appended, false otherwise */ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, - unsigned char new_end) + unsigned char new_end) { - unsigned char end = wr_mas->node_end; - struct ma_state *mas = wr_mas->mas; - unsigned char node_pivots = mt_pivots[wr_mas->type]; + struct ma_state *mas; + void __rcu **slots; + unsigned char end; + mas = wr_mas->mas; if (mt_in_rcu(mas->tree)) return false; if (mas->offset != wr_mas->node_end) return false; - if (new_end < node_pivots) { + end = wr_mas->node_end; + if (mas->offset != end) + return false; + + if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; - ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end); + ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } - if (new_end == wr_mas->node_end + 1) { + slots = wr_mas->slots; + if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ - rcu_assign_pointer(wr_mas->slots[new_end], - wr_mas->entry); + rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ - rcu_assign_pointer(wr_mas->slots[new_end], - wr_mas->content); + rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; - rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ - rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content); + rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; - rcu_assign_pointer(wr_mas->slots[end + 1], wr_mas->entry); + rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } @@ -4157,6 +4162,7 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); + trace_ma_write(__func__, mas, new_end, wr_mas->entry); return true; } -- cgit v1.2.3