Greg,
Here are the backports of the patches for the maple tree fixes from 6.3-rc6 for 6.1 and 6.2.
The patches differ with a few extra patches for the maple tree, and changes to the mm code patch to work without the vma iterator change.
Liam R. Howlett (14): maple_tree: remove GFP_ZERO from kmem_cache_alloc() and kmem_cache_alloc_bulk() maple_tree: fix potential rcu issue maple_tree: reduce user error potential maple_tree: fix handle of invalidated state in mas_wr_store_setup() maple_tree: fix mas_prev() and mas_find() state handling maple_tree: fix mas_skip_node() end slot detection maple_tree: be more cautious about dead nodes maple_tree: refine ma_state init from mas_start() maple_tree: detect dead nodes in mas_start() maple_tree: fix freeing of nodes in rcu mode maple_tree: remove extra smp_wmb() from mas_dead_leaves() maple_tree: add smp_rmb() to dead node detection maple_tree: add RCU lock checking to rcu callback functions mm: enable maple tree RCU mode by default.
include/linux/mm_types.h | 3 +- kernel/fork.c | 3 + lib/maple_tree.c | 389 ++++++++++++++++++++----------- mm/mmap.c | 3 +- tools/testing/radix-tree/maple.c | 18 +- 5 files changed, 263 insertions(+), 153 deletions(-)
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 541e06b772c1aaffb3b6a245ccface36d7107af2 upstream.
Preallocations are common in the VMA code to avoid allocating under certain locking conditions. The preallocations must also cover the worst-case scenario. Removing the GFP_ZERO flag from the kmem_cache_alloc() (and bulk variant) calls will reduce the amount of time spent zeroing memory that may not be used. Only zero out the necessary area to keep track of the allocations in the maple state. Zero the entire node prior to using it in the tree.
This required internal changes to node counting on allocation, so the test code is also updated.
This restores some micro-benchmark performance: up to +9% in mmtests mmap1 by my testing +10% to +20% in mmap, mmapaddr, mmapmany tests reported by Red Hat
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2149636 Link: https://lkml.kernel.org/r/20230105160427.2988454-1-Liam.Howlett@oracle.com Cc: stable@vger.kernel.org Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Liam.Howlett@oracle.com Reported-by: Jirka Hladky jhladky@redhat.com Suggested-by: Matthew Wilcox (Oracle) willy@infradead.org --- lib/maple_tree.c | 80 +++++++++++++++++--------------- tools/testing/radix-tree/maple.c | 18 +++---- 2 files changed, 52 insertions(+), 46 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 69cb44b035ec..fd824b065ace 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -149,13 +149,12 @@ struct maple_subtree_state { /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { - return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO); + return kmem_cache_alloc(maple_node_cache, gfp); }
static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { - return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size, - nodes); + return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); }
static inline void mt_free_bulk(size_t size, void __rcu **nodes) @@ -1123,9 +1122,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); + unsigned int req = mas_alloc_req(mas);
/* nothing or a request pending. */ - if (unlikely(!total)) + if (WARN_ON(!total)) return NULL;
if (total == 1) { @@ -1135,27 +1135,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) goto single_node; }
- if (!node->node_count) { + if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; - node->slot[0] = NULL; mas->alloc->total = node->total - 1; ret = node; goto new_head; } - node->total--; - ret = node->slot[node->node_count]; - node->slot[node->node_count--] = NULL; + ret = node->slot[--node->node_count]; + node->slot[node->node_count] = NULL;
single_node: new_head: - ret->total = 0; - ret->node_count = 0; - if (ret->request_count) { - mas_set_alloc_req(mas, ret->request_count + 1); - ret->request_count = 0; + if (req) { + req++; + mas_set_alloc_req(mas, req); } + + memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; }
@@ -1174,21 +1172,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) unsigned long count; unsigned int requested = mas_alloc_req(mas);
- memset(reuse, 0, sizeof(*reuse)); count = mas_allocated(mas);
- if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) { - if (head->slot[0]) - head->node_count++; - head->slot[head->node_count] = reuse; + reuse->request_count = 0; + reuse->node_count = 0; + if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { + head->slot[head->node_count++] = reuse; head->total++; goto done; }
reuse->total = 1; if ((head) && !((unsigned long)head & 0x1)) { - head->request_count = 0; reuse->slot[0] = head; + reuse->node_count = 1; reuse->total += head->total; }
@@ -1207,7 +1204,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); - unsigned long success = allocated; unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; @@ -1223,24 +1219,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) WARN_ON(!allocated); }
- if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) { + if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one;
- if (allocated) + if (allocated) { node->slot[0] = mas->alloc; + node->node_count = 1; + } else { + node->node_count = 0; + }
- success++; mas->alloc = node; + node->total = ++allocated; requested--; }
node = mas->alloc; + node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS; - if (node->slot[0]) { - unsigned int offset = node->node_count + 1; + if (node->node_count) { + unsigned int offset = node->node_count;
slots = (void **)&node->slot[offset]; max_req -= offset; @@ -1254,15 +1255,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) goto nomem_bulk;
node->node_count += count; - /* zero indexed. */ - if (slots == (void **)&node->slot) - node->node_count--; - - success += count; + allocated += count; node = node->slot[0]; + node->node_count = 0; + node->request_count = 0; requested -= count; } - mas->alloc->total = success; + mas->alloc->total = allocated; return;
nomem_bulk: @@ -1271,7 +1270,7 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) nomem_one: mas_set_alloc_req(mas, requested); if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) - mas->alloc->total = success; + mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); return;
@@ -5740,6 +5739,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; + unsigned long total;
/* * When using mas_for_each() to insert an expected number of elements, @@ -5762,14 +5762,20 @@ void mas_destroy(struct ma_state *mas) } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);
- while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) { + total = mas_allocated(mas); + while (total) { node = mas->alloc; mas->alloc = node->slot[0]; - if (node->node_count > 0) - mt_free_bulk(node->node_count, - (void __rcu **)&node->slot[1]); + if (node->node_count > 1) { + size_t count = node->node_count - 1; + + mt_free_bulk(count, (void __rcu **)&node->slot[1]); + total -= count; + } kmem_cache_free(maple_node_cache, node); + total--; } + mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 2e91973fbaa6..aceb6011315c 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -172,11 +172,11 @@ static noinline void check_new_node(struct maple_tree *mt)
if (!MAPLE_32BIT) { if (i >= 35) - e = i - 35; + e = i - 34; else if (i >= 5) - e = i - 5; + e = i - 4; else if (i >= 2) - e = i - 2; + e = i - 1; } else { if (i >= 4) e = i - 4; @@ -304,17 +304,17 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
mn = mas_pop_node(&mas); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1);
mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS);
/* Check the limit of pop/push/pop */ mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ @@ -322,14 +322,14 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); MT_BUG_ON(mt, mas_alloc_req(&mas)); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas.alloc->node_count != 1); MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn));
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 65be6f058b0eba98dc6c6f197ea9f62c9b6a519f upstream.
Ensure the node isn't dead after reading the node end.
Link: https://lkml.kernel.org/r/20230120162650.984577-3-Liam.Howlett@oracle.com Cc: Stable@vger.kernel.org Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index fd824b065ace..1ade7748cc9b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4656,13 +4656,13 @@ static inline void *mas_next_nentry(struct ma_state *mas, pivots = ma_pivots(node, type); slots = ma_slots(node, type); mas->index = mas_safe_min(mas, pivots, mas->offset); + count = ma_data_end(node, type, pivots, mas->max); if (ma_dead_node(node)) return NULL;
if (mas->index > max) return NULL;
- count = ma_data_end(node, type, pivots, mas->max); if (mas->offset > count) return NULL;
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 50e81c82ad947045c7ed26ddc9acb17276b653b6 upstream.
When iterating, a user may operate on the tree and cause the maple state to be altered and left in an unintuitive state. Detect this scenario and correct it by setting to the limit and invalidating the state.
Link: https://lkml.kernel.org/r/20230120162650.984577-4-Liam.Howlett@oracle.com Cc: Stable@vger.kernel.org Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1ade7748cc9b..819ba692940e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4737,6 +4737,11 @@ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) unsigned long last; enum maple_type mt;
+ if (mas->index > limit) { + mas->index = mas->last = limit; + mas_pause(mas); + return NULL; + } last = mas->last; retry: offset = mas->offset; @@ -4843,6 +4848,11 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) { void *entry;
+ if (mas->index < min) { + mas->index = mas->last = min; + mas_pause(mas); + return NULL; + } retry: while (likely(!mas_is_none(mas))) { entry = mas_prev_nentry(mas, min, mas->index);
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 1202700c3f8cc5f7e4646c3cf05ee6f7c8bc6ccf upstream.
If an invalidated maple state is encountered during write, reset the maple state to MAS_START. This will result in a re-walk of the tree to the correct location for the write.
Link: https://lore.kernel.org/all/20230107020126.1627-1-sj@kernel.org/ Link: https://lkml.kernel.org/r/20230120162650.984577-6-Liam.Howlett@oracle.com Cc: Stable@vger.kernel.org Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com Reported-by: SeongJae Park sj@kernel.org --- lib/maple_tree.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 819ba692940e..50604fecd476 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5614,6 +5614,9 @@ static inline void mte_destroy_walk(struct maple_enode *enode,
static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { + if (unlikely(mas_is_paused(wr_mas->mas))) + mas_reset(wr_mas->mas); + if (!mas_is_start(wr_mas->mas)) { if (mas_is_none(wr_mas->mas)) { mas_reset(wr_mas->mas);
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 17dc622c7b0f94e49bed030726df4db12ecaa6b5 upstream.
When mas_prev() does not find anything, set the state to MAS_NONE.
Handle the MAS_NONE in mas_find() like a MAS_START.
Link: https://lkml.kernel.org/r/20230120162650.984577-7-Liam.Howlett@oracle.com Cc: Stable@vger.kernel.org Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com Reported-by: syzbot+502859d610c661e56545@syzkaller.appspotmail.com --- lib/maple_tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 50604fecd476..fc3e22cff642 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4850,7 +4850,7 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min)
if (mas->index < min) { mas->index = mas->last = min; - mas_pause(mas); + mas->node = MAS_NONE; return NULL; } retry: @@ -5926,6 +5926,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (!mas->index) { /* Nothing comes before 0 */ mas->last = 0; + mas->node = MAS_NONE; return NULL; }
@@ -6016,6 +6017,9 @@ void *mas_find(struct ma_state *mas, unsigned long max) mas->index = ++mas->last; }
+ if (unlikely(mas_is_none(mas))) + mas->node = MAS_START; + if (unlikely(mas_is_start(mas))) { /* First run or continue */ void *entry;
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 0fa99fdfe1b38da396d0b2d1496a823bcd0ebea0 upstream.
mas_skip_node() is used to move the maple state to the node with a higher limit. It does this by walking up the tree and increasing the slot count. Since slot count may not be able to be increased, it may need to walk up multiple times to find room to walk right to a higher limit node. The limit of slots that was being used was the node limit and not the last location of data in the node. This would cause the maple state to be shifted outside actual data and enter an error state, thus returning -EBUSY.
The result of the incorrect error state means that mas_awalk() would return an error instead of finding the allocation space.
The fix is to use mas_data_end() in mas_skip_node() to detect the nodes data end point and continue walking the tree up until it is safe to move to a node with a higher limit.
The walk up the tree also sets the maple state limits so remove the buggy code from mas_skip_node(). Setting the limits had the unfortunate side effect of triggering another bug if the parent node was full and the there was no suitable gap in the second last child, but room in the next child.
mas_skip_node() may also be passed a maple state in an error state from mas_anode_descend() when no allocations are available. Return on such an error state immediately.
Link: https://lkml.kernel.org/r/20230307180247.2220303-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230307180247.2220303-2-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com Reported-by: Snild Dolkow snild@sony.com Link: https://lore.kernel.org/linux-mm/cb8dc31a-fef2-1d09-f133-e9f7b9f9e77a@sony.c... Tested-by: Snild Dolkow snild@sony.com Cc: Peng Zhang zhangpeng.00@bytedance.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org --- lib/maple_tree.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index fc3e22cff642..c50646fcb8ca 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5097,35 +5097,21 @@ static inline bool mas_rewind_node(struct ma_state *mas) */ static inline bool mas_skip_node(struct ma_state *mas) { - unsigned char slot, slot_count; - unsigned long *pivots; - enum maple_type mt; + if (mas_is_err(mas)) + return false;
- mt = mte_node_type(mas->node); - slot_count = mt_slots[mt] - 1; do { if (mte_is_root(mas->node)) { - slot = mas->offset; - if (slot > slot_count) { + if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); - slot = mas->offset; - mt = mte_node_type(mas->node); - slot_count = mt_slots[mt] - 1; } - } while (slot > slot_count); - - mas->offset = ++slot; - pivots = ma_pivots(mas_mn(mas), mt); - if (slot > 0) - mas->min = pivots[slot - 1] + 1; - - if (slot <= slot_count) - mas->max = pivots[slot]; + } while (mas->offset >= mas_data_end(mas));
+ mas->offset++; return true; }
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 39d0bd86c499ecd6abae42a9b7112056c5560691 upstream.
ma_pivots() and ma_data_end() may be called with a dead node. Ensure to that the node isn't dead before using the returned values.
This is necessary for RCU mode of the maple tree.
Link: https://lkml.kernel.org/r/20230227173632.3292573-1-surenb@google.com Link: https://lkml.kernel.org/r/20230227173632.3292573-2-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Cc: Stable@vger.kernel.org Signed-off-by: Liam Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 52 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c50646fcb8ca..7c8225e7df13 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -534,6 +534,7 @@ static inline bool ma_dead_node(const struct maple_node *node)
return (parent == node); } + /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node @@ -615,6 +616,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas) * @node - the maple node * @type - the node type * + * In the event of a dead node, this array may be %NULL + * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, @@ -1086,8 +1089,11 @@ static int mas_ascend(struct ma_state *mas) a_type = mas_parent_enum(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); - pivots = ma_pivots(a_node, a_type); a_enode = mt_mk_node(a_node, a_type); + pivots = ma_pivots(a_node, a_type); + + if (unlikely(ma_dead_node(a_node))) + return 1;
if (!set_min && a_slot) { set_min = true; @@ -1393,6 +1399,9 @@ static inline unsigned char ma_data_end(struct maple_node *node, { unsigned char offset;
+ if (!pivots) + return 0; + if (type == maple_arange_64) return ma_meta_end(node, type);
@@ -1428,6 +1437,9 @@ static inline unsigned char mas_data_end(struct ma_state *mas) return ma_meta_end(node, type);
pivots = ma_pivots(node, type); + if (unlikely(ma_dead_node(node))) + return 0; + offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); @@ -4499,6 +4511,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); + if (unlikely(ma_dead_node(node))) + return 1; + mas->max = pivots[offset]; if (offset) mas->min = pivots[offset - 1] + 1; @@ -4520,6 +4535,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, mas->max); + if (unlikely(ma_dead_node(node))) + return 1; + if (offset) mas->min = pivots[offset - 1] + 1;
@@ -4568,6 +4586,7 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, struct maple_enode *enode; int level = 0; unsigned char offset; + unsigned char node_end; enum maple_type mt; void __rcu **slots;
@@ -4591,7 +4610,11 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); - } while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max))); + node_end = ma_data_end(node, mt, pivots, mas->max); + if (unlikely(ma_dead_node(node))) + return 1; + + } while (unlikely(offset == node_end));
slots = ma_slots(node, mt); pivot = mas_safe_pivot(mas, pivots, ++offset, mt); @@ -4607,6 +4630,9 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, mt = mte_node_type(mas->node); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); + if (unlikely(ma_dead_node(node))) + return 1; + offset = 0; pivot = pivots[0]; } @@ -4653,11 +4679,14 @@ static inline void *mas_next_nentry(struct ma_state *mas, return NULL; }
- pivots = ma_pivots(node, type); slots = ma_slots(node, type); - mas->index = mas_safe_min(mas, pivots, mas->offset); + pivots = ma_pivots(node, type); count = ma_data_end(node, type, pivots, mas->max); - if (ma_dead_node(node)) + if (unlikely(ma_dead_node(node))) + return NULL; + + mas->index = mas_safe_min(mas, pivots, mas->offset); + if (unlikely(ma_dead_node(node))) return NULL;
if (mas->index > max) @@ -4815,6 +4844,11 @@ static inline void *mas_prev_nentry(struct ma_state *mas, unsigned long limit,
slots = ma_slots(mn, mt); pivots = ma_pivots(mn, mt); + if (unlikely(ma_dead_node(mn))) { + mas_rewalk(mas, index); + goto retry; + } + if (offset == mt_pivots[mt]) pivot = mas->max; else @@ -6617,11 +6651,11 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, while (likely(!ma_is_leaf(mt))) { MT_BUG_ON(mas->tree, mte_dead_node(mas->node)); slots = ma_slots(mn, mt); - pivots = ma_pivots(mn, mt); - max = pivots[0]; entry = mas_slot(mas, slots, 0); + pivots = ma_pivots(mn, mt); if (unlikely(ma_dead_node(mn))) return NULL; + max = pivots[0]; mas->node = entry; mn = mas_mn(mas); mt = mte_node_type(mas->node); @@ -6641,13 +6675,13 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, if (likely(entry)) return entry;
- pivots = ma_pivots(mn, mt); - mas->index = pivots[0] + 1; mas->offset = 1; entry = mas_slot(mas, slots, 1); + pivots = ma_pivots(mn, mt); if (unlikely(ma_dead_node(mn))) return NULL;
+ mas->index = pivots[0] + 1; if (mas->index > limit) goto none;
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 46b345848261009477552d654cb2f65000c30e4d upstream.
If mas->node is an MAS_START, there are three cases, and they all assign different values to mas->node and mas->offset. So there is no need to set them to a default value before updating.
Update them directly to make them easier to understand and for better readability.
Link: https://lkml.kernel.org/r/20221221060058.609003-7-vernon2gm@gmail.com Cc: Stable@vger.kernel.org Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Vernon Yang vernon2gm@gmail.com Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7c8225e7df13..194963149c2d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1334,7 +1334,7 @@ static void mas_node_count(struct ma_state *mas, int count) * mas_start() - Sets up maple state for operations. * @mas: The maple state. * - * If mas->node == MAS_START, then set the min, max, depth, and offset to + * If mas->node == MAS_START, then set the min, max and depth to * defaults. * * Return: @@ -1348,22 +1348,22 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) if (likely(mas_is_start(mas))) { struct maple_enode *root;
- mas->node = MAS_NONE; mas->min = 0; mas->max = ULONG_MAX; mas->depth = 0; - mas->offset = 0;
root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->node = mte_safe_root(root); + mas->offset = 0; return NULL; }
/* empty tree */ if (unlikely(!root)) { + mas->node = MAS_NONE; mas->offset = MAPLE_NODE_SLOTS; return NULL; }
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit a7b92d59c885018cb7bb88539892278e4fd64b29 upstream.
When initially starting a search, the root node may already be in the process of being replaced in RCU mode. Detect and restart the walk if this is the case. This is necessary for RCU mode of the maple tree.
Link: https://lkml.kernel.org/r/20230227173632.3292573-3-surenb@google.com Cc: Stable@vger.kernel.org Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 194963149c2d..6fcf08dbdbf9 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1352,12 +1352,16 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) mas->max = ULONG_MAX; mas->depth = 0;
+retry: root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->node = mte_safe_root(root); mas->offset = 0; + if (mte_dead_node(mas->node)) + goto retry; + return NULL; }
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 2e5b4921f8efc9e845f4f04741797d16f36847eb upstream.
The walk to destroy the nodes was not always setting the node type and would result in a destroy method potentially using the values as nodes. Avoid this by setting the correct node types. This is necessary for the RCU mode of the maple tree.
Link: https://lkml.kernel.org/r/20230227173632.3292573-4-surenb@google.com Cc: Stable@vger.kernel.org Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 73 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 11 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 6fcf08dbdbf9..0f0a2d4850e8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -892,6 +892,44 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, meta->end = end; }
+/* + * mas_clear_meta() - clear the metadata information of a node, if it exists + * @mas: The maple state + * @mn: The maple node + * @mt: The maple node type + * @offset: The offset of the highest sub-gap in this node. + * @end: The end of the data in this node. + */ +static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn, + enum maple_type mt) +{ + struct maple_metadata *meta; + unsigned long *pivots; + void __rcu **slots; + void *next; + + switch (mt) { + case maple_range_64: + pivots = mn->mr64.pivot; + if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { + slots = mn->mr64.slot; + next = mas_slot_locked(mas, slots, + MAPLE_RANGE64_SLOTS - 1); + if (unlikely((mte_to_node(next) && mte_node_type(next)))) + return; /* The last slot is a node, no metadata */ + } + fallthrough; + case maple_arange_64: + meta = ma_meta(mn, mt); + break; + default: + return; + } + + meta->gap = 0; + meta->end = 0; +} + /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node @@ -5439,20 +5477,22 @@ static inline int mas_rev_alloc(struct ma_state *mas, unsigned long min, * mas_dead_leaves() - Mark all leaves of a node as dead. * @mas: The maple state * @slots: Pointer to the slot array + * @type: The maple node type * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline -unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) +unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, + enum maple_type mt) { struct maple_node *node; enum maple_type type; void *entry; int offset;
- for (offset = 0; offset < mt_slot_count(mas->node); offset++) { + for (offset = 0; offset < mt_slots[mt]; offset++) { entry = mas_slot_locked(mas, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); @@ -5471,14 +5511,13 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)
static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset) { - struct maple_node *node, *next; + struct maple_node *next; void __rcu **slots = NULL;
next = mas_mn(mas); do { - mas->node = ma_enode_ptr(next); - node = mas_mn(mas); - slots = ma_slots(node, node->type); + mas->node = mt_mk_node(next, next->type); + slots = ma_slots(next, next->type); next = mas_slot_locked(mas, slots, offset); offset = 0; } while (!ma_is_leaf(next->type)); @@ -5542,11 +5581,14 @@ static inline void __rcu **mas_destroy_descend(struct ma_state *mas, node = mas_mn(mas); slots = ma_slots(node, mte_node_type(mas->node)); next = mas_slot_locked(mas, slots, 0); - if ((mte_dead_node(next))) + if ((mte_dead_node(next))) { + mte_to_node(next)->type = mte_node_type(next); next = mas_slot_locked(mas, slots, 1); + }
mte_set_node_dead(mas->node); node->type = mte_node_type(mas->node); + mas_clear_meta(mas, node, node->type); node->piv_parent = prev; node->parent_slot = offset; offset = 0; @@ -5566,13 +5608,18 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
MA_STATE(mas, &mt, 0, 0);
- if (mte_is_leaf(enode)) + mas.node = enode; + if (mte_is_leaf(enode)) { + node->type = mte_node_type(enode); goto free_leaf; + }
+ ma_flags &= ~MT_FLAGS_LOCK_MASK; mt_init_flags(&mt, ma_flags); mas_lock(&mas);
- mas.node = start = enode; + mte_to_node(enode)->ma_flags = ma_flags; + start = enode; slots = mas_destroy_descend(&mas, start, 0); node = mas_mn(&mas); do { @@ -5580,7 +5627,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, unsigned char offset; struct maple_enode *parent, *tmp;
- node->slot_len = mas_dead_leaves(&mas, slots); + node->type = mte_node_type(mas.node); + node->slot_len = mas_dead_leaves(&mas, slots, node->type); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; @@ -5604,7 +5652,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, } while (start != mas.node);
node = mas_mn(&mas); - node->slot_len = mas_dead_leaves(&mas, slots); + node->type = mte_node_type(mas.node); + node->slot_len = mas_dead_leaves(&mas, slots, node->type); if (free) mt_free_bulk(node->slot_len, slots);
@@ -5614,6 +5663,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, free_leaf: if (free) mt_free_rcu(&node->rcu); + else + mas_clear_meta(&mas, node, node->type); }
/*
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 8372f4d83f96f35915106093cde4565836587123 upstream.
The call to mte_set_dead_node() before the smp_wmb() already calls smp_wmb() so this is not needed. This is an optimization for the RCU mode of the maple tree.
Link: https://lkml.kernel.org/r/20230227173632.3292573-5-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Cc: stable@vger.kernel.org Signed-off-by: Liam Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0f0a2d4850e8..281be0997e55 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5501,7 +5501,6 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, break;
mte_set_node_dead(entry); - smp_wmb(); /* Needed for RCU */ node->type = type; rcu_assign_pointer(slots[offset], node); }
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 0a2b18d948838e16912b3b627b504ab062b7d02a upstream.
Add an smp_rmb() before reading the parent pointer to ensure that anything read from the node prior to the parent pointer hasn't been reordered ahead of this check.
The is necessary for RCU mode.
Link: https://lkml.kernel.org/r/20230227173632.3292573-7-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Cc: stable@vger.kernel.org Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 281be0997e55..2f9af64edad9 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -529,9 +529,11 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode) */ static inline bool ma_dead_node(const struct maple_node *node) { - struct maple_node *parent = (void *)((unsigned long) - node->parent & ~MAPLE_NODE_MASK); + struct maple_node *parent;
+ /* Do not reorder reads from the node prior to the parent check */ + smp_rmb(); + parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); }
@@ -546,6 +548,8 @@ static inline bool mte_dead_node(const struct maple_enode *enode) struct maple_node *parent, *node;
node = mte_to_node(enode); + /* Do not reorder reads from the node prior to the parent check */ + smp_rmb(); parent = mte_parent(enode); return (parent == node); }
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 790e1fa86b340c2bd4a327e01c161f7a1ad885f6 upstream.
Dereferencing RCU objects within the RCU callback without the RCU check has caused lockdep to complain. Fix the RCU dereferencing by using the RCU callback lock to ensure the operation is safe.
Also stop creating a new lock to use for dereferencing during destruction of the tree or subtree. Instead, pass through a pointer to the tree that has the lock that is held for RCU dereferencing checking. It also does not make sense to use the maple state in the freeing scenario as the tree walk is a special case where the tree no longer has the normal encodings and parent pointers.
Link: https://lkml.kernel.org/r/20230227173632.3292573-8-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Cc: stable@vger.kernel.org Reported-by: Suren Baghdasaryan surenb@google.com Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com --- lib/maple_tree.c | 188 ++++++++++++++++++++++++----------------------- 1 file changed, 96 insertions(+), 92 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 2f9af64edad9..b6e29081d2cc 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -814,6 +814,11 @@ static inline void *mt_slot(const struct maple_tree *mt, return rcu_dereference_check(slots[offset], mt_locked(mt)); }
+static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, + unsigned char offset) +{ + return rcu_dereference_protected(slots[offset], mt_locked(mt)); +} /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state @@ -825,7 +830,7 @@ static inline void *mt_slot(const struct maple_tree *mt, static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { - return rcu_dereference_protected(slots[offset], mt_locked(mas->tree)); + return mt_slot_locked(mas->tree, slots, offset); }
/* @@ -897,34 +902,35 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, }
/* - * mas_clear_meta() - clear the metadata information of a node, if it exists - * @mas: The maple state + * mt_clear_meta() - clear the metadata information of a node, if it exists + * @mt: The maple tree * @mn: The maple node - * @mt: The maple node type + * @type: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ -static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn, - enum maple_type mt) +static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, + enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next;
- switch (mt) { + switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; - next = mas_slot_locked(mas, slots, - MAPLE_RANGE64_SLOTS - 1); - if (unlikely((mte_to_node(next) && mte_node_type(next)))) - return; /* The last slot is a node, no metadata */ + next = mt_slot_locked(mt, slots, + MAPLE_RANGE64_SLOTS - 1); + if (unlikely((mte_to_node(next) && + mte_node_type(next)))) + return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: - meta = ma_meta(mn, mt); + meta = ma_meta(mn, type); break; default: return; @@ -5478,7 +5484,7 @@ static inline int mas_rev_alloc(struct ma_state *mas, unsigned long min, }
/* - * mas_dead_leaves() - Mark all leaves of a node as dead. + * mte_dead_leaves() - Mark all leaves of a node as dead. * @mas: The maple state * @slots: Pointer to the slot array * @type: The maple node type @@ -5488,16 +5494,16 @@ static inline int mas_rev_alloc(struct ma_state *mas, unsigned long min, * Return: The number of leaves marked as dead. */ static inline -unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, - enum maple_type mt) +unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, + void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset;
- for (offset = 0; offset < mt_slots[mt]; offset++) { - entry = mas_slot_locked(mas, slots, offset); + for (offset = 0; offset < mt_slot_count(enode); offset++) { + entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ @@ -5512,162 +5518,160 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, return offset; }
-static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset) +/** + * mte_dead_walk() - Walk down a dead tree to just before the leaves + * @enode: The maple encoded node + * @offset: The starting offset + * + * Note: This can only be used from the RCU callback context. + */ +static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { - struct maple_node *next; + struct maple_node *node, *next; void __rcu **slots = NULL;
- next = mas_mn(mas); + next = mte_to_node(*enode); do { - mas->node = mt_mk_node(next, next->type); - slots = ma_slots(next, next->type); - next = mas_slot_locked(mas, slots, offset); + *enode = ma_enode_ptr(next); + node = mte_to_node(*enode); + slots = ma_slots(node, node->type); + next = rcu_dereference_protected(slots[offset], + lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type));
return slots; }
+/** + * mt_free_walk() - Walk & free a tree in the RCU callback context + * @head: The RCU head that's within the node. + * + * Note: This can only be used from the RCU callback context. + */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; - struct maple_tree mt; + struct maple_enode *enode; unsigned char offset; enum maple_type type; - MA_STATE(mas, &mt, 0, 0);
node = container_of(head, struct maple_node, rcu);
if (ma_is_leaf(node->type)) goto free_leaf;
- mt_init_flags(&mt, node->ma_flags); - mas_lock(&mas); start = node; - mas.node = mt_mk_node(node, node->type); - slots = mas_dead_walk(&mas, 0); - node = mas_mn(&mas); + enode = mt_mk_node(node, node->type); + slots = mte_dead_walk(&enode, 0); + node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; - mas.node = node->piv_parent; - if (mas_mn(&mas) == node) - goto start_slots_free; - - type = mte_node_type(mas.node); - slots = ma_slots(mte_to_node(mas.node), type); - if ((offset < mt_slots[type]) && (slots[offset])) - slots = mas_dead_walk(&mas, offset); - - node = mas_mn(&mas); + enode = node->piv_parent; + if (mte_to_node(enode) == node) + goto free_leaf; + + type = mte_node_type(enode); + slots = ma_slots(mte_to_node(enode), type); + if ((offset < mt_slots[type]) && + rcu_dereference_protected(slots[offset], + lock_is_held(&rcu_callback_map))) + slots = mte_dead_walk(&enode, offset); + node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset));
slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots);
-start_slots_free: - mas_unlock(&mas); free_leaf: mt_free_rcu(&node->rcu); }
-static inline void __rcu **mas_destroy_descend(struct ma_state *mas, - struct maple_enode *prev, unsigned char offset) +static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, + struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; - struct maple_enode *next = mas->node; + struct maple_enode *next = *enode; void __rcu **slots = NULL; + enum maple_type type; + unsigned char next_offset = 0;
do { - mas->node = next; - node = mas_mn(mas); - slots = ma_slots(node, mte_node_type(mas->node)); - next = mas_slot_locked(mas, slots, 0); - if ((mte_dead_node(next))) { - mte_to_node(next)->type = mte_node_type(next); - next = mas_slot_locked(mas, slots, 1); - } + *enode = next; + node = mte_to_node(*enode); + type = mte_node_type(*enode); + slots = ma_slots(node, type); + next = mt_slot_locked(mt, slots, next_offset); + if ((mte_dead_node(next))) + next = mt_slot_locked(mt, slots, ++next_offset);
- mte_set_node_dead(mas->node); - node->type = mte_node_type(mas->node); - mas_clear_meta(mas, node, node->type); + mte_set_node_dead(*enode); + node->type = type; node->piv_parent = prev; node->parent_slot = offset; - offset = 0; - prev = mas->node; + offset = next_offset; + next_offset = 0; + prev = *enode; } while (!mte_is_leaf(next));
return slots; }
-static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, +static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; - struct maple_tree mt; - - MA_STATE(mas, &mt, 0, 0);
- mas.node = enode; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; }
- ma_flags &= ~MT_FLAGS_LOCK_MASK; - mt_init_flags(&mt, ma_flags); - mas_lock(&mas); - - mte_to_node(enode)->ma_flags = ma_flags; start = enode; - slots = mas_destroy_descend(&mas, start, 0); - node = mas_mn(&mas); + slots = mte_destroy_descend(&enode, mt, start, 0); + node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp;
- node->type = mte_node_type(mas.node); - node->slot_len = mas_dead_leaves(&mas, slots, node->type); + node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; - mas.node = node->piv_parent; - if (mas_mn(&mas) == node) - goto start_slots_free; + enode = node->piv_parent; + if (mte_to_node(enode) == node) + goto free_leaf;
- type = mte_node_type(mas.node); - slots = ma_slots(mte_to_node(mas.node), type); + type = mte_node_type(enode); + slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next;
- tmp = mas_slot_locked(&mas, slots, offset); + tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { - parent = mas.node; - mas.node = tmp; - slots = mas_destroy_descend(&mas, parent, offset); + parent = enode; + enode = tmp; + slots = mte_destroy_descend(&enode, mt, parent, offset); } next: - node = mas_mn(&mas); - } while (start != mas.node); + node = mte_to_node(enode); + } while (start != enode);
- node = mas_mn(&mas); - node->type = mte_node_type(mas.node); - node->slot_len = mas_dead_leaves(&mas, slots, node->type); + node = mte_to_node(enode); + node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots);
-start_slots_free: - mas_unlock(&mas); - free_leaf: if (free) mt_free_rcu(&node->rcu); else - mas_clear_meta(&mas, node, node->type); + mt_clear_meta(mt, node, node->type); }
/* @@ -5683,10 +5687,10 @@ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_node *node = mte_to_node(enode);
if (mt_in_rcu(mt)) { - mt_destroy_walk(enode, mt->ma_flags, false); + mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { - mt_destroy_walk(enode, mt->ma_flags, true); + mt_destroy_walk(enode, mt, true); } }
From: "Liam R. Howlett" Liam.Howlett@Oracle.com
commit 3dd4432549415f3c65dd52d5c687629efbf4ece1 upstream.
Use the maple tree in RCU mode for VMA tracking.
The maple tree tracks the stack and is able to update the pivot (lower/upper boundary) in-place to allow the page fault handler to write to the tree while holding just the mmap read lock. This is safe as the writes to the stack have a guard VMA which ensures there will always be a NULL in the direction of the growth and thus will only update a pivot.
It is possible, but not recommended, to have VMAs that grow up/down without guard VMAs. syzbot has constructed a testcase which sets up a VMA to grow and consume the empty space. Overwriting the entire NULL entry causes the tree to be altered in a way that is not safe for concurrent readers; the readers may see a node being rewritten or one that does not match the maple state they are using.
Enabling RCU mode allows the concurrent readers to see a stable node and will return the expected result.
Link: https://lkml.kernel.org/r/20230227173632.3292573-9-surenb@google.com Cc: stable@vger.kernel.org Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree") Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com Reported-by: syzbot+8d95422d3537159ca390@syzkaller.appspotmail.com --- include/linux/mm_types.h | 3 ++- kernel/fork.c | 3 +++ mm/mmap.c | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 500e536796ca..247aedb18d5c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -725,7 +725,8 @@ struct mm_struct { unsigned long cpu_bitmap[]; };
-#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN) +#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ + MT_FLAGS_USE_RCU) extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */ diff --git a/kernel/fork.c b/kernel/fork.c index a6d243a50be3..ec913b13c5ed 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto out;
+ mt_clear_in_rcu(mas.tree); mas_for_each(&old_mas, mpnt, ULONG_MAX) { struct file *file;
@@ -703,6 +704,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = arch_dup_mmap(oldmm, mm); loop_out: mas_destroy(&mas); + if (!retval) + mt_set_in_rcu(mas.tree); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); diff --git a/mm/mmap.c b/mm/mmap.c index 177714886849..fe1db604dc49 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2308,7 +2308,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, int count = 0; int error = -ENOMEM; MA_STATE(mas_detach, &mt_detach, 0, 0); - mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_init_flags(&mt_detach, mas->tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_set_external_lock(&mt_detach, &mm->mmap_lock);
if (mas_preallocate(mas, vma, GFP_KERNEL)) @@ -3095,6 +3095,7 @@ void exit_mmap(struct mm_struct *mm) */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); + mt_clear_in_rcu(&mm->mm_mt); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb);
On Tue, Apr 11, 2023 at 11:10:41AM -0400, Liam R. Howlett wrote:
Greg,
Here are the backports of the patches for the maple tree fixes from 6.3-rc6 for 6.1 and 6.2.
The patches differ with a few extra patches for the maple tree, and changes to the mm code patch to work without the vma iterator change.
Liam R. Howlett (14): maple_tree: remove GFP_ZERO from kmem_cache_alloc() and kmem_cache_alloc_bulk() maple_tree: fix potential rcu issue maple_tree: reduce user error potential maple_tree: fix handle of invalidated state in mas_wr_store_setup() maple_tree: fix mas_prev() and mas_find() state handling maple_tree: fix mas_skip_node() end slot detection maple_tree: be more cautious about dead nodes maple_tree: refine ma_state init from mas_start() maple_tree: detect dead nodes in mas_start() maple_tree: fix freeing of nodes in rcu mode maple_tree: remove extra smp_wmb() from mas_dead_leaves() maple_tree: add smp_rmb() to dead node detection maple_tree: add RCU lock checking to rcu callback functions mm: enable maple tree RCU mode by default.
include/linux/mm_types.h | 3 +- kernel/fork.c | 3 + lib/maple_tree.c | 389 ++++++++++++++++++++----------- mm/mmap.c | 3 +- tools/testing/radix-tree/maple.c | 18 +- 5 files changed, 263 insertions(+), 153 deletions(-)
-- 2.39.2
Thanks for these. One of them was already in the 6.2.y and 6.1.y releases, so I just skipped it.
greg k-h
linux-stable-mirror@lists.linaro.org