This unintended LRU eviction issue was observed while developing the selftest for "[PATCH bpf-next v10 0/8] bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags for percpu maps" [1].
When updating an existing element in lru_hash or lru_percpu_hash maps, the current implementation calls prealloc_lru_pop() to get a new node before checking if the key already exists. If the map is full, this triggers LRU eviction and removes an existing element, even though the update operation only needs to modify the value in-place.
In the selftest of the aforementioned patch, this was to be worked around by reserving an extra entry to void triggering eviction in __htab_lru_percpu_map_update_elem(). However, the underlying issue remains problematic because:
1. Users may unexpectedly lose entries when updating existing keys in a full map. 2. The eviction overhead is unnecessary for existing key updates.
This patchset fixes the issue by first checking if the key exists before allocating a new node. If the key is found, update the value using the extra LRU node without triggering any eviction. Only proceed with node allocation if the key does not exist.
Links: [1] https://lore.kernel.org/bpf/20251117162033.6296-1-leon.hwang@linux.dev/
Changes: v1 -> v2: * Tidy hash handling in LRU code. * Factor out bpf_lru_node_reset_state helper. * Factor out bpf_lru_move_next_inactive_rotation helper. * Update element using preallocated extra elements in order to avoid breaking the update atomicity (per Alexei). * Check values on other CPUs in tests (per bot). * v1: https://lore.kernel.org/bpf/20251202153032.10118-1-leon.hwang@linux.dev/
Leon Hwang (5): bpf: lru: Tidy hash handling in LRU code bpf: lru: Factor out bpf_lru_node_reset_state helper bpf: lru: Factor out bpf_lru_move_next_inactive_rotation helper bpf: lru: Fix unintended eviction when updating lru hash maps selftests/bpf: Add tests to verify no unintended eviction when updating lru_[percpu_,]hash maps
kernel/bpf/bpf_lru_list.c | 228 ++++++++++++++---- kernel/bpf/bpf_lru_list.h | 10 +- kernel/bpf/hashtab.c | 90 ++++++- .../selftests/bpf/prog_tests/htab_update.c | 129 ++++++++++ 4 files changed, 402 insertions(+), 55 deletions(-)
-- 2.52.0
The hash field is not used by the LRU list itself.
Setting hash while manipulating the LRU list also obscures the intent of the code and makes it harder to follow.
Tidy this up by moving the hash assignment to prealloc_lru_pop(), where the element is prepared for insertion into the hash table.
Signed-off-by: Leon Hwang leon.hwang@linux.dev --- kernel/bpf/bpf_lru_list.c | 24 +++++++++--------------- kernel/bpf/bpf_lru_list.h | 5 ++--- kernel/bpf/hashtab.c | 5 ++--- 3 files changed, 13 insertions(+), 21 deletions(-)
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc60523f..f4e183a9c28f 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -344,10 +344,8 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, static void __local_list_add_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l, int cpu, - struct bpf_lru_node *node, - u32 hash) + struct bpf_lru_node *node) { - *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; bpf_lru_node_clear_ref(node); @@ -393,8 +391,7 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) return NULL; }
-static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, - u32 hash) +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru) { struct list_head *free_list; struct bpf_lru_node *node = NULL; @@ -415,7 +412,6 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
if (!list_empty(free_list)) { node = list_first_entry(free_list, struct bpf_lru_node, list); - *(u32 *)((void *)node + lru->hash_offset) = hash; bpf_lru_node_clear_ref(node); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } @@ -425,8 +421,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, return node; }
-static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, - u32 hash) +static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru) { struct bpf_lru_locallist *loc_l, *steal_loc_l; struct bpf_common_lru *clru = &lru->common_lru; @@ -446,7 +441,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, }
if (node) - __local_list_add_pending(lru, loc_l, cpu, node, hash); + __local_list_add_pending(lru, loc_l, cpu, node);
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
@@ -481,19 +476,19 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
if (node) { raw_spin_lock_irqsave(&loc_l->lock, flags); - __local_list_add_pending(lru, loc_l, cpu, node, hash); + __local_list_add_pending(lru, loc_l, cpu, node); raw_spin_unlock_irqrestore(&loc_l->lock, flags); }
return node; }
-struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru) { if (lru->percpu) - return bpf_percpu_lru_pop_free(lru, hash); + return bpf_percpu_lru_pop_free(lru); else - return bpf_common_lru_pop_free(lru, hash); + return bpf_common_lru_pop_free(lru); }
static void bpf_common_lru_push_free(struct bpf_lru *lru, @@ -643,7 +638,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l) raw_spin_lock_init(&l->lock); }
-int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, +int bpf_lru_init(struct bpf_lru *lru, bool percpu, del_from_htab_func del_from_htab, void *del_arg) { int cpu; @@ -681,7 +676,6 @@ int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, lru->percpu = percpu; lru->del_from_htab = del_from_htab; lru->del_arg = del_arg; - lru->hash_offset = hash_offset;
return 0; } diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index fe2661a58ea9..29e8300e0fd1 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -57,7 +57,6 @@ struct bpf_lru { }; del_from_htab_func del_from_htab; void *del_arg; - unsigned int hash_offset; unsigned int target_free; unsigned int nr_scans; bool percpu; @@ -69,12 +68,12 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) WRITE_ONCE(node->ref, 1); }
-int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, +int bpf_lru_init(struct bpf_lru *lru, bool percpu, del_from_htab_func del_from_htab, void *delete_arg); void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems); void bpf_lru_destroy(struct bpf_lru *lru); -struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru); void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
#endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c8a9b27f8663..d029690246f8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -296,12 +296,13 @@ static void htab_free_elems(struct bpf_htab *htab) static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { - struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); + struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru); struct htab_elem *l;
if (node) { bpf_map_inc_elem_count(&htab->map); l = container_of(node, struct htab_elem, lru_node); + l->hash = hash; memcpy(l->key, key, htab->map.key_size); return l; } @@ -342,8 +343,6 @@ static int prealloc_init(struct bpf_htab *htab) if (htab_is_lru(htab)) err = bpf_lru_init(&htab->lru, htab->map.map_flags & BPF_F_NO_COMMON_LRU, - offsetof(struct htab_elem, hash) - - offsetof(struct htab_elem, lru_node), htab_lru_map_delete_node, htab); else
Introduce the helper bpf_lru_node_reset_state to set type and clear ref.
No functional change intended.
Signed-off-by: Leon Hwang leon.hwang@linux.dev --- kernel/bpf/bpf_lru_list.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index f4e183a9c28f..b17b05f41900 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -41,6 +41,12 @@ static void bpf_lru_node_clear_ref(struct bpf_lru_node *node) WRITE_ONCE(node->ref, 0); }
+static void bpf_lru_node_reset_state(struct bpf_lru_node *node, enum bpf_lru_list_type type) +{ + node->type = type; + bpf_lru_node_clear_ref(node); +} + static void bpf_lru_list_count_inc(struct bpf_lru_list *l, enum bpf_lru_list_type type) { @@ -85,8 +91,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, return;
bpf_lru_list_count_inc(l, tgt_type); - node->type = tgt_type; - bpf_lru_node_clear_ref(node); + bpf_lru_node_reset_state(node, tgt_type); list_move(&node->list, &l->lists[tgt_type]); }
@@ -347,8 +352,7 @@ static void __local_list_add_pending(struct bpf_lru *lru, struct bpf_lru_node *node) { node->cpu = cpu; - node->type = BPF_LRU_LOCAL_LIST_T_PENDING; - bpf_lru_node_clear_ref(node); + bpf_lru_node_reset_state(node, BPF_LRU_LOCAL_LIST_T_PENDING); list_add(&node->list, local_pending_list(loc_l)); }
@@ -513,8 +517,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, goto check_lru_list; }
- node->type = BPF_LRU_LOCAL_LIST_T_FREE; - bpf_lru_node_clear_ref(node); + bpf_lru_node_reset_state(node, BPF_LRU_LOCAL_LIST_T_FREE); list_move(&node->list, local_free_list(loc_l));
raw_spin_unlock_irqrestore(&loc_l->lock, flags); @@ -559,8 +562,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, struct bpf_lru_node *node;
node = (struct bpf_lru_node *)(buf + node_offset); - node->type = BPF_LRU_LIST_T_FREE; - bpf_lru_node_clear_ref(node); + bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } @@ -588,8 +590,7 @@ static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; - node->type = BPF_LRU_LIST_T_FREE; - bpf_lru_node_clear_ref(node); + bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; buf += elem_size;
Factor out a bpf_lru_move_next_inactive_rotation() helper to update next_inactive_rotation when handling the extra-node case.
No functional change intended.
Signed-off-by: Leon Hwang leon.hwang@linux.dev --- kernel/bpf/bpf_lru_list.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index b17b05f41900..563707af8035 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -61,6 +61,15 @@ static void bpf_lru_list_count_dec(struct bpf_lru_list *l, l->counts[type]--; }
+static void bpf_lru_move_next_inactive_rotation(struct bpf_lru_list *l, struct bpf_lru_node *node) +{ + /* If the removing node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; +} + static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, struct bpf_lru_node *node, struct list_head *free_list, @@ -69,11 +78,7 @@ static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return;
- /* If the removing node is the next_inactive_rotation candidate, - * move the next_inactive_rotation pointer also. - */ - if (&node->list == l->next_inactive_rotation) - l->next_inactive_rotation = l->next_inactive_rotation->prev; + bpf_lru_move_next_inactive_rotation(l, node);
bpf_lru_list_count_dec(l, node->type);
@@ -114,11 +119,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l, } bpf_lru_node_clear_ref(node);
- /* If the moving node is the next_inactive_rotation candidate, - * move the next_inactive_rotation pointer also. - */ - if (&node->list == l->next_inactive_rotation) - l->next_inactive_rotation = l->next_inactive_rotation->prev; + bpf_lru_move_next_inactive_rotation(l, node);
list_move(&node->list, &l->lists[tgt_type]); }
When updating an existing element in lru_[percpu_,]hash maps, the current implementation always calls prealloc_lru_pop() to get a new node before checking if the key already exists. If the map is full, this triggers LRU eviction and removes an existing element, even though the update operation only needs to modify the value of an existing key in-place.
This is problematic because: 1. Users may unexpectedly lose entries when doing simple value updates 2. The eviction overhead is unnecessary for existing key updates
Fix this by first checking if the key exists before allocating a new node. If the key is found, update the value using the extra lru node without triggering any eviction.
Fixes: 29ba732acbee ("bpf: Add BPF_MAP_TYPE_LRU_HASH") Fixes: 8f8449384ec3 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH") Signed-off-by: Leon Hwang leon.hwang@linux.dev --- kernel/bpf/bpf_lru_list.c | 164 +++++++++++++++++++++++++++++++++++--- kernel/bpf/bpf_lru_list.h | 5 +- kernel/bpf/hashtab.c | 85 ++++++++++++++++++-- 3 files changed, 239 insertions(+), 15 deletions(-)
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 563707af8035..142b0f10b011 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -124,6 +124,41 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l, list_move(&node->list, &l->lists[tgt_type]); }
+static struct bpf_lru_node *__bpf_lru_node_move_from_extra(struct bpf_lru_list *l, + enum bpf_lru_list_type tgt_type) +{ + struct bpf_lru_node *node; + + node = list_first_entry_or_null(&l->extra, struct bpf_lru_node, list); + if (!node) + return NULL; + + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return NULL; + + bpf_lru_list_count_inc(l, tgt_type); + bpf_lru_node_reset_state(node, tgt_type); + list_move(&node->list, &l->lists[tgt_type]); + return node; +} + +static bool __bpf_lru_node_move_to_extra(struct bpf_lru_list *l, + struct bpf_lru_node *node) +{ + if (!list_empty(&l->extra)) + return false; + + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return false; + + bpf_lru_move_next_inactive_rotation(l, node); + + bpf_lru_list_count_dec(l, node->type); + bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE); + list_move(&node->list, &l->extra); + return true; +} + static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) { return l->counts[BPF_LRU_LIST_T_INACTIVE] < @@ -305,6 +340,69 @@ static void __local_list_flush(struct bpf_lru_list *l, } }
+static struct bpf_lru_node *bpf_percpu_lru_pop_extra(struct bpf_lru *lru) +{ + int cpu = raw_smp_processor_id(); + struct bpf_lru_node *node; + struct bpf_lru_list *l; + unsigned long flags; + + l = per_cpu_ptr(lru->percpu_lru, cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + node = __bpf_lru_node_move_from_extra(l, BPF_LRU_LIST_T_ACTIVE); + + raw_spin_unlock_irqrestore(&l->lock, flags); + + return node; +} + +static struct bpf_lru_node *bpf_lru_locallist_extra_pop(struct bpf_lru_locallist *l) +{ + struct bpf_lru_node *node; + + node = list_first_entry_or_null(&l->extra, struct bpf_lru_node, list); + if (node) + list_del(&node->list); + + return node; +} + +static void __local_list_add_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l, + int cpu, + struct bpf_lru_node *node); + +static struct bpf_lru_node *bpf_common_lru_pop_extra(struct bpf_lru *lru) +{ + struct bpf_common_lru *clru = &lru->common_lru; + int cpu = raw_smp_processor_id(); + struct bpf_lru_locallist *loc_l; + struct bpf_lru_node *node; + unsigned long flags; + + loc_l = per_cpu_ptr(clru->local_list, cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + node = bpf_lru_locallist_extra_pop(loc_l); + if (node) + __local_list_add_pending(lru, loc_l, cpu, node); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + + return node; +} + +struct bpf_lru_node *bpf_lru_pop_extra(struct bpf_lru *lru) +{ + if (lru->percpu) + return bpf_percpu_lru_pop_extra(lru); + else + return bpf_common_lru_pop_extra(lru); +} + static void bpf_lru_list_push_free(struct bpf_lru_list *l, struct bpf_lru_node *node) { @@ -496,6 +594,16 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru) return bpf_common_lru_pop_free(lru); }
+static bool bpf_lru_locallist_extra_push(struct bpf_lru_locallist *loc_l, struct bpf_lru_node *node) +{ + if (!list_empty(&loc_l->extra)) + return false; + + bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE); + list_move(&node->list, &loc_l->extra); + return true; +} + static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { @@ -518,8 +626,10 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, goto check_lru_list; }
- bpf_lru_node_reset_state(node, BPF_LRU_LOCAL_LIST_T_FREE); - list_move(&node->list, local_free_list(loc_l)); + if (!bpf_lru_locallist_extra_push(loc_l, node)) { + bpf_lru_node_reset_state(node, BPF_LRU_LOCAL_LIST_T_FREE); + list_move(&node->list, local_free_list(loc_l)); + }
raw_spin_unlock_irqrestore(&loc_l->lock, flags); return; @@ -539,7 +649,8 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
raw_spin_lock_irqsave(&l->lock, flags);
- __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + if (!__bpf_lru_node_move_to_extra(l, node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
raw_spin_unlock_irqrestore(&l->lock, flags); } @@ -554,9 +665,11 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, - u32 nr_elems) + u32 nr_elems, u32 nr_extra_elems) { - struct bpf_lru_list *l = &lru->common_lru.lru_list; + struct bpf_common_lru *clru = &lru->common_lru; + struct bpf_lru_list *l = &clru->lru_list; + int cpu; u32 i;
for (i = 0; i < nr_elems; i++) { @@ -570,11 +683,26 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, 1, LOCAL_FREE_TARGET); + + if (WARN_ON_ONCE(nr_extra_elems != num_possible_cpus())) + return; + + for_each_possible_cpu(cpu) { + struct bpf_lru_locallist *loc_l; + struct bpf_lru_node *node; + + loc_l = per_cpu_ptr(clru->local_list, cpu); + node = (struct bpf_lru_node *)(buf + node_offset); + node->cpu = cpu; + bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE); + list_add(&node->list, &loc_l->extra); + buf += elem_size; + } }
static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, - u32 nr_elems) + u32 nr_elems, u32 nr_extra_elems) { u32 i, pcpu_entries; int cpu; @@ -600,17 +728,31 @@ static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, if (i % pcpu_entries) goto again; } + + if (WARN_ON_ONCE(nr_extra_elems != num_possible_cpus())) + return; + + for_each_possible_cpu(cpu) { + struct bpf_lru_node *node; + + l = per_cpu_ptr(lru->percpu_lru, cpu); + node = (struct bpf_lru_node *)(buf + node_offset); + node->cpu = cpu; + bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE); + list_add(&node->list, &l->extra); + buf += elem_size; + } }
void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) + u32 elem_size, u32 nr_elems, u32 nr_extra_elems) { if (lru->percpu) bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, - nr_elems); + nr_elems, nr_extra_elems); else bpf_common_lru_populate(lru, buf, node_offset, elem_size, - nr_elems); + nr_elems, nr_extra_elems); }
static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) @@ -620,6 +762,8 @@ static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) INIT_LIST_HEAD(&loc_l->lists[i]);
+ INIT_LIST_HEAD(&loc_l->extra); + loc_l->next_steal = cpu;
raw_spin_lock_init(&loc_l->lock); @@ -637,6 +781,8 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ INIT_LIST_HEAD(&l->extra); + raw_spin_lock_init(&l->lock); }
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 29e8300e0fd1..446779341b34 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -33,12 +33,14 @@ struct bpf_lru_list { unsigned int counts[NR_BPF_LRU_LIST_COUNT]; /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; + struct list_head extra; /* for percpu lru */
raw_spinlock_t lock ____cacheline_aligned_in_smp; };
struct bpf_lru_locallist { struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + struct list_head extra; /* for common lru */ u16 next_steal; raw_spinlock_t lock; }; @@ -71,9 +73,10 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) int bpf_lru_init(struct bpf_lru *lru, bool percpu, del_from_htab_func del_from_htab, void *delete_arg); void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems); + u32 elem_size, u32 nr_elems, u32 nr_extra_elems); void bpf_lru_destroy(struct bpf_lru *lru); struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru); void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); +struct bpf_lru_node *bpf_lru_pop_extra(struct bpf_lru *lru);
#endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d029690246f8..8665eb6b8a7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -207,12 +207,12 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) }
/* Both percpu and fd htab support in-place update, so no need for - * extra elem. LRU itself can remove the least used element, so - * there is no need for an extra elem during map_update. + * extra elem. LRU requires extra elems to avoid unintended eviction when + * updating the existing elems. */ static bool htab_has_extra_elems(struct bpf_htab *htab) { - return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); + return htab_is_lru(htab) || (!htab_is_percpu(htab) && !is_fd_htab(htab)); }
static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) @@ -313,6 +313,7 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, static int prealloc_init(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; + u32 lru_num_entries = num_entries; int err = -ENOMEM, i;
if (htab_has_extra_elems(htab)) @@ -354,7 +355,8 @@ static int prealloc_init(struct bpf_htab *htab) if (htab_is_lru(htab)) bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node), - htab->elem_size, num_entries); + htab->elem_size, lru_num_entries, + num_entries - lru_num_entries); else pcpu_freelist_populate(&htab->freelist, htab->elems + offsetof(struct htab_elem, fnode), @@ -557,7 +559,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked;
- if (htab_has_extra_elems(htab)) { + if (htab_has_extra_elems(htab) && !htab_is_lru(htab)) { err = alloc_extra_elems(htab); if (err) goto free_prealloc; @@ -1182,6 +1184,69 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) bpf_lru_push_free(&htab->lru, &elem->lru_node); }
+static int htab_lru_map_update_elem_in_place(struct bpf_htab *htab, void *key, void *value, + u64 map_flags, struct bucket *b, + struct hlist_nulls_head *head, u32 hash, + bool percpu, bool onallcpus) +{ + struct htab_elem *l_new, *l_old, *l_free; + struct bpf_map *map = &htab->map; + u32 key_size = map->key_size; + struct bpf_lru_node *node; + unsigned long flags; + void *l_val; + int ret; + + node = bpf_lru_pop_extra(&htab->lru); + if (!node) + return -ENOENT; + + l_new = container_of(node, struct htab_elem, lru_node); + l_new->hash = hash; + memcpy(l_new->key, key, key_size); + if (!percpu) { + l_val = htab_elem_value(l_new, map->key_size); + copy_map_value(map, l_val, value); + bpf_obj_free_fields(map->record, l_val); + } + + ret = htab_lock_bucket(b, &flags); + if (ret) + goto err_lock_bucket; + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + bpf_lru_node_set_ref(&l_new->lru_node); + if (percpu) { + /* per-cpu hash map can update value in-place. + * Keep the same logic in __htab_lru_percpu_map_update_elem(). + */ + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + l_free = l_new; + } else { + hlist_nulls_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_del_rcu(&l_old->hash_node); + l_free = l_old; + } + } else { + ret = -ENOENT; + } + +err: + htab_unlock_bucket(b, flags); + +err_lock_bucket: + bpf_lru_push_free(&htab->lru, ret ? node : &l_free->lru_node); + + return ret; +} + static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { @@ -1206,6 +1271,11 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value b = __select_bucket(htab, hash); head = &b->head;
+ ret = htab_lru_map_update_elem_in_place(htab, key, value, map_flags, b, head, hash, false, + false); + if (!ret) + return 0; + /* For LRU, we need to alloc before taking bucket's * spinlock because getting free nodes from LRU may need * to remove older elements from htab and this removal @@ -1336,6 +1406,11 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head;
+ ret = htab_lru_map_update_elem_in_place(htab, key, value, map_flags, b, head, hash, true, + onallcpus); + if (!ret) + return 0; + /* For LRU, we need to alloc before taking bucket's * spinlock because LRU's elem alloc may need * to remove older elem from htab and this removal
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d029690246f8..8665eb6b8a7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c
[ ... ]
+static int htab_lru_map_update_elem_in_place(struct bpf_htab *htab, void *key, void *value,
u64 map_flags, struct bucket *b,struct hlist_nulls_head *head, u32 hash,bool percpu, bool onallcpus)+{
- struct htab_elem *l_new, *l_old, *l_free;
- struct bpf_map *map = &htab->map;
- u32 key_size = map->key_size;
- struct bpf_lru_node *node;
- unsigned long flags;
- void *l_val;
- int ret;
- node = bpf_lru_pop_extra(&htab->lru);
- if (!node)
return -ENOENT;- l_new = container_of(node, struct htab_elem, lru_node);
- l_new->hash = hash;
- memcpy(l_new->key, key, key_size);
- if (!percpu) {
l_val = htab_elem_value(l_new, map->key_size);copy_map_value(map, l_val, value);bpf_obj_free_fields(map->record, l_val);- }
Should bpf_obj_free_fields() be called here on l_new? The value was just copied into l_new and l_new will be inserted into the hash table when l_old is found. Calling bpf_obj_free_fields() on l_new's value frees the special fields (kptr, bpf_list_head, etc.) that were just copied, then l_new is inserted with freed fields. This appears to be a use-after-free for maps with special fields.
Looking at htab_lru_map_update_elem(), the pattern is to copy the value and NOT call bpf_obj_free_fields() on the new element. The fields are only freed when an element is removed from the hash table.
- ret = htab_lock_bucket(b, &flags);
- if (ret)
goto err_lock_bucket;- l_old = lookup_elem_raw(head, hash, key, key_size);
- ret = check_flags(htab, l_old, map_flags);
- if (ret)
goto err;- if (l_old) {
bpf_lru_node_set_ref(&l_new->lru_node);if (percpu) {/* per-cpu hash map can update value in-place.* Keep the same logic in __htab_lru_percpu_map_update_elem().*/pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),value, onallcpus);l_free = l_new;} else {hlist_nulls_add_head_rcu(&l_new->hash_node, head);hlist_nulls_del_rcu(&l_old->hash_node);l_free = l_old;}- } else {
ret = -ENOENT;- }
+err:
- htab_unlock_bucket(b, flags);
+err_lock_bucket:
- bpf_lru_push_free(&htab->lru, ret ? node : &l_free->lru_node);
^^^^^^^^^^^^^^^^^^
When ret == 0 and !percpu, l_free points to l_old which is being removed from the hash table. The code calls bpf_lru_push_free() directly instead of htab_lru_push_free(). Looking at htab_lru_push_free():
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { check_and_free_fields(htab, elem); bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); }
It calls check_and_free_fields() to free special fields in the element before pushing to the free list. The code here bypasses that, which would leak special fields (kptr, etc.) in l_old when it has them.
Should this use htab_lru_push_free() instead of bpf_lru_push_free() for the non-error path where l_old is being freed?
- return ret;
+}
--- AI reviewed your patch. Please fix the bug or email reply why it's not a bug. See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/20720201621
On 2026/1/5 23:43, bot+bpf-ci@kernel.org wrote:
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d029690246f8..8665eb6b8a7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c
[ ... ]
+static int htab_lru_map_update_elem_in_place(struct bpf_htab *htab, void *key, void *value,
u64 map_flags, struct bucket *b,struct hlist_nulls_head *head, u32 hash,bool percpu, bool onallcpus)+{
- struct htab_elem *l_new, *l_old, *l_free;
- struct bpf_map *map = &htab->map;
- u32 key_size = map->key_size;
- struct bpf_lru_node *node;
- unsigned long flags;
- void *l_val;
- int ret;
- node = bpf_lru_pop_extra(&htab->lru);
- if (!node)
return -ENOENT;- l_new = container_of(node, struct htab_elem, lru_node);
- l_new->hash = hash;
- memcpy(l_new->key, key, key_size);
- if (!percpu) {
l_val = htab_elem_value(l_new, map->key_size);copy_map_value(map, l_val, value);bpf_obj_free_fields(map->record, l_val);- }
Should bpf_obj_free_fields() be called here on l_new? The value was just copied into l_new and l_new will be inserted into the hash table when l_old is found. Calling bpf_obj_free_fields() on l_new's value frees the special fields (kptr, bpf_list_head, etc.) that were just copied, then l_new is inserted with freed fields. This appears to be a use-after-free for maps with special fields.
Looking at htab_lru_map_update_elem(), the pattern is to copy the value and NOT call bpf_obj_free_fields() on the new element. The fields are only freed when an element is removed from the hash table.
You're wrong here, that the copied value must be cleaned by bpf_obj_free_fields().
- ret = htab_lock_bucket(b, &flags);
- if (ret)
goto err_lock_bucket;- l_old = lookup_elem_raw(head, hash, key, key_size);
- ret = check_flags(htab, l_old, map_flags);
- if (ret)
goto err;- if (l_old) {
bpf_lru_node_set_ref(&l_new->lru_node);if (percpu) {/* per-cpu hash map can update value in-place.* Keep the same logic in __htab_lru_percpu_map_update_elem().*/pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),value, onallcpus);l_free = l_new;} else {hlist_nulls_add_head_rcu(&l_new->hash_node, head);hlist_nulls_del_rcu(&l_old->hash_node);l_free = l_old;}- } else {
ret = -ENOENT;- }
+err:
- htab_unlock_bucket(b, flags);
+err_lock_bucket:
- bpf_lru_push_free(&htab->lru, ret ? node : &l_free->lru_node);
^^^^^^^^^^^^^^^^^^When ret == 0 and !percpu, l_free points to l_old which is being removed from the hash table. The code calls bpf_lru_push_free() directly instead of htab_lru_push_free(). Looking at htab_lru_push_free():
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { check_and_free_fields(htab, elem); bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); }It calls check_and_free_fields() to free special fields in the element before pushing to the free list. The code here bypasses that, which would leak special fields (kptr, etc.) in l_old when it has them.
Should this use htab_lru_push_free() instead of bpf_lru_push_free() for the non-error path where l_old is being freed?
You're right here, that I did not free the special fields of l_old. But htab_lru_push_free() shouldn't be used here for the non-error path, because bpf_map_dec_elem_count() was not required here.
I'll fix it in the next revision.
Thanks, Leon
- return ret;
+}
AI reviewed your patch. Please fix the bug or email reply why it's not a bug. See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/20720201621
Add four tests to verify that updating an existing element in LRU hash maps does not cause unintended eviction of other elements.
The test creates lru_hash/lru_percpu_hash maps with max_entries slots and populates all of them. It then updates an existing key and verifies that: 1. The update succeeds without error 2. The updated key has the new value 3. All other keys still exist with their original values
This validates the fix that prevents unnecessary LRU eviction when updating existing elements in full LRU hash maps.
Signed-off-by: Leon Hwang leon.hwang@linux.dev --- .../selftests/bpf/prog_tests/htab_update.c | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c index d0b405eb2966..a0c93aae2b99 100644 --- a/tools/testing/selftests/bpf/prog_tests/htab_update.c +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -143,3 +143,132 @@ void test_htab_update(void) if (test__start_subtest("concurrent_update")) test_concurrent_update(); } + +static void __setaffinity(cpu_set_t *cpus, int cpu) +{ + CPU_ZERO(cpus); + CPU_SET(cpu, cpus); + pthread_setaffinity_np(pthread_self(), sizeof(*cpus), cpus); +} + +static void test_lru_hash_map_update_elem(enum bpf_map_type map_type, u64 map_flags) +{ + bool percpu = map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; + int err, map_fd, i, key, nr_cpus, max_entries = 128; + u64 *values, value = 0xDEADC0DE; + cpu_set_t cpus; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = map_flags, + ); + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + values = calloc(nr_cpus, sizeof(u64)); + if (!ASSERT_OK_PTR(values, "calloc values")) + return; + for (i = 0; i < nr_cpus; i++) + values[i] = value; + + map_fd = bpf_map_create(map_type, "test_lru", sizeof(int), sizeof(u64), max_entries, &opts); + if (!ASSERT_GE(map_fd, 0, "bpf_map_create")) { + free(values); + return; + } + + /* populate all slots */ + for (key = 0; key < max_entries; key++) { + __setaffinity(&cpus, key%nr_cpus); + err = bpf_map_update_elem(map_fd, &key, values, 0); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + } + + /* LRU eviction should not happen */ + +#define CHECK_OTHER_CPUS_VALUES(__val) \ + do { \ + if (!percpu) \ + break; \ + for (i = 1; i < nr_cpus; i++) \ + if (!ASSERT_EQ(values[i], __val, "bpf_map_lookup_elem value")) \ + goto out; \ + } while (0) + + __setaffinity(&cpus, 0); + key = 0; + memset(values, 0, nr_cpus * sizeof(u64)); + err = bpf_map_update_elem(map_fd, &key, values, 0); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + err = bpf_map_lookup_elem(map_fd, &key, values); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto out; + if (!ASSERT_EQ(*values, 0, "bpf_map_lookup_elem value")) + goto out; + CHECK_OTHER_CPUS_VALUES(0); + + for (key = 1; key < max_entries; key++) { + err = bpf_map_lookup_elem(map_fd, &key, values); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto out; + if (!ASSERT_EQ(*values, value, "bpf_map_lookup_elem value")) + goto out; + CHECK_OTHER_CPUS_VALUES(value); + } + + for (i = 0; i < nr_cpus; i++) + values[i] = value; + + key = max_entries; + err = bpf_map_update_elem(map_fd, &key, values, 0); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out; + + err = bpf_map_lookup_elem(map_fd, &key, values); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto out; + if (!ASSERT_EQ(*values, value, "bpf_map_lookup_elem value")) + goto out; + CHECK_OTHER_CPUS_VALUES(value); + +#undef CHECK_OTHER_CPUS_VALUES + +out: + close(map_fd); + free(values); +} + +static void test_update_lru_hash_map_common_lru(void) +{ + test_lru_hash_map_update_elem(BPF_MAP_TYPE_LRU_HASH, 0); +} + +static void test_update_lru_hash_map_percpu_lru(void) +{ + test_lru_hash_map_update_elem(BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU); +} + +static void test_update_lru_percpu_hash_map_common_lru(void) +{ + test_lru_hash_map_update_elem(BPF_MAP_TYPE_LRU_PERCPU_HASH, 0); +} + +static void test_update_lru_percpu_hash_map_percpu_lru(void) +{ + test_lru_hash_map_update_elem(BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU); +} + +void test_update_lru_hash_maps(void) +{ + if (test__start_subtest("lru_hash/common_lru")) + test_update_lru_hash_map_common_lru(); + if (test__start_subtest("lru_hash/percpu_lru")) + test_update_lru_hash_map_percpu_lru(); + if (test__start_subtest("lru_percpu_hash/common_lru")) + test_update_lru_percpu_hash_map_common_lru(); + if (test__start_subtest("lru_percpu_hash/percpu_lru")) + test_update_lru_percpu_hash_map_percpu_lru(); +}
linux-kselftest-mirror@lists.linaro.org