From: Jarkko Sakkinen jarkko.sakkinen@opinsys.com
Add an isolated list of unreferenced keys to be queued for deletion, and try to pin the keys in the garbage collector before processing anything. Skip unpinnable keys.
Use this list for blocking the reaping process during the teardown:
1. First off, the keys added to `keys_graveyard` are snapshotted, and the list is flushed. This the very last step in `key_put()`. 2. `key_put()` reaches zero. This will mark key as busy for the garbage collector. 3. `key_garbage_collector()` will try to increase refcount, which won't go above zero. Whenever this happens, the key will be skipped.
Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Jarkko Sakkinen jarkko.sakkinen@opinsys.com --- v8: - One more rebasing error (2x list_splice_init, reported by Marek Szyprowski) v7: - Fixed multiple definitions (from rebasing). v6: - Rebase went wrong in v5. v5: - Rebased on top of v6.15-rc - Updated commit message to explain how spin lock and refcount isolate the time window in key_put(). v4: - Pin the key while processing key type teardown. Skip dead keys. - Revert key_gc_graveyard back key_gc_unused_keys. - Rewrote the commit message. - "unsigned long flags" declaration somehow did make to the previous patch (sorry). v3: - Using spin_lock() fails since key_put() is executed inside IRQs. Using spin_lock_irqsave() would neither work given the lock is acquired for /proc/keys. Therefore, separate the lock for graveyard and key_graveyard before reaping key_serial_tree. v2: - Rename key_gc_unused_keys as key_gc_graveyard, and re-document the function. --- include/linux/key.h | 7 ++----- security/keys/gc.c | 36 ++++++++++++++++++++---------------- security/keys/internal.h | 5 +++++ security/keys/key.c | 7 +++++-- 4 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/include/linux/key.h b/include/linux/key.h index ba05de8579ec..c50659184bdf 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -195,10 +195,8 @@ enum key_state { struct key { refcount_t usage; /* number of references */ key_serial_t serial; /* key serial number */ - union { - struct list_head graveyard_link; - struct rb_node serial_node; - }; + struct list_head graveyard_link; /* key->usage == 0 */ + struct rb_node serial_node; #ifdef CONFIG_KEY_NOTIFICATIONS struct watch_list *watchers; /* Entities watching this key for changes */ #endif @@ -236,7 +234,6 @@ struct key { #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ -#define KEY_FLAG_FINAL_PUT 10 /* set if final put has happened on key */
/* the key type and key description string * - the desc is used to match a key against search criteria diff --git a/security/keys/gc.c b/security/keys/gc.c index f27223ea4578..9ccd8ee6fcdb 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -189,6 +189,7 @@ static void key_garbage_collector(struct work_struct *work) struct rb_node *cursor; struct key *key; time64_t new_timer, limit, expiry; + unsigned long flags;
kenter("[%lx,%x]", key_gc_flags, gc_state);
@@ -206,21 +207,35 @@ static void key_garbage_collector(struct work_struct *work)
new_timer = TIME64_MAX;
+ spin_lock_irqsave(&key_graveyard_lock, flags); + list_splice_init(&key_graveyard, &graveyard); + spin_unlock_irqrestore(&key_graveyard_lock, flags); + + list_for_each_entry(key, &graveyard, graveyard_link) { + spin_lock(&key_serial_lock); + kdebug("unrefd key %d", key->serial); + rb_erase(&key->serial_node, &key_serial_tree); + spin_unlock(&key_serial_lock); + } + /* As only this function is permitted to remove things from the key * serial tree, if cursor is non-NULL then it will always point to a * valid node in the tree - even if lock got dropped. */ spin_lock(&key_serial_lock); + key = NULL; cursor = rb_first(&key_serial_tree);
continue_scanning: + key_put(key); while (cursor) { key = rb_entry(cursor, struct key, serial_node); cursor = rb_next(cursor); - - if (test_bit(KEY_FLAG_FINAL_PUT, &key->flags)) { - smp_mb(); /* Clobber key->user after FINAL_PUT seen. */ - goto found_unreferenced_key; + /* key_get(), unless zero: */ + if (!refcount_inc_not_zero(&key->usage)) { + key = NULL; + gc_state |= KEY_GC_REAP_AGAIN; + goto skip_dead_key; }
if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) { @@ -274,6 +289,7 @@ static void key_garbage_collector(struct work_struct *work) spin_lock(&key_serial_lock); goto continue_scanning; } + key_put(key);
/* We've completed the pass. Set the timer if we need to and queue a * new cycle if necessary. We keep executing cycles until we find one @@ -328,18 +344,6 @@ static void key_garbage_collector(struct work_struct *work) kleave(" [end %x]", gc_state); return;
- /* We found an unreferenced key - once we've removed it from the tree, - * we can safely drop the lock. - */ -found_unreferenced_key: - kdebug("unrefd key %d", key->serial); - rb_erase(&key->serial_node, &key_serial_tree); - spin_unlock(&key_serial_lock); - - list_add_tail(&key->graveyard_link, &graveyard); - gc_state |= KEY_GC_REAP_AGAIN; - goto maybe_resched; - /* We found a restricted keyring and need to update the restriction if * it is associated with the dead key type. */ diff --git a/security/keys/internal.h b/security/keys/internal.h index 2cffa6dc8255..4e3d9b322390 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -63,9 +63,14 @@ struct key_user { int qnbytes; /* number of bytes allocated to this user */ };
+extern struct list_head key_graveyard; +extern spinlock_t key_graveyard_lock; + extern struct rb_root key_user_tree; extern spinlock_t key_user_lock; extern struct key_user root_key_user; +extern struct list_head key_graveyard; +extern spinlock_t key_graveyard_lock;
extern struct key_user *key_user_lookup(kuid_t uid); extern void key_user_put(struct key_user *user); diff --git a/security/keys/key.c b/security/keys/key.c index 7198cd2ac3a3..7511f2017b6b 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -22,6 +22,8 @@ DEFINE_SPINLOCK(key_serial_lock);
struct rb_root key_user_tree; /* tree of quota records indexed by UID */ DEFINE_SPINLOCK(key_user_lock); +LIST_HEAD(key_graveyard); +DEFINE_SPINLOCK(key_graveyard_lock);
unsigned int key_quota_root_maxkeys = 1000000; /* root's key count quota */ unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */ @@ -658,8 +660,9 @@ void key_put(struct key *key) key->user->qnbytes -= key->quotalen; spin_unlock_irqrestore(&key->user->lock, flags); } - smp_mb(); /* key->user before FINAL_PUT set. */ - set_bit(KEY_FLAG_FINAL_PUT, &key->flags); + spin_lock_irqsave(&key_graveyard_lock, flags); + list_add_tail(&key->graveyard_link, &key_graveyard); + spin_unlock_irqrestore(&key_graveyard_lock, flags); schedule_work(&key_gc_work); } }
On Mon, Apr 07, 2025 at 03:58:01PM +0300, Jarkko Sakkinen wrote:
From: Jarkko Sakkinen jarkko.sakkinen@opinsys.com
Add an isolated list of unreferenced keys to be queued for deletion, and try to pin the keys in the garbage collector before processing anything. Skip unpinnable keys.
Use this list for blocking the reaping process during the teardown:
- First off, the keys added to `keys_graveyard` are snapshotted, and the list is flushed. This the very last step in `key_put()`.
- `key_put()` reaches zero. This will mark key as busy for the garbage collector.
- `key_garbage_collector()` will try to increase refcount, which won't go above zero. Whenever this happens, the key will be skipped.
Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Jarkko Sakkinen jarkko.sakkinen@opinsys.com
This version is my master branch now:
https://web.git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git/l...
For the time being not in next.
BR, Jarkko
On Tue, Apr 08, 2025 at 07:01:47PM +0300, Jarkko Sakkinen wrote:
On Mon, Apr 07, 2025 at 03:58:01PM +0300, Jarkko Sakkinen wrote:
From: Jarkko Sakkinen jarkko.sakkinen@opinsys.com
Add an isolated list of unreferenced keys to be queued for deletion, and try to pin the keys in the garbage collector before processing anything. Skip unpinnable keys.
Use this list for blocking the reaping process during the teardown:
- First off, the keys added to `keys_graveyard` are snapshotted, and the list is flushed. This the very last step in `key_put()`.
- `key_put()` reaches zero. This will mark key as busy for the garbage collector.
- `key_garbage_collector()` will try to increase refcount, which won't go above zero. Whenever this happens, the key will be skipped.
Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Jarkko Sakkinen jarkko.sakkinen@opinsys.com
This version is my master branch now:
https://web.git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git/l...
For the time being not in next.
I just updated it to my -next, so probably tomorrow will be in linux-next.
I believe this is absolutely right thing to do but please be aware of this (now it is *knowingly* applied) and ping me for any issues.
Summaery: it sets walls against using struct key in the middle of destruction (e.g. when key_put() is accessing it after zero refcount, GC should never touch it).
BR, Jarkko
Jarkko Sakkinen jarkko@kernel.org wrote:
- spin_lock_irqsave(&key_graveyard_lock, flags);
- list_splice_init(&key_graveyard, &graveyard);
- spin_unlock_irqrestore(&key_graveyard_lock, flags);
I would wrap this bit in a check to see if key_graveyard is empty so that we can avoid disabling irqs and taking the lock if the graveyard is empty.
if (!refcount_inc_not_zero(&key->usage)) {
Sorry, but eww. You're going to wangle the refcount twice on every key on the system every time the gc does a pass. Further, in some cases inc_not_zero is not the fastest op in the world.
spin_lock_irqsave(&key_graveyard_lock, flags);
list_add_tail(&key->graveyard_link, &key_graveyard);
spin_unlock_irqrestore(&key_graveyard_lock, flags); schedule_work(&key_gc_work);
This is going to enable and disable interrupts twice and that can be expensive, depending on the arch. I wonder if it would be better to do:
local_irq_save(flags); spin_lock(&key_graveyard_lock); list_add_tail(&key->graveyard_link, &key_graveyard); spin_unlock(&key_graveyard_lock); schedule_work(&key_gc_work); local_irq_restore(flags);
David
On Fri, Apr 11, 2025 at 04:59:11PM +0100, David Howells wrote:
Jarkko Sakkinen jarkko@kernel.org wrote:
- spin_lock_irqsave(&key_graveyard_lock, flags);
- list_splice_init(&key_graveyard, &graveyard);
- spin_unlock_irqrestore(&key_graveyard_lock, flags);
I would wrap this bit in a check to see if key_graveyard is empty so that we can avoid disabling irqs and taking the lock if the graveyard is empty.
Can do, and does make sense.
if (!refcount_inc_not_zero(&key->usage)) {
Sorry, but eww. You're going to wangle the refcount twice on every key on the system every time the gc does a pass. Further, in some cases inc_not_zero is not the fastest op in the world.
One could alternatively "test_bit(KEY_FLAG_FINAL_PUT, &key->flags)) && !refcount_inc_not_zero(&key->usage))" without mb() on either side and set_bit() could be at the beginning of key_put().
Race at worst would be an extra refcount_inc_not_zero() but not often.
spin_lock_irqsave(&key_graveyard_lock, flags);
list_add_tail(&key->graveyard_link, &key_graveyard);
spin_unlock_irqrestore(&key_graveyard_lock, flags); schedule_work(&key_gc_work);
This is going to enable and disable interrupts twice and that can be expensive, depending on the arch. I wonder if it would be better to do:
local_irq_save(flags); spin_lock(&key_graveyard_lock); list_add_tail(&key->graveyard_link, &key_graveyard); spin_unlock(&key_graveyard_lock); schedule_work(&key_gc_work); local_irq_restore(flags);
I like this but shouldn't this also comprehend the quota update before (just asking for completeness sake)?
David
BR, Jarkko
On Fri, Apr 11, 2025 at 11:37:25PM +0300, Jarkko Sakkinen wrote:
This is going to enable and disable interrupts twice and that can be expensive, depending on the arch. I wonder if it would be better to do:
local_irq_save(flags); spin_lock(&key_graveyard_lock); list_add_tail(&key->graveyard_link, &key_graveyard); spin_unlock(&key_graveyard_lock); schedule_work(&key_gc_work); local_irq_restore(flags);
I like this but shouldn't this also comprehend the quota update before (just asking for completeness sake)?
"This brings me on to another though: Should key_serial_lock be a seqlock? And should the gc use RCU + read_seqlock() and insertion write_seqlock()?" https://lore.kernel.org/keyrings/797521.1743602083@warthog.procyon.org.uk/
I think that should be done too (because it made whole a lot of sense) as a separate patch. I'd just prefer move slowly and in baby steps for better quality, and keep that as a separate follow-up patch.
It makes obviously sense given rare writes.
BR, Jarkko
On Fri, Apr 11, 2025 at 11:37:25PM +0300, Jarkko Sakkinen wrote:
On Fri, Apr 11, 2025 at 04:59:11PM +0100, David Howells wrote:
Jarkko Sakkinen jarkko@kernel.org wrote:
- spin_lock_irqsave(&key_graveyard_lock, flags);
- list_splice_init(&key_graveyard, &graveyard);
- spin_unlock_irqrestore(&key_graveyard_lock, flags);
I would wrap this bit in a check to see if key_graveyard is empty so that we can avoid disabling irqs and taking the lock if the graveyard is empty.
Can do, and does make sense.
if (!refcount_inc_not_zero(&key->usage)) {
Sorry, but eww. You're going to wangle the refcount twice on every key on the system every time the gc does a pass. Further, in some cases inc_not_zero is not the fastest op in the world.
One could alternatively "test_bit(KEY_FLAG_FINAL_PUT, &key->flags)) && !refcount_inc_not_zero(&key->usage))" without mb() on either side and
Refactoring the changes to key_put() would be (draft):
void key_put(struct key *key) { unsigned long flags;
if (!key) return;
key_check(key);
if (!refcount_dec_and_test(&key->usage)) return;
local_irq_save(flags);
set_bit(KEY_FLAG_FINAL_PUT, &key->flags);
/* Remove user's key tracking and quota: */ if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { spin_lock(&key->user->lock); key->user->qnkeys--; key->user->qnbytes -= key->quotalen; spin_unlock(&key->user->lock); }
spin_lock(&key_graveyard_lock); list_add_tail(&key->graveyard_link, &key_graveyard); spin_unlock(&key_graveyard_lock);
schedule_work(&key_gc_work);
local_irq_restore(flags); }
And:
static bool key_get_gc(struct key *key) { return !test_bit(KEY_FLAG_FINAL_PUT, &key->flags)) && /* fast path */ refcount_inc_not_zero(&key->usage) /* slow path */ }
In the gc-loop:
if (!key_get_gc(&key)) { key = NULL; gc_state |= KEY_GC_REAP_AGAIN; goto skip_dead_key; }
[none yet compiled]
BR, Jarkko
On Sat, Apr 12, 2025 at 04:30:24AM +0300, Jarkko Sakkinen wrote:
On Fri, Apr 11, 2025 at 11:37:25PM +0300, Jarkko Sakkinen wrote:
On Fri, Apr 11, 2025 at 04:59:11PM +0100, David Howells wrote:
Jarkko Sakkinen jarkko@kernel.org wrote:
- spin_lock_irqsave(&key_graveyard_lock, flags);
- list_splice_init(&key_graveyard, &graveyard);
- spin_unlock_irqrestore(&key_graveyard_lock, flags);
I would wrap this bit in a check to see if key_graveyard is empty so that we can avoid disabling irqs and taking the lock if the graveyard is empty.
Can do, and does make sense.
if (!refcount_inc_not_zero(&key->usage)) {
Sorry, but eww. You're going to wangle the refcount twice on every key on the system every time the gc does a pass. Further, in some cases inc_not_zero is not the fastest op in the world.
One could alternatively "test_bit(KEY_FLAG_FINAL_PUT, &key->flags)) && !refcount_inc_not_zero(&key->usage))" without mb() on either side and
Refactoring the changes to key_put() would be (draft):
I'll post a fresh patch set later :-) Deeply realized how this does not make sense as it is. So yeah, it'll be a patch set.
One change that would IMHO make sense would be
diff --git a/security/keys/key.c b/security/keys/key.c index 7198cd2ac3a3..aecbd624612d 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -656,10 +656,12 @@ void key_put(struct key *key) spin_lock_irqsave(&key->user->lock, flags); key->user->qnkeys--; key->user->qnbytes -= key->quotalen; + set_bit(KEY_FLAG_FINAL_PUT, &key->flags); spin_unlock_irqrestore(&key->user->lock, flags); + } else { + set_bit(KEY_FLAG_FINAL_PUT, &key->flags); + smp_mb(); /* key->user before FINAL_PUT set. */ } - smp_mb(); /* key->user before FINAL_PUT set. */ - set_bit(KEY_FLAG_FINAL_PUT, &key->flags); schedule_work(&key_gc_work); } }
I did not see anything obvious that would endanger anything and reduces the number of smp_mb()'s. This is just on top of mainline ...
BR, Jarkko
linux-stable-mirror@lists.linaro.org