This set required 4 additional patches to avoid errors.
Peter Zijlstra (4): futex,rt_mutex: Provide futex specific rt_mutex API futex: Remove rt_mutex_deadlock_account_*() futex: Rework inconsistent rt_mutex/futex_q state futex: Avoid violating the 10th rule of futex
Thomas Gleixner (6): futex: Replace pointless printk in fixup_owner() futex: Provide and use pi_state_update_owner() rtmutex: Remove unused argument from rt_mutex_proxy_unlock() futex: Use pi_state_update_owner() in put_pi_state() futex: Simplify fixup_pi_state_owner() futex: Handle faults correctly for PI futexes
kernel/futex.c | 278 ++++++++++++++++++-------------- kernel/locking/rtmutex-debug.c | 9 -- kernel/locking/rtmutex-debug.h | 3 - kernel/locking/rtmutex.c | 127 +++++++++------ kernel/locking/rtmutex.h | 2 - kernel/locking/rtmutex_common.h | 12 +- 6 files changed, 244 insertions(+), 187 deletions(-)
Cc: bigeasy@linutronix.de Cc: bristot@redhat.com Cc: Darren Hart dvhart@infradead.org Cc: dvhart@infradead.org Cc: jdesfossez@efficios.com Cc: juri.lelli@arm.com Cc: mathieu.desnoyers@efficios.com Cc: rostedt@goodmis.org Cc: stable@vger.kernel.org Cc: xlpang@redhat.com
From: Peter Zijlstra peterz@infradead.org
[ Upstream commit 5293c2efda37775346885c7e924d4ef7018ea60b ]
Part of what makes futex_unlock_pi() intricate is that rt_mutex_futex_unlock() -> rt_mutex_slowunlock() can drop rt_mutex::wait_lock.
This means it cannot rely on the atomicy of wait_lock, which would be preferred in order to not rely on hb->lock so much.
The reason rt_mutex_slowunlock() needs to drop wait_lock is because it can race with the rt_mutex fastpath, however futexes have their own fast path.
Since futexes already have a bunch of separate rt_mutex accessors, complete that set and implement a rt_mutex variant without fastpath for them.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104151.702962446@infradead.org Signed-off-by: Thomas Gleixner tglx@linutronix.de [Lee: Back-ported to solve a dependency] Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 30 +++++++++--------- kernel/locking/rtmutex.c | 56 ++++++++++++++++++++++++--------- kernel/locking/rtmutex_common.h | 8 +++-- 3 files changed, 61 insertions(+), 33 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index f1990e2a51e5a..00b474b4b54e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -936,7 +936,7 @@ static void exit_pi_state_list(struct task_struct *curr) pi_state->owner = NULL; raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_unlock(&pi_state->pi_mutex); + rt_mutex_futex_unlock(&pi_state->pi_mutex);
spin_unlock(&hb->lock);
@@ -1436,20 +1436,18 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, pi_state->owner = new_owner; raw_spin_unlock_irq(&new_owner->pi_lock);
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - - deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); - /* - * First unlock HB so the waiter does not spin on it once he got woken - * up. Second wake up the waiter before the priority is adjusted. If we - * deboost first (and lose our higher priority), then the task might get - * scheduled away before the wake up can take place. + * We've updated the uservalue, this unlock cannot fail. */ + deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); - wake_up_q(&wake_q); - if (deboost) + + if (deboost) { + wake_up_q(&wake_q); rt_mutex_adjust_prio(current); + }
return 0; } @@ -2362,7 +2360,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * task acquired the rt_mutex after we removed ourself from the * rt_mutex waiters list. */ - if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { + if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) { locked = 1; goto out; } @@ -2686,7 +2684,7 @@ retry_private: if (!trylock) { ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); } else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; } @@ -2709,7 +2707,7 @@ retry_private: * it and return the fault to userspace. */ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) - rt_mutex_unlock(&q.pi_state->pi_mutex); + rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
/* Unqueue and drop the lock */ unqueue_me_pi(&q); @@ -3016,7 +3014,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) - rt_mutex_unlock(&q.pi_state->pi_mutex); + rt_mutex_futex_unlock(&q.pi_state->pi_mutex); /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -3059,7 +3057,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * userspace. */ if (ret && rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); + rt_mutex_futex_unlock(pi_mutex);
/* Unqueue and drop the lock. */ unqueue_me_pi(&q); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index dd173df9ee5e5..3323ef935372f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1485,15 +1485,23 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/* * Futex variant with full deadlock detection. + * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock(). */ -int rt_mutex_timed_futex_lock(struct rt_mutex *lock, +int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) { might_sleep();
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_FULL_CHAINWALK, - rt_mutex_slowlock); + return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, + timeout, RT_MUTEX_FULL_CHAINWALK); +} + +/* + * Futex variant, must not use fastpath. + */ +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_slowtrylock(lock); }
/** @@ -1552,20 +1560,38 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/** - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock - * @lock: the rt_mutex to be unlocked - * - * Returns: true/false indicating whether priority adjustment is - * required or not. + * Futex variant, that since futex variants do not use the fast-path, can be + * simple and will not need to retry. */ -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh) +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q) { - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - return false; + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ + } + + mark_wakeup_next_waiter(wake_q, lock); + return true; /* deboost and wakeups */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +{ + WAKE_Q(wake_q); + bool deboost; + + raw_spin_lock_irq(&lock->wait_lock); + deboost = __rt_mutex_futex_unlock(lock, &wake_q); + raw_spin_unlock_irq(&lock->wait_lock); + + if (deboost) { + wake_up_q(&wake_q); + rt_mutex_adjust_prio(current); } - return rt_mutex_slowunlock(lock, wqh); }
/** diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 6f8f68edb700c..cdcaccfb74432 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -112,8 +112,12 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); +extern int rt_mutex_futex_trylock(struct rt_mutex *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh); + extern void rt_mutex_adjust_prio(struct task_struct *task);
#ifdef CONFIG_DEBUG_RT_MUTEXES
From: Peter Zijlstra peterz@infradead.org
These are unused and clutter up the code.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104151.652692478@infradead.org Signed-off-by: Thomas Gleixner tglx@linutronix.de [Lee: Back-ported to solve a dependency] Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/locking/rtmutex-debug.c | 9 -------- kernel/locking/rtmutex-debug.h | 3 --- kernel/locking/rtmutex.c | 42 +++++++++++++--------------------- kernel/locking/rtmutex.h | 2 -- 4 files changed, 16 insertions(+), 40 deletions(-)
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 62b6cee8ea7f9..0613c4b1d0596 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) lock->name = name; }
-void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index d0519c3432b67..b585af9a1b508 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -9,9 +9,6 @@ * This file contains macros used solely by rtmutex.c. Debug version. */
-extern void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3323ef935372f..e3dd1642423f8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -937,8 +937,6 @@ takeit: */ rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, task); - return 1; }
@@ -1331,8 +1329,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
debug_rt_mutex_unlock(lock);
- rt_mutex_deadlock_account_unlock(current); - /* * We must be careful here if the fast path is enabled. If we * have no waiters queued we cannot set owner to NULL here @@ -1398,11 +1394,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); }
static inline int @@ -1414,21 +1409,19 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, timeout, chwalk); + + return slowfn(lock, state, timeout, chwalk); }
static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 1; - } + return slowfn(lock); }
@@ -1438,19 +1431,18 @@ rt_mutex_fastunlock(struct rt_mutex *lock, struct wake_q_head *wqh)) { WAKE_Q(wake_q); + bool deboost;
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return;
- } else { - bool deboost = slowfn(lock, &wake_q); + deboost = slowfn(lock, &wake_q);
- wake_up_q(&wake_q); + wake_up_q(&wake_q);
- /* Undo pi boosting if necessary: */ - if (deboost) - rt_mutex_adjust_prio(current); - } + /* Undo pi boosting if necessary: */ + if (deboost) + rt_mutex_adjust_prio(current); }
/** @@ -1648,7 +1640,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); }
/** @@ -1664,7 +1655,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); - rt_mutex_deadlock_account_unlock(proxy_owner); }
/** diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index c4060584c4076..6607802efa8bd 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -11,8 +11,6 @@ */
#define rt_mutex_deadlock_check(l) (0) -#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) -#define rt_mutex_deadlock_account_unlock(l) do { } while (0) #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) #define debug_rt_mutex_lock(l) do { } while (0)
From: Peter Zijlstra peterz@infradead.org
[Upstream commit 73d786bd043ebc855f349c81ea805f6b11cbf2aa ]
There is a weird state in the futex_unlock_pi() path when it interleaves with a concurrent futex_lock_pi() at the point where it drops hb->lock.
In this case, it can happen that the rt_mutex wait_list and the futex_q disagree on pending waiters, in particular rt_mutex will find no pending waiters where futex_q thinks there are. In this case the rt_mutex unlock code cannot assign an owner.
The futex side fixup code has to cleanup the inconsistencies with quite a bunch of interesting corner cases.
Simplify all this by changing wake_futex_pi() to return -EAGAIN when this situation occurs. This then gives the futex_lock_pi() code the opportunity to continue and the retried futex_unlock_pi() will now observe a coherent state.
The only problem is that this breaks RT timeliness guarantees. That is, consider the following scenario:
T1 and T2 are both pinned to CPU0. prio(T2) > prio(T1)
CPU0
T1 lock_pi() queue_me() <- Waiter is visible
preemption
T2 unlock_pi() loops with -EAGAIN forever
Which is undesirable for PI primitives. Future patches will rectify this.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104151.850383690@infradead.org Signed-off-by: Thomas Gleixner tglx@linutronix.de [Lee: Back-ported to solve a dependency] Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 50 ++++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 36 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index 00b474b4b54e0..a5a91a55c451f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1389,12 +1389,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/* - * It is possible that the next waiter (the one that brought - * this owner to the kernel) timed out and is no longer - * waiting on the lock. + * When we interleave with futex_lock_pi() where it does + * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter, + * but the rt_mutex's wait_list can be empty (either still, or again, + * depending on which side we land). + * + * When this happens, give up our locks and try again, giving the + * futex_lock_pi() instance time to complete, either by waiting on the + * rtmutex or removing itself from the futex queue. */ - if (!new_owner) - new_owner = this->task; + if (!new_owner) { + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return -EAGAIN; + }
/* * We pass it to the next owner. The WAITERS bit is always @@ -2337,7 +2344,6 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - struct task_struct *owner; int ret = 0;
if (locked) { @@ -2350,44 +2356,16 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) goto out; }
- /* - * Catch the rare case, where the lock was released when we were on the - * way back before we locked the hash bucket. - */ - if (q->pi_state->owner == current) { - /* - * Try to get the rt_mutex now. This might fail as some other - * task acquired the rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - - /* - * pi_state is incorrect, some other task did a lock steal and - * we returned due to timeout or signal without taking the - * rt_mutex. Too late. - */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); - goto out; - } - /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " "pi-state %p\n", ret, q->pi_state->pi_mutex.owner, q->pi_state->owner); + }
out: return ret ? ret : locked;
From: Peter Zijlstra peterz@infradead.org
commit c1e2f0eaf015fb7076d51a339011f2383e6dd389 upstream.
Julia reported futex state corruption in the following scenario:
waiter waker stealer (prio > waiter)
futex(WAIT_REQUEUE_PI, uaddr, uaddr2, timeout=[N ms]) futex_wait_requeue_pi() futex_wait_queue_me() freezable_schedule() <scheduled out> futex(LOCK_PI, uaddr2) futex(CMP_REQUEUE_PI, uaddr, uaddr2, 1, 0) /* requeues waiter to uaddr2 */ futex(UNLOCK_PI, uaddr2) wake_futex_pi() cmp_futex_value_locked(uaddr2, waiter) wake_up_q() <woken by waker> <hrtimer_wakeup() fires, clears sleeper->task> futex(LOCK_PI, uaddr2) __rt_mutex_start_proxy_lock() try_to_take_rt_mutex() /* steals lock */ rt_mutex_set_owner(lock, stealer) <preempted> <scheduled in> rt_mutex_wait_proxy_lock() __rt_mutex_slowlock() try_to_take_rt_mutex() /* fails, lock held by stealer */ if (timeout && !timeout->task) return -ETIMEDOUT; fixup_owner() /* lock wasn't acquired, so, fixup_pi_state_owner skipped */
return -ETIMEDOUT;
/* At this point, we've returned -ETIMEDOUT to userspace, but the * futex word shows waiter to be the owner, and the pi_mutex has * stealer as the owner */
futex_lock(LOCK_PI, uaddr2) -> bails with EDEADLK, futex word says we're owner.
And suggested that what commit:
73d786bd043e ("futex: Rework inconsistent rt_mutex/futex_q state")
removes from fixup_owner() looks to be just what is needed. And indeed it is -- I completely missed that requeue_pi could also result in this case. So we need to restore that, except that subsequent patches, like commit:
16ffa12d7425 ("futex: Pull rt_mutex_futex_unlock() out from under hb->lock")
changed all the locking rules. Even without that, the sequence:
- if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - }
- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner);
already suggests there were races; otherwise we'd never have to look at next_owner.
So instead of doing 3 consecutive wait_lock sections with who knows what races, we do it all in a single section. Additionally, the usage of pi_state->owner in fixup_owner() was only safe because only the rt_mutex owner would modify it, which this additional case wrecks.
Luckily the values can only change away and not to the value we're testing, this means we can do a speculative test and double check once we have the wait_lock.
Fixes: 73d786bd043e ("futex: Rework inconsistent rt_mutex/futex_q state") Reported-by: Julia Cartwright julia@ni.com Reported-by: Gratian Crisan gratian.crisan@ni.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Julia Cartwright julia@ni.com Tested-by: Gratian Crisan gratian.crisan@ni.com Cc: Darren Hart dvhart@infradead.org Link: https://lkml.kernel.org/r/20171208124939.7livp7no2ov65rrc@hirez.programming.... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org [Lee: Back-ported to solve a dependency] Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 80 +++++++++++++++++++++++++++------ kernel/locking/rtmutex.c | 26 ++++++++--- kernel/locking/rtmutex_common.h | 1 + 3 files changed, 87 insertions(+), 20 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index a5a91a55c451f..780872ac7d675 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2227,30 +2227,34 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); }
-/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *argowner) { - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner = pi_state->owner; u32 uval, uninitialized_var(curval), newval; + struct task_struct *oldowner, *newowner; + u32 newtid; int ret;
+ lockdep_assert_held(q->lock_ptr); + + oldowner = pi_state->owner; /* Owner died? */ if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED;
/* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but failed to get the rtmutex the first time. - * We have to replace the newowner TID in the user space variable. + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), + * + * or: + * + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). + * + * Either way, we have to replace the TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state @@ -2264,6 +2268,39 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in lookup_pi_state. */ retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + return 0; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock after all, nothing to fix. */ + return 0; + } + + /* + * Since we just failed the trylock; there must be an owner. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + BUG_ON(!newowner); + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + return 0; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + if (get_futex_value_locked(&uval, uaddr)) goto handle_fault;
@@ -2350,12 +2387,29 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: + * + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); goto out; }
+ /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); + goto out; + } + /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index e3dd1642423f8..45d3c9aec8533 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1284,6 +1284,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; }
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = try_to_take_rt_mutex(lock, current, NULL); + + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + + return ret; +} + /* * Slow path try-lock function: */ @@ -1305,13 +1318,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ raw_spin_lock(&lock->wait_lock);
- ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); + ret = __rt_mutex_slowtrylock(lock);
raw_spin_unlock(&lock->wait_lock);
@@ -1496,6 +1503,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) return rt_mutex_slowtrylock(lock); }
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index cdcaccfb74432..ea7310b9ce83a 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -113,6 +113,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); extern int rt_mutex_futex_trylock(struct rt_mutex *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit 04b79c55201f02ffd675e1231d731365e335c307 ]
If that unexpected case of inconsistent arguments ever happens then the futex state is left completely inconsistent and the printk is not really helpful. Replace it with a warning and make the state consistent.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index 780872ac7d675..a247942d69799 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2412,14 +2412,10 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
/* * Paranoia check. If we did not take the lock, then we should not be - * the owner of the rt_mutex. + * the owner of the rt_mutex. Warn and establish consistent state. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { - printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " - "pi-state %p\n", ret, - q->pi_state->pi_mutex.owner, - q->pi_state->owner); - } + if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) + return fixup_pi_state_owner(uaddr, q, current);
out: return ret ? ret : locked;
From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit c5cade200ab9a2a3be9e7f32a752c8d86b502ec7 ]
Updating pi_state::owner is done at several places with the same code. Provide a function for it and use that at the obvious places.
This is also a preparation for a bug fix to avoid yet another copy of the same code or alternatively introducing a completely unpenetratable mess of gotos.
Originally-by: Peter Zijlstra peterz@infradead.org Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 64 ++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 31 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index a247942d69799..1390ffa874a6b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -835,6 +835,29 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; }
+static void pi_state_update_owner(struct futex_pi_state *pi_state, + struct task_struct *new_owner) +{ + struct task_struct *old_owner = pi_state->owner; + + lockdep_assert_held(&pi_state->pi_mutex.wait_lock); + + if (old_owner) { + raw_spin_lock(&old_owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&old_owner->pi_lock); + } + + if (new_owner) { + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + } +} + /* * Must be called with the hb lock held. */ @@ -1427,26 +1450,16 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else ret = -EINVAL; } - if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; - } - - raw_spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock);
- raw_spin_lock_irq(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); - - /* - * We've updated the uservalue, this unlock cannot fail. - */ - deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + if (!ret) { + /* + * This is a point of no return; once we modified the uval + * there is no going back and subsequent operations must + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); + deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + }
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); @@ -2318,19 +2331,8 @@ retry: * We fixed up user space. Now we need to fix the pi_state * itself. */ - if (pi_state->owner != NULL) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); - } - - pi_state->owner = newowner; + pi_state_update_owner(pi_state, newowner);
- raw_spin_lock_irq(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock_irq(&newowner->pi_lock); return 0;
/*
On Thu, 2021-02-04 at 17:28 +0000, Lee Jones wrote:
From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit c5cade200ab9a2a3be9e7f32a752c8d86b502ec7 ]
Updating pi_state::owner is done at several places with the same code. Provide a function for it and use that at the obvious places.
This is also a preparation for a bug fix to avoid yet another copy of the same code or alternatively introducing a completely unpenetratable mess of gotos.
Originally-by: Peter Zijlstra peterz@infradead.org Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org
kernel/futex.c | 64 ++++++++++++++++++++++++++----------------------
1 file changed, 33 insertions(+), 31 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index a247942d69799..1390ffa874a6b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -835,6 +835,29 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +static void pi_state_update_owner(struct futex_pi_state *pi_state, + struct task_struct *new_owner) +{ + struct task_struct *old_owner = pi_state->owner;
+ lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
But not all callers do hold the wait_lock. That may not be needed as we don't have commit 734009e96d19 "futex: Change locking rules".
+ if (old_owner) { + raw_spin_lock(&old_owner->pi_lock);
(Some of) the callers used to disable interrupts when taking pi_lock, and I think that behaviour needs to be preserved here.
I'm doubtful that this cherry-picking approach is going to work.
Ben.
+ WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&old_owner->pi_lock); + }
+ if (new_owner) { + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + } +}
/* * Must be called with the hb lock held. */ @@ -1427,26 +1450,16 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else ret = -EINVAL; } - if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; - }
- raw_spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); - raw_spin_lock_irq(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock);
- /* - * We've updated the uservalue, this unlock cannot fail. - */ - deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + if (!ret) { + /* + * This is a point of no return; once we modified the uval + * there is no going back and subsequent operations must + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); + deboost = __rt_mutex_futex_unlock(&pi_state-
pi_mutex, &wake_q);
+ } raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); @@ -2318,19 +2331,8 @@ retry: * We fixed up user space. Now we need to fix the pi_state * itself. */ - if (pi_state->owner != NULL) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); - }
- pi_state->owner = newowner; + pi_state_update_owner(pi_state, newowner); - raw_spin_lock_irq(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock_irq(&newowner->pi_lock); return 0; /*
From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit 2156ac1934166d6deb6cd0f6ffc4c1076ec63697 ] Nothing uses the argument. Remove it as preparation to use pi_state_update_owner().
Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 2 +- kernel/locking/rtmutex.c | 3 +-- kernel/locking/rtmutex_common.h | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index 1390ffa874a6b..bf40921ef1200 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -878,7 +878,7 @@ static void free_pi_state(struct futex_pi_state *pi_state) list_del_init(&pi_state->list); raw_spin_unlock_irq(&pi_state->owner->pi_lock);
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + rt_mutex_proxy_unlock(&pi_state->pi_mutex); }
if (current->pi_state_cache) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 45d3c9aec8533..1c0cb5c3c6ad6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1662,8 +1662,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, * No locking. Caller has to do serializing itself * Special API call for PI-futex support */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) +void rt_mutex_proxy_unlock(struct rt_mutex *lock) { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index ea7310b9ce83a..4584db96265d4 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -101,8 +101,7 @@ enum rtmutex_chainwalk { extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task);
From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit 6ccc84f917d33312eb2846bd7b567639f585ad6d ]
No point in open coding it. This way it gains the extra sanity checks.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index bf40921ef1200..d9bec8eb60969 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -874,10 +874,7 @@ static void free_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); - + pi_state_update_owner(pi_state, NULL); rt_mutex_proxy_unlock(&pi_state->pi_mutex); }
From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit f2dac39d93987f7de1e20b3988c8685523247ae2 ]
Too many gotos already and an upcoming fix would make it even more unreadable.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index d9bec8eb60969..8300870666638 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2237,18 +2237,16 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); }
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *argowner) +static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, uninitialized_var(curval), newval; struct task_struct *oldowner, *newowner; - u32 newtid; - int ret; - - lockdep_assert_held(q->lock_ptr); + u32 uval, curval, newval, newtid; + int err = 0;
oldowner = pi_state->owner; + /* Owner died? */ if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; @@ -2289,7 +2287,7 @@ retry:
if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { /* We got the lock after all, nothing to fix. */ - return 0; + return 1; }
/* @@ -2304,7 +2302,7 @@ retry: * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - return 0; + return 1; } newowner = argowner; } @@ -2345,7 +2343,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr);
- ret = fault_in_user_writeable(uaddr); + err = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
@@ -2353,12 +2351,27 @@ handle_fault: * Check if someone else fixed it for us: */ if (pi_state->owner != oldowner) - return 0; + return argowner == current;
- if (ret) - return ret; + /* Retry if err was -EAGAIN or the fault in succeeded */ + if (!err) + goto retry;
- goto retry; + return err; +} + +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) +{ + struct futex_pi_state *pi_state = q->pi_state; + int ret; + + lockdep_assert_held(q->lock_ptr); + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + ret = __fixup_pi_state_owner(uaddr, q, argowner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; }
static long futex_wait_restart(struct restart_block *restart);
From: Thomas Gleixner tglx@linutronix.de
fixup_pi_state_owner() tries to ensure that the state of the rtmutex, pi_state and the user space value related to the PI futex are consistent before returning to user space. In case that the user space value update faults and the fault cannot be resolved by faulting the page in via fault_in_user_writeable() the function returns with -EFAULT and leaves the rtmutex and pi_state owner state inconsistent.
A subsequent futex_unlock_pi() operates on the inconsistent pi_state and releases the rtmutex despite not owning it which can corrupt the RB tree of the rtmutex and cause a subsequent kernel stack use after free.
It was suggested to loop forever in fixup_pi_state_owner() if the fault cannot be resolved, but that results in runaway tasks which is especially undesired when the problem happens due to a programming error and not due to malice.
As the user space value cannot be fixed up, the proper solution is to make the rtmutex and the pi_state consistent so both have the same owner. This leaves the user space value out of sync. Any subsequent operation on the futex will fail because the 10th rule of PI futexes (pi_state owner and user space value are consistent) has been violated.
As a consequence this removes the inept attempts of 'fixing' the situation in case that the current task owns the rtmutex when returning with an unresolvable fault by unlocking the rtmutex which left pi_state::owner and rtmutex::owner out of sync in a different and only slightly less dangerous way.
Fixes: 1b7558e457ed ("futexes: fix fault handling in futex_lock_pi") Reported-by: gzobqq@gmail.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org --- kernel/futex.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c index 8300870666638..199e63c5b6120 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1012,7 +1012,8 @@ static void exit_pi_state_list(struct task_struct *curr) * FUTEX_OWNER_DIED bit. See [4] * * [10] There is no transient state which leaves owner and user space - * TID out of sync. + * TID out of sync. Except one error case where the kernel is denied + * write access to the user address, see fixup_pi_state_owner(). */
/* @@ -2357,6 +2358,24 @@ handle_fault: if (!err) goto retry;
+ /* + * fault_in_user_writeable() failed so user state is immutable. At + * best we can make the kernel state consistent but user state will + * be most likely hosed and any subsequent unlock operation will be + * rejected due to PI futex rule [10]. + * + * Ensure that the rtmutex owner is also the pi_state owner despite + * the user space value claiming something different. There is no + * point in unlocking the rtmutex if current is the owner as it + * would need to wait until the next waiter has taken the rtmutex + * to guarantee consistent state. Keep it simple. Userspace asked + * for this wreckaged state. + * + * The rtmutex has an owner - either current or some other + * task. See the EAGAIN loop above. + */ + pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); + return err; }
@@ -2742,13 +2761,6 @@ retry_private: if (res) ret = (res < 0) ? res : 0;
- /* - * If fixup_owner() faulted and was unable to handle the fault, unlock - * it and return the fault to userspace. - */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) - rt_mutex_futex_unlock(&q.pi_state->pi_mutex); - /* Unqueue and drop the lock */ unqueue_me_pi(&q);
@@ -3053,8 +3065,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) - rt_mutex_futex_unlock(&q.pi_state->pi_mutex); /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -3091,14 +3101,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0;
- /* - * If fixup_pi_state_owner() faulted and was unable to handle - * the fault, unlock the rt_mutex and return the fault to - * userspace. - */ - if (ret && rt_mutex_owner(pi_mutex) == current) - rt_mutex_futex_unlock(pi_mutex); - /* Unqueue and drop the lock. */ unqueue_me_pi(&q); }
On Thu, Feb 04, 2021 at 05:28:53PM +0000, Lee Jones wrote:
This set required 4 additional patches to avoid errors.
Peter Zijlstra (4): futex,rt_mutex: Provide futex specific rt_mutex API futex: Remove rt_mutex_deadlock_account_*() futex: Rework inconsistent rt_mutex/futex_q state futex: Avoid violating the 10th rule of futex
Thomas Gleixner (6): futex: Replace pointless printk in fixup_owner() futex: Provide and use pi_state_update_owner() rtmutex: Remove unused argument from rt_mutex_proxy_unlock() futex: Use pi_state_update_owner() in put_pi_state() futex: Simplify fixup_pi_state_owner() futex: Handle faults correctly for PI futexes
All now queued up, thanks!
greg k-h
linux-stable-mirror@lists.linaro.org