 
            On Mon, Jan 04, 2021 at 04:20:55PM +0100, Frederic Weisbecker wrote:
Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP kthread (rcuog) to be serviced.
Usually a wake up happening while running the idle task is spotted in one of the need_resched() checks carefully placed within the idle loop that can break to the scheduler.
Urgh, this is horrific and fragile :/ You having had to audit and fix a number of rcu_idle_enter() callers should've made you realize that making rcu_idle_enter() return something would've been saner.
Also, I might hope that when RCU does do that wakeup, it will not have put RCU in idle mode? So it is a natural 'fail' state for rcu_idle_enter(), *sigh* it continues to put RCU to sleep, so that needs fixing too.
I'm thinking that rcu_user_enter() will have the exact same problem? Did you audit that?
Something like the below, combined with a fixup for all callers (which the compiler will help us find thanks to __must_check).
---
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de0826411311..612f66c16078 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -95,10 +95,10 @@ static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */
#ifdef CONFIG_NO_HZ_FULL -void rcu_user_enter(void); +bool __must_check rcu_user_enter(void); void rcu_user_exit(void); #else -static inline void rcu_user_enter(void) { } +static inline bool __must_check rcu_user_enter(void) { return true; } static inline void rcu_user_exit(void) { } #endif /* CONFIG_NO_HZ_FULL */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index df578b73960f..9ba0c5d9e99e 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -43,7 +43,7 @@ bool rcu_gp_might_be_stalled(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate);
-void rcu_idle_enter(void); +bool __must_check rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..13e19e5db0b8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -625,7 +625,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ -static noinstr void rcu_eqs_enter(bool user) +static noinstr bool rcu_eqs_enter(bool user) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -636,7 +636,7 @@ static noinstr void rcu_eqs_enter(bool user) if (rdp->dynticks_nesting != 1) { // RCU will still be watching, so just do accounting and leave. rdp->dynticks_nesting--; - return; + return true; }
lockdep_assert_irqs_disabled(); @@ -644,7 +644,14 @@ static noinstr void rcu_eqs_enter(bool user) trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); - do_nocb_deferred_wakeup(rdp); + if (do_nocb_deferred_wakeup(rdp)) { + /* + * We did the wakeup, don't enter EQS, we'll need to abort idle + * and schedule. + */ + return false; + } + rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current);
@@ -657,6 +664,8 @@ static noinstr void rcu_eqs_enter(bool user) rcu_dynticks_eqs_enter(); // ... but is no longer watching here. rcu_dynticks_task_enter(); + + return true; }
/** @@ -670,10 +679,10 @@ static noinstr void rcu_eqs_enter(bool user) * If you add or remove a call to rcu_idle_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_idle_enter(void) +bool rcu_idle_enter(void) { lockdep_assert_irqs_disabled(); - rcu_eqs_enter(false); + return rcu_eqs_enter(false); } EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -689,10 +698,10 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); * If you add or remove a call to rcu_user_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -noinstr void rcu_user_enter(void) +noinstr bool rcu_user_enter(void) { lockdep_assert_irqs_disabled(); - rcu_eqs_enter(true); + return rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7708ed161f4a..9226f4021a36 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -433,7 +433,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); -static void do_nocb_deferred_wakeup(struct rcu_data *rdp); +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7e291ce0a1d6..8ca41b3fe4f9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1631,7 +1631,7 @@ bool rcu_is_nocb_cpu(int cpu) * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock * and this function releases it. */ -static void wake_nocb_gp(struct rcu_data *rdp, bool force, +static bool wake_nocb_gp(struct rcu_data *rdp, bool force, unsigned long flags) __releases(rdp->nocb_lock) { @@ -1654,8 +1654,11 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); } raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (needwake) + if (needwake) { wake_up_process(rdp_gp->nocb_gp_kthread); + return true; + } + return false; }
/* @@ -2155,17 +2158,19 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { unsigned long flags; + bool ret; int ndw;
rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_nocb_need_deferred_wakeup(rdp)) { rcu_nocb_unlock_irqrestore(rdp, flags); - return; + return false; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + return ret; }
/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ @@ -2181,10 +2186,12 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) * This means we do an inexact common-case check. Note that if * we miss, ->nocb_timer will eventually clean things up. */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { if (rcu_nocb_need_deferred_wakeup(rdp)) - do_nocb_deferred_wakeup_common(rdp); + return do_nocb_deferred_wakeup_common(rdp); + + return false; }
void __init rcu_init_nohz(void) @@ -2518,8 +2525,9 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) return false; }
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { + return false }
static void rcu_spawn_cpu_nocb_kthread(int cpu)