Hi,
Please find the upcoming miscellaneous RCU changes. The changes can also be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/rcu/linux.git misc.2025.03.04a
Regards, Boqun
Paul E. McKenney (6): rcu: Split rcu_report_exp_cpu_mult() mask parameter and use for tracing rcu: Fix get_state_synchronize_rcu_full() GP-start detection rcu-tasks: Move RCU Tasks self-tests to core_initcall() rcu/nocb: Print segment lengths in show_rcu_nocb_gp_state() context_tracking: Make RCU watch ct_kernel_exit_state() warning Flush console log from kernel_power_off()
Uladzislau Rezki (Sony) (3): rcutorture: Allow a negative value for nfakewriters rcu: Update TREE05.boot to test normal synchronize_rcu() rcu: Use _full() API to debug synchronize_rcu()
Zilin Guan (1): rcu: Remove READ_ONCE() for rdp->gpwrap access in __note_gp_changes()
include/linux/printk.h | 6 ++++ include/linux/rcupdate.h | 6 ---- include/linux/rcupdate_wait.h | 3 ++ init/main.c | 1 - kernel/context_tracking.c | 9 +++--- kernel/printk/printk.c | 4 +-- kernel/rcu/rcu.h | 2 +- kernel/rcu/rcutorture.c | 22 ++++++++++---- kernel/rcu/tasks.h | 5 +++- kernel/rcu/tree.c | 29 +++++++++++-------- kernel/rcu/tree_exp.h | 6 ++-- kernel/rcu/tree_nocb.h | 20 +++++++++---- kernel/reboot.c | 1 + .../rcutorture/configs/rcu/TREE05.boot | 6 ++++ 14 files changed, 78 insertions(+), 42 deletions(-)
From: "Paul E. McKenney" paulmck@kernel.org
This commit renames the rcu_report_exp_cpu_mult() function from "mask" to "mask_in" and introduced a "mask" local variable to better support upcoming event-tracing additions.
Signed-off-by: Paul E. McKenney paulmck@kernel.org Cc: Frederic Weisbecker frederic@kernel.org Signed-off-by: Boqun Feng boqun.feng@gmail.com --- kernel/rcu/tree_exp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 77efed89c79e..8d4895c854c5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) * specified leaf rcu_node structure, which is acquired by the caller. */ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, - unsigned long mask, bool wake) + unsigned long mask_in, bool wake) __releases(rnp->lock) { int cpu; + unsigned long mask; struct rcu_data *rdp;
raw_lockdep_assert_held_rcu_node(rnp); - if (!(rnp->expmask & mask)) { + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu);
From: Zilin Guan zilinguan811@gmail.com
There is one access to the per-CPU rdp->gpwrap field in the __note_gp_changes() function that does not use READ_ONCE(), but all other accesses do use READ_ONCE(). When using the 8*TREE03 and CONFIG_NR_CPUS=8 configuration, KCSAN found no data races at that point. This is because all calls to __note_gp_changes() hold rnp->lock, which excludes writes to the rdp->gpwrap fields for all CPUs associated with that same leaf rcu_node structure.
This commit therefore removes READ_ONCE() from rdp->gpwrap accesses within the __note_gp_changes() function.
Signed-off-by: Zilin Guan zilinguan811@gmail.com Signed-off-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Boqun Feng boqun.feng@gmail.com --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..e4c0ce600b2b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1254,7 +1254,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1268,7 +1268,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1283,7 +1283,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp);
From: "Paul E. McKenney" paulmck@kernel.org
The get_state_synchronize_rcu_full() and poll_state_synchronize_rcu_full() functions use the root rcu_node structure's ->gp_seq field to detect the beginnings and ends of grace periods, respectively. This choice is necessary for the poll_state_synchronize_rcu_full() function because (give or take counter wrap), the following sequence is guaranteed not to trigger:
get_state_synchronize_rcu_full(&rgos); synchronize_rcu(); WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&rgos));
The RCU callbacks that awaken synchronize_rcu() instances are guaranteed not to be invoked before the root rcu_node structure's ->gp_seq field is updated to indicate the end of the grace period. However, these callbacks might start being invoked immediately thereafter, in particular, before rcu_state.gp_seq has been updated. Therefore, poll_state_synchronize_rcu_full() must refer to the root rcu_node structure's ->gp_seq field. Because this field is updated under this structure's ->lock, any code following a call to poll_state_synchronize_rcu_full() will be fully ordered after the full grace-period computation, as is required by RCU's memory-ordering semantics.
By symmetry, the get_state_synchronize_rcu_full() function should also use this same root rcu_node structure's ->gp_seq field. But it turns out that symmetry is profoundly (though extremely infrequently) destructive in this case. To see this, consider the following sequence of events:
1. CPU 0 starts a new grace period, and updates rcu_state.gp_seq accordingly.
2. As its first step of grace-period initialization, CPU 0 examines the current CPU hotplug state and decides that it need not wait for CPU 1, which is currently offline.
3. CPU 1 comes online, and updates its state. But this does not affect the current grace period, but rather the one after that. After all, CPU 1 was offline when the current grace period started, so all pre-existing RCU readers on CPU 1 must have completed or been preempted before it last went offline. The current grace period therefore has nothing it needs to wait for on CPU 1.
4. CPU 1 switches to an rcutorture kthread which is running rcutorture's rcu_torture_reader() function, which starts a new RCU reader.
5. CPU 2 is running rcutorture's rcu_torture_writer() function and collects a new polled grace-period "cookie" using get_state_synchronize_rcu_full(). Because the newly started grace period has not completed initialization, the root rcu_node structure's ->gp_seq field has not yet been updated to indicate that this new grace period has already started.
This cookie is therefore set up for the end of the current grace period (rather than the end of the following grace period).
6. CPU 0 finishes grace-period initialization.
7. If CPU 1’s rcutorture reader is preempted, it will be added to the ->blkd_tasks list, but because CPU 1’s ->qsmask bit is not set in CPU 1's leaf rcu_node structure, the ->gp_tasks pointer will not be updated. Thus, this grace period will not wait on it. Which is only fair, given that the CPU did not come online until after the grace period officially started.
8. CPUs 0 and 2 then detect the new grace period and then report a quiescent state to the RCU core.
9. Because CPU 1 was offline at the start of the current grace period, CPUs 0 and 2 are the only CPUs that this grace period needs to wait on. So the grace period ends and post-grace-period cleanup starts. In particular, the root rcu_node structure's ->gp_seq field is updated to indicate that this grace period has now ended.
10. CPU 2 continues running rcu_torture_writer() and sees that, from the viewpoint of the root rcu_node structure consulted by the poll_state_synchronize_rcu_full() function, the grace period has ended. It therefore updates state accordingly.
11. CPU 1 is still running the same RCU reader, which notices this update and thus complains about the too-short grace period.
The fix is for the get_state_synchronize_rcu_full() function to use rcu_state.gp_seq instead of the root rcu_node structure's ->gp_seq field. With this change in place, if step 5's cookie indicates that the grace period has not yet started, then any prior code executed by CPU 2 must have happened before CPU 1 came online. This will in turn prevent CPU 1's code in steps 3 and 11 from spanning CPU 2's grace-period wait, thus preventing CPU 1 from being subjected to a too-short grace period.
This commit therefore makes this change. Note that there is no change to the poll_state_synchronize_rcu_full() function, which as noted above, must continue to use the root rcu_node structure's ->gp_seq field. This is of course an asymmetry between these two functions, but is an asymmetry that is absolutely required for correct operation. It is a common human tendency to greatly value symmetry, and sometimes symmetry is a wonderful thing. Other times, symmetry results in poor performance. But in this case, symmetry is just plain wrong.
Nevertheless, the asymmetry does require an additional adjustment. It is possible for get_state_synchronize_rcu_full() to see a given grace period as having started, but for an immediately following poll_state_synchronize_rcu_full() to see it as having not yet started. Given the current rcu_seq_done_exact() implementation, this will result in a false-positive indication that the grace period is done from poll_state_synchronize_rcu_full(). This is dealt with by making rcu_seq_done_exact() reach back three grace periods rather than just two of them.
However, simply changing get_state_synchronize_rcu_full() function to use rcu_state.gp_seq instead of the root rcu_node structure's ->gp_seq field results in a theoretical bug in kernels booted with rcutree.rcu_normal_wake_from_gp=1 due to the following sequence of events:
o The rcu_gp_init() function invokes rcu_seq_start() to officially start a new grace period.
o A new RCU reader begins, referencing X from some RCU-protected list. The new grace period is not obligated to wait for this reader.
o An updater removes X, then calls synchronize_rcu(), which queues a wait element.
o The grace period ends, awakening the updater, which frees X while the reader is still referencing it.
The reason that this is theoretical is that although the grace period has officially started, none of the CPUs are officially aware of this, and thus will have to assume that the RCU reader pre-dated the start of the grace period. Detailed explanation can be found at [2] and [3].
Except for kernels built with CONFIG_PROVE_RCU=y, which use the polled grace-period APIs, which can and do complain bitterly when this sequence of events occurs. Not only that, there might be some future RCU grace-period mechanism that pulls this sequence of events from theory into practice. This commit therefore also pulls the call to rcu_sr_normal_gp_init() to precede that to rcu_seq_start().
Although this fixes commit 91a967fd6934 ("rcu: Add full-sized polling for get_completed*() and poll_state*()"), it is not clear that it is worth backporting this commit. First, it took me many weeks to convince rcutorture to reproduce this more frequently than once per year. Second, this cannot be reproduced at all without frequent CPU-hotplug operations, as in waiting all of 50 milliseconds from the end of the previous operation until starting the next one. Third, the TREE03.boot settings cause multi-millisecond delays during RCU grace-period initialization, which greatly increase the probability of the above sequence of events. (Don't do this in production workloads!) Fourth, the TREE03 rcutorture scenario was modified to use four-CPU guest OSes, to have a single-rcu_node combining tree, no testing of RCU priority boosting, and no random preemption, and these modifications were necessary to reproduce this issue in a reasonable timeframe. Fifth, extremely heavy use of get_state_synchronize_rcu_full() and/or poll_state_synchronize_rcu_full() is required to reproduce this, and as of v6.12, only kfree_rcu() uses it, and even then not particularly heavily.
[boqun: Apply the fix [1], and add the comment before the moved rcu_sr_normal_gp_init(). Additional links are added for explanation.]
Signed-off-by: Paul E. McKenney paulmck@kernel.org Reviewed-by: Frederic Weisbecker frederic@kernel.org Reviewed-by: Joel Fernandes (Google) joel@joelfernandes.org Tested-by: Uladzislau Rezki (Sony) urezki@gmail.com Link: https://lore.kernel.org/rcu/d90bd6d9-d15c-4b9b-8a69-95336e74e8f4@paulmck-lap... [1] Link: https://lore.kernel.org/rcu/20250303001507.GA3994772@joelnvbox/ [2] Link: https://lore.kernel.org/rcu/Z8bcUsZ9IpRi1QoP@pc636/ [3] Reviewed-by: Joel Fernandes joelagnelf@nvidia.com Signed-off-by: Boqun Feng boqun.feng@gmail.com --- kernel/rcu/rcu.h | 2 +- kernel/rcu/tree.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..f87c9d6d36fc 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp);
- return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); }
/* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e4c0ce600b2b..131fb463ba68 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1801,10 +1801,14 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); @@ -3357,14 +3361,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
From: "Paul E. McKenney" paulmck@kernel.org
The timer and hrtimer softirq processing has moved to dedicated threads for kernels built with CONFIG_IRQ_FORCED_THREADING=y. This results in timers not expiring until later in early boot, which in turn causes the RCU Tasks self-tests to hang in kernels built with CONFIG_PROVE_RCU=y, which further causes the entire kernel to hang. One fix would be to make timers work during this time, but there are no known users of RCU Tasks grace periods during that time, so no justification for the added complexity. Not yet, anyway.
This commit therefore moves the call to rcu_init_tasks_generic() from kernel_init_freeable() to a core_initcall(). This works because the timer and hrtimer kthreads are created at early_initcall() time.
Fixes: 49a17639508c3 ("softirq: Use a dedicated thread for timer wakeups on PREEMPT_RT.") Signed-off-by: Paul E. McKenney paulmck@kernel.org Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Frederic Weisbecker frederic@kernel.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Alexei Starovoitov ast@kernel.org Cc: Andrii Nakryiko andrii@kernel.org Cc: Steven Rostedt rostedt@goodmis.org Cc: Mathieu Desnoyers mathieu.desnoyers@efficios.com Cc: Masami Hiramatsu mhiramat@kernel.org Cc: linux-trace-kernel@vger.kernel.org Tested-by: Sebastian Andrzej Siewior bigeasy@linutronix.de Reviewed-by: Sebastian Andrzej Siewior bigeasy@linutronix.de Signed-off-by: Boqun Feng boqun.feng@gmail.com --- include/linux/rcupdate.h | 6 ------ init/main.c | 1 - kernel/rcu/tasks.h | 5 ++++- 3 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..36849a4ea141 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -121,12 +121,6 @@ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user);
-#ifdef CONFIG_TASKS_RCU_GENERIC -void rcu_init_tasks_generic(void); -#else -static inline void rcu_init_tasks_generic(void) { } -#endif - #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); diff --git a/init/main.c b/init/main.c index 2a1757826397..7f0a2a3dbd29 100644 --- a/init/main.c +++ b/init/main.c @@ -1553,7 +1553,6 @@ static noinline void __init kernel_init_freeable(void)
init_mm_internals();
- rcu_init_tasks_generic(); do_pre_smp_initcalls(); lockup_detector_init();
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 59314da5eb60..466668eb4fad 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -2256,7 +2256,7 @@ void __init tasks_cblist_init_generic(void) #endif }
-void __init rcu_init_tasks_generic(void) +static int __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU rcu_spawn_tasks_kthread(); @@ -2272,7 +2272,10 @@ void __init rcu_init_tasks_generic(void)
// Run the self-tests. rcu_tasks_initiate_self_tests(); + + return 0; } +core_initcall(rcu_init_tasks_generic);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {}
From: "Paul E. McKenney" paulmck@kernel.org
Analysis of an rcutorture callback-based forward-progress test failure was hampered by the lack of ->cblist segment lengths. This commit therefore adds this information, so that what would have been ".W85620.N." (there are some callbacks waiting for grace period sequence number 85620 and some number more that have not yet been assigned to a grace period) now prints as ".W2(85620).N6." (there are 2 callbacks waiting for grace period 85620 and 6 not yet assigned to a grace period). Note that "D" (done), "N" (next and not yet assigned to a grace period, and "B" (bypass, also not yet assigned to a grace period) have just the number of callbacks without the parenthesized grace-period sequence number.
In contrast, "W" (waiting for the current grace period) and "R" (ready to wait for the next grace period to start) both have parenthesized grace-period sequence numbers.
Signed-off-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Boqun Feng boqun.feng@gmail.com --- kernel/rcu/tree_nocb.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2605dd234a13..5ff3bc56ff51 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1557,8 +1557,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { - char bufw[20]; - char bufr[20]; + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; @@ -1572,9 +1575,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) typeof(*rdp), nocb_entry_rdp);
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], @@ -1586,12 +1593,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
From: "Paul E. McKenney" paulmck@kernel.org
The WARN_ON_ONCE() in ct_kernel_exit_state() follows the call to ct_state_inc(), which means that RCU is not watching this WARN_ON_ONCE(). This can (and does) result in extraneous lockdep warnings when this WARN_ON_ONCE() triggers. These extraneous warnings are the opposite of helpful.
Therefore, invert the WARN_ON_ONCE() condition and move it before the call to ct_state_inc(). This does mean that the ct_state_inc() return value can no longer be used in the WARN_ON_ONCE() condition, so discard this return value and instead use a call to rcu_is_watching_curr_cpu(). This call is executed only in CONFIG_RCU_EQS_DEBUG=y kernels, so there is no added overhead in production use.
[Boqun: Add the subsystem tag in the title]
Reported-by: Breno Leitao leitao@debian.org Signed-off-by: Paul E. McKenney paulmck@kernel.org Reviewed-by: Valentin Schneider vschneid@redhat.com Reviewed-by: Frederic Weisbecker frederic@kernel.org Link: https://lore.kernel.org/r/bd911cd9-1fe9-447c-85e0-ea811a1dc896@paulmck-lapto... Signed-off-by: Boqun Feng boqun.feng@gmail.com --- kernel/context_tracking.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 938c48952d26..fb5be6e9b423 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -80,17 +80,16 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void) */ static noinstr void ct_kernel_exit_state(int offset) { - int seq; - /* * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ rcu_task_trace_heavyweight_enter(); // Before CT state update! - seq = ct_state_inc(offset); - // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); + // RCU is still watching. Better not be in extended quiescent state! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu()); + (void)ct_state_inc(offset); + // RCU is no longer watching. }
/*
From: "Paul E. McKenney" paulmck@kernel.org
Kernels built with CONFIG_PREEMPT_RT=y can lose significant console output and shutdown time, which hides shutdown-time RCU issues from rcutorture. Therefore, make pr_flush() public and invoke it after then last print in kernel_power_off().
[ paulmck: Apply John Ogness feedback. ] [ paulmck: Appy Sebastian Andrzej Siewior feedback. ] [ paulmck: Apply kernel test robot feedback. ]
Signed-off-by: Paul E. McKenney paulmck@kernel.org Reviewed-by: John Ogness john.ogness@linutronix.de Reviewed-by: Petr Mladek pmladek@suse.com Cc: Steven Rostedt rostedt@goodmis.org Cc: Sergey Senozhatsky senozhatsky@chromium.org Link: https://lore.kernel.org/r/5f743488-dc2a-4f19-bdda-cf50b9314832@paulmck-lapto... Signed-off-by: Boqun Feng boqun.feng@gmail.com --- include/linux/printk.h | 6 ++++++ kernel/printk/printk.c | 4 +--- kernel/reboot.c | 1 + 3 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/include/linux/printk.h b/include/linux/printk.h index 4217a9f412b2..5b462029d03c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -207,6 +207,7 @@ void printk_legacy_allow_panic_sync(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); void nbcon_atomic_flush_unsafe(void); +bool pr_flush(int timeout_ms, bool reset_on_progress); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -315,6 +316,11 @@ static inline void nbcon_atomic_flush_unsafe(void) { }
+static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + #endif
bool this_cpu_in_panic(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 07668433644b..057db78876cd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2461,7 +2461,6 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk);
-static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
#else /* CONFIG_PRINTK */ @@ -2474,7 +2473,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
static u64 syslog_seq;
-static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
#endif /* CONFIG_PRINTK */ @@ -4466,7 +4464,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ -static bool pr_flush(int timeout_ms, bool reset_on_progress) +bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } diff --git a/kernel/reboot.c b/kernel/reboot.c index b5a8569e5d81..41ab9e1ba357 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -704,6 +704,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); + pr_flush(1000, true); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); }
From: "Uladzislau Rezki (Sony)" urezki@gmail.com
Currently "nfakewriters" parameter can be set to any value but there is no possibility to adjust it automatically based on how many CPUs a system has where a test is run on.
To address this, if the "nfakewriters" is set to negative it will be adjusted to num_online_cpus() during torture initialization.
Reviewed-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Uladzislau Rezki (Sony) urezki@gmail.com Link: https://lore.kernel.org/r/20250227131613.52683-1-urezki@gmail.com Signed-off-by: Boqun Feng boqun.feng@gmail.com --- kernel/rcu/rcutorture.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..726c2d63ab66 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -147,6 +147,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; @@ -1728,7 +1729,7 @@ rcu_torture_fakewriter(void *arg) do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else { switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -2522,7 +2523,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " "preempt_duration=%d preempt_interval=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, @@ -3597,7 +3598,7 @@ rcu_torture_cleanup(void) rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); kfree(fakewriter_tasks); @@ -3994,6 +3995,14 @@ rcu_torture_init(void)
rcu_torture_init_srcu_lockdep();
+ if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -4050,8 +4059,9 @@ rcu_torture_init(void) writer_task); if (torture_init_error(firsterr)) goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -4060,7 +4070,7 @@ rcu_torture_init(void) goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); if (torture_init_error(firsterr))
From: "Uladzislau Rezki (Sony)" urezki@gmail.com
Add extra parameters for rcutorture module. One is the "nfakewriters" which is set -1. There will be created number of test-kthreads which correspond to number of CPUs in a test system. Those threads randomly invoke synchronize_rcu() call.
Apart of that "rcu_normal" is set to 1, because it is specifically for a normal synchronize_rcu() testing, also a newly added parameter which is "rcu_normal_wake_from_gp" is set to 1 also. That prevents interaction with other callbacks in a system.
Reviewed-by: Paul E. McKenney paulmck@kernel.org Signed-off-by: Uladzislau Rezki (Sony) urezki@gmail.com Link: https://lore.kernel.org/r/20250227131613.52683-2-urezki@gmail.com Signed-off-by: Boqun Feng boqun.feng@gmail.com --- tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot index c419cac233ee..54f5c9053474 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot @@ -2,3 +2,9 @@ rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcupdate.rcu_self_test=1 + +# This part is for synchronize_rcu() testing +rcutorture.nfakewriters=-1 +rcutorture.gp_sync=1 +rcupdate.rcu_normal=1 +rcutree.rcu_normal_wake_from_gp=1
From: "Uladzislau Rezki (Sony)" urezki@gmail.com
Switch for using of get_state_synchronize_rcu_full() and poll_state_synchronize_rcu_full() pair to debug a normal synchronize_rcu() call.
Just using "not" full APIs to identify if a grace period is passed or not might lead to a false-positive kernel splat.
It can happen, because get_state_synchronize_rcu() compresses both normal and expedited states into one single unsigned long value, so a poll_state_synchronize_rcu() can miss GP-completion when synchronize_rcu()/synchronize_rcu_expedited() concurrently run.
To address this, switch to poll_state_synchronize_rcu_full() and get_state_synchronize_rcu_full() APIs, which use separate variables for expedited and normal states.
Reported-by: cheung wall zzqq0103.hey@gmail.com Closes: https://lore.kernel.org/lkml/Z5ikQeVmVdsWQrdD@pc636/T/ Fixes: 988f569ae041 ("rcu: Reduce synchronize_rcu() latency") Signed-off-by: Uladzislau Rezki (Sony) urezki@gmail.com Reviewed-by: Paul E. McKenney paulmck@kernel.org Link: https://lore.kernel.org/r/20250227131613.52683-3-urezki@gmail.com Signed-off-by: Boqun Feng boqun.feng@gmail.com --- include/linux/rcupdate_wait.h | 3 +++ kernel/rcu/tree.c | 8 +++----- 2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index f9bed3d3f78d..4c92d4291cce 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -16,6 +16,9 @@ struct rcu_synchronize { struct rcu_head head; struct completion completion; + + /* This is for debugging. */ + struct rcu_gp_oldstate oldstate; }; void wakeme_after_rcu(struct rcu_head *head);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 131fb463ba68..fb98d4d9d9c9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1612,12 +1612,10 @@ static void rcu_sr_normal_complete(struct llist_node *node) { struct rcu_synchronize *rs = container_of( (struct rcu_head *) node, struct rcu_synchronize, head); - unsigned long oldstate = (unsigned long) rs->head.func;
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && - !poll_state_synchronize_rcu(oldstate), - "A full grace period is not passed yet: %lu", - rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n");
/* Finally. */ complete(&rs->completion); @@ -3218,7 +3216,7 @@ static void synchronize_rcu_normal(void) * snapshot before adding a request. */ if (IS_ENABLED(CONFIG_PROVE_RCU)) - rs.head.func = (void *) get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&rs.oldstate);
rcu_sr_normal_add_req(&rs);
linux-kselftest-mirror@lists.linaro.org