Igor Raits and Bagas Sanjaya report a RQCF_ACT_SKIP leak warning. Link: https://lore.kernel.org/all/a5dd536d-041a-2ce9-f4b7-64d8d85c86dc@gmail.com
Commit ebb83d84e49b54 ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()") add RQCF_ACT_SKIP leak warning in rq_clock_start_loop_update(). But this warning is inaccurate and may be triggered incorrectly in the following situations:
CPU0 CPU1
__schedule() *rq->clock_update_flags <<= 1;* unregister_fair_sched_group() pick_next_task_fair+0x4a/0x410 destroy_cfs_bandwidth() newidle_balance+0x115/0x3e0 for_each_possible_cpu(i) *i=0* rq_unpin_lock(this_rq, rf) __cfsb_csd_unthrottle() raw_spin_rq_unlock(this_rq) rq_lock(*CPU0_rq*, &rf) rq_clock_start_loop_update() rq->clock_update_flags & RQCF_ACT_SKIP <--
raw_spin_rq_lock(this_rq)
So we remove this wrong check. Add assert_clock_updated() to check that rq clock has been updated before calling rq_clock_start_loop_update(). And we cannot unconditionally set rq->clock_update_flags to RQCF_ACT_SKIP in rq_clock_start_loop_update(). So we use the variable rq_clock_flags in rq_clock_start_loop_update() to record the previous state of rq->clock_update_flags. Correspondingly, restore rq->clock_update_flags through rq_clock_flags in rq_clock_stop_loop_update() to prevent losing its previous information.
Fixes: ebb83d84e49b ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()") Cc: stable@vger.kernel.org Reported-by: Igor Raits igor.raits@gmail.com Reported-by: Bagas Sanjaya bagasdotme@gmail.com Signed-off-by: Hao Jia jiahao.os@bytedance.com --- kernel/sched/fair.c | 10 ++++++---- kernel/sched/sched.h | 12 +++++++----- 2 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8dbff6e7ad4f..a64a002573d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5679,6 +5679,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_SMP static void __cfsb_csd_unthrottle(void *arg) { + unsigned int rq_clock_flags; struct cfs_rq *cursor, *tmp; struct rq *rq = arg; struct rq_flags rf; @@ -5691,7 +5692,7 @@ static void __cfsb_csd_unthrottle(void *arg) * Do it once and skip the potential next ones. */ update_rq_clock(rq); - rq_clock_start_loop_update(rq); + rq_clock_start_loop_update(rq, &rq_clock_flags);
/* * Since we hold rq lock we're safe from concurrent manipulation of @@ -5712,7 +5713,7 @@ static void __cfsb_csd_unthrottle(void *arg)
rcu_read_unlock();
- rq_clock_stop_loop_update(rq); + rq_clock_stop_loop_update(rq, &rq_clock_flags); rq_unlock(rq, &rf); }
@@ -6230,6 +6231,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) /* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { + unsigned int rq_clock_flags; struct task_group *tg;
lockdep_assert_rq_held(rq); @@ -6239,7 +6241,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * set_rq_offline(), so we should skip updating * the rq clock again in unthrottle_cfs_rq(). */ - rq_clock_start_loop_update(rq); + rq_clock_start_loop_update(rq, &rq_clock_flags);
rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -6264,7 +6266,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) } rcu_read_unlock();
- rq_clock_stop_loop_update(rq); + rq_clock_stop_loop_update(rq, &rq_clock_flags); }
bool cfs_task_bw_constrained(struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..ff2864f202f5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1558,20 +1558,22 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) * when using list_for_each_entry_*) * rq_clock_start_loop_update() can be called after updating the clock * once and before iterating over the list to prevent multiple update. + * And use @rq_clock_flags to record the previous state of rq->clock_update_flags. * After the iterative traversal, we need to call rq_clock_stop_loop_update() - * to clear RQCF_ACT_SKIP of rq->clock_update_flags. + * to restore rq->clock_update_flags through @rq_clock_flags. */ -static inline void rq_clock_start_loop_update(struct rq *rq) +static inline void rq_clock_start_loop_update(struct rq *rq, unsigned int *rq_clock_flags) { lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + assert_clock_updated(rq); + *rq_clock_flags = rq->clock_update_flags; rq->clock_update_flags |= RQCF_ACT_SKIP; }
-static inline void rq_clock_stop_loop_update(struct rq *rq) +static inline void rq_clock_stop_loop_update(struct rq *rq, unsigned int *rq_clock_flags) { lockdep_assert_rq_held(rq); - rq->clock_update_flags &= ~RQCF_ACT_SKIP; + rq->clock_update_flags = *rq_clock_flags; }
struct rq_flags {
On Wed, Sep 13, 2023 at 04:24:24PM +0800, Hao Jia wrote:
Igor Raits and Bagas Sanjaya report a RQCF_ACT_SKIP leak warning. Link: https://lore.kernel.org/all/a5dd536d-041a-2ce9-f4b7-64d8d85c86dc@gmail.com
Commit ebb83d84e49b54 ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()") add RQCF_ACT_SKIP leak warning in rq_clock_start_loop_update(). But this warning is inaccurate and may be triggered incorrectly in the following situations:
CPU0 CPU1
__schedule() *rq->clock_update_flags <<= 1;* unregister_fair_sched_group() pick_next_task_fair+0x4a/0x410 destroy_cfs_bandwidth() newidle_balance+0x115/0x3e0 for_each_possible_cpu(i) *i=0* rq_unpin_lock(this_rq, rf) __cfsb_csd_unthrottle()
if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED;
so that preserves all flags, but only stores UPDATED.
raw_spin_rq_unlock(this_rq) rq_lock(*CPU0_rq*, &rf)
rq_pin_lock() rq->clock_update_flags &= (REQ_SKIP|ACT_SKIP); rf->clock_update_flags = 0;
IOW, we preserve ACT_SKIP from CPU0
rq_clock_start_loop_update() rq->clock_update_flags & RQCF_ACT_SKIP <--
And go SPLAT
raw_spin_rq_lock(this_rq)
rq_repin_lock() rq->clock_update_flags |= rf->clock_update_flags;
which restores UPDATED, even though in reality time could have moved on quite significantly.
Anyway....
the purpose of ACT_SKIP is to skip the update (clue in name etc), but the update is very early in __schedule(), but we clear *_SKIP very late, causing it to span that gap above.
Going by the commits that put it there, the thinking was to clear clock_skip_update before unlock, but AFAICT we can clear SKIP flags right after the update_rq_clock() we're wanting to skip, no?
That is, would not something like the below make more sense?
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8fd29d66b24..bfd2ab4b95da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5357,8 +5357,6 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next);
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */ @@ -6596,6 +6594,8 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); +
switch_count = &prev->nivcsw;
@@ -6675,8 +6675,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq);
On 2023/9/28 Peter Zijlstra wrote:
On Wed, Sep 13, 2023 at 04:24:24PM +0800, Hao Jia wrote:
Igor Raits and Bagas Sanjaya report a RQCF_ACT_SKIP leak warning. Link: https://lore.kernel.org/all/a5dd536d-041a-2ce9-f4b7-64d8d85c86dc@gmail.com
Commit ebb83d84e49b54 ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()") add RQCF_ACT_SKIP leak warning in rq_clock_start_loop_update(). But this warning is inaccurate and may be triggered incorrectly in the following situations:
CPU0 CPU1
__schedule() *rq->clock_update_flags <<= 1;* unregister_fair_sched_group() pick_next_task_fair+0x4a/0x410 destroy_cfs_bandwidth() newidle_balance+0x115/0x3e0 for_each_possible_cpu(i) *i=0* rq_unpin_lock(this_rq, rf) __cfsb_csd_unthrottle()
if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED;
so that preserves all flags, but only stores UPDATED.
raw_spin_rq_unlock(this_rq) rq_lock(*CPU0_rq*, &rf)
rq_pin_lock() rq->clock_update_flags &= (REQ_SKIP|ACT_SKIP); rf->clock_update_flags = 0; IOW, we preserve ACT_SKIP from CPU0
rq_clock_start_loop_update() rq->clock_update_flags & RQCF_ACT_SKIP <--
And go SPLAT
raw_spin_rq_lock(this_rq)
rq_repin_lock() rq->clock_update_flags |= rf->clock_update_flags;
which restores UPDATED, even though in reality time could have moved on quite significantly.
Anyway....
the purpose of ACT_SKIP is to skip the update (clue in name etc), but the update is very early in __schedule(), but we clear *_SKIP very late, causing it to span that gap above.
Going by the commits that put it there, the thinking was to clear clock_skip_update before unlock, but AFAICT we can clear SKIP flags right after the update_rq_clock() we're wanting to skip, no?
Thanks for your review, and I am very sorry to reply to you so late. I just came back from a long vacation.
That is, would not something like the below make more sense?
If we understand correctly, this may not work.
After applying this patch, the following situation will trigger the rq->clock_update_flags < RQCF_ACT_SKIP warning.
If rq_clock_skip_update() is called before __schedule(), so RQCF_REQ_SKIP of rq->clock_update_flags is set.
__schedule() { rq_lock(rq, &rf); [rq->clock_update_flags is RQCF_REQ_SKIP] rq->clock_update_flags <<= 1; update_rq_clock(rq); [rq->clock_update_flags is RQCF_ACT_SKIP] + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); * At this time, rq->clock_update_flags = 0; *
pick_next_task_fair set_next_entity update_load_avg assert_clock_updated() <--- }
assert_clock_updated() will determine whether rq->clock_update_flags is less than RQCF_ACT_SKIP. If we clear RQCF_ACT_SKIP prematurely, this assert may be triggered later.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8fd29d66b24..bfd2ab4b95da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5357,8 +5357,6 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next);
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */ @@ -6596,6 +6594,8 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq);
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
switch_count = &prev->nivcsw; @@ -6675,8 +6675,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq);
On Sat, Oct 07, 2023 at 04:44:46PM +0800, Hao Jia wrote:
That is, would not something like the below make more sense?
If we understand correctly, this may not work.
After applying this patch, the following situation will trigger the rq->clock_update_flags < RQCF_ACT_SKIP warning.
If rq_clock_skip_update() is called before __schedule(), so RQCF_REQ_SKIP of rq->clock_update_flags is set.
__schedule() { rq_lock(rq, &rf); [rq->clock_update_flags is RQCF_REQ_SKIP] rq->clock_update_flags <<= 1; update_rq_clock(rq); [rq->clock_update_flags is RQCF_ACT_SKIP]
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- At this time, rq->clock_update_flags = 0; *
Fixed easily enough, just change to:
rq->clock_updated_flags = RQCF_UPDATED;
pick_next_task_fair set_next_entity update_load_avg assert_clock_updated() <---
}
--- diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a0a582c8cf8c..cf9eb1a26c22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5357,8 +5357,6 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next);
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - prepare_lock_switch(rq, next, rf);
/* Here we just switch the register state and the stack. */ @@ -6596,6 +6594,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); + rq->clock_update_flags = RQCF_UPDATED;
switch_count = &prev->nivcsw;
@@ -6675,8 +6674,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq);
On 2023/10/10 Peter Zijlstra wrote:
On Sat, Oct 07, 2023 at 04:44:46PM +0800, Hao Jia wrote:
That is, would not something like the below make more sense?
If we understand correctly, this may not work.
After applying this patch, the following situation will trigger the rq->clock_update_flags < RQCF_ACT_SKIP warning.
If rq_clock_skip_update() is called before __schedule(), so RQCF_REQ_SKIP of rq->clock_update_flags is set.
__schedule() { rq_lock(rq, &rf); [rq->clock_update_flags is RQCF_REQ_SKIP] rq->clock_update_flags <<= 1; update_rq_clock(rq); [rq->clock_update_flags is RQCF_ACT_SKIP]
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- At this time, rq->clock_update_flags = 0; *
Fixed easily enough, just change to:
rq->clock_updated_flags = RQCF_UPDATED;
Thanks for your suggestions and help, I revised the commit message and sent patch v2.
https://lore.kernel.org/all/20231012090003.11450-1-jiahao.os@bytedance.com/
Please review again.
Thanks, Hao
pick_next_task_fair set_next_entity update_load_avg assert_clock_updated() <---
}
linux-stable-mirror@lists.linaro.org