Even though there are 2 separate sets of housekeeping cpumasks for access and update, it is possible that the set of cpumasks to be updated are still being used by the callers of housekeeping functions resulting in the use of an intermediate cpumask between the new and old ones.
To reduce the chance of this, we need to introduce delay between successive housekeeping cpumask updates. One simple way is to make use of the RCU grace period delay. The callers of the housekeeping APIs can optionally hold rcu_read_lock to eliminate the chance of using intermediate housekeeping cpumasks.
Signed-off-by: Waiman Long longman@redhat.com --- kernel/sched/isolation.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+)
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index ee396ae13719..f26708667754 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -23,6 +23,9 @@ EXPORT_SYMBOL_GPL(housekeeping_overridden); * The housekeeping cpumasks can now be dynamically updated at run time. * Two set of cpumasks are kept. One set can be used while the other set are * being updated concurrently. + * + * rcu_read_lock() can optionally be held by housekeeping API callers to + * ensure stability of the cpumasks. */ static DEFINE_RAW_SPINLOCK(cpumask_lock); struct housekeeping { @@ -34,6 +37,8 @@ struct housekeeping {
static struct housekeeping housekeeping; static bool sched_tick_offload_inited; +static struct rcu_head rcu_gp[HK_TYPE_MAX]; +static unsigned long update_flags;
bool housekeeping_enabled(enum hk_type type) { @@ -267,6 +272,18 @@ static int __init housekeeping_isolcpus_setup(char *str) } __setup("isolcpus=", housekeeping_isolcpus_setup);
+/* + * Bits in update_flags can only turned on with cpumask_lock held and + * cleared by this RCU callback function. + */ +static void rcu_gp_end(struct rcu_head *rcu) +{ + int type = rcu - rcu_gp; + + /* Atomically clear the corresponding flag bit */ + clear_bit(type, &update_flags); +} + /** * housekeeping_exclude_cpumask - Update housekeeping cpumasks to exclude only the given cpumask * @cpumask: new cpumask to be excluded from housekeeping cpumasks @@ -306,8 +323,21 @@ int housekeeping_exclude_cpumask(struct cpumask *cpumask, unsigned long hk_flags } #endif
+retry: + /* + * If the RCU grace period for the previous update with conflicting + * flag bits hasn't been completed yet, we have to wait for it. + */ + while (READ_ONCE(update_flags) & hk_flags) + synchronize_rcu(); + raw_spin_lock(&cpumask_lock);
+ if (READ_ONCE(update_flags) & hk_flags) { + raw_spin_unlock(&cpumask_lock); + goto retry; + } + for_each_set_bit(type, &hk_flags, HK_TYPE_MAX) { int idx = ++housekeeping.seq_nrs[type] & 1; struct cpumask *dst_cpumask = housekeeping.cpumasks[type][idx]; @@ -320,8 +350,11 @@ int housekeeping_exclude_cpumask(struct cpumask *cpumask, unsigned long hk_flags housekeeping.flags |= BIT(type); } WRITE_ONCE(housekeeping.cpumask_ptrs[type], dst_cpumask); + set_bit(type, &update_flags); } raw_spin_unlock(&cpumask_lock); + for_each_set_bit(type, &hk_flags, HK_TYPE_MAX) + call_rcu(&rcu_gp[type], rcu_gp_end);
if (!housekeeping.flags && static_key_enabled(&housekeeping_overridden)) static_key_disable(&housekeeping_overridden.key);