On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause seems to be: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. During the initialization of the sched_domain, we set the NOHZ_IDLE flag when nr_busy_cpus is initialized to 0 in order to have a coherent configuration. If a CPU enters idle and call set_cpu_sd_state_idle during the build of the new sched_domain it will not corrupt the initial state set_cpu_sd_state_busy is modified and clears the NOHZ_IDLE only if a non NULL sched_domain is attached to the CPU (which is the case during the rebuild)
Change since V3; - NOHZ flag is not cleared if a NULL domain is attached to the CPU - Remove patch 2/2 which becomes useless with latest modifications
Change since V2: - change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state
Change since V1: - remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org --- kernel/sched/core.c | 4 +++- kernel/sched/fair.c | 9 +++++++-- 2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 26058d0..c730a4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5884,7 +5884,9 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) return;
update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); + atomic_set(&sg->sgp->nr_busy_cpus, 0); + set_bit(NOHZ_IDLE, nohz_flags(cpu)); + }
int __weak arch_sd_sibling_asym_packing(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 81fa536..2701a92 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5403,15 +5403,20 @@ static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; int cpu = smp_processor_id(); + int clear = 0;
if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu));
rcu_read_lock(); - for_each_domain(cpu, sd) + for_each_domain(cpu, sd) { atomic_inc(&sd->groups->sgp->nr_busy_cpus); + clear = 1; + } rcu_read_unlock(); + + if (likely(clear)) + clear_bit(NOHZ_IDLE, nohz_flags(cpu)); }
void set_cpu_sd_state_idle(void)
On Thu, Feb 21, 2013 at 09:29:16AM +0100, Vincent Guittot wrote:
On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause seems to be: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. During the initialization of the sched_domain, we set the NOHZ_IDLE flag when nr_busy_cpus is initialized to 0 in order to have a coherent configuration. If a CPU enters idle and call set_cpu_sd_state_idle during the build of the new sched_domain it will not corrupt the initial state set_cpu_sd_state_busy is modified and clears the NOHZ_IDLE only if a non NULL sched_domain is attached to the CPU (which is the case during the rebuild)
Change since V3;
- NOHZ flag is not cleared if a NULL domain is attached to the CPU
- Remove patch 2/2 which becomes useless with latest modifications
Change since V2:
- change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state
Change since V1:
- remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org
kernel/sched/core.c | 4 +++- kernel/sched/fair.c | 9 +++++++-- 2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 26058d0..c730a4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5884,7 +5884,9 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) return; update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
- atomic_set(&sg->sgp->nr_busy_cpus, 0);
- set_bit(NOHZ_IDLE, nohz_flags(cpu));
} int __weak arch_sd_sibling_asym_packing(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 81fa536..2701a92 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5403,15 +5403,20 @@ static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; int cpu = smp_processor_id();
- int clear = 0;
if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) return;
- clear_bit(NOHZ_IDLE, nohz_flags(cpu));
rcu_read_lock();
- for_each_domain(cpu, sd)
- for_each_domain(cpu, sd) { atomic_inc(&sd->groups->sgp->nr_busy_cpus);
clear = 1;
- } rcu_read_unlock();
- if (likely(clear))
clear_bit(NOHZ_IDLE, nohz_flags(cpu));
I fear there is still a race window:
= CPU 0 = = CPU 1 = // NOHZ_IDLE is set set_cpu_sd_state_busy() { dom1 = rcu_dereference(dom1); inc(dom1->nr_busy_cpus)
rcu_assign_pointer(dom 1, NULL) // create new domain init_sched_group_power() { atomic_set(&tmp->nr_busy_cpus, 0); set_bit(NOHZ_IDLE, nohz_flags(cpu 1)); rcu_assign_pointer(dom 1, tmp)
clear_bit(NOHZ_IDLE, nohz_flags(cpu)); }
I don't know if there is any sane way to deal with this issue other than having nr_busy_cpus and nohz_flags in the same object sharing the same lifecycle.
On 22 February 2013 13:32, Frederic Weisbecker fweisbec@gmail.com wrote:
On Thu, Feb 21, 2013 at 09:29:16AM +0100, Vincent Guittot wrote:
On my smp platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle. The root cause seems to be: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. During the initialization of the sched_domain, we set the NOHZ_IDLE flag when nr_busy_cpus is initialized to 0 in order to have a coherent configuration. If a CPU enters idle and call set_cpu_sd_state_idle during the build of the new sched_domain it will not corrupt the initial state set_cpu_sd_state_busy is modified and clears the NOHZ_IDLE only if a non NULL sched_domain is attached to the CPU (which is the case during the rebuild)
Change since V3;
- NOHZ flag is not cleared if a NULL domain is attached to the CPU
- Remove patch 2/2 which becomes useless with latest modifications
Change since V2:
- change the initialization to idle state instead of busy state so a CPU that enters idle during the build of the sched_domain will not corrupt the initialization state
Change since V1:
- remove the patch for SCHED softirq on an idle core use case as it was a side effect of the other use cases.
Signed-off-by: Vincent Guittot vincent.guittot@linaro.org
kernel/sched/core.c | 4 +++- kernel/sched/fair.c | 9 +++++++-- 2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 26058d0..c730a4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5884,7 +5884,9 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) return;
update_group_power(sd, cpu);
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
atomic_set(&sg->sgp->nr_busy_cpus, 0);
set_bit(NOHZ_IDLE, nohz_flags(cpu));
}
int __weak arch_sd_sibling_asym_packing(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 81fa536..2701a92 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5403,15 +5403,20 @@ static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; int cpu = smp_processor_id();
int clear = 0; if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) return;
clear_bit(NOHZ_IDLE, nohz_flags(cpu)); rcu_read_lock();
for_each_domain(cpu, sd)
for_each_domain(cpu, sd) { atomic_inc(&sd->groups->sgp->nr_busy_cpus);
clear = 1;
} rcu_read_unlock();
if (likely(clear))
clear_bit(NOHZ_IDLE, nohz_flags(cpu));
I fear there is still a race window:
= CPU 0 = = CPU 1 = // NOHZ_IDLE is set set_cpu_sd_state_busy() { dom1 = rcu_dereference(dom1); inc(dom1->nr_busy_cpus) rcu_assign_pointer(dom 1, NULL) // create new domain init_sched_group_power() { atomic_set(&tmp->nr_busy_cpus, 0); set_bit(NOHZ_IDLE, nohz_flags(cpu 1)); rcu_assign_pointer(dom 1, tmp) clear_bit(NOHZ_IDLE, nohz_flags(cpu)); }
I don't know if there is any sane way to deal with this issue other than having nr_busy_cpus and nohz_flags in the same object sharing the same lifecycle.
I wanted to avoid having to use the sd pointer for testing NOHZ_IDLE flag because it occurs each time we go into idle but it seems to be not easily feasible. Another solution could be to add a synchronization step between rcu_assign_pointer(dom 1, NULL) and create new domain to ensure that all pending access to old sd values, has finished but this will imply a potential delay in the rebuild of sched_domain and i'm not sure that it's acceptable
Vincent
2013/2/22 Vincent Guittot vincent.guittot@linaro.org:
I wanted to avoid having to use the sd pointer for testing NOHZ_IDLE flag because it occurs each time we go into idle but it seems to be not easily feasible. Another solution could be to add a synchronization step between rcu_assign_pointer(dom 1, NULL) and create new domain to ensure that all pending access to old sd values, has finished but this will imply a potential delay in the rebuild of sched_domain and i'm not sure that it's acceptable
The other issue is that we'll need to abuse the fact that struct sched_domain is per cpu in order to store a per cpu state there. That's a bit ugly but at least safer.
Also, are struct sched_group and struct sched_group_power shared among several CPUs or are they per CPUs allocated as well? I guess they aren't otherwise nr_cpus_busy would be pointless.
On 26 February 2013 14:16, Frederic Weisbecker fweisbec@gmail.com wrote:
2013/2/22 Vincent Guittot vincent.guittot@linaro.org:
I wanted to avoid having to use the sd pointer for testing NOHZ_IDLE flag because it occurs each time we go into idle but it seems to be not easily feasible. Another solution could be to add a synchronization step between rcu_assign_pointer(dom 1, NULL) and create new domain to ensure that all pending access to old sd values, has finished but this will imply a potential delay in the rebuild of sched_domain and i'm not sure that it's acceptable
The other issue is that we'll need to abuse the fact that struct sched_domain is per cpu in order to store a per cpu state there. That's a bit ugly but at least safer.
Also, are struct sched_group and struct sched_group_power shared among several CPUs or are they per CPUs allocated as well? I guess they aren't otherwise nr_cpus_busy would be pointless.
Yes they are shared between CPUs, per cpu sched_domain points to same sched_group and sched_group_power.
2013/2/26 Vincent Guittot vincent.guittot@linaro.org:
On 26 February 2013 14:16, Frederic Weisbecker fweisbec@gmail.com wrote:
2013/2/22 Vincent Guittot vincent.guittot@linaro.org:
I wanted to avoid having to use the sd pointer for testing NOHZ_IDLE flag because it occurs each time we go into idle but it seems to be not easily feasible. Another solution could be to add a synchronization step between rcu_assign_pointer(dom 1, NULL) and create new domain to ensure that all pending access to old sd values, has finished but this will imply a potential delay in the rebuild of sched_domain and i'm not sure that it's acceptable
Ah I see what you meant there. Making a synchronize_rcu() after setting the dom to NULL, on top of which we could work on preventing from any concurrent nohz_flag modification. But cpu hotplug seem to become a bit of a performance sensitive path this day.
Ok I don't like having a per cpu state in struct sched domain but for now I can't find anything better. So my suggestion is that we do this and describe well the race, define the issue in the changelog and code comments and explain how we are solving it. This way at least the issue is identified and known. Then later, on review or after the patch is upstream, if somebody with some good taste comes with a better idea, we consider it.
What do you think?
On 26 February 2013 18:43, Frederic Weisbecker fweisbec@gmail.com wrote:
2013/2/26 Vincent Guittot vincent.guittot@linaro.org:
On 26 February 2013 14:16, Frederic Weisbecker fweisbec@gmail.com wrote:
2013/2/22 Vincent Guittot vincent.guittot@linaro.org:
I wanted to avoid having to use the sd pointer for testing NOHZ_IDLE flag because it occurs each time we go into idle but it seems to be not easily feasible. Another solution could be to add a synchronization step between rcu_assign_pointer(dom 1, NULL) and create new domain to ensure that all pending access to old sd values, has finished but this will imply a potential delay in the rebuild of sched_domain and i'm not sure that it's acceptable
Ah I see what you meant there. Making a synchronize_rcu() after setting the dom to NULL, on top of which we could work on preventing from any concurrent nohz_flag modification. But cpu hotplug seem to become a bit of a performance sensitive path this day.
That's was also my concern
Ok I don't like having a per cpu state in struct sched domain but for now I can't find anything better. So my suggestion is that we do this and describe well the race, define the issue in the changelog and code comments and explain how we are solving it. This way at least the issue is identified and known. Then later, on review or after the patch is upstream, if somebody with some good taste comes with a better idea, we consider it.
What do you think?
I don't have better solution than adding this state in the sched_domain if we want to keep the exact same behavior. This will be a bit of waste of mem because we don't need to update all sched_domain level (1st level is enough).
Vincent
On Wed, Feb 27, 2013 at 09:28:26AM +0100, Vincent Guittot wrote:
Ok I don't like having a per cpu state in struct sched domain but for now I can't find anything better. So my suggestion is that we do this and describe well the race, define the issue in the changelog and code comments and explain how we are solving it. This way at least the issue is identified and known. Then later, on review or after the patch is upstream, if somebody with some good taste comes with a better idea, we consider it.
What do you think?
I don't have better solution than adding this state in the sched_domain if we want to keep the exact same behavior. This will be a bit of waste of mem because we don't need to update all sched_domain level (1st level is enough).
Or you can try something like the below. Both flags and sched_domain share the same object here so the same RCU lifecycle. And there shouldn't be more overhead there since accessing rq->sd_rq.sd is the same than rq->sd_rq in the ASM level: only one pointer to dereference.
Also rq_idle becomes a separate value from rq->nohz_flags. It's a simple boolean (just making it an int here because boolean size are a bit opaque, although they are supposed to be char, let's just avoid surprises in structures).
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfd..16c0d55 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -417,7 +417,10 @@ struct rq {
#ifdef CONFIG_SMP struct root_domain *rd; - struct sched_domain *sd; + struct sched_domain_rq { + struct sched_domain sd; + int rq_idle; + } __rcu *sd_rq;
unsigned long cpu_power;
@@ -505,9 +508,14 @@ DECLARE_PER_CPU(struct rq, runqueues);
#ifdef CONFIG_SMP
-#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) +#define rcu_dereference_check_sched_domain(p) ({\ + struct sched_domain_rq *__sd_rq = rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)); \ + if (!__sd_rq) \ + NULL; \ + else \ + &__sd_rq->sd; \ +})
/* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -517,7 +525,7 @@ DECLARE_PER_CPU(struct rq, runqueues); * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd_rq); \ __sd; __sd = __sd->parent)
#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
On 27 February 2013 17:13, Frederic Weisbecker fweisbec@gmail.com wrote:
On Wed, Feb 27, 2013 at 09:28:26AM +0100, Vincent Guittot wrote:
Ok I don't like having a per cpu state in struct sched domain but for now I can't find anything better. So my suggestion is that we do this and describe well the race, define the issue in the changelog and code comments and explain how we are solving it. This way at least the issue is identified and known. Then later, on review or after the patch is upstream, if somebody with some good taste comes with a better idea, we consider it.
What do you think?
I don't have better solution than adding this state in the sched_domain if we want to keep the exact same behavior. This will be a bit of waste of mem because we don't need to update all sched_domain level (1st level is enough).
Or you can try something like the below. Both flags and sched_domain share the same object here so the same RCU lifecycle. And there shouldn't be more overhead there since accessing rq->sd_rq.sd is the same than rq->sd_rq in the ASM level: only one pointer to dereference.
your proposal solves the waste of memory and keeps the sync between flag and nr_busy. I'm going to try it
Thanks
Also rq_idle becomes a separate value from rq->nohz_flags. It's a simple boolean (just making it an int here because boolean size are a bit opaque, although they are supposed to be char, let's just avoid surprises in structures).
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfd..16c0d55 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -417,7 +417,10 @@ struct rq {
#ifdef CONFIG_SMP struct root_domain *rd;
struct sched_domain *sd;
struct sched_domain_rq {
struct sched_domain sd;
int rq_idle;
} __rcu *sd_rq; unsigned long cpu_power;
@@ -505,9 +508,14 @@ DECLARE_PER_CPU(struct rq, runqueues);
#ifdef CONFIG_SMP
-#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
+#define rcu_dereference_check_sched_domain(p) ({\
struct sched_domain_rq *__sd_rq = rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex)); \
if (!__sd_rq) \
NULL; \
else \
&__sd_rq->sd; \
+})
/*
- The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -517,7 +525,7 @@ DECLARE_PER_CPU(struct rq, runqueues);
- preempt-disabled sections.
*/ #define for_each_domain(cpu, __sd) \
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd_rq); \ __sd; __sd = __sd->parent)
#define for_each_lower_domain(sd) for (; sd; sd = sd->child)