This series implements an event-driven cpufreq governor that scales cpu frequency as a function of cfs runqueue utilization. The intent of this RFC is to get some discussion going about how the scheduler can become the policy engine for selecting cpu frequency, what limitations exist and what design do we want to take to get to a solution.
This series depends on having frequency-invariant representations for load. This requires Vincent's recently merged cpu capacity rework patches, as well as a new patch from Morten included here. Morten's patch will likely make an appearance in his energy aware scheduling v4 series.
Thanks to Juri Lelli juri.lelli@arm.com for contributing to the development of the governor.
A git branch with these patches can be pulled from here: https://git.linaro.org/people/mike.turquette/linux.git sched-freq
Smoke testing has been done on an OMAP4 Pandaboard and an Exynos 5800 Chromebook2. Extensive benchmarking and regression testing has not yet been done. Before sinking too much time into extensive testing I'd like to get feedback on the general design.
Michael Turquette (3): sched: sched feature for cpu frequency selection sched: export get_cpu_usage & capacity_orig_of sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling
Morten Rasmussen (1): arm: Frequency invariant scheduler load-tracking support
arch/arm/include/asm/topology.h | 7 + arch/arm/kernel/smp.c | 53 ++++++- arch/arm/kernel/topology.c | 17 +++ drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 20 ++- kernel/sched/features.h | 6 + kernel/sched/sched.h | 9 ++ 10 files changed, 450 insertions(+), 4 deletions(-) create mode 100644 kernel/sched/cpufreq_sched_cfs.c
-- 1.9.1
From: Morten Rasmussen Morten.Rasmussen@arm.com
Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.
Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com --- changes since internal v1: * replaced two commits from eas v3 with this new one from Morten
arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53 +++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);
+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity +struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + +DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); + #else
static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq); +DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity); + +/* + * Scheduler load-tracking scale-invariance + * + * Provides the scheduler with a scale-invariance correction factor that + * compensates for frequency scaling through arch_scale_freq_capacity() + * (implemented in topology.c). + */ +static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{ + unsigned long capacity; + + if (!max) + return; + + capacity = (curr << SCHED_CAPACITY_SHIFT) / max; + atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity); +}
static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu; + unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu));
if (freq->flags & CPUFREQ_CONST_LOOPS) return NOTIFY_OK; @@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); } + + scale_freq_capacity(cpu, freq->new, max); + return NOTIFY_OK; }
@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };
+static int cpufreq_policy_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = data; + int i; + + for_each_cpu(i, policy->cpus) { + scale_freq_capacity(i, policy->cur, policy->max); + atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max); + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_policy_notifier = { + .notifier_call = cpufreq_policy_callback, +}; + static int __init register_cpufreq_notifier(void) { - return cpufreq_register_notifier(&cpufreq_notifier, + int ret; + + ret = cpufreq_register_notifier(&cpufreq_notifier, CPUFREQ_TRANSITION_NOTIFIER); + if (ret) + return ret; + + return cpufreq_register_notifier(&cpufreq_policy_notifier, + CPUFREQ_POLICY_NOTIFIER); } core_initcall(register_cpufreq_notifier); - #endif diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }
+/* + * Scheduler load-tracking scale-invariance + * + * Provides the scheduler with a scale-invariance correction factor that + * compensates for frequency scaling (arch_scale_freq_capacity()). The scaling + * factor is updated in smp.c + */ +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity, cpu)); + + if (!curr) + return SCHED_CAPACITY_SCALE; + + return curr; +} + #else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {} -- 1.9.1
On 04/27/2015 09:46 AM, Michael Turquette wrote:
From: Morten Rasmussen Morten.Rasmussen@arm.com
Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.
Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com
changes since internal v1:
replaced two commits from eas v3 with this new one from Morten
arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53 +++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);
+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
+struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu);
+DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity);
IMO cpu_freq_capacity should be statically declared in the core code and modified/inspected through accessors also in the core code.
eg.
sched_cpu_freq_capacity_set(int cpu, unsigned long freq_capacity); unsigned long sched_cpu_freq_capacity_get(int cpu);
#else
static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq);
In the code cpu_max_freq is used to update the scale freq invariance.
Wouldn't be simpler to use directly: cpufreq_quick_get_max(int cpu) instead of declaring another per cpu variable ?
+DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity);
+/*
- Scheduler load-tracking scale-invariance
- Provides the scheduler with a scale-invariance correction factor that
- compensates for frequency scaling through arch_scale_freq_capacity()
- (implemented in topology.c).
- */
+static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{
- unsigned long capacity;
- if (!max)
return;
- capacity = (curr << SCHED_CAPACITY_SHIFT) / max;
- atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity);
+}
static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu;
unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu));
if (freq->flags & CPUFREQ_CONST_LOOPS) return NOTIFY_OK;
@@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); }
- scale_freq_capacity(cpu, freq->new, max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?
- return NOTIFY_OK; }
@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };
+static int cpufreq_policy_callback(struct notifier_block *nb,
unsigned long val, void *data)
+{
- struct cpufreq_policy *policy = data;
- int i;
- for_each_cpu(i, policy->cpus) {
scale_freq_capacity(i, policy->cur, policy->max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?
atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max);
atomic_long_set no longer needed.
- }
- return NOTIFY_OK;
+}
+static struct notifier_block cpufreq_policy_notifier = {
- .notifier_call = cpufreq_policy_callback,
+};
- static int __init register_cpufreq_notifier(void) {
- return cpufreq_register_notifier(&cpufreq_notifier,
- int ret;
- ret = cpufreq_register_notifier(&cpufreq_notifier, CPUFREQ_TRANSITION_NOTIFIER);
- if (ret)
return ret;
- return cpufreq_register_notifier(&cpufreq_policy_notifier,
} core_initcall(register_cpufreq_notifier);CPUFREQ_POLICY_NOTIFIER);
- #endif
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }
+/*
- Scheduler load-tracking scale-invariance
- Provides the scheduler with a scale-invariance correction factor that
- compensates for frequency scaling (arch_scale_freq_capacity()). The scaling
- factor is updated in smp.c
- */
+unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{
- unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity, cpu));
- if (!curr)
return SCHED_CAPACITY_SCALE;
Why not initialized 'cpu_freq_capacity' with the right value, so !curr won't happen ?
- return curr;
+}
- #else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {}
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/27/2015 09:46 AM, Michael Turquette wrote:
From: Morten Rasmussen Morten.Rasmussen@arm.com
Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.
Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com
changes since internal v1:
replaced two commits from eas v3 with this new one from Morten
arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53
+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);
+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
+struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu);
+DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity);
IMO cpu_freq_capacity should be statically declared in the core code and modified/inspected through accessors also in the core code.
eg.
sched_cpu_freq_capacity_set(int cpu, unsigned long freq_capacity); unsigned long sched_cpu_freq_capacity_get(int cpu);
Peter asked that arm_arch_scale_freq_capacity should not add any additional instruction if not used by an arch because it's on the very hot path of the scheduler
#else
static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq);
In the code cpu_max_freq is used to update the scale freq invariance.
Wouldn't be simpler to use directly: cpufreq_quick_get_max(int cpu) instead of declaring another per cpu variable ?
+DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity);
+/*
- Scheduler load-tracking scale-invariance
- Provides the scheduler with a scale-invariance correction factor that
- compensates for frequency scaling through arch_scale_freq_capacity()
- (implemented in topology.c).
- */
+static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{
unsigned long capacity;
if (!max)
return;
capacity = (curr << SCHED_CAPACITY_SHIFT) / max;
atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity);
+}
static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu;
unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu)); if (freq->flags & CPUFREQ_CONST_LOOPS) return NOTIFY_OK;
@@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); }
scale_freq_capacity(cpu, freq->new, max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?
}return NOTIFY_OK;
@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };
+static int cpufreq_policy_callback(struct notifier_block *nb,
unsigned long val, void
*data) +{
struct cpufreq_policy *policy = data;
int i;
for_each_cpu(i, policy->cpus) {
scale_freq_capacity(i, policy->cur, policy->max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?
atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max);
atomic_long_set no longer needed.
}
return NOTIFY_OK;
+}
+static struct notifier_block cpufreq_policy_notifier = {
.notifier_call = cpufreq_policy_callback,
+};
- static int __init register_cpufreq_notifier(void) {
return cpufreq_register_notifier(&cpufreq_notifier,
int ret;
ret = cpufreq_register_notifier(&cpufreq_notifier,
CPUFREQ_TRANSITION_NOTIFIER);
if (ret)
return ret;
return cpufreq_register_notifier(&cpufreq_policy_notifier,
} core_initcall(register_cpufreq_notifier);CPUFREQ_POLICY_NOTIFIER);
- #endif
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }
+/*
- Scheduler load-tracking scale-invariance
- Provides the scheduler with a scale-invariance correction factor that
- compensates for frequency scaling (arch_scale_freq_capacity()). The
scaling
- factor is updated in smp.c
- */
+unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{
unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity,
cpu));
if (!curr)
return SCHED_CAPACITY_SCALE;
Why not initialized 'cpu_freq_capacity' with the right value, so !curr won't happen ?
return curr;
+}
I wonder if you should better move arm_arch_scale_freq_capacity in the arch/arm/kernel/smp.c as all other fucntion and variable are there. This will allow you to remove DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); form the topology.h file
- #else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {}
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
On 04/29/2015 01:15 PM, Vincent Guittot wrote:
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/27/2015 09:46 AM, Michael Turquette wrote:
From: Morten Rasmussen Morten.Rasmussen@arm.com
Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.
Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com
changes since internal v1:
replaced two commits from eas v3 with this new one from Morten
arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53
+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);
+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
+struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu);
+DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity);
IMO cpu_freq_capacity should be statically declared in the core code and modified/inspected through accessors also in the core code.
eg.
sched_cpu_freq_capacity_set(int cpu, unsigned long freq_capacity); unsigned long sched_cpu_freq_capacity_get(int cpu);
Peter asked that arm_arch_scale_freq_capacity should not add any additional instruction if not used by an arch because it's on the very hot path of the scheduler
#else
static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq);
In the code cpu_max_freq is used to update the scale freq invariance.
Wouldn't be simpler to use directly: cpufreq_quick_get_max(int cpu) instead of declaring another per cpu variable ?
+DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity);
+/*
- Scheduler load-tracking scale-invariance
- Provides the scheduler with a scale-invariance correction factor that
- compensates for frequency scaling through arch_scale_freq_capacity()
- (implemented in topology.c).
- */
+static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{
unsigned long capacity;
if (!max)
return;
capacity = (curr << SCHED_CAPACITY_SHIFT) / max;
atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity);
+}
static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu;
unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu)); if (freq->flags & CPUFREQ_CONST_LOOPS) return NOTIFY_OK;
@@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); }
scale_freq_capacity(cpu, freq->new, max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?
}return NOTIFY_OK;
@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };
+static int cpufreq_policy_callback(struct notifier_block *nb,
unsigned long val, void
*data) +{
struct cpufreq_policy *policy = data;
int i;
for_each_cpu(i, policy->cpus) {
scale_freq_capacity(i, policy->cur, policy->max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?
atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max);
atomic_long_set no longer needed.
}
return NOTIFY_OK;
+}
+static struct notifier_block cpufreq_policy_notifier = {
.notifier_call = cpufreq_policy_callback,
+};
- static int __init register_cpufreq_notifier(void) {
return cpufreq_register_notifier(&cpufreq_notifier,
int ret;
ret = cpufreq_register_notifier(&cpufreq_notifier,
CPUFREQ_TRANSITION_NOTIFIER);
if (ret)
return ret;
return cpufreq_register_notifier(&cpufreq_policy_notifier,
} core_initcall(register_cpufreq_notifier);CPUFREQ_POLICY_NOTIFIER);
- #endif
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }
+/*
- Scheduler load-tracking scale-invariance
- Provides the scheduler with a scale-invariance correction factor that
- compensates for frequency scaling (arch_scale_freq_capacity()). The
scaling
- factor is updated in smp.c
- */
+unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{
unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity,
cpu));
if (!curr)
return SCHED_CAPACITY_SCALE;
Why not initialized 'cpu_freq_capacity' with the right value, so !curr won't happen ?
return curr;
+}
I wonder if you should better move arm_arch_scale_freq_capacity in the arch/arm/kernel/smp.c as all other fucntion and variable are there. This will allow you to remove DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); form the topology.h file
- #else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {}
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 01:15 PM, Vincent Guittot wrote:
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/27/2015 09:46 AM, Michael Turquette wrote:
From: Morten Rasmussen Morten.Rasmussen@arm.com
Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.
Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com
changes since internal v1:
replaced two commits from eas v3 with this new one from Morten
arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53
+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);
+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113
+struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu);
+DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity);
IMO cpu_freq_capacity should be statically declared in the core code and modified/inspected through accessors also in the core code.
eg.
sched_cpu_freq_capacity_set(int cpu, unsigned long freq_capacity); unsigned long sched_cpu_freq_capacity_get(int cpu);
Peter asked that arm_arch_scale_freq_capacity should not add any additional instruction if not used by an arch because it's on the very hot path of the scheduler
#else
static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq);
In the code cpu_max_freq is used to update the scale freq invariance.
Wouldn't be simpler to use directly: cpufreq_quick_get_max(int cpu) instead of declaring another per cpu variable ?
+DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity);
+/*
- Scheduler load-tracking scale-invariance
- Provides the scheduler with a scale-invariance correction factor
that
- compensates for frequency scaling through arch_scale_freq_capacity()
- (implemented in topology.c).
- */
+static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{
unsigned long capacity;
if (!max)
return;
capacity = (curr << SCHED_CAPACITY_SHIFT) / max;
atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity);
+}
static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu;
unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq,
cpu));
if (freq->flags & CPUFREQ_CONST_LOOPS) return NOTIFY_OK;
@@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); }
scale_freq_capacity(cpu, freq->new, max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?
}return NOTIFY_OK;
@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };
+static int cpufreq_policy_callback(struct notifier_block *nb,
unsigned long val, void
*data) +{
struct cpufreq_policy *policy = data;
int i;
for_each_cpu(i, policy->cpus) {
scale_freq_capacity(i, policy->cur, policy->max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?
atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max);
atomic_long_set no longer needed.
}
return NOTIFY_OK;
+}
+static struct notifier_block cpufreq_policy_notifier = {
.notifier_call = cpufreq_policy_callback,
+};
- static int __init register_cpufreq_notifier(void) {
return cpufreq_register_notifier(&cpufreq_notifier,
int ret;
ret = cpufreq_register_notifier(&cpufreq_notifier,
CPUFREQ_TRANSITION_NOTIFIER);
if (ret)
return ret;
return cpufreq_register_notifier(&cpufreq_policy_notifier,
CPUFREQ_POLICY_NOTIFIER); } core_initcall(register_cpufreq_notifier);
- #endif
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }
+/*
- Scheduler load-tracking scale-invariance
- Provides the scheduler with a scale-invariance correction factor
that
- compensates for frequency scaling (arch_scale_freq_capacity()). The
scaling
- factor is updated in smp.c
- */
+unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{
unsigned long curr =
atomic_long_read(&per_cpu(cpu_freq_capacity, cpu));
if (!curr)
return SCHED_CAPACITY_SCALE;
Why not initialized 'cpu_freq_capacity' with the right value, so !curr won't happen ?
return curr;
+}
I wonder if you should better move arm_arch_scale_freq_capacity in the arch/arm/kernel/smp.c as all other fucntion and variable are there. This will allow you to remove DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); form the topology.h file
- #else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {}
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On 04/29/2015 02:34 PM, Vincent Guittot wrote:
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 01:15 PM, Vincent Guittot wrote:
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/27/2015 09:46 AM, Michael Turquette wrote:
From: Morten Rasmussen Morten.Rasmussen@arm.com
Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.
Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com
changes since internal v1:
replaced two commits from eas v3 with this new one from Morten
arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53
+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);
+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113
Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).
It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 02:34 PM, Vincent Guittot wrote:
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 01:15 PM, Vincent Guittot wrote:
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/27/2015 09:46 AM, Michael Turquette wrote:
From: Morten Rasmussen Morten.Rasmussen@arm.com
Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.
Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com
changes since internal v1:
replaced two commits from eas v3 with this new one from Morten
arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53
+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);
+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113
Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).
It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?
the weak function adds the useless sequence: value *= 1024 value >>=10
whereas the macro doesn't add any additional instruction
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On 04/29/2015 03:17 PM, Vincent Guittot wrote:
On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 02:34 PM, Vincent Guittot wrote:
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 01:15 PM, Vincent Guittot wrote:
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/27/2015 09:46 AM, Michael Turquette wrote: > > > > From: Morten Rasmussen Morten.Rasmussen@arm.com > > Implements arch-specific function to provide the scheduler with a > frequency scaling correction factor for more accurate load-tracking. > The > factor is: > > current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) > > This implementation only provides frequency invariance. No > micro-architecture invariance yet. > > Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com > --- > changes since internal v1: > * replaced two commits from eas v3 with this new one from Morten > > arch/arm/include/asm/topology.h | 7 ++++++ > arch/arm/kernel/smp.c | 53 > +++++++++++++++++++++++++++++++++++++++-- > arch/arm/kernel/topology.c | 17 +++++++++++++ > 3 files changed, 75 insertions(+), 2 deletions(-) > > diff --git a/arch/arm/include/asm/topology.h > b/arch/arm/include/asm/topology.h > index 2fe85ff..4b985dc 100644 > --- a/arch/arm/include/asm/topology.h > +++ b/arch/arm/include/asm/topology.h > @@ -24,6 +24,13 @@ void init_cpu_topology(void); > void store_cpu_topology(unsigned int cpuid); > const struct cpumask *cpu_coregroup_mask(int cpu); > > +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113
Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).
It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?
the weak function adds the useless sequence: value *= 1024 value >>=10
whereas the macro doesn't add any additional instruction
So if the macro is not defined for the architecture, the compilation will fail.
I don't see why the function below is not right:
void __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE; }
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On 29 April 2015 at 15:27, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 03:17 PM, Vincent Guittot wrote:
On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 02:34 PM, Vincent Guittot wrote:
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 01:15 PM, Vincent Guittot wrote:
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote: > > > > On 04/27/2015 09:46 AM, Michael Turquette wrote: >> >> >> >> >> From: Morten Rasmussen Morten.Rasmussen@arm.com >> >> Implements arch-specific function to provide the scheduler with a >> frequency scaling correction factor for more accurate load-tracking. >> The >> factor is: >> >> current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) >> >> This implementation only provides frequency invariance. No >> micro-architecture invariance yet. >> >> Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com >> --- >> changes since internal v1: >> * replaced two commits from eas v3 with this new one from Morten >> >> arch/arm/include/asm/topology.h | 7 ++++++ >> arch/arm/kernel/smp.c | 53 >> +++++++++++++++++++++++++++++++++++++++-- >> arch/arm/kernel/topology.c | 17 +++++++++++++ >> 3 files changed, 75 insertions(+), 2 deletions(-) >> >> diff --git a/arch/arm/include/asm/topology.h >> b/arch/arm/include/asm/topology.h >> index 2fe85ff..4b985dc 100644 >> --- a/arch/arm/include/asm/topology.h >> +++ b/arch/arm/include/asm/topology.h >> @@ -24,6 +24,13 @@ void init_cpu_topology(void); >> void store_cpu_topology(unsigned int cpuid); >> const struct cpumask *cpu_coregroup_mask(int cpu); >> >> +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity > > > > > > What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113
Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).
It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?
the weak function adds the useless sequence: value *= 1024 value >>=10
whereas the macro doesn't add any additional instruction
So if the macro is not defined for the architecture, the compilation will fail.
no there is a test in sched.h
#ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE; } #endif
I don't see why the function below is not right:
void __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE;
}
AFAIU, it doesn't ensure that the function will be inlined and optimized by the compiler. I had done several tests during the thread discussion and only the chosen solution was able to remove the useless sequence value *= 1024 value >>=10
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On 04/29/2015 03:33 PM, Vincent Guittot wrote:
On 29 April 2015 at 15:27, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 03:17 PM, Vincent Guittot wrote:
On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 02:34 PM, Vincent Guittot wrote:
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 01:15 PM, Vincent Guittot wrote: > > > > On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org > wrote: >> >> >> >> On 04/27/2015 09:46 AM, Michael Turquette wrote: >>> >>> >>> >>> >>> From: Morten Rasmussen Morten.Rasmussen@arm.com >>> >>> Implements arch-specific function to provide the scheduler with a >>> frequency scaling correction factor for more accurate load-tracking. >>> The >>> factor is: >>> >>> current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) >>> >>> This implementation only provides frequency invariance. No >>> micro-architecture invariance yet. >>> >>> Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com >>> --- >>> changes since internal v1: >>> * replaced two commits from eas v3 with this new one from Morten >>> >>> arch/arm/include/asm/topology.h | 7 ++++++ >>> arch/arm/kernel/smp.c | 53 >>> +++++++++++++++++++++++++++++++++++++++-- >>> arch/arm/kernel/topology.c | 17 +++++++++++++ >>> 3 files changed, 75 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/arm/include/asm/topology.h >>> b/arch/arm/include/asm/topology.h >>> index 2fe85ff..4b985dc 100644 >>> --- a/arch/arm/include/asm/topology.h >>> +++ b/arch/arm/include/asm/topology.h >>> @@ -24,6 +24,13 @@ void init_cpu_topology(void); >>> void store_cpu_topology(unsigned int cpuid); >>> const struct cpumask *cpu_coregroup_mask(int cpu); >>> >>> +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity >> >> >> >> >> >> What is for this macro ? > > > > > This is used to doesn't add any useless computation in the hot path > when arch_scale_freq_capacity is not used. This has been asked by > peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113
Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).
It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?
the weak function adds the useless sequence: value *= 1024 value >>=10
whereas the macro doesn't add any additional instruction
So if the macro is not defined for the architecture, the compilation will fail.
no there is a test in sched.h
#ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE; } #endif
Ah, ok I didn't see this one.
I don't see why the function below is not right:
void __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE;
}
AFAIU, it doesn't ensure that the function will be inlined and optimized by the compiler. I had done several tests during the thread discussion and only the chosen solution was able to remove the useless sequence value *= 1024 value >>=10
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On Wed, Apr 29, 2015 at 03:12:38PM +0100, Daniel Lezcano wrote:
On 04/29/2015 03:33 PM, Vincent Guittot wrote:
On 29 April 2015 at 15:27, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 03:17 PM, Vincent Guittot wrote:
On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:
On 04/29/2015 02:34 PM, Vincent Guittot wrote:
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote: > > > On 04/29/2015 01:15 PM, Vincent Guittot wrote: >> >> >> >> On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org >> wrote: >>> >>> >>> >>> On 04/27/2015 09:46 AM, Michael Turquette wrote: >>>> >>>> >>>> >>>> >>>> From: Morten Rasmussen Morten.Rasmussen@arm.com >>>> >>>> Implements arch-specific function to provide the scheduler with a >>>> frequency scaling correction factor for more accurate load-tracking. >>>> The >>>> factor is: >>>> >>>> current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) >>>> >>>> This implementation only provides frequency invariance. No >>>> micro-architecture invariance yet. >>>> >>>> Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com >>>> --- >>>> changes since internal v1: >>>> * replaced two commits from eas v3 with this new one from Morten >>>> >>>> arch/arm/include/asm/topology.h | 7 ++++++ >>>> arch/arm/kernel/smp.c | 53 >>>> +++++++++++++++++++++++++++++++++++++++-- >>>> arch/arm/kernel/topology.c | 17 +++++++++++++ >>>> 3 files changed, 75 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/arch/arm/include/asm/topology.h >>>> b/arch/arm/include/asm/topology.h >>>> index 2fe85ff..4b985dc 100644 >>>> --- a/arch/arm/include/asm/topology.h >>>> +++ b/arch/arm/include/asm/topology.h >>>> @@ -24,6 +24,13 @@ void init_cpu_topology(void); >>>> void store_cpu_topology(unsigned int cpuid); >>>> const struct cpumask *cpu_coregroup_mask(int cpu); >>>> >>>> +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity >>> >>> >>> >>> >>> >>> What is for this macro ? >> >> >> >> >> This is used to doesn't add any useless computation in the hot path >> when arch_scale_freq_capacity is not used. This has been asked by >> peter > > > > > What is the difference with having a dummy empty function with a 'weak' > attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113
Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).
It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?
the weak function adds the useless sequence: value *= 1024 value >>=10
whereas the macro doesn't add any additional instruction
So if the macro is not defined for the architecture, the compilation will fail.
no there is a test in sched.h
#ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE; } #endif
Ah, ok I didn't see this one.
I don't see why the function below is not right:
void __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE;
}
AFAIU, it doesn't ensure that the function will be inlined and optimized by the compiler.
AFAIU, the __weak attribute guarantees that the function cannot be inlined at all. The __weak symbols are resolved at link time, not at compile time, hence they cannot be inlined. So Daniel's example above, which is pretty much what we used to have prior to 4.1, will always lead to a function call. In addition to that, the compiler has no chance of figuring out that it always returns a constant which is paired with a shift or division in every call site in fair.c which could all have been optimized out.
The _weak trick was fine while we didn't call arch_scale_freq_capacity() very often. Now that we call it all the time, we have to minimize the overhead and the #define trick Peter came up with seems to be the only feasible way to get zero overhead when the architecture doesn't care about frequency scaling of capacity.
Morten
This patch introduces the SCHED_ENERGY_FREQ sched feature, which is implemented using jump labels when SCHED_DEBUG is defined. It is statically set to false when SCHED_DEBUG is not defined and thus disabled by default.
Signed-off-by: Michael Turquette mturquette@linaro.org --- changes since internal v1: none
kernel/sched/fair.c | 5 +++++ kernel/sched/features.h | 6 ++++++ 2 files changed, 11 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 46855d0..75aec8d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4207,6 +4207,11 @@ static inline void hrtick_update(struct rq *rq) } #endif
+static inline bool sched_energy_freq(void) +{ + return sched_feat(SCHED_ENERGY_FREQ); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 91e33cd..77381cf 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -96,3 +96,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) */ SCHED_FEAT(NUMA_RESIST_LOWER, false) #endif + +/* + * Scheduler-driven CPU frequency selection aimed to save energy based on + * load tracking + */ +SCHED_FEAT(SCHED_ENERGY_FREQ, false) -- 1.9.1
get_cpu_usage and capacity_orig_of are useful for a cpu frequency scaling policy which is based on cfs load tracking and cpu capacity metrics. Expose these calls in sched.h so that they can be used in such a policy.
Signed-off-by: Michael Turquette mturquette@linaro.org --- changes since internal v1: * exported capacity_orig_of
kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75aec8d..393fc36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4366,7 +4366,7 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; }
-static unsigned long capacity_orig_of(int cpu) +unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } @@ -4801,7 +4801,7 @@ done: * Without capping the usage, a group could be seen as overloaded (CPU0 usage * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity */ -static int get_cpu_usage(int cpu) +int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e1299..63a8be9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,6 +1396,9 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif
+int get_cpu_usage(int cpu); +unsigned long capacity_orig_of(int cpu); + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1
On 04/27/2015 09:46 AM, Michael Turquette wrote:
get_cpu_usage and capacity_orig_of are useful for a cpu frequency scaling policy which is based on cfs load tracking and cpu capacity metrics. Expose these calls in sched.h so that they can be used in such a policy.
Signed-off-by: Michael Turquette mturquette@linaro.org
changes since internal v1:
exported capacity_orig_of
kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75aec8d..393fc36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4366,7 +4366,7 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; }
-static unsigned long capacity_orig_of(int cpu) +unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } @@ -4801,7 +4801,7 @@ done:
- Without capping the usage, a group could be seen as overloaded (CPU0 usage
- at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
*/ -static int get_cpu_usage(int cpu) +int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e1299..63a8be9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,6 +1396,9 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif
+int get_cpu_usage(int cpu); +unsigned long capacity_orig_of(int cpu);
This one could be changed to a static inline in the header file, no ?
- static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
Quoting Daniel Lezcano (2015-04-29 05:23:18)
On 04/27/2015 09:46 AM, Michael Turquette wrote:
get_cpu_usage and capacity_orig_of are useful for a cpu frequency scaling policy which is based on cfs load tracking and cpu capacity metrics. Expose these calls in sched.h so that they can be used in such a policy.
Signed-off-by: Michael Turquette mturquette@linaro.org
changes since internal v1:
exported capacity_orig_of
kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75aec8d..393fc36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4366,7 +4366,7 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; }
-static unsigned long capacity_orig_of(int cpu) +unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } @@ -4801,7 +4801,7 @@ done:
- Without capping the usage, a group could be seen as overloaded (CPU0 usage
- at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
*/ -static int get_cpu_usage(int cpu) +int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e1299..63a8be9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,6 +1396,9 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif
+int get_cpu_usage(int cpu); +unsigned long capacity_orig_of(int cpu);
This one could be changed to a static inline in the header file, no ?
Yes it could. I'll update the patch.
Regards, Mike
- static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.
This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.
Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.
Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.
This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.
This policy is implemented using the cpufreq governor interface for two main reasons:
1) re-using the cpufreq machine drivers without using the governor interface is hard.
2) using the cpufreq interface allows us to switch between the scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.
Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.
Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.
[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22
Signed-off-by: Michael Turquette mturquette@linaro.org --- changes since internal v1: * renamed everything * fixed possible deadlock between gov_cfs_thread and gov_cfs_stop * replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges * usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases * dropped workqueue method due to instability * kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1
drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_SCHED_CFS + bool "sched_cfs" + select CPU_FREQ_GOV_SCHED_CFS + select CPU_FREQ_GOV_PERFORMANCE + help + Use the CPUfreq governor 'sched_cfs' as default. This scales + cpu frequency from the scheduler as per-entity load tracking + statistics are updated. endchoice
config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS + tristate "'sched cfs' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched_cfs' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is invoked from the completely fair + scheduler when updating per-entity load tracking statistics. + Latency to respond to changes in load is improved over polling + governors due to its event-driven design. + + If in doubt, say N. + comment "CPU frequency scaling drivers"
config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif
/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/* + * Copyright (C) 2015 Michael Turquette mturquette@linaro.org + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h> + +#include "sched.h" + +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * + * struct gov_data is the per-policy gov_cfs-specific data structure. A + * per-policy instance of it is created when the gov_cfs governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + struct cpufreq_policy *policy; +}; + +/** + * gov_cfs_select_freq - pick the next frequency for a cpu + * @policy: the cpufreq policy whose frequency may be changed + * + * gov_cfs_select_freq selects a frequency based on pelt load statistics + * tracked by cfs. First it finds the most utilized cpu in the policy and then + * maps that utilization value onto a cpu frequency and returns it. + * + * Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value + * before converting it to a frequency. The margin is derived from MARGIN_PCT, + * which itself is inspired by imbalance_pct in cfs. This is needed to + * proactively increase frequency in the case of increasing load. + * + * This approach attempts to maintain headroom of 25% unutilized cpu capacity. + * A traditional way of doing this is to take 75% of the current capacity and + * check if current utilization crosses that threshold. The only problem with + * that approach is determining the next cpu frequency target if that threshold + * is crossed. + * + * Instead of using the 75% threshold, gov_cfs_select_freq adds a 25% + * utilization margin to the utilization and converts that to a frequency. This + * removes conditional logic around checking thresholds and better supports + * drivers that use non-discretized frequency ranges (i.e. no pre-defined + * frequency tables or operating points). + * + * Returns frequency selected. + */ +static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{ + int cpu = 0; + struct gov_data *gd; + unsigned long freq = 0, max_usage = 0, usage = 0; + + if (!policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* + * get_cpu_usage is called without locking the runqueues. This is the + * same behavior used by find_busiest_cpu in load_balance. We are + * willing to accept occasionally stale data here in exchange for + * lockless behavior. + */ + for_each_cpu(cpu, policy->cpus) { + usage = get_cpu_usage(cpu); + if (usage > max_usage) + max_usage = usage; + } + + /* add margin to max_usage based on imbalance_pct */ + max_usage = max_usage * MARGIN_PCT / 100; + + cpu = cpumask_first(policy->cpus); + + /* freq is current utilization + 25% */ + freq = max_usage * policy->max / capacity_orig_of(cpu); + +out: + return freq; +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int gov_cfs_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long freq; + int ret; + + policy = (struct cpufreq_policy *) data; + if (!policy) { + pr_warn("%s: missing policy\n", __func__); + do_exit(-EINVAL); + } + + gd = policy->governor_data; + if (!gd) { + pr_warn("%s: missing governor data\n", __func__); + do_exit(-EINVAL); + } + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus); + if (ret) { + pr_warn("%s: failed to set allowed ptr\n", __func__); + do_exit(-EINVAL); + } + + /* main loop of the per-policy kthread */ + do { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + if (kthread_should_stop()) + break; + + /* avoid race with gov_cfs_stop */ + if (!down_write_trylock(&policy->rwsem)) + continue; + + freq = gov_cfs_select_freq(policy); + + ret = __cpufreq_driver_target(policy, freq, + CPUFREQ_RELATION_H); + if (ret) + pr_debug("%s: __cpufreq_driver_target returned %d\n", + __func__, ret); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); + } while (!kthread_should_stop()); + + do_exit(0); +} + +static void gov_cfs_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) { + return; + } + + wake_up_process(gd->task); +} + +/** + * gov_cfs_update_cpu - interface to scheduler for changing capacity values + * @cpu: cpu whose capacity utilization has recently changed + * + * gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the + * scheduler may inform the governor of updates to capacity utilization and + * make changes to cpu frequency. Currently this interface is designed around + * PELT values in CFS. It can be expanded to other scheduling classes in the + * future if needed. + * + * gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up + * the thread that does the actual work, gov_cfs_thread. + */ +void gov_cfs_update_cpu(int cpu) +{ + struct cpufreq_policy *policy; + struct gov_data *gd; + + /* XXX put policy pointer in per-cpu data? */ + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) { + return; + } + + if (!policy->governor_data) { + goto out; + } + + gd = policy->governor_data; + + /* bail early if we are throttled */ + if (ktime_before(ktime_get(), gd->throttle)) { + goto out; + } + + irq_work_queue_on(&gd->irq_work, cpu); + +out: + cpufreq_cpu_put(policy); + return; +} + +static void gov_cfs_start(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + + /* prepare per-policy private data */ + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) { + pr_debug("%s: failed to allocate private data\n", __func__); + return; + } + + /* + * Don't ask for freq changes at an higher rate than what + * the driver advertises as transition latency. + */ + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + /* init per-policy kthread */ + gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task"); + if (IS_ERR_OR_NULL(gd->task)) + pr_err("%s: failed to create kgov_cfs_task thread\n", __func__); + + init_irq_work(&gd->irq_work, gov_cfs_irq_work); + policy->governor_data = gd; + gd->policy = policy; +} + +static void gov_cfs_stop(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + + gd = policy->governor_data; + kthread_stop(gd->task); + + policy->governor_data = NULL; + + /* FIXME replace with devm counterparts? */ + kfree(gd); +} + +static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_START: + /* Start managing the frequency */ + gov_cfs_start(policy); + return 0; + + case CPUFREQ_GOV_STOP: + gov_cfs_stop(policy); + return 0; + + case CPUFREQ_GOV_LIMITS: /* unused */ + case CPUFREQ_GOV_POLICY_INIT: /* unused */ + case CPUFREQ_GOV_POLICY_EXIT: /* unused */ + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = { + .name = "gov_cfs", + .governor = gov_cfs_setup, + .owner = THIS_MODULE, +}; + +static int __init gov_cfs_init(void) +{ + return cpufreq_register_governor(&cpufreq_gov_cfs); +} + +static void __exit gov_cfs_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_cfs); +} + +/* Try to make this the default governor */ +fs_initcall(gov_cfs_init); + +MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); } + + if(sched_energy_freq()) + gov_cfs_update_cpu(cpu_of(rq)); + hrtick_update(rq); }
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } + + if(sched_energy_freq()) + gov_cfs_update_cpu(cpu_of(rq)); + hrtick_update(rq); }
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1); + + if(sched_energy_freq()) + gov_cfs_update_cpu(cpu_of(rq)); }
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1
Hi Mike,
On 27/04/15 08:46, Michael Turquette wrote:
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.
This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.
Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is
requesting ^
equivalent to the current capacity utilization, plus a margin.
Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.
This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.
This policy is implemented using the cpufreq governor interface for two main reasons:
- re-using the cpufreq machine drivers without using the governor
interface is hard.
- using the cpufreq interface allows us to switch between the
scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.
Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.
Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.
[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22
Signed-off-by: Michael Turquette mturquette@linaro.org
changes since internal v1:
- renamed everything
- fixed possible deadlock between gov_cfs_thread and gov_cfs_stop
- replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges
- usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases
- dropped workqueue method due to instability
- kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1
drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.
+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
bool "sched_cfs"
select CPU_FREQ_GOV_SCHED_CFS
select CPU_FREQ_GOV_PERFORMANCE
help
Use the CPUfreq governor 'sched_cfs' as default. This scales
cpu frequency from the scheduler as per-entity load tracking
statistics are updated.
endchoice
config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
tristate "'sched cfs' cpufreq governor"
depends on CPU_FREQ
Also CONFIG_IRQ_WORK is a dependency.
select CPU_FREQ_GOV_COMMON
help
'sched_cfs' - this governor scales cpu frequency from the
scheduler as a function of cpu capacity utilization. It does
not evaluate utilization on a periodic basis (as ondemand
does) but instead is invoked from the completely fair
scheduler when updating per-entity load tracking statistics.
Latency to respond to changes in load is improved over polling
governors due to its event-driven design.
If in doubt, say N.
comment "CPU frequency scaling drivers"
config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif
/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*
- Copyright (C) 2015 Michael Turquette mturquette@linaro.org
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
- */
+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h>
We don't need this anymore (at least for now), right?
+#include <linux/irq_work.h>
+#include "sched.h"
+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */
+/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
- @throttle_nsec: throttle period length in nanoseconds
- @task: worker thread for dvfs transition that may block/sleep
- @irq_work: callback used to wake up worker thread
- struct gov_data is the per-policy gov_cfs-specific data structure. A
- per-policy instance of it is created when the gov_cfs governor receives
- the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- member of struct cpufreq_policy.
- Readers of this data must call down_read(policy->rwsem). Writers must
- call down_write(policy->rwsem).
- */
+struct gov_data {
ktime_t throttle;
unsigned int throttle_nsec;
struct task_struct *task;
struct irq_work irq_work;
struct cpufreq_policy *policy;
+};
+/**
- gov_cfs_select_freq - pick the next frequency for a cpu
- @policy: the cpufreq policy whose frequency may be changed
- gov_cfs_select_freq selects a frequency based on pelt load statistics
- tracked by cfs. First it finds the most utilized cpu in the policy and then
- maps that utilization value onto a cpu frequency and returns it.
- Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value
- before converting it to a frequency. The margin is derived from MARGIN_PCT,
- which itself is inspired by imbalance_pct in cfs. This is needed to
- proactively increase frequency in the case of increasing load.
utilization? ^
- This approach attempts to maintain headroom of 25% unutilized cpu capacity.
- A traditional way of doing this is to take 75% of the current capacity and
- check if current utilization crosses that threshold. The only problem with
- that approach is determining the next cpu frequency target if that threshold
- is crossed.
- Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%
- utilization margin to the utilization and converts that to a frequency. This
- removes conditional logic around checking thresholds and better supports
- drivers that use non-discretized frequency ranges (i.e. no pre-defined
- frequency tables or operating points).
- Returns frequency selected.
- */
+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
int cpu = 0;
struct gov_data *gd;
unsigned long freq = 0, max_usage = 0, usage = 0;
if (!policy->governor_data)
goto out;
gd = policy->governor_data;
/*
* get_cpu_usage is called without locking the runqueues. This is the
* same behavior used by find_busiest_cpu in load_balance. We are
* willing to accept occasionally stale data here in exchange for
* lockless behavior.
*/
for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
if (usage > max_usage)
max_usage = usage;
}
/* add margin to max_usage based on imbalance_pct */
max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
/* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
+out:
return freq;
+}
+/*
- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
- */
+static int gov_cfs_thread(void *data) +{
struct sched_param param;
struct cpufreq_policy *policy;
struct gov_data *gd;
unsigned long freq;
int ret;
policy = (struct cpufreq_policy *) data;
if (!policy) {
pr_warn("%s: missing policy\n", __func__);
do_exit(-EINVAL);
}
gd = policy->governor_data;
if (!gd) {
pr_warn("%s: missing governor data\n", __func__);
do_exit(-EINVAL);
}
param.sched_priority = 50;
ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
if (ret) {
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
do_exit(-EINVAL);
} else {
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
__func__, gd->task->pid);
}
ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
if (ret) {
pr_warn("%s: failed to set allowed ptr\n", __func__);
do_exit(-EINVAL);
}
/* main loop of the per-policy kthread */
do {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop())
break;
/* avoid race with gov_cfs_stop */
if (!down_write_trylock(&policy->rwsem))
continue;
freq = gov_cfs_select_freq(policy);
ret = __cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_H);
I think we should use CPUFREQ_RELATION_L here. From the comments I read:
#define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target */ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target */
So we have to tell the driver to select a frequency with enough capacity (above the current one).
if (ret)
pr_debug("%s: __cpufreq_driver_target returned %d\n",
__func__, ret);
gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
up_write(&policy->rwsem);
} while (!kthread_should_stop());
do_exit(0);
+}
+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;
gd = container_of(irq_work, struct gov_data, irq_work);
if (!gd) {
return;
}
No brackets?
wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
+}
+/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
- gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the
- scheduler may inform the governor of updates to capacity utilization and
- make changes to cpu frequency. Currently this interface is designed around
- PELT values in CFS. It can be expanded to other scheduling classes in the
- future if needed.
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
- */
+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;
struct gov_data *gd;
/* XXX put policy pointer in per-cpu data? */
policy = cpufreq_cpu_get(cpu);
if (IS_ERR_OR_NULL(policy)) {
return;
}
if (!policy->governor_data) {
goto out;
}
gd = policy->governor_data;
/* bail early if we are throttled */
if (ktime_before(ktime_get(), gd->throttle)) {
goto out;
}
No brackets in the 3 ifs above?
Thanks,
- Juri
irq_work_queue_on(&gd->irq_work, cpu);
+out:
cpufreq_cpu_put(policy);
return;
+}
+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;
/* prepare per-policy private data */
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
if (!gd) {
pr_debug("%s: failed to allocate private data\n", __func__);
return;
}
/*
* Don't ask for freq changes at an higher rate than what
* the driver advertises as transition latency.
*/
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
policy->cpuinfo.transition_latency :
THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
__func__, gd->throttle_nsec);
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
Quoting Juri Lelli (2015-04-27 10:09:50)
Hi Mike,
On 27/04/15 08:46, Michael Turquette wrote:
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.
This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.
Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is
requesting ^
equivalent to the current capacity utilization, plus a margin.
Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.
This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.
This policy is implemented using the cpufreq governor interface for two main reasons:
- re-using the cpufreq machine drivers without using the governor
interface is hard.
- using the cpufreq interface allows us to switch between the
scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.
Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.
Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.
[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22
Signed-off-by: Michael Turquette mturquette@linaro.org
changes since internal v1:
- renamed everything
- fixed possible deadlock between gov_cfs_thread and gov_cfs_stop
- replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges
- usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases
- dropped workqueue method due to instability
- kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1
drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.
+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
bool "sched_cfs"
select CPU_FREQ_GOV_SCHED_CFS
select CPU_FREQ_GOV_PERFORMANCE
help
Use the CPUfreq governor 'sched_cfs' as default. This scales
cpu frequency from the scheduler as per-entity load tracking
statistics are updated.
endchoice
config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
tristate "'sched cfs' cpufreq governor"
depends on CPU_FREQ
Also CONFIG_IRQ_WORK is a dependency.
select CPU_FREQ_GOV_COMMON
help
'sched_cfs' - this governor scales cpu frequency from the
scheduler as a function of cpu capacity utilization. It does
not evaluate utilization on a periodic basis (as ondemand
does) but instead is invoked from the completely fair
scheduler when updating per-entity load tracking statistics.
Latency to respond to changes in load is improved over polling
governors due to its event-driven design.
If in doubt, say N.
comment "CPU frequency scaling drivers"
config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif
/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*
- Copyright (C) 2015 Michael Turquette mturquette@linaro.org
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
- */
+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h>
We don't need this anymore (at least for now), right?
+#include <linux/irq_work.h>
+#include "sched.h"
+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */
+/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
- @throttle_nsec: throttle period length in nanoseconds
- @task: worker thread for dvfs transition that may block/sleep
- @irq_work: callback used to wake up worker thread
- struct gov_data is the per-policy gov_cfs-specific data structure. A
- per-policy instance of it is created when the gov_cfs governor receives
- the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- member of struct cpufreq_policy.
- Readers of this data must call down_read(policy->rwsem). Writers must
- call down_write(policy->rwsem).
- */
+struct gov_data {
ktime_t throttle;
unsigned int throttle_nsec;
struct task_struct *task;
struct irq_work irq_work;
struct cpufreq_policy *policy;
+};
+/**
- gov_cfs_select_freq - pick the next frequency for a cpu
- @policy: the cpufreq policy whose frequency may be changed
- gov_cfs_select_freq selects a frequency based on pelt load statistics
- tracked by cfs. First it finds the most utilized cpu in the policy and then
- maps that utilization value onto a cpu frequency and returns it.
- Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value
- before converting it to a frequency. The margin is derived from MARGIN_PCT,
- which itself is inspired by imbalance_pct in cfs. This is needed to
- proactively increase frequency in the case of increasing load.
utilization? ^
- This approach attempts to maintain headroom of 25% unutilized cpu capacity.
- A traditional way of doing this is to take 75% of the current capacity and
- check if current utilization crosses that threshold. The only problem with
- that approach is determining the next cpu frequency target if that threshold
- is crossed.
- Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%
- utilization margin to the utilization and converts that to a frequency. This
- removes conditional logic around checking thresholds and better supports
- drivers that use non-discretized frequency ranges (i.e. no pre-defined
- frequency tables or operating points).
- Returns frequency selected.
- */
+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
int cpu = 0;
struct gov_data *gd;
unsigned long freq = 0, max_usage = 0, usage = 0;
if (!policy->governor_data)
goto out;
gd = policy->governor_data;
/*
* get_cpu_usage is called without locking the runqueues. This is the
* same behavior used by find_busiest_cpu in load_balance. We are
* willing to accept occasionally stale data here in exchange for
* lockless behavior.
*/
for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
if (usage > max_usage)
max_usage = usage;
}
/* add margin to max_usage based on imbalance_pct */
max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
/* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
+out:
return freq;
+}
+/*
- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
- */
+static int gov_cfs_thread(void *data) +{
struct sched_param param;
struct cpufreq_policy *policy;
struct gov_data *gd;
unsigned long freq;
int ret;
policy = (struct cpufreq_policy *) data;
if (!policy) {
pr_warn("%s: missing policy\n", __func__);
do_exit(-EINVAL);
}
gd = policy->governor_data;
if (!gd) {
pr_warn("%s: missing governor data\n", __func__);
do_exit(-EINVAL);
}
param.sched_priority = 50;
ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
if (ret) {
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
do_exit(-EINVAL);
} else {
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
__func__, gd->task->pid);
}
ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
if (ret) {
pr_warn("%s: failed to set allowed ptr\n", __func__);
do_exit(-EINVAL);
}
/* main loop of the per-policy kthread */
do {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop())
break;
/* avoid race with gov_cfs_stop */
if (!down_write_trylock(&policy->rwsem))
continue;
freq = gov_cfs_select_freq(policy);
ret = __cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_H);
I think we should use CPUFREQ_RELATION_L here. From the comments I read:
#define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target */ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target */
So we have to tell the driver to select a frequency with enough capacity (above the current one).
Thanks for all of the above comments. I'll fix all of them. this RELATION_L thing is annoying because I had it in my notes to fix it and forgot :-/
if (ret)
pr_debug("%s: __cpufreq_driver_target returned %d\n",
__func__, ret);
gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
up_write(&policy->rwsem);
} while (!kthread_should_stop());
do_exit(0);
+}
+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;
gd = container_of(irq_work, struct gov_data, irq_work);
if (!gd) {
return;
}
No brackets?
Will fix.
wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?
+}
+/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
- gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the
- scheduler may inform the governor of updates to capacity utilization and
- make changes to cpu frequency. Currently this interface is designed around
- PELT values in CFS. It can be expanded to other scheduling classes in the
- future if needed.
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
- */
+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;
struct gov_data *gd;
/* XXX put policy pointer in per-cpu data? */
policy = cpufreq_cpu_get(cpu);
if (IS_ERR_OR_NULL(policy)) {
return;
}
if (!policy->governor_data) {
goto out;
}
gd = policy->governor_data;
/* bail early if we are throttled */
if (ktime_before(ktime_get(), gd->throttle)) {
goto out;
}
No brackets in the 3 ifs above?
Will fix.
Thanks, Mike
Thanks,
- Juri
irq_work_queue_on(&gd->irq_work, cpu);
+out:
cpufreq_cpu_put(policy);
return;
+}
+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;
/* prepare per-policy private data */
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
if (!gd) {
pr_debug("%s: failed to allocate private data\n", __func__);
return;
}
/*
* Don't ask for freq changes at an higher rate than what
* the driver advertises as transition latency.
*/
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
policy->cpuinfo.transition_latency :
THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
__func__, gd->throttle_nsec);
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.
You mean because in this case we don't have any reference to base such a threshold on?
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?
Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.
Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.
With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).
Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.
From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001
From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API
Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.
A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.
Signed-off-by: Juri Lelli juri.lelli@arm.com --- kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@
#include "sched.h"
-#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */
+static DEFINE_PER_CPU(unsigned long, new_capacity); + /** * gov_data - per-policy data internal to the governor * @throttle: next throttling period expiry. Derived from throttle_nsec @@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) { - usage = get_cpu_usage(cpu); + usage = per_cpu(new_capacity, cpu); if (usage > max_usage) max_usage = usage; } @@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
- cpu = cpumask_first(policy->cpus); - - if (max_usage >= capacity_orig_of(cpu)) { + if (max_usage >= SCHED_LOAD_SCALE) { freq = policy->max; goto out; }
/* freq is current utilization + 25% */ - freq = max_usage * policy->max / capacity_orig_of(cpu); + freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) * gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up * the thread that does the actual work, gov_cfs_thread. */ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
+ per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd; + int cpu;
/* prepare per-policy private data */ gd = kzalloc(sizeof(*gd), GFP_KERNEL); @@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
+ for_each_cpu(cpu, policy->related_cpus) + per_cpu(new_capacity, cpu) = 0; + /* init per-policy kthread */ gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task"); if (IS_ERR_OR_NULL(gd->task)) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
if(sched_energy_freq()) - gov_cfs_update_cpu(cpu_of(rq)); + gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq); } @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
if(sched_energy_freq()) - gov_cfs_update_cpu(cpu_of(rq)); + gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq); } @@ -4800,6 +4800,12 @@ next: done: return target; } + +unsigned long capacity_curr_of(int cpu) +{ + return arch_scale_freq_capacity(NULL, cpu); +} + /* * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can @@ -4817,7 +4823,7 @@ done: * Without capping the usage, a group could be seen as overloaded (CPU0 usage * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity */ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
+static inline unsigned long task_utilization(struct task_struct *p) +{ + return p->se.avg.utilization_avg_contrib; +} + /* * scheduler tick hitting a task of our scheduling class: */ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + int cpu = task_cpu(curr);
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_rq_runnable_avg(rq, 1);
- if(sched_energy_freq()) - gov_cfs_update_cpu(cpu_of(rq)); + if (sched_energy_freq() && + (capacity_curr_of(cpu) < SCHED_LOAD_SCALE) && + ((capacity_curr_of(cpu) * 100) < + (task_utilization(curr) * MARGIN_PCT))) { + gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE); + } }
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif
-int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu);
#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {} #endif
On 28 April 2015 at 19:48, Juri Lelli juri.lelli@arm.com wrote:
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.
You mean because in this case we don't have any reference to base such a threshold on?
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?
Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.
Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.
With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).
Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.
From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API
Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.
A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@
#include "sched.h"
-#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */
+static DEFINE_PER_CPU(unsigned long, new_capacity);
/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
@@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
usage = per_cpu(new_capacity, cpu); if (usage > max_usage) max_usage = usage; }
@@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
if (max_usage >= capacity_orig_of(cpu)) {
if (max_usage >= SCHED_LOAD_SCALE) { freq = policy->max; goto out; } /* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work)
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
*/ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd;
int cpu; /* prepare per-policy private data */ gd = kzalloc(sizeof(*gd), GFP_KERNEL);
@@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
for_each_cpu(cpu, policy->related_cpus)
per_cpu(new_capacity, cpu) = 0;
/* init per-policy kthread */ gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task"); if (IS_ERR_OR_NULL(gd->task))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
Using utilization_load_avg is an interesting way to remove dependency with CPU's capacity
hrtick_update(rq);
} @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg); hrtick_update(rq);
} @@ -4800,6 +4800,12 @@ next: done: return target; }
+unsigned long capacity_curr_of(int cpu) +{
return arch_scale_freq_capacity(NULL, cpu);
+}
/*
- get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
- tasks. The unit of the return value must be the one of capacity so we can
@@ -4817,7 +4823,7 @@ done:
- Without capping the usage, a group could be seen as overloaded (CPU0 usage
- at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
*/ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
+static inline unsigned long task_utilization(struct task_struct *p) +{
return p->se.avg.utilization_avg_contrib;
+}
/*
- scheduler tick hitting a task of our scheduling class:
*/ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
int cpu = task_cpu(curr); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se);
@@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
if (sched_energy_freq() &&
(capacity_curr_of(cpu) < SCHED_LOAD_SCALE) &&
((capacity_curr_of(cpu) * 100) <
(task_utilization(curr) * MARGIN_PCT))) {
gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE);
This looks like a policy (similar to ondemand one) and i'm not sure that we should such policy in the tick
}
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif
-int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu);
#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {}
#endif
2.2.2
I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization.
From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points
remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity
modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API)
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); }
+void gov_cfs_reset_cpu(int cpu) +{
per_cpu(new_capacity, cpu) = 0;
+}
/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq);
}
@@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
if(sched_energy_freq()) {
/*
* Ask for an update only if we are not going idle.
* If we are going idle we just need to clear our
* current request.
*/
if (rq->cfs.nr_running)
gov_cfs_update_cpu(cpu_of(rq),
rq->cfs.utilization_load_avg);
else
gov_cfs_reset_cpu(cpu_of(rq));
} hrtick_update(rq);
} @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; }
+static inline unsigned long task_utilization(struct task_struct *p) +{
return p->se.avg.utilization_avg_contrib;
+}
/*
- select_task_rq_fair: Select target runqueue for the waking task in domains
- that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock();
/* We want to consider the pre-decayed utilization */
if(sched_energy_freq())
gov_cfs_update_cpu(new_cpu,
cpu_rq(new_cpu)->cfs.utilization_load_avg +
task_utilization(p));
This can be seen as an artifact to boost the frequency as you use an old value Then, you are not sure that the scheduler will select this cpu at the end (because of cpu affinity as an example); The only place is the enqueue function
return new_cpu;
}
@@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
-static inline unsigned long task_utilization(struct task_struct *p) -{
return p->se.avg.utilization_avg_contrib;
-}
/*
- scheduler tick hitting a task of our scheduling class:
*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {}
#endif
2.2.2
This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4).
Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already).
From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled
If the kthread is throttled we still need to update requests, or we may end-up with stale values.
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity)
gd = policy->governor_data;
per_cpu(new_capacity, cpu) = capacity;
/* bail early if we are throttled */ if (ktime_before(ktime_get(), gd->throttle)) { goto out; }
per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out:
2.2.2
From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing
This should cover load_balance paths (untested).
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned:
ld_moved = 0;
out:
/* dst_grpmask might be NULL for NEWLY_IDLE. */
if (sched_energy_freq() && ld_moved && env.dst_grpmask)
/*
* dequeue_task_fair() already took care of src_cpu
*/
gov_cfs_update_cpu(env.dst_cpu,
cpu_rq(env.dst_cpu)->cfs.utilization_load_avg);
dst_cpu is the last cpu that has been used for load balancing but more cpus can have been involved so you can miss some tasks migration: enqueue/dequeue are the only safe place
return ld_moved;
}
@@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock);
if (p)
if (p) { attach_one_task(target_rq, p);
if (sched_energy_freq())
gov_cfs_update_cpu(cpu_of(target_rq),
target_rq->cfs.utilization_load_avg);
} local_irq_enable();
-- 2.2.2
Comments?
multiplying the hook increases the complexity and risk to have regression each time there is a change in the scheduler. So staying in enqueue/dequeue is more safe as it is the only place that ensure that a task will be pu on a rq
Thanks,
- Juri
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
Hi Vincent,
thanks for your review.
On 29/04/15 08:52, Vincent Guittot wrote:
On 28 April 2015 at 19:48, Juri Lelli juri.lelli@arm.com wrote:
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.
You mean because in this case we don't have any reference to base such a threshold on?
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?
Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.
Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.
With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).
Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.
From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API
Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.
A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@
#include "sched.h"
-#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */
+static DEFINE_PER_CPU(unsigned long, new_capacity);
/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
@@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
usage = per_cpu(new_capacity, cpu); if (usage > max_usage) max_usage = usage; }
@@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
if (max_usage >= capacity_orig_of(cpu)) {
if (max_usage >= SCHED_LOAD_SCALE) { freq = policy->max; goto out; } /* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work)
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
*/ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd;
int cpu; /* prepare per-policy private data */ gd = kzalloc(sizeof(*gd), GFP_KERNEL);
@@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
for_each_cpu(cpu, policy->related_cpus)
per_cpu(new_capacity, cpu) = 0;
/* init per-policy kthread */ gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task"); if (IS_ERR_OR_NULL(gd->task))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
Using utilization_load_avg is an interesting way to remove dependency with CPU's capacity
hrtick_update(rq);
} @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg); hrtick_update(rq);
} @@ -4800,6 +4800,12 @@ next: done: return target; }
+unsigned long capacity_curr_of(int cpu) +{
return arch_scale_freq_capacity(NULL, cpu);
+}
/*
- get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
- tasks. The unit of the return value must be the one of capacity so we can
@@ -4817,7 +4823,7 @@ done:
- Without capping the usage, a group could be seen as overloaded (CPU0 usage
- at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
*/ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
+static inline unsigned long task_utilization(struct task_struct *p) +{
return p->se.avg.utilization_avg_contrib;
+}
/*
- scheduler tick hitting a task of our scheduling class:
*/ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
int cpu = task_cpu(curr); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se);
@@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
if (sched_energy_freq() &&
(capacity_curr_of(cpu) < SCHED_LOAD_SCALE) &&
((capacity_curr_of(cpu) * 100) <
(task_utilization(curr) * MARGIN_PCT))) {
gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE);
This looks like a policy (similar to ondemand one) and i'm not sure that we should such policy in the tick
The rationale behind this is that if we base all our decision on utilization signal we only get a "true" signal when we don't saturate the current capacity. Here we realized that we are most likely going to saturate that, so we react going to max. When the task will be subsequently dequeue we probably have better chances to pick the "true" capacity level (when this task is queued back for example).
}
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif
-int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu);
#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {}
#endif
2.2.2
I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization.
From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points
remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity
modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API)
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); }
+void gov_cfs_reset_cpu(int cpu) +{
per_cpu(new_capacity, cpu) = 0;
+}
/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq);
}
@@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
if(sched_energy_freq()) {
/*
* Ask for an update only if we are not going idle.
* If we are going idle we just need to clear our
* current request.
*/
if (rq->cfs.nr_running)
gov_cfs_update_cpu(cpu_of(rq),
rq->cfs.utilization_load_avg);
else
gov_cfs_reset_cpu(cpu_of(rq));
} hrtick_update(rq);
} @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; }
+static inline unsigned long task_utilization(struct task_struct *p) +{
return p->se.avg.utilization_avg_contrib;
+}
/*
- select_task_rq_fair: Select target runqueue for the waking task in domains
- that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock();
/* We want to consider the pre-decayed utilization */
if(sched_energy_freq())
gov_cfs_update_cpu(new_cpu,
cpu_rq(new_cpu)->cfs.utilization_load_avg +
task_utilization(p));
This can be seen as an artifact to boost the frequency as you use an old value
Well, as the old value is the only thing I know about the task (we don't already have blocked_utilization here) I was trying to avoid kicking the thing in the tick right after the task starts running, but rather trying to predict what will probably happen.
Then, you are not sure that the scheduler will select this cpu at the end (because of cpu affinity as an example); The only place is the enqueue function
So, select_task_rq_fair() is not even called for tasks with a single cpu affinity mask, we'll have to put a check in select_task_rq(), right. But after this we should be pretty sure on where the task will be woken up.
return new_cpu;
}
@@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
-static inline unsigned long task_utilization(struct task_struct *p) -{
return p->se.avg.utilization_avg_contrib;
-}
/*
- scheduler tick hitting a task of our scheduling class:
*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {}
#endif
2.2.2
This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4).
Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already).
From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled
If the kthread is throttled we still need to update requests, or we may end-up with stale values.
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity)
gd = policy->governor_data;
per_cpu(new_capacity, cpu) = capacity;
/* bail early if we are throttled */ if (ktime_before(ktime_get(), gd->throttle)) { goto out; }
per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out:
2.2.2
From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing
This should cover load_balance paths (untested).
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned:
ld_moved = 0;
out:
/* dst_grpmask might be NULL for NEWLY_IDLE. */
if (sched_energy_freq() && ld_moved && env.dst_grpmask)
/*
* dequeue_task_fair() already took care of src_cpu
*/
gov_cfs_update_cpu(env.dst_cpu,
cpu_rq(env.dst_cpu)->cfs.utilization_load_avg);
dst_cpu is the last cpu that has been used for load balancing but more cpus can have been involved so you can miss some tasks migration: enqueue/dequeue are the only safe place
Right, for the new_dst_cpu thing. I guess we have to move this trigger up above (probably inside the more_balance loop).
return ld_moved;
}
@@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock);
if (p)
if (p) { attach_one_task(target_rq, p);
if (sched_energy_freq())
gov_cfs_update_cpu(cpu_of(target_rq),
target_rq->cfs.utilization_load_avg);
} local_irq_enable();
-- 2.2.2
Comments?
multiplying the hook increases the complexity and risk to have regression each time there is a change in the scheduler. So staying in enqueue/dequeue is more safe as it is the only place that ensure that a task will be pu on a rq
So, I actually started thinking and doing exactly what you are saying, but then Peter's reply came on LKML and a started thinking how (and if) we can actually do something finer grained. I still agree that hooks in enqueue/dequeue/tick (maybe even in core) make all this a lot easier, but at the same time we are risking to add useless overhead or even missing potentially needed freq changes (think for example if we move a lot of task between rqs during a load balance iteration).
Best,
- Juri
Quoting Juri Lelli (2015-04-28 10:48:27)
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
CPPC-based systems.
I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.
First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html
Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html
If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!
So I decided to transmute the threshold into a margin. Instead of checking to see if we crossed some boundary we always try to maintain a bit of overhead. This works for table-based and table-less systems, and allows us to hit the minimum and maximum frequencies without any weird corner cases.
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.
You mean because in this case we don't have any reference to base such a threshold on?
Correct.
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?
Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.
Yeah, but it is still hidden behind the rwsem. I'd to get rid of that too if I can think of a way.
Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.
With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).
Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.
Thanks for the patches. I'll review them and get back to you but I wanted to respond to your questions above asap.
Regards, Mike
From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API
Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.
A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@ #include "sched.h" -#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */ +static DEFINE_PER_CPU(unsigned long, new_capacity);
/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
@@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
usage = per_cpu(new_capacity, cpu); if (usage > max_usage) max_usage = usage; }
@@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
if (max_usage >= capacity_orig_of(cpu)) {
if (max_usage >= SCHED_LOAD_SCALE) { freq = policy->max; goto out; }
/* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work)
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
*/ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd;
int cpu;
/* prepare per-policy private data */ gd = kzalloc(sizeof(*gd), GFP_KERNEL); @@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
for_each_cpu(cpu, policy->related_cpus)
per_cpu(new_capacity, cpu) = 0;
/* init per-policy kthread */ gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task"); if (IS_ERR_OR_NULL(gd->task))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq); } @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq); } @@ -4800,6 +4800,12 @@ next: done: return target; }
+unsigned long capacity_curr_of(int cpu) +{
return arch_scale_freq_capacity(NULL, cpu);
+}
/*
- get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
- tasks. The unit of the return value must be the one of capacity so we can
@@ -4817,7 +4823,7 @@ done:
- Without capping the usage, a group could be seen as overloaded (CPU0 usage
- at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
*/ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ +static inline unsigned long task_utilization(struct task_struct *p) +{
return p->se.avg.utilization_avg_contrib;
+}
/*
- scheduler tick hitting a task of our scheduling class:
*/ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
int cpu = task_cpu(curr);
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
if (sched_energy_freq() &&
(capacity_curr_of(cpu) < SCHED_LOAD_SCALE) &&
((capacity_curr_of(cpu) * 100) <
(task_utilization(curr) * MARGIN_PCT))) {
gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE);
}
} /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif -int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {}
#endif
2.2.2
I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization.
From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points
remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity
modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API)
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); } +void gov_cfs_reset_cpu(int cpu) +{
per_cpu(new_capacity, cpu) = 0;
+}
/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq);
} @@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
if(sched_energy_freq()) {
/*
* Ask for an update only if we are not going idle.
* If we are going idle we just need to clear our
* current request.
*/
if (rq->cfs.nr_running)
gov_cfs_update_cpu(cpu_of(rq),
rq->cfs.utilization_load_avg);
else
gov_cfs_reset_cpu(cpu_of(rq));
}
hrtick_update(rq); } @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; } +static inline unsigned long task_utilization(struct task_struct *p) +{
return p->se.avg.utilization_avg_contrib;
+}
/*
- select_task_rq_fair: Select target runqueue for the waking task in domains
- that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock();
/* We want to consider the pre-decayed utilization */
if(sched_energy_freq())
gov_cfs_update_cpu(new_cpu,
cpu_rq(new_cpu)->cfs.utilization_load_avg +
task_utilization(p)); return new_cpu;
} @@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ -static inline unsigned long task_utilization(struct task_struct *p) -{
return p->se.avg.utilization_avg_contrib;
-}
/*
- scheduler tick hitting a task of our scheduling class:
*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {}
#endif
2.2.2
This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4).
Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already).
From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled
If the kthread is throttled we still need to update requests, or we may end-up with stale values.
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity) gd = policy->governor_data;
per_cpu(new_capacity, cpu) = capacity;
/* bail early if we are throttled */ if (ktime_before(ktime_get(), gd->throttle)) { goto out; }
per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out: -- 2.2.2
From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing
This should cover load_balance paths (untested).
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned: ld_moved = 0; out:
/* dst_grpmask might be NULL for NEWLY_IDLE. */
if (sched_energy_freq() && ld_moved && env.dst_grpmask)
/*
* dequeue_task_fair() already took care of src_cpu
*/
gov_cfs_update_cpu(env.dst_cpu,
cpu_rq(env.dst_cpu)->cfs.utilization_load_avg);
return ld_moved;
} @@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock);
if (p)
if (p) { attach_one_task(target_rq, p);
if (sched_energy_freq())
gov_cfs_update_cpu(cpu_of(target_rq),
target_rq->cfs.utilization_load_avg);
}
local_irq_enable(); -- 2.2.2
Comments?
Thanks,
- Juri
On 29/04/15 09:32, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-28 10:48:27)
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
CPPC-based systems.
I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.
First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html
Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html
If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!
Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.
So I decided to transmute the threshold into a margin. Instead of checking to see if we crossed some boundary we always try to maintain a bit of overhead. This works for table-based and table-less systems, and allows us to hit the minimum and maximum frequencies without any weird corner cases.
Ok, margin is fine. I kept that in my deltas below. I think we also need that to be able to stabilize capacity requests.
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.
You mean because in this case we don't have any reference to base such a threshold on?
Correct.
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?
Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.
Yeah, but it is still hidden behind the rwsem. I'd to get rid of that too if I can think of a way.
Agreed, we should move away from that.
Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.
With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).
Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.
Thanks for the patches. I'll review them and get back to you but I wanted to respond to your questions above asap.
Thanks!
Best,
- Juri
Regards, Mike
From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API
Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.
A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@
#include "sched.h"
-#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */
+static DEFINE_PER_CPU(unsigned long, new_capacity);
/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
@@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
usage = per_cpu(new_capacity, cpu); if (usage > max_usage) max_usage = usage; }
@@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
if (max_usage >= capacity_orig_of(cpu)) {
if (max_usage >= SCHED_LOAD_SCALE) { freq = policy->max; goto out; } /* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work)
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
*/ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd;
int cpu; /* prepare per-policy private data */ gd = kzalloc(sizeof(*gd), GFP_KERNEL);
@@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
for_each_cpu(cpu, policy->related_cpus)
per_cpu(new_capacity, cpu) = 0;
/* init per-policy kthread */ gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task"); if (IS_ERR_OR_NULL(gd->task))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg); hrtick_update(rq);
} @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg); hrtick_update(rq);
} @@ -4800,6 +4800,12 @@ next: done: return target; }
+unsigned long capacity_curr_of(int cpu) +{
return arch_scale_freq_capacity(NULL, cpu);
+}
/*
- get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
- tasks. The unit of the return value must be the one of capacity so we can
@@ -4817,7 +4823,7 @@ done:
- Without capping the usage, a group could be seen as overloaded (CPU0 usage
- at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
*/ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
+static inline unsigned long task_utilization(struct task_struct *p) +{
return p->se.avg.utilization_avg_contrib;
+}
/*
- scheduler tick hitting a task of our scheduling class:
*/ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
int cpu = task_cpu(curr); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se);
@@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
if (sched_energy_freq() &&
(capacity_curr_of(cpu) < SCHED_LOAD_SCALE) &&
((capacity_curr_of(cpu) * 100) <
(task_utilization(curr) * MARGIN_PCT))) {
gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE);
}
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif
-int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu);
#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {}
#endif
2.2.2
I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization.
From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points
remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity
modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API)
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); }
+void gov_cfs_reset_cpu(int cpu) +{
per_cpu(new_capacity, cpu) = 0;
+}
/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq);
}
@@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
if(sched_energy_freq()) {
/*
* Ask for an update only if we are not going idle.
* If we are going idle we just need to clear our
* current request.
*/
if (rq->cfs.nr_running)
gov_cfs_update_cpu(cpu_of(rq),
rq->cfs.utilization_load_avg);
else
gov_cfs_reset_cpu(cpu_of(rq));
} hrtick_update(rq);
} @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; }
+static inline unsigned long task_utilization(struct task_struct *p) +{
return p->se.avg.utilization_avg_contrib;
+}
/*
- select_task_rq_fair: Select target runqueue for the waking task in domains
- that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock();
/* We want to consider the pre-decayed utilization */
if(sched_energy_freq())
gov_cfs_update_cpu(new_cpu,
cpu_rq(new_cpu)->cfs.utilization_load_avg +
task_utilization(p)); return new_cpu;
}
@@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq)
#endif /* CONFIG_SMP */
-static inline unsigned long task_utilization(struct task_struct *p) -{
return p->se.avg.utilization_avg_contrib;
-}
/*
- scheduler tick hitting a task of our scheduling class:
*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {}
#endif
2.2.2
This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4).
Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already).
From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled
If the kthread is throttled we still need to update requests, or we may end-up with stale values.
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity)
gd = policy->governor_data;
per_cpu(new_capacity, cpu) = capacity;
/* bail early if we are throttled */ if (ktime_before(ktime_get(), gd->throttle)) { goto out; }
per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);
out:
2.2.2
From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing
This should cover load_balance paths (untested).
Signed-off-by: Juri Lelli juri.lelli@arm.com
kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned:
ld_moved = 0;
out:
/* dst_grpmask might be NULL for NEWLY_IDLE. */
if (sched_energy_freq() && ld_moved && env.dst_grpmask)
/*
* dequeue_task_fair() already took care of src_cpu
*/
gov_cfs_update_cpu(env.dst_cpu,
cpu_rq(env.dst_cpu)->cfs.utilization_load_avg);
return ld_moved;
}
@@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock);
if (p)
if (p) { attach_one_task(target_rq, p);
if (sched_energy_freq())
gov_cfs_update_cpu(cpu_of(target_rq),
target_rq->cfs.utilization_load_avg);
} local_irq_enable();
-- 2.2.2
Comments?
Thanks,
- Juri
Hi Juri,
On 29 April 2015 at 05:39, Juri Lelli juri.lelli@arm.com wrote:
On 29/04/15 09:32, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-28 10:48:27)
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
CPPC-based systems.
I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.
First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html
Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html
If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!
Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.
IIUC, the optimization you're getting at is to suppress the CPU freq requests when it falls within some range of the current OPP? I think this may hamper certain latency sensitive workloads, since the freq ramp up could be potentially slowed down. So, theres some merit in making the request path as quick as possible and allow for continuous adaptation. I need to look at your patches in more detail, but eyeballing it seems like you're trying to achieve that.
From the energy model perspective, can a continuous performance band
be supported at all or is it a hard requirement to have a discretized table?
Regards, Ashwin.
Hi Ashwin,
On 04/05/15 14:41, Ashwin Chaugule wrote:
Hi Juri,
On 29 April 2015 at 05:39, Juri Lelli juri.lelli@arm.com wrote:
On 29/04/15 09:32, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-28 10:48:27)
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
> + > + wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
CPPC-based systems.
I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.
First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html
Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html
If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!
Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.
IIUC, the optimization you're getting at is to suppress the CPU freq requests when it falls within some range of the current OPP? I think this may hamper certain latency sensitive workloads, since the freq ramp up could be potentially slowed down. So, theres some merit in making the request path as quick as possible and allow for continuous adaptation. I need to look at your patches in more detail, but eyeballing it seems like you're trying to achieve that.
So, the energy model (and please mind that the patches on top of Mike's patchset don't have that yet) currently gives you these "capacity bands". The idea is to try to adapt the OPP selection to the usage you see on your CPU/cluster. Since the usage signal is subject to saturation, what I'm trying to do is to avoid this condition by jumping up to the max available OPP when we realize that we are going to saturate a particular OPP. After we run for a small interval of time (say a tick) at that max OPP we can better estimate the real usage and directly select an OPP ("capacity band") that suits it.
I see your point, though. I think the two approaches differ for how we get to the desired capacity: ramping up from bottom vs. selecting from top.
From the energy model perspective, can a continuous performance band be supported at all or is it a hard requirement to have a discretized table?
I don't think it's a hard requirement (Morten or Dietmar may correct me here), but just an abstraction of the systems we develop onto today. I guess we would need to compute some formulas at run time, instead of reading tabular values, if we want to have continuous performance bands. Food for thought :).
Thanks,
- Juri
On Tue, May 5, 2015 at 3:12 AM, Juri Lelli juri.lelli@arm.com wrote:
Hi Ashwin,
On 04/05/15 14:41, Ashwin Chaugule wrote:
Hi Juri,
On 29 April 2015 at 05:39, Juri Lelli juri.lelli@arm.com wrote:
On 29/04/15 09:32, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-28 10:48:27)
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
>> + >> + wake_up_process(gd->task); > > So, we always wake up the kthread, even when we know that we won't > need a freq change. This might be, I fear, an almost certain source of > reasonable complain and pushback. I understand that we might not want > to start optimizing things, but IMHO this point deserves some more > thought before posting. Don't you think we could do some level of > aggregation before kicking the kthread? In task_tick_fair(), for > example, we could just check if we are beyond the 25% threshold and kick > the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.
On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
CPPC-based systems.
I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.
First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html
Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html
If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!
Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.
IIUC, the optimization you're getting at is to suppress the CPU freq requests when it falls within some range of the current OPP? I think this may hamper certain latency sensitive workloads, since the freq ramp up could be potentially slowed down. So, theres some merit in making the request path as quick as possible and allow for continuous adaptation. I need to look at your patches in more detail, but eyeballing it seems like you're trying to achieve that.
So, the energy model (and please mind that the patches on top of Mike's patchset don't have that yet) currently gives you these "capacity bands". The idea is to try to adapt the OPP selection to the usage you see on your CPU/cluster. Since the usage signal is subject to saturation, what I'm trying to do is to avoid this condition by jumping up to the max available OPP when we realize that we are going to saturate a particular OPP. After we run for a small interval of time (say a tick) at that max OPP we can better estimate the real usage and directly select an OPP ("capacity band") that suits it.
I'm not sure about jumping to the max frequency when we detect that the signal is saturated.
Ondemand has similar behavior to this and many vendors have implemented out-of-tree solutions that do something like setting the frequency to an "intermediate" rate (maybe 2/3 of the total performance band) and then re-evaluate if they need to jump to max performance after another sampling period.
So at some point you might face the same issue where vendors find this approach too aggressive and wastes too much power, thus some intermediate level will be introduced. I'm not providing you any solutions here, but I'm saying that designing a policy algorithm that works well for everyone is super hard.
I see your point, though. I think the two approaches differ for how we get to the desired capacity: ramping up from bottom vs. selecting from top.
From the energy model perspective, can a continuous performance band be supported at all or is it a hard requirement to have a discretized table?
I don't think it's a hard requirement (Morten or Dietmar may correct me here), but just an abstraction of the systems we develop onto today. I guess we would need to compute some formulas at run time, instead of reading tabular values, if we want to have continuous performance bands. Food for thought :).
We could also tablify continuous frequency domains based on some reasonable factor like 50Mhz or something. I guess that factor could even be supplied by the driver.
Regards, Mike
Thanks,
- Juri
Hi Mike,
On 06/05/15 01:58, Mike Turquette wrote:
On Tue, May 5, 2015 at 3:12 AM, Juri Lelli juri.lelli@arm.com wrote:
Hi Ashwin,
On 04/05/15 14:41, Ashwin Chaugule wrote:
Hi Juri,
On 29 April 2015 at 05:39, Juri Lelli juri.lelli@arm.com wrote:
On 29/04/15 09:32, Michael Turquette wrote:
Quoting Juri Lelli (2015-04-28 10:48:27)
Hi Mike,
I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).
On 28/04/15 05:02, Michael Turquette wrote: > Quoting Juri Lelli (2015-04-27 10:09:50)
[snip]
>>> + >>> + wake_up_process(gd->task); >> >> So, we always wake up the kthread, even when we know that we won't >> need a freq change. This might be, I fear, an almost certain source of >> reasonable complain and pushback. I understand that we might not want >> to start optimizing things, but IMHO this point deserves some more >> thought before posting. Don't you think we could do some level of >> aggregation before kicking the kthread? In task_tick_fair(), for >> example, we could just check if we are beyond the 25% threshold and kick >> the kthread only in that case. > > This patch does not check against a threshold. It always requests a rate > based on the current utilization plus 25%. > > On systems with discretized cpu frequencies (opps) we will often target > the same opp, occasionally crossing the boundary into another opp. On > systems with continuous cpu frequencies we will continually give > ourselves "room to grow". >
Can you make an example of such systems?
CPPC-based systems.
I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.
First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html
Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html
If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!
Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.
IIUC, the optimization you're getting at is to suppress the CPU freq requests when it falls within some range of the current OPP? I think this may hamper certain latency sensitive workloads, since the freq ramp up could be potentially slowed down. So, theres some merit in making the request path as quick as possible and allow for continuous adaptation. I need to look at your patches in more detail, but eyeballing it seems like you're trying to achieve that.
So, the energy model (and please mind that the patches on top of Mike's patchset don't have that yet) currently gives you these "capacity bands". The idea is to try to adapt the OPP selection to the usage you see on your CPU/cluster. Since the usage signal is subject to saturation, what I'm trying to do is to avoid this condition by jumping up to the max available OPP when we realize that we are going to saturate a particular OPP. After we run for a small interval of time (say a tick) at that max OPP we can better estimate the real usage and directly select an OPP ("capacity band") that suits it.
I'm not sure about jumping to the max frequency when we detect that the signal is saturated.
Ondemand has similar behavior to this and many vendors have implemented out-of-tree solutions that do something like setting the frequency to an "intermediate" rate (maybe 2/3 of the total performance band) and then re-evaluate if they need to jump to max performance after another sampling period.
So at some point you might face the same issue where vendors find this approach too aggressive and wastes too much power, thus some intermediate level will be introduced. I'm not providing you any solutions here, but I'm saying that designing a policy algorithm that works well for everyone is super hard.
No doubt about this :).
I got your point, but I guess it should be fairly easy to make this freq at which we jump somewhat "configurable". Makes sense to me, considering the variety of shapes power-perf curves can have, for example.
I see your point, though. I think the two approaches differ for how we get to the desired capacity: ramping up from bottom vs. selecting from top.
From the energy model perspective, can a continuous performance band be supported at all or is it a hard requirement to have a discretized table?
I don't think it's a hard requirement (Morten or Dietmar may correct me here), but just an abstraction of the systems we develop onto today. I guess we would need to compute some formulas at run time, instead of reading tabular values, if we want to have continuous performance bands. Food for thought :).
We could also tablify continuous frequency domains based on some reasonable factor like 50Mhz or something. I guess that factor could even be supplied by the driver.
Agree. That's what I was thinking with "discretize continuous systems".
Best,
- Juri
On 6 May 2015 at 04:34, Juri Lelli juri.lelli@arm.com wrote:
Hi Mike, On 06/05/15 01:58, Mike Turquette wrote:
On Tue, May 5, 2015 at 3:12 AM, Juri Lelli juri.lelli@arm.com wrote:
Hi Ashwin,
So, the energy model (and please mind that the patches on top of Mike's patchset don't have that yet) currently gives you these "capacity bands". The idea is to try to adapt the OPP selection to the usage you see on your CPU/cluster. Since the usage signal is subject to saturation, what I'm trying to do is to avoid this condition by jumping up to the max available OPP when we realize that we are going to saturate a particular OPP. After we run for a small interval of time (say a tick) at that max OPP we can better estimate the real usage and directly select an OPP ("capacity band") that suits it.
I'm not sure about jumping to the max frequency when we detect that the signal is saturated.
Ondemand has similar behavior to this and many vendors have implemented out-of-tree solutions that do something like setting the frequency to an "intermediate" rate (maybe 2/3 of the total performance band) and then re-evaluate if they need to jump to max performance after another sampling period.
So at some point you might face the same issue where vendors find this approach too aggressive and wastes too much power, thus some intermediate level will be introduced. I'm not providing you any solutions here, but I'm saying that designing a policy algorithm that works well for everyone is super hard.
No doubt about this :).
I got your point, but I guess it should be fairly easy to make this freq at which we jump somewhat "configurable". Makes sense to me, considering the variety of shapes power-perf curves can have, for example.
Instead of another knob, perhaps we could make the 25% headroom flexible by adapting it to current vs past utilization? Not something we need to start off with, but a possible future optimization.
I see your point, though. I think the two approaches differ for how we get to the desired capacity: ramping up from bottom vs. selecting from top.
From the energy model perspective, can a continuous performance band be supported at all or is it a hard requirement to have a discretized table?
I don't think it's a hard requirement (Morten or Dietmar may correct me here), but just an abstraction of the systems we develop onto today. I guess we would need to compute some formulas at run time, instead of reading tabular values, if we want to have continuous performance bands. Food for thought :).
We could also tablify continuous frequency domains based on some reasonable factor like 50Mhz or something. I guess that factor could even be supplied by the driver.
Agree. That's what I was thinking with "discretize continuous systems".
Sounds possible. Probably not a big deal, but theres a chance of losing out some power optimization depending on how many discrete steps you make. A matter of system profiling I guess.
Regards, Ashwin.
On 27 April 2015 at 09:46, Michael Turquette mturquette@linaro.org wrote:
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.
This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.
Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.
Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.
This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.
This policy is implemented using the cpufreq governor interface for two main reasons:
- re-using the cpufreq machine drivers without using the governor
interface is hard.
- using the cpufreq interface allows us to switch between the
scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.
Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.
Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.
[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22
Signed-off-by: Michael Turquette mturquette@linaro.org
changes since internal v1:
- renamed everything
- fixed possible deadlock between gov_cfs_thread and gov_cfs_stop
- replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges
- usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases
- dropped workqueue method due to instability
- kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1
drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.
+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
bool "sched_cfs"
select CPU_FREQ_GOV_SCHED_CFS
select CPU_FREQ_GOV_PERFORMANCE
Hi Mike,
do you really need to select CPU_FREQ_GOV_PERFORMANCE ?
help
Use the CPUfreq governor 'sched_cfs' as default. This scales
cpu frequency from the scheduler as per-entity load tracking
statistics are updated.
endchoice
config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
tristate "'sched cfs' cpufreq governor"
depends on CPU_FREQ
select CPU_FREQ_GOV_COMMON
help
'sched_cfs' - this governor scales cpu frequency from the
scheduler as a function of cpu capacity utilization. It does
not evaluate utilization on a periodic basis (as ondemand
does) but instead is invoked from the completely fair
scheduler when updating per-entity load tracking statistics.
Latency to respond to changes in load is improved over polling
governors due to its event-driven design.
If in doubt, say N.
comment "CPU frequency scaling drivers"
config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif
/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*
- Copyright (C) 2015 Michael Turquette mturquette@linaro.org
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
- */
+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h>
+#include "sched.h"
+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */
+/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
- @throttle_nsec: throttle period length in nanoseconds
- @task: worker thread for dvfs transition that may block/sleep
- @irq_work: callback used to wake up worker thread
- struct gov_data is the per-policy gov_cfs-specific data structure. A
- per-policy instance of it is created when the gov_cfs governor receives
- the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- member of struct cpufreq_policy.
- Readers of this data must call down_read(policy->rwsem). Writers must
- call down_write(policy->rwsem).
- */
+struct gov_data {
ktime_t throttle;
unsigned int throttle_nsec;
struct task_struct *task;
struct irq_work irq_work;
struct cpufreq_policy *policy;
+};
+/**
- gov_cfs_select_freq - pick the next frequency for a cpu
- @policy: the cpufreq policy whose frequency may be changed
- gov_cfs_select_freq selects a frequency based on pelt load statistics
- tracked by cfs. First it finds the most utilized cpu in the policy and then
- maps that utilization value onto a cpu frequency and returns it.
- Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value
- before converting it to a frequency. The margin is derived from MARGIN_PCT,
- which itself is inspired by imbalance_pct in cfs. This is needed to
- proactively increase frequency in the case of increasing load.
- This approach attempts to maintain headroom of 25% unutilized cpu capacity.
- A traditional way of doing this is to take 75% of the current capacity and
- check if current utilization crosses that threshold. The only problem with
- that approach is determining the next cpu frequency target if that threshold
- is crossed.
- Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%
- utilization margin to the utilization and converts that to a frequency. This
- removes conditional logic around checking thresholds and better supports
- drivers that use non-discretized frequency ranges (i.e. no pre-defined
- frequency tables or operating points).
- Returns frequency selected.
- */
+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
int cpu = 0;
struct gov_data *gd;
unsigned long freq = 0, max_usage = 0, usage = 0;
if (!policy->governor_data)
goto out;
gd = policy->governor_data;
/*
* get_cpu_usage is called without locking the runqueues. This is the
* same behavior used by find_busiest_cpu in load_balance. We are
* willing to accept occasionally stale data here in exchange for
* lockless behavior.
*/
for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
if (usage > max_usage)
max_usage = usage;
}
/* add margin to max_usage based on imbalance_pct */
max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
/* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
+out:
return freq;
+}
+/*
- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
- */
+static int gov_cfs_thread(void *data) +{
struct sched_param param;
struct cpufreq_policy *policy;
struct gov_data *gd;
unsigned long freq;
int ret;
policy = (struct cpufreq_policy *) data;
if (!policy) {
pr_warn("%s: missing policy\n", __func__);
do_exit(-EINVAL);
}
gd = policy->governor_data;
if (!gd) {
pr_warn("%s: missing governor data\n", __func__);
do_exit(-EINVAL);
}
param.sched_priority = 50;
ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
if (ret) {
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
do_exit(-EINVAL);
} else {
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
__func__, gd->task->pid);
}
ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
if (ret) {
pr_warn("%s: failed to set allowed ptr\n", __func__);
do_exit(-EINVAL);
}
/* main loop of the per-policy kthread */
do {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop())
break;
/* avoid race with gov_cfs_stop */
if (!down_write_trylock(&policy->rwsem))
continue;
freq = gov_cfs_select_freq(policy);
ret = __cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_H);
if (ret)
pr_debug("%s: __cpufreq_driver_target returned %d\n",
__func__, ret);
gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
up_write(&policy->rwsem);
} while (!kthread_should_stop());
do_exit(0);
+}
+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;
gd = container_of(irq_work, struct gov_data, irq_work);
if (!gd) {
return;
}
wake_up_process(gd->task);
+}
+/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
- gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the
- scheduler may inform the governor of updates to capacity utilization and
- make changes to cpu frequency. Currently this interface is designed around
- PELT values in CFS. It can be expanded to other scheduling classes in the
- future if needed.
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
- */
+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;
struct gov_data *gd;
/* XXX put policy pointer in per-cpu data? */
policy = cpufreq_cpu_get(cpu);
if (IS_ERR_OR_NULL(policy)) {
return;
}
if (!policy->governor_data) {
goto out;
}
gd = policy->governor_data;
/* bail early if we are throttled */
if (ktime_before(ktime_get(), gd->throttle)) {
goto out;
}
irq_work_queue_on(&gd->irq_work, cpu);
+out:
cpufreq_cpu_put(policy);
return;
+}
+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;
/* prepare per-policy private data */
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
if (!gd) {
pr_debug("%s: failed to allocate private data\n", __func__);
return;
}
/*
* Don't ask for freq changes at an higher rate than what
* the driver advertises as transition latency.
*/
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
policy->cpuinfo.transition_latency :
THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
__func__, gd->throttle_nsec);
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
Quoting Vincent Guittot (2015-04-28 07:10:20)
On 27 April 2015 at 09:46, Michael Turquette mturquette@linaro.org wrote:
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.
This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.
Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.
Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.
This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.
This policy is implemented using the cpufreq governor interface for two main reasons:
- re-using the cpufreq machine drivers without using the governor
interface is hard.
- using the cpufreq interface allows us to switch between the
scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.
Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.
Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.
[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22
Signed-off-by: Michael Turquette mturquette@linaro.org
changes since internal v1:
- renamed everything
- fixed possible deadlock between gov_cfs_thread and gov_cfs_stop
- replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges
- usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases
- dropped workqueue method due to instability
- kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1
drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.
+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
bool "sched_cfs"
select CPU_FREQ_GOV_SCHED_CFS
select CPU_FREQ_GOV_PERFORMANCE
Hi Mike,
do you really need to select CPU_FREQ_GOV_PERFORMANCE ?
Ondemand and conservative governors do this, so I copied that style. I guess the idea is that a production system should always have the option to run flat out without requiring to recompile the kernel, or a kernel module.
Regards, Mike
help
Use the CPUfreq governor 'sched_cfs' as default. This scales
cpu frequency from the scheduler as per-entity load tracking
statistics are updated.
endchoice
config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
tristate "'sched cfs' cpufreq governor"
depends on CPU_FREQ
select CPU_FREQ_GOV_COMMON
help
'sched_cfs' - this governor scales cpu frequency from the
scheduler as a function of cpu capacity utilization. It does
not evaluate utilization on a periodic basis (as ondemand
does) but instead is invoked from the completely fair
scheduler when updating per-entity load tracking statistics.
Latency to respond to changes in load is improved over polling
governors due to its event-driven design.
If in doubt, say N.
comment "CPU frequency scaling drivers"
config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif
/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*
- Copyright (C) 2015 Michael Turquette mturquette@linaro.org
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
- */
+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h>
+#include "sched.h"
+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */
+/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
- @throttle_nsec: throttle period length in nanoseconds
- @task: worker thread for dvfs transition that may block/sleep
- @irq_work: callback used to wake up worker thread
- struct gov_data is the per-policy gov_cfs-specific data structure. A
- per-policy instance of it is created when the gov_cfs governor receives
- the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- member of struct cpufreq_policy.
- Readers of this data must call down_read(policy->rwsem). Writers must
- call down_write(policy->rwsem).
- */
+struct gov_data {
ktime_t throttle;
unsigned int throttle_nsec;
struct task_struct *task;
struct irq_work irq_work;
struct cpufreq_policy *policy;
+};
+/**
- gov_cfs_select_freq - pick the next frequency for a cpu
- @policy: the cpufreq policy whose frequency may be changed
- gov_cfs_select_freq selects a frequency based on pelt load statistics
- tracked by cfs. First it finds the most utilized cpu in the policy and then
- maps that utilization value onto a cpu frequency and returns it.
- Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value
- before converting it to a frequency. The margin is derived from MARGIN_PCT,
- which itself is inspired by imbalance_pct in cfs. This is needed to
- proactively increase frequency in the case of increasing load.
- This approach attempts to maintain headroom of 25% unutilized cpu capacity.
- A traditional way of doing this is to take 75% of the current capacity and
- check if current utilization crosses that threshold. The only problem with
- that approach is determining the next cpu frequency target if that threshold
- is crossed.
- Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%
- utilization margin to the utilization and converts that to a frequency. This
- removes conditional logic around checking thresholds and better supports
- drivers that use non-discretized frequency ranges (i.e. no pre-defined
- frequency tables or operating points).
- Returns frequency selected.
- */
+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
int cpu = 0;
struct gov_data *gd;
unsigned long freq = 0, max_usage = 0, usage = 0;
if (!policy->governor_data)
goto out;
gd = policy->governor_data;
/*
* get_cpu_usage is called without locking the runqueues. This is the
* same behavior used by find_busiest_cpu in load_balance. We are
* willing to accept occasionally stale data here in exchange for
* lockless behavior.
*/
for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
if (usage > max_usage)
max_usage = usage;
}
/* add margin to max_usage based on imbalance_pct */
max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
/* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
+out:
return freq;
+}
+/*
- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
- */
+static int gov_cfs_thread(void *data) +{
struct sched_param param;
struct cpufreq_policy *policy;
struct gov_data *gd;
unsigned long freq;
int ret;
policy = (struct cpufreq_policy *) data;
if (!policy) {
pr_warn("%s: missing policy\n", __func__);
do_exit(-EINVAL);
}
gd = policy->governor_data;
if (!gd) {
pr_warn("%s: missing governor data\n", __func__);
do_exit(-EINVAL);
}
param.sched_priority = 50;
ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
if (ret) {
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
do_exit(-EINVAL);
} else {
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
__func__, gd->task->pid);
}
ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
if (ret) {
pr_warn("%s: failed to set allowed ptr\n", __func__);
do_exit(-EINVAL);
}
/* main loop of the per-policy kthread */
do {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop())
break;
/* avoid race with gov_cfs_stop */
if (!down_write_trylock(&policy->rwsem))
continue;
freq = gov_cfs_select_freq(policy);
ret = __cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_H);
if (ret)
pr_debug("%s: __cpufreq_driver_target returned %d\n",
__func__, ret);
gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
up_write(&policy->rwsem);
} while (!kthread_should_stop());
do_exit(0);
+}
+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;
gd = container_of(irq_work, struct gov_data, irq_work);
if (!gd) {
return;
}
wake_up_process(gd->task);
+}
+/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
- gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the
- scheduler may inform the governor of updates to capacity utilization and
- make changes to cpu frequency. Currently this interface is designed around
- PELT values in CFS. It can be expanded to other scheduling classes in the
- future if needed.
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
- */
+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;
struct gov_data *gd;
/* XXX put policy pointer in per-cpu data? */
policy = cpufreq_cpu_get(cpu);
if (IS_ERR_OR_NULL(policy)) {
return;
}
if (!policy->governor_data) {
goto out;
}
gd = policy->governor_data;
/* bail early if we are throttled */
if (ktime_before(ktime_get(), gd->throttle)) {
goto out;
}
irq_work_queue_on(&gd->irq_work, cpu);
+out:
cpufreq_cpu_put(policy);
return;
+}
+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;
/* prepare per-policy private data */
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
if (!gd) {
pr_debug("%s: failed to allocate private data\n", __func__);
return;
}
/*
* Don't ask for freq changes at an higher rate than what
* the driver advertises as transition latency.
*/
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
policy->cpuinfo.transition_latency :
THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
__func__, gd->throttle_nsec);
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
On 04/27/2015 09:46 AM, Michael Turquette wrote:
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.
This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.
Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.
Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.
This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.
This policy is implemented using the cpufreq governor interface for two main reasons:
- re-using the cpufreq machine drivers without using the governor
interface is hard.
- using the cpufreq interface allows us to switch between the
scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.
Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.
Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.
[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22
Signed-off-by: Michael Turquette mturquette@linaro.org
changes since internal v1:
renamed everything
fixed possible deadlock between gov_cfs_thread and gov_cfs_stop
replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges
usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases
dropped workqueue method due to instability
kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1
drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.
+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
bool "sched_cfs"
select CPU_FREQ_GOV_SCHED_CFS
select CPU_FREQ_GOV_PERFORMANCE
help
Use the CPUfreq governor 'sched_cfs' as default. This scales
cpu frequency from the scheduler as per-entity load tracking
statistics are updated.
endchoice
config CPU_FREQ_GOV_PERFORMANCE
@@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
tristate "'sched cfs' cpufreq governor"
depends on CPU_FREQ
select CPU_FREQ_GOV_COMMON
help
'sched_cfs' - this governor scales cpu frequency from the
scheduler as a function of cpu capacity utilization. It does
not evaluate utilization on a periodic basis (as ondemand
does) but instead is invoked from the completely fair
scheduler when updating per-entity load tracking statistics.
Latency to respond to changes in load is improved over polling
governors due to its event-driven design.
If in doubt, say N.
comment "CPU frequency scaling drivers"
config CPUFREQ_DT
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif
/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*
- Copyright (C) 2015 Michael Turquette mturquette@linaro.org
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
- */
+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h>
+#include "sched.h"
+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */
+/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
- @throttle_nsec: throttle period length in nanoseconds
- @task: worker thread for dvfs transition that may block/sleep
- @irq_work: callback used to wake up worker thread
- struct gov_data is the per-policy gov_cfs-specific data structure. A
- per-policy instance of it is created when the gov_cfs governor receives
- the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- member of struct cpufreq_policy.
- Readers of this data must call down_read(policy->rwsem). Writers must
- call down_write(policy->rwsem).
- */
+struct gov_data {
- ktime_t throttle;
- unsigned int throttle_nsec;
- struct task_struct *task;
- struct irq_work irq_work;
- struct cpufreq_policy *policy;
+};
+/**
- gov_cfs_select_freq - pick the next frequency for a cpu
- @policy: the cpufreq policy whose frequency may be changed
- gov_cfs_select_freq selects a frequency based on pelt load statistics
- tracked by cfs. First it finds the most utilized cpu in the policy and then
- maps that utilization value onto a cpu frequency and returns it.
- Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value
- before converting it to a frequency. The margin is derived from MARGIN_PCT,
- which itself is inspired by imbalance_pct in cfs. This is needed to
- proactively increase frequency in the case of increasing load.
- This approach attempts to maintain headroom of 25% unutilized cpu capacity.
- A traditional way of doing this is to take 75% of the current capacity and
- check if current utilization crosses that threshold. The only problem with
- that approach is determining the next cpu frequency target if that threshold
- is crossed.
- Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%
- utilization margin to the utilization and converts that to a frequency. This
- removes conditional logic around checking thresholds and better supports
- drivers that use non-discretized frequency ranges (i.e. no pre-defined
- frequency tables or operating points).
- Returns frequency selected.
- */
+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
- int cpu = 0;
- struct gov_data *gd;
- unsigned long freq = 0, max_usage = 0, usage = 0;
- if (!policy->governor_data)
goto out;
s/goto out/return 0/
- gd = policy->governor_data;
- /*
* get_cpu_usage is called without locking the runqueues. This is the
* same behavior used by find_busiest_cpu in load_balance. We are
* willing to accept occasionally stale data here in exchange for
* lockless behavior.
*/
- for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
if (usage > max_usage)
max_usage = usage;
- }
- /* add margin to max_usage based on imbalance_pct */
- max_usage = max_usage * MARGIN_PCT / 100;
- cpu = cpumask_first(policy->cpus);
- /* freq is current utilization + 25% */
- freq = max_usage * policy->max / capacity_orig_of(cpu);
Couldn't this be slightly simplified by using directly cpu_rq(cpu)->cfs.utilization_load_avg instead of calling get_cpu_usage ?
+out:
- return freq;
+}
+/*
- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
- */
+static int gov_cfs_thread(void *data) +{
- struct sched_param param;
- struct cpufreq_policy *policy;
- struct gov_data *gd;
- unsigned long freq;
- int ret;
- policy = (struct cpufreq_policy *) data;
- if (!policy) {
pr_warn("%s: missing policy\n", __func__);
do_exit(-EINVAL);
- }
- gd = policy->governor_data;
- if (!gd) {
pr_warn("%s: missing governor data\n", __func__);
do_exit(-EINVAL);
- }
- param.sched_priority = 50;
- ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
- if (ret) {
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
do_exit(-EINVAL);
- } else {
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
__func__, gd->task->pid);
- }
- ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
- if (ret) {
pr_warn("%s: failed to set allowed ptr\n", __func__);
do_exit(-EINVAL);
- }
- /* main loop of the per-policy kthread */
- do {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop())
break;
/* avoid race with gov_cfs_stop */
if (!down_write_trylock(&policy->rwsem))
continue;
freq = gov_cfs_select_freq(policy);
ret = __cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_H);
if (ret)
pr_debug("%s: __cpufreq_driver_target returned %d\n",
__func__, ret);
Shouldn't the relation be H or L depending we are increasing or decreasing the freq ?
gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
up_write(&policy->rwsem);
- } while (!kthread_should_stop());
- do_exit(0);
+}
+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
- struct gov_data *gd;
- gd = container_of(irq_work, struct gov_data, irq_work);
- if (!gd) {
return;
- }
- wake_up_process(gd->task);
+}
+/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
- gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the
- scheduler may inform the governor of updates to capacity utilization and
- make changes to cpu frequency. Currently this interface is designed around
- PELT values in CFS. It can be expanded to other scheduling classes in the
- future if needed.
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
- */
+void gov_cfs_update_cpu(int cpu) +{
- struct cpufreq_policy *policy;
- struct gov_data *gd;
- /* XXX put policy pointer in per-cpu data? */
- policy = cpufreq_cpu_get(cpu);
- if (IS_ERR_OR_NULL(policy)) {
return;
- }
- if (!policy->governor_data) {
goto out;
- }
- gd = policy->governor_data;
- /* bail early if we are throttled */
- if (ktime_before(ktime_get(), gd->throttle)) {
goto out;
- }
- irq_work_queue_on(&gd->irq_work, cpu);
+out:
- cpufreq_cpu_put(policy);
- return;
+}
+static void gov_cfs_start(struct cpufreq_policy *policy) +{
- struct gov_data *gd;
- /* prepare per-policy private data */
- gd = kzalloc(sizeof(*gd), GFP_KERNEL);
- if (!gd) {
pr_debug("%s: failed to allocate private data\n", __func__);
return;
- }
- /*
* Don't ask for freq changes at an higher rate than what
* the driver advertises as transition latency.
*/
- gd->throttle_nsec = policy->cpuinfo.transition_latency ?
policy->cpuinfo.transition_latency :
THROTTLE_NSEC;
- pr_debug("%s: throttle threshold = %u [ns]\n",
__func__, gd->throttle_nsec);
- /* init per-policy kthread */
- gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
- if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
- init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?
- policy->governor_data = gd;
- gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
- struct gov_data *gd;
- gd = policy->governor_data;
- kthread_stop(gd->task);
- policy->governor_data = NULL;
- /* FIXME replace with devm counterparts? */
- kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
- switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
- }
- return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
- .name = "gov_cfs",
- .governor = gov_cfs_setup,
- .owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
- return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
- cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
- if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
- hrtick_update(rq); }
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
- if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
- hrtick_update(rq); }
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
- static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
Quoting Daniel Lezcano (2015-04-30 08:05:52)
On 04/27/2015 09:46 AM, Michael Turquette wrote:
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.
This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.
Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.
Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.
This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.
This policy is implemented using the cpufreq governor interface for two main reasons:
- re-using the cpufreq machine drivers without using the governor
interface is hard.
- using the cpufreq interface allows us to switch between the
scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.
Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.
Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.
[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22
Signed-off-by: Michael Turquette mturquette@linaro.org
changes since internal v1:
renamed everything
fixed possible deadlock between gov_cfs_thread and gov_cfs_stop
replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges
usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases
dropped workqueue method due to instability
kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1
drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.
+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
bool "sched_cfs"
select CPU_FREQ_GOV_SCHED_CFS
select CPU_FREQ_GOV_PERFORMANCE
help
Use the CPUfreq governor 'sched_cfs' as default. This scales
cpu frequency from the scheduler as per-entity load tracking
statistics are updated.
endchoice
config CPU_FREQ_GOV_PERFORMANCE
@@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
tristate "'sched cfs' cpufreq governor"
depends on CPU_FREQ
select CPU_FREQ_GOV_COMMON
help
'sched_cfs' - this governor scales cpu frequency from the
scheduler as a function of cpu capacity utilization. It does
not evaluate utilization on a periodic basis (as ondemand
does) but instead is invoked from the completely fair
scheduler when updating per-entity load tracking statistics.
Latency to respond to changes in load is improved over polling
governors due to its event-driven design.
If in doubt, say N.
comment "CPU frequency scaling drivers"
config CPUFREQ_DT
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif
/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*
- Copyright (C) 2015 Michael Turquette mturquette@linaro.org
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
- */
+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h>
+#include "sched.h"
+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */
+/**
- gov_data - per-policy data internal to the governor
- @throttle: next throttling period expiry. Derived from throttle_nsec
- @throttle_nsec: throttle period length in nanoseconds
- @task: worker thread for dvfs transition that may block/sleep
- @irq_work: callback used to wake up worker thread
- struct gov_data is the per-policy gov_cfs-specific data structure. A
- per-policy instance of it is created when the gov_cfs governor receives
- the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- member of struct cpufreq_policy.
- Readers of this data must call down_read(policy->rwsem). Writers must
- call down_write(policy->rwsem).
- */
+struct gov_data {
ktime_t throttle;
unsigned int throttle_nsec;
struct task_struct *task;
struct irq_work irq_work;
struct cpufreq_policy *policy;
+};
+/**
- gov_cfs_select_freq - pick the next frequency for a cpu
- @policy: the cpufreq policy whose frequency may be changed
- gov_cfs_select_freq selects a frequency based on pelt load statistics
- tracked by cfs. First it finds the most utilized cpu in the policy and then
- maps that utilization value onto a cpu frequency and returns it.
- Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value
- before converting it to a frequency. The margin is derived from MARGIN_PCT,
- which itself is inspired by imbalance_pct in cfs. This is needed to
- proactively increase frequency in the case of increasing load.
- This approach attempts to maintain headroom of 25% unutilized cpu capacity.
- A traditional way of doing this is to take 75% of the current capacity and
- check if current utilization crosses that threshold. The only problem with
- that approach is determining the next cpu frequency target if that threshold
- is crossed.
- Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%
- utilization margin to the utilization and converts that to a frequency. This
- removes conditional logic around checking thresholds and better supports
- drivers that use non-discretized frequency ranges (i.e. no pre-defined
- frequency tables or operating points).
- Returns frequency selected.
- */
+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
int cpu = 0;
struct gov_data *gd;
unsigned long freq = 0, max_usage = 0, usage = 0;
if (!policy->governor_data)
goto out;
s/goto out/return 0/
OK.
gd = policy->governor_data;
/*
* get_cpu_usage is called without locking the runqueues. This is the
* same behavior used by find_busiest_cpu in load_balance. We are
* willing to accept occasionally stale data here in exchange for
* lockless behavior.
*/
for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
if (usage > max_usage)
max_usage = usage;
}
/* add margin to max_usage based on imbalance_pct */
max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
/* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
Couldn't this be slightly simplified by using directly cpu_rq(cpu)->cfs.utilization_load_avg instead of calling get_cpu_usage ?
The big.LITTLE case here is confusing. Is cfs.utilization_load_avg already normalized against cpu capacity differences? If so then you are right, I could use the value directly. But the if not then get_cpu_usage buys us that normalization by doing:
cfs.utilization_load_avg * capacity_orig >> SCHED_LOAD_SHIFT;
Where capacity_orig may be different across various CPUs.
+out:
return freq;
+}
+/*
- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
- */
+static int gov_cfs_thread(void *data) +{
struct sched_param param;
struct cpufreq_policy *policy;
struct gov_data *gd;
unsigned long freq;
int ret;
policy = (struct cpufreq_policy *) data;
if (!policy) {
pr_warn("%s: missing policy\n", __func__);
do_exit(-EINVAL);
}
gd = policy->governor_data;
if (!gd) {
pr_warn("%s: missing governor data\n", __func__);
do_exit(-EINVAL);
}
param.sched_priority = 50;
ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
if (ret) {
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
do_exit(-EINVAL);
} else {
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
__func__, gd->task->pid);
}
ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
if (ret) {
pr_warn("%s: failed to set allowed ptr\n", __func__);
do_exit(-EINVAL);
}
/* main loop of the per-policy kthread */
do {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop())
break;
/* avoid race with gov_cfs_stop */
if (!down_write_trylock(&policy->rwsem))
continue;
freq = gov_cfs_select_freq(policy);
ret = __cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_H);
if (ret)
pr_debug("%s: __cpufreq_driver_target returned %d\n",
__func__, ret);
Shouldn't the relation be H or L depending we are increasing or decreasing the freq ?
Yes, this was pointed out in another reply. I'll fix it up.
gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
up_write(&policy->rwsem);
} while (!kthread_should_stop());
do_exit(0);
+}
+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;
gd = container_of(irq_work, struct gov_data, irq_work);
if (!gd) {
return;
}
wake_up_process(gd->task);
+}
+/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
- gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the
- scheduler may inform the governor of updates to capacity utilization and
- make changes to cpu frequency. Currently this interface is designed around
- PELT values in CFS. It can be expanded to other scheduling classes in the
- future if needed.
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
- */
+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;
struct gov_data *gd;
/* XXX put policy pointer in per-cpu data? */
policy = cpufreq_cpu_get(cpu);
if (IS_ERR_OR_NULL(policy)) {
return;
}
if (!policy->governor_data) {
goto out;
}
gd = policy->governor_data;
/* bail early if we are throttled */
if (ktime_before(ktime_get(), gd->throttle)) {
goto out;
}
irq_work_queue_on(&gd->irq_work, cpu);
+out:
cpufreq_cpu_put(policy);
return;
+}
+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;
/* prepare per-policy private data */
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
if (!gd) {
pr_debug("%s: failed to allocate private data\n", __func__);
return;
}
/*
* Don't ask for freq changes at an higher rate than what
* the driver advertises as transition latency.
*/
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
policy->cpuinfo.transition_latency :
THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
__func__, gd->throttle_nsec);
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?
I am confused by how you phrased the above text. This patch uses irqwork + kthread, which you say is the way to go. But I don't use a traditional workqueue instead of irqwork.
I did implement a method that uses irqwork + wq, and it had some bugs, as well as the fact that it clearly increased cfs load tracking stats.
Thanks a lot for the review, Mike
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
- static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On 01/05/15 00:49, Michael Turquette wrote:
Quoting Daniel Lezcano (2015-04-30 08:05:52)
On 04/27/2015 09:46 AM, Michael Turquette wrote:
[snip]
/*
* get_cpu_usage is called without locking the runqueues. This is the
* same behavior used by find_busiest_cpu in load_balance. We are
* willing to accept occasionally stale data here in exchange for
* lockless behavior.
*/
for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
if (usage > max_usage)
max_usage = usage;
}
/* add margin to max_usage based on imbalance_pct */
max_usage = max_usage * MARGIN_PCT / 100;
cpu = cpumask_first(policy->cpus);
/* freq is current utilization + 25% */
freq = max_usage * policy->max / capacity_orig_of(cpu);
Couldn't this be slightly simplified by using directly cpu_rq(cpu)->cfs.utilization_load_avg instead of calling get_cpu_usage ?
The big.LITTLE case here is confusing. Is cfs.utilization_load_avg already normalized against cpu capacity differences? If so then you are right, I
Nope. In this patchset utilization_load_avg is only freq invariant.
Thanks,
- Juri
could use the value directly. But the if not then get_cpu_usage buys us that normalization by doing:
cfs.utilization_load_avg * capacity_orig >> SCHED_LOAD_SHIFT;
Where capacity_orig may be different across various CPUs.
+out:
return freq;
+}
+/*
- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
- */
+static int gov_cfs_thread(void *data) +{
struct sched_param param;
struct cpufreq_policy *policy;
struct gov_data *gd;
unsigned long freq;
int ret;
policy = (struct cpufreq_policy *) data;
if (!policy) {
pr_warn("%s: missing policy\n", __func__);
do_exit(-EINVAL);
}
gd = policy->governor_data;
if (!gd) {
pr_warn("%s: missing governor data\n", __func__);
do_exit(-EINVAL);
}
param.sched_priority = 50;
ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);
if (ret) {
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
do_exit(-EINVAL);
} else {
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
__func__, gd->task->pid);
}
ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
if (ret) {
pr_warn("%s: failed to set allowed ptr\n", __func__);
do_exit(-EINVAL);
}
/* main loop of the per-policy kthread */
do {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop())
break;
/* avoid race with gov_cfs_stop */
if (!down_write_trylock(&policy->rwsem))
continue;
freq = gov_cfs_select_freq(policy);
ret = __cpufreq_driver_target(policy, freq,
CPUFREQ_RELATION_H);
if (ret)
pr_debug("%s: __cpufreq_driver_target returned %d\n",
__func__, ret);
Shouldn't the relation be H or L depending we are increasing or decreasing the freq ?
Yes, this was pointed out in another reply. I'll fix it up.
gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
up_write(&policy->rwsem);
} while (!kthread_should_stop());
do_exit(0);
+}
+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;
gd = container_of(irq_work, struct gov_data, irq_work);
if (!gd) {
return;
}
wake_up_process(gd->task);
+}
+/**
- gov_cfs_update_cpu - interface to scheduler for changing capacity values
- @cpu: cpu whose capacity utilization has recently changed
- gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the
- scheduler may inform the governor of updates to capacity utilization and
- make changes to cpu frequency. Currently this interface is designed around
- PELT values in CFS. It can be expanded to other scheduling classes in the
- future if needed.
- gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up
- the thread that does the actual work, gov_cfs_thread.
- */
+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;
struct gov_data *gd;
/* XXX put policy pointer in per-cpu data? */
policy = cpufreq_cpu_get(cpu);
if (IS_ERR_OR_NULL(policy)) {
return;
}
if (!policy->governor_data) {
goto out;
}
gd = policy->governor_data;
/* bail early if we are throttled */
if (ktime_before(ktime_get(), gd->throttle)) {
goto out;
}
irq_work_queue_on(&gd->irq_work, cpu);
+out:
cpufreq_cpu_put(policy);
return;
+}
+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;
/* prepare per-policy private data */
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
if (!gd) {
pr_debug("%s: failed to allocate private data\n", __func__);
return;
}
/*
* Don't ask for freq changes at an higher rate than what
* the driver advertises as transition latency.
*/
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
policy->cpuinfo.transition_latency :
THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
__func__, gd->throttle_nsec);
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?
I am confused by how you phrased the above text. This patch uses irqwork
- kthread, which you say is the way to go. But I don't use a traditional
workqueue instead of irqwork.
I did implement a method that uses irqwork + wq, and it had some bugs, as well as the fact that it clearly increased cfs load tracking stats.
Thanks a lot for the review, Mike
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
- static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
On 05/01/2015 01:49 AM, Michael Turquette wrote:
[ ... ]
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?
I am confused by how you phrased the above text. This patch uses irqwork
- kthread, which you say is the way to go. But I don't use a traditional
workqueue instead of irqwork.
Sorry for the confusion. I was refering to the email:
"[Eas-dev] [PATCH] cap_gov: irq_work + workqueue".
I did implement a method that uses irqwork + wq, and it had some bugs,
Why are you using the irqwork ?
as well as the fact that it clearly increased cfs load tracking stats.
Ah, interesting. Could you elaborate ?
Thanks
-- Daniel
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
- static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
Quoting Daniel Lezcano (2015-05-04 01:05:04)
On 05/01/2015 01:49 AM, Michael Turquette wrote:
[ ... ]
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?
I am confused by how you phrased the above text. This patch uses irqwork
- kthread, which you say is the way to go. But I don't use a traditional
workqueue instead of irqwork.
Sorry for the confusion. I was refering to the email:
"[Eas-dev] [PATCH] cap_gov: irq_work + workqueue".
I did implement a method that uses irqwork + wq, and it had some bugs,
Why are you using the irqwork ?
Good question. The bulk of the Real Work is done in the kthread. We need to wake up the kthread somehow from inside enqueue_task_fair, dequeue_task_fair and task_tick_fair. These functions hold runqueue locks and disable interrupts. We cannot call any function that might sleep or call schedule().
In order to wake the kthread we use wake_up_process. The good news is that this function does not sleep. The bad news is the calling it will re-enter the scheduler, which is fatal.
Thus we need a way to call wake_up_process AFTER we exit the critical section in the scheduler where irqs are disabled. One of the ways we handled this in previous patch sets was to hack in a callback in run_rebalance_domains, but this has two problems:
1) it is an ugly hack 2) waking up the kthread here causes an undesirable periodic behavior
Juri proposed a solution to register an irq_work callback that simply calls wake_up_process (which is safe since wake_up_process will not sleep). From within the scheduler we raise an IPI. After we exit the critical section and re-enable interrupts then we handle the IPI which wakes up the kthread.
The ideal solution would be to wake up the kthread from within the scheduler's critical section via some special case which does not cause reentry. This can be done but it is a bit over my head and might not be accepted upstream.
Regards, Mike
as well as the fact that it clearly increased cfs load tracking stats.
Ah, interesting. Could you elaborate ?
Thanks
-- Daniel
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
- static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
Quoting Daniel Lezcano (2015-05-04 01:05:04)
On 05/01/2015 01:49 AM, Michael Turquette wrote:
[ ... ]
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?
I am confused by how you phrased the above text. This patch uses irqwork
- kthread, which you say is the way to go. But I don't use a traditional
workqueue instead of irqwork.
Sorry for the confusion. I was refering to the email:
"[Eas-dev] [PATCH] cap_gov: irq_work + workqueue".
I did implement a method that uses irqwork + wq, and it had some bugs,
Why are you using the irqwork ?
as well as the fact that it clearly increased cfs load tracking stats.
Ah, interesting. Could you elaborate ?
Oops. Forgot to answer this in my previous mail.
The kthread is currently set to use SCHED_FIFO. It is an RT task. The main reason to do this is that it receives higher priority in the runqueue and will be run BEFORE the cfs tasks.
There is also the nice side effect that the cfs governor only looks at cfs load right now. Thus the added load of doing a DVFS transition as an rt task doesn't affect the cfs load statistics and we kind of get this behavior "for free". In other words we get to avoid Heisenberg's observability principle ;-)
Of course some day if we want to start basing a dvfs decision on rt tasks stats then we will lose this behavior.
To answer your question, using a workqueue puts SCHED_OTHER tasks onto the cfs runqueues. Thus we DO see an impact to cfs load stats by doing a dvfs transition in this way.
Regards, Mike
Thanks
-- Daniel
policy->governor_data = gd;
gd->policy = policy;
+}
+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}
+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
/* Start managing the frequency */
gov_cfs_start(policy);
return 0;
case CPUFREQ_GOV_STOP:
gov_cfs_stop(policy);
return 0;
case CPUFREQ_GOV_LIMITS: /* unused */
case CPUFREQ_GOV_POLICY_INIT: /* unused */
case CPUFREQ_GOV_POLICY_EXIT: /* unused */
break;
}
return 0;
+}
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name = "gov_cfs",
.governor = gov_cfs_setup,
.owner = THIS_MODULE,
+};
+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}
+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}
+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);
+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}hrtick_update(rq);
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif
- static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog